# Imports

In [None]:
import pickle
import torch
import random
import math
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader, Dataset
from torchvision import models, transforms
from torchsummary import summary
from torchvision.transforms import v2
from google.colab import drive
from google.colab import runtime
from PIL import Image
!pip install torchinfo
from torchinfo import summary

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


# Data

File Loading

In [None]:
drive.mount('/content/drive')
train_front_data = pickle.load(open("/content/drive/My Drive/AV Research/train_front_data_80k_256_cropped_color",'rb'))
test_front_data = pickle.load(open("/content/drive/My Drive/AV Research/test_front_data_80k_256_cropped_color",'rb'))

output_data = pickle.load(open("/content/drive/My Drive/AV Research/train_output_data_80k_256_cropped_color",'rb'))
test_output_data = pickle.load(open("/content/drive/My Drive/AV Research/test_output_data_80k_256_cropped_color",'rb'))

Mounted at /content/drive


Data Transformations

In [None]:
class TrainDataAugmentation(nn.Module):
    def __init__(self):
        super(TrainDataAugmentation, self).__init__()
        self.transforms = v2.Compose([
            v2.ToImage(),
            v2.Grayscale(num_output_channels=1),
            v2.RandomAffine(degrees=(0, 30), translate=(0.2, 0.2)),
            v2.RandomRotation(degrees=(0, 180)),
            v2.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
            v2.ToDtype(torch.float32, scale=True),
            v2.Normalize(mean=(0.5437,), std=(0.1366,))
        ])

    def forward(self, image):
        augmented_image = self.transforms(image)
        return augmented_image

class ValDataAugmentation(nn.Module):
    def __init__(self):
        super(ValDataAugmentation, self).__init__()
        self.transforms = v2.Compose([
            v2.ToImage(),
            v2.Grayscale(num_output_channels=1),
            v2.ToDtype(torch.float32, scale=True),
            v2.Normalize(mean=(0.5437,), std=(0.1366,))
        ])

    def forward(self, image):
        augmented_image = self.transforms(image)
        return augmented_image

Data Preprocessing

In [None]:
# From: https://stackoverflow.com/questions/60101240/finding-mean-and-standard-deviation-across-image-channels-pytorch
def dataset_mean_std(data_loader):
  nimages = 0
  mean = 0.
  std = 0.
  for batch, _ in data_loader:
      # Rearrange batch to be the shape of [B, C, W * H]
      batch = batch.view(batch.size(0), batch.size(3), -1)
      # Update total number of images
      nimages += batch.size(0)
      # Compute mean and std here
      mean += batch.float().mean(2).sum(0)
      std += batch.float().std(2).sum(0)

  # Final step
  mean /= nimages
  std /= nimages

  print("Training set mean", mean)
  print("Training set std", std)

  return mean, std

def show_img(loader):
  features, labels = next(iter(loader))
  img = features[70].squeeze()
  label = labels[70]
  plt.imshow(img.T)
  plt.show()
  print(f"Label: {label}")

In [None]:
class RandomHorizontalFlipWithSteeringAngle(object):
    def __init__(self, p=0.5):
        self.p = p

    def __call__(self, input, output):
        if random.random() < self.p:
            v2.functional.hflip(input)
            output[0] = -output[0]
        return input, output


class AVDataset(Dataset):
    def __init__(self, input_images, output_values, transform):
        self.input_images = input_images
        self.output_values = output_values
        self.transform = transform

    def __len__(self):
        return len(self.output_values)

    def __getitem__(self, idx):
        input_image = self.input_images[idx]
        output_value = self.output_values[idx]

        if self.transform:
            input_image = self.transform(input_image)
            input_image, output_value = RandomHorizontalFlipWithSteeringAngle(0.5)(input_image, output_value)

        return input_image, output_value

train_data_augmentation = TrainDataAugmentation()
val_data_augmentation = ValDataAugmentation()

train_dataset = AVDataset(input_images=train_front_data, output_values=output_data, transform=train_data_augmentation)
val_dataset = AVDataset(input_images=test_front_data, output_values=test_output_data, transform=val_data_augmentation)

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=1, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=1, pin_memory=True)
del train_dataset, val_dataset, train_front_data, test_front_data, output_data, test_output_data

# Model

Neural Network

In [None]:
# From: https://github.com/wzlxjtu/PositionalEncoding2D
class PositionalEncoding2d(nn.Module):
    def __init__(self, d_model, height, width):
        super(PositionalEncoding2d, self).__init__()
        if d_model % 4 != 0:
            raise ValueError("Cannot use sin/cos positional encoding with "
                            "odd dimension (got dim={:d})".format(d_model))
        pe = torch.zeros(d_model, height, width)
        # Each dimension use half of d_model
        d_model = int(d_model / 2)
        div_term = torch.exp(torch.arange(0., d_model, 2) *
                            -(math.log(10000.0) / d_model))
        pos_w = torch.arange(0., width).unsqueeze(1)
        pos_h = torch.arange(0., height).unsqueeze(1)
        pe[0:d_model:2, :, :] = torch.sin(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
        pe[1:d_model:2, :, :] = torch.cos(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
        pe[d_model::2, :, :] = torch.sin(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
        pe[d_model + 1::2, :, :] = torch.cos(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe

In [None]:
# Inspired by: https://github.com/reshalfahsi/separableconv-torch
class SeparableConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, bias):
        super(SeparableConv2d, self).__init__()
        self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=in_channels, bias=bias)
        self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=bias)

    def forward(self, x):
        x = self.depthwise(x)
        x = self.pointwise(x)
        return x

class ResidualBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, kernel_size: int, num_layers: int, pool: bool, short: bool):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.pooling = pool
        self.short = short

        self.inconv = nn.Sequential(
            SeparableConv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=1, padding=1, bias=False),
            nn.SELU()
        )

        layers = []
        for _ in range(num_layers - 1):
            layers.append(SeparableConv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=kernel_size, stride=1, padding=1, bias=False))
            layers.append(nn.SELU())
        self.convlayers = nn.Sequential(*layers)

        if self.pooling:
            self.pool = nn.MaxPool2d(kernel_size=kernel_size, stride=2, padding=1)
            self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=2, bias=False)
        else:
            self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)

        self.sact = nn.SELU()

    def forward(self, x):
        out = self.inconv(x)
        out = self.convlayers(out)

        if self.pooling:
            out = self.pool(out)

        if self.short:
            shortcut = self.shortcut(x)
            out = out + shortcut
            out = self.sact(out)

        return out

In [None]:
class AVModel(nn.Module):
    def __init__(self):
        super(AVModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False)
        self.act = nn.SELU()
        self.positional_encoding = PositionalEncoding2d(64, 171, 256)

        self.conv_layers = nn.Sequential(
            ResidualBlock(in_channels=64, out_channels=64, kernel_size=3, num_layers=4, pool=True, short=True),
            ResidualBlock(in_channels=64, out_channels=128, kernel_size=3, num_layers=4, pool=True, short=True),
            ResidualBlock(in_channels=128, out_channels=256, kernel_size=3, num_layers=4, pool=True, short=True),
            ResidualBlock(in_channels=256, out_channels=512, kernel_size=3, num_layers=4, pool=True, short=True),
        )

        self.attention = nn.MultiheadAttention(embed_dim=176, num_heads=8, dropout=0.5, batch_first=True)

        self.dense_layers = nn.Sequential(
            nn.Linear(512, 1024, bias=False),
            nn.SELU(),
            nn.Linear(1024, 1024, bias=False),
            nn.SELU(),
            nn.Linear(1024, 1024, bias=False),
            nn.SELU(),
            nn.Dropout(0.5)
        )

        self.output_layer = nn.Linear(1024, 3)

    def forward(self, x):
        x = self.conv1(x)
        x = self.act(x)
        x = F.layer_norm(self.positional_encoding(x), x.shape)
        x = self.conv_layers(x)

        batch_size, channels, height, width = x.size()
        x = x.view(batch_size, channels, height * width)
        attention_output, _ = self.attention(x, x, x)
        x = F.layer_norm(x + attention_output, x.shape)

        x = torch.mean(x.view(x.size(0), x.size(1), -1), dim=2) # GlobalAveragePooling2D
        x = self.dense_layers(x)
        x = self.output_layer(x)

        steering_output = F.hardtanh(x[:, 0:1])
        throttle_brake_output = F.hardtanh(x[:, 1:], min_val=0)
        out = torch.cat((steering_output, throttle_brake_output), dim=1)
        return out

# Util

MFPE Loss

In [None]:
class MFPELoss(nn.Module):
    def __init__(self):
        super(MFPELoss, self).__init__()

    def forward(self, input, target):
        error = input - target
        fourth_power_error = error ** 4
        mean_fourth_power_error = torch.mean(fourth_power_error)
        return mean_fourth_power_error

Early Stopping

In [None]:
class EarlyStopping:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = float('inf')

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

Autoclip

In [None]:
# From: https://github.com/pseeth/autoclip/blob/master/autoclip.py
class AutoClip:
    def __init__(self, percentile):
        self.grad_history = []
        self.percentile = percentile

    def compute_grad_norm(self, model):
        total_norm = 0
        for p in model.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        total_norm = total_norm ** (1. / 2)

        return total_norm

    def __call__(self, model):
        grad_norm = self.compute_grad_norm(model)
        self.grad_history.append(grad_norm)
        clip_value = np.percentile(self.grad_history, self.percentile)
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

# Training Loop

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = AVModel().to(device)
num_epochs = 50
learning_rate = 0.00001
criterion = MFPELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
early_stopping = EarlyStopping(patience=3)

torch.autograd.set_detect_anomaly(False)
torch.backends.cudnn.benchmark = True
scaler = torch.cuda.amp.GradScaler()
autoclipper = AutoClip(percentile=10)

print(summary(model, input_size=(batch_size, 1, 171, 256)))

def train_loop(train_loader, model, criterion, optimizer, device):
    size = len(train_loader.dataset)
    model.train()
    train_loss = 0.0
    num_batches = len(train_loader)

    # Training loop
    for batch, (inputs, targets) in enumerate(tqdm(train_loader)):
        optimizer.zero_grad()
        inputs = inputs.float().to(device)
        targets = targets.float().to(device)
        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
          outputs = model(inputs)
          loss = criterion(outputs, targets)

        train_loss += loss.item()
        scaler.scale(loss).backward()
        autoclipper(model)

        scaler.step(optimizer)
        scaler.update()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * batch_size + len(inputs)
            print(f"Training loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

    train_loss = train_loss / num_batches
    return train_loss

def val_loop(val_loader, model, criterion, device):
    model.eval()
    val_loss = 0.0
    num_batches = len(val_loader)

    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.float().to(device, non_blocking=True)
            targets = targets.float().to(device, non_blocking=True)
            outputs = model(inputs)
            val_loss += criterion(outputs, targets).item()

    val_loss = val_loss / num_batches
    return val_loss

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loss = train_loop(train_loader, model, criterion, optimizer, device)
    val_loss = val_loop(val_loader, model, criterion, device)
    print(f"Train loss: {train_loss:>8f} - Val loss: {val_loss:>8f} \n")
    if early_stopping.early_stop(val_loss):
        print(f"Early stopping after {epoch+1} epochs \n")
        print(f"Best val loss: {early_stopping.min_validation_loss} \n")
        break

Layer (type:depth-idx)                        Output Shape              Param #
AVModel                                       [128, 3]                  --
├─Conv2d: 1-1                                 [128, 64, 171, 256]       576
├─SELU: 1-2                                   [128, 64, 171, 256]       --
├─PositionalEncoding2d: 1-3                   [128, 64, 171, 256]       --
├─Sequential: 1-4                             [128, 512, 11, 16]        --
│    └─ResidualBlock: 2-1                     [128, 64, 86, 128]        --
│    │    └─Sequential: 3-1                   [128, 64, 171, 256]       4,672
│    │    └─Sequential: 3-2                   [128, 64, 171, 256]       14,016
│    │    └─MaxPool2d: 3-3                    [128, 64, 86, 128]        --
│    │    └─Conv2d: 3-4                       [128, 64, 86, 128]        4,096
│    │    └─SELU: 3-5                         [128, 64, 86, 128]        --
│    └─ResidualBlock: 2-2                     [128, 128, 43, 64]        --
│    │   

  self.pid = os.fork()
  0%|          | 1/625 [00:10<1:52:58, 10.86s/it]

Training loss: 0.123673  [  128/80000]


 16%|█▌        | 101/625 [03:35<17:54,  2.05s/it]

Training loss: 0.010814  [12928/80000]


 32%|███▏      | 201/625 [07:00<14:29,  2.05s/it]

Training loss: 0.004182  [25728/80000]


 48%|████▊     | 301/625 [10:25<11:04,  2.05s/it]

Training loss: 0.063306  [38528/80000]


 64%|██████▍   | 401/625 [13:50<07:39,  2.05s/it]

Training loss: 0.013785  [51328/80000]


 80%|████████  | 501/625 [17:15<04:14,  2.05s/it]

Training loss: 0.028335  [64128/80000]


 96%|█████████▌| 601/625 [20:40<00:49,  2.05s/it]

Training loss: 0.072845  [76928/80000]


100%|██████████| 625/625 [21:29<00:00,  2.06s/it]


Train loss: 0.048502 - Val loss: 0.025264 

Epoch 2
-------------------------------


  0%|          | 1/625 [00:02<29:23,  2.83s/it]

Training loss: 0.038063  [  128/80000]


 16%|█▌        | 101/625 [03:28<17:56,  2.05s/it]

Training loss: 0.007250  [12928/80000]


 32%|███▏      | 201/625 [06:53<14:31,  2.05s/it]

Training loss: 0.002970  [25728/80000]


 48%|████▊     | 301/625 [10:19<11:05,  2.05s/it]

Training loss: 0.066907  [38528/80000]


 64%|██████▍   | 401/625 [13:44<07:40,  2.05s/it]

Training loss: 0.012815  [51328/80000]


 80%|████████  | 501/625 [17:10<04:14,  2.05s/it]

Training loss: 0.023119  [64128/80000]


 96%|█████████▌| 601/625 [20:35<00:49,  2.06s/it]

Training loss: 0.032667  [76928/80000]


100%|██████████| 625/625 [21:25<00:00,  2.06s/it]


Train loss: 0.040115 - Val loss: 0.025216 

Epoch 3
-------------------------------


  0%|          | 1/625 [00:02<29:25,  2.83s/it]

Training loss: 0.040887  [  128/80000]


 16%|█▌        | 101/625 [03:28<17:56,  2.06s/it]

Training loss: 0.008815  [12928/80000]


 32%|███▏      | 201/625 [06:53<14:31,  2.05s/it]

Training loss: 0.003057  [25728/80000]


 48%|████▊     | 301/625 [10:19<11:05,  2.05s/it]

Training loss: 0.054825  [38528/80000]


 64%|██████▍   | 401/625 [13:44<07:40,  2.05s/it]

Training loss: 0.012686  [51328/80000]


 80%|████████  | 501/625 [17:10<04:14,  2.05s/it]

Training loss: 0.021046  [64128/80000]


 96%|█████████▌| 601/625 [20:35<00:49,  2.06s/it]

Training loss: 0.029863  [76928/80000]


100%|██████████| 625/625 [21:25<00:00,  2.06s/it]


Train loss: 0.037136 - Val loss: 0.026138 

Epoch 4
-------------------------------


  0%|          | 1/625 [00:02<29:14,  2.81s/it]

Training loss: 0.039149  [  128/80000]


 16%|█▌        | 101/625 [03:28<17:56,  2.05s/it]

Training loss: 0.013129  [12928/80000]


 32%|███▏      | 201/625 [06:53<14:31,  2.06s/it]

Training loss: 0.004478  [25728/80000]


 48%|████▊     | 301/625 [10:19<11:05,  2.06s/it]

Training loss: 0.049832  [38528/80000]


 64%|██████▍   | 401/625 [13:44<07:40,  2.06s/it]

Training loss: 0.011052  [51328/80000]


 80%|████████  | 501/625 [17:10<04:14,  2.06s/it]

Training loss: 0.018290  [64128/80000]


 96%|█████████▌| 601/625 [20:35<00:49,  2.06s/it]

Training loss: 0.027291  [76928/80000]


100%|██████████| 625/625 [21:25<00:00,  2.06s/it]


Train loss: 0.034216 - Val loss: 0.028801 

Epoch 5
-------------------------------


  0%|          | 1/625 [00:02<29:31,  2.84s/it]

Training loss: 0.039552  [  128/80000]


 16%|█▌        | 101/625 [03:28<17:56,  2.06s/it]

Training loss: 0.018531  [12928/80000]


 32%|███▏      | 201/625 [06:53<14:31,  2.05s/it]

Training loss: 0.002902  [25728/80000]


 48%|████▊     | 301/625 [10:19<11:05,  2.06s/it]

Training loss: 0.048477  [38528/80000]


 64%|██████▍   | 401/625 [13:44<07:40,  2.06s/it]

Training loss: 0.012001  [51328/80000]


 80%|████████  | 501/625 [17:10<04:14,  2.06s/it]

Training loss: 0.016658  [64128/80000]


 96%|█████████▌| 601/625 [20:35<00:49,  2.05s/it]

Training loss: 0.024397  [76928/80000]


100%|██████████| 625/625 [21:25<00:00,  2.06s/it]


Train loss: 0.033283 - Val loss: 0.030582 

Early stopping after 5 epochs 

Best val loss: 0.025216350458562374 



In [None]:
torch.save(model.state_dict(), "/content/drive/My Drive/AV Research/summer_torch_model256.pt")

In [None]:
from google.colab import runtime
runtime.unassign()