In [None]:
import sys
sys.path.insert(0, '/kaggle/input/kbl5kit-v2/l5kit/')

In [None]:
# from IPython.core.debugger import set_trace

In [None]:
import os
from pprint import pprint
from typing import Dict

from tempfile import gettempdir
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from torchvision.models.resnet import resnet50
# from torchvision.models.mobilenet import mobilenet_v2
from tqdm.notebook import tqdm
import wandb

from l5kit.configs import load_config_data
from l5kit.data import LocalDataManager, ChunkedDataset
from l5kit.dataset import AgentDataset, EgoDataset
from l5kit.rasterization import build_rasterizer
from l5kit.evaluation import write_pred_csv, compute_metrics_csv, read_gt_csv, create_chopped_dataset
from l5kit.evaluation.chop_dataset import MIN_FUTURE_STEPS
from l5kit.evaluation.metrics import neg_multi_log_likelihood, time_displace
from l5kit.geometry import transform_points
from l5kit.visualization import PREDICTED_POINTS_COLOR, TARGET_POINTS_COLOR, draw_trajectory
from prettytable import PrettyTable
from pathlib import Path

## Prepare Data path and load cfg
By setting the L5KIT_DATA_FOLDER variable, we can point the script to the folder where the data lies.

Then, we load our config file with relative paths and other configurations (rasteriser, training params...).

In [None]:
# set env variable for data
os.environ["L5KIT_DATA_FOLDER"] = "/kaggle/input/lyft-motion-prediction-autonomous-vehicles/"
dm = LocalDataManager(None)
# get config
# cfg = load_config_data("/kaggle/input/lyft-config-files/agent_motion_config.yaml")
cfg = load_config_data("/kaggle/input/kbl5kit-v2/examples/agent_motion_prediction/agent_motion_config.yaml")
pprint(cfg)

In [None]:
cfg['model_params']['history_num_frames'] = 10
cfg['train_params']['max_num_steps'] = 25000
cfg['train_params']['checkpoint_every_n_steps'] = 5000
cfg['train_data_loader']['batch_size'] = 12
cfg['train_data_loader']['num_workers'] = 4
cfg['train_data_loader']['key'] = 'scenes/train.zarr'
cfg['val_data_loader']['batch_size'] = 12
cfg['val_data_loader']['num_workers'] = 4
cfg['val_data_loader']['key'] = 'scenes/validate.zarr'

## Model

Our baseline is a simple `resnet50` pretrained on `imagenet`. We must replace the input and the final layer to address our requirements.

In [None]:
class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, x):
        return x
    
class Resnet50LSTM(nn.Module):
    def __init__(self, config: Dict):
        super(Resnet50LSTM, self).__init__()
        self.cfg = config
        self.batch_size = self.cfg['train_data_loader']['batch_size']
        self.hist_frames = self.cfg['model_params']['history_num_frames']
        self.num_targets = 2 * self.cfg["model_params"]["future_num_frames"]
        self.input_size = 8  # pos(2) + yaw(1) + vel(2) + accel(2) + yawrate(1)
        self.hidden_size = 64
        self.cnn = self.build_basecnn()
        self.fc_infeatures = 2048 + self.hidden_size
        self.fc0 = nn.Sequential(
            nn.Linear(in_features=self.input_size, out_features=self.input_size),
            nn.LeakyReLU(inplace=True)
        )
        self.fc1 = nn.Sequential(
            nn.Dropout(p=0.2, inplace=False),
            nn.Linear(in_features=self.fc_infeatures, out_features=4096),
            nn.LeakyReLU(inplace=True),
            nn.Dropout(p=0.2, inplace=False),
            nn.Linear(in_features=4096, out_features=128),
            nn.LeakyReLU(inplace=True)
        )
        self.encoder = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=1, batch_first=True)
        self.decoder = nn.LSTM(input_size=128, hidden_size=self.hidden_size, num_layers=1, batch_first=True)
        self.fc2 = nn.Linear(in_features=(self.num_targets//2) * self.hidden_size, out_features=100)
#         self.hidden_cell = (torch.zeros(self.batch_size, 1, self.hidden_size),
#                             torch.zeros(self.batch_size, 1, self.hidden_size))

    def forward(self, x, pos, yaw, vel, accel, yawrate):
        enc_inputs = torch.cat([pos, yaw, vel, accel, yawrate], dim=2)
        enc_inputs = [self.fc0(enc_inputs[:, i, :].reshape(-1, self.input_size)).unsqueeze(1) for i in range(enc_inputs.shape[1])]
        enc_inputs = torch.cat(enc_inputs, dim=1)
        enc_output, _ = self.encoder(enc_inputs)
        enc_output = enc_output[:, -1, :]  # get ouput at last timestep
        x = self.cnn(x)
        x = torch.cat([x, enc_output], dim=1)
        x = self.fc1(x)
        x = torch.repeat_interleave(x.unsqueeze(1), repeats=self.num_targets//2, dim=1)  # shape: [batch, 50, 128]
        dec_output, _ = self.decoder(x)  # shape: [batch, 50, 64]
        x = self.fc2(dec_output.reshape(-1, (self.num_targets//2) * self.hidden_size))

        return x

    def build_basecnn(self):
        # load pre-trained Conv2D model
        model = resnet50(pretrained=True)

        # change input channels number to match the rasterizer's output
        num_history_channels = (cfg["model_params"]["history_num_frames"] + 1) * 2
        num_in_channels = 3 + num_history_channels
        model.conv1 = nn.Conv2d(
            num_in_channels,
            model.conv1.out_channels,
            kernel_size=model.conv1.kernel_size,
            stride=model.conv1.stride,
            padding=model.conv1.padding,
            bias=False,
        )

        model.fc = Identity()
    #     # change output size to (X, Y) * number of future states
    #     num_targets = 2 * cfg["model_params"]["future_num_frames"]
    #     model.fc = nn.Linear(in_features=2048, out_features=num_targets)

        return model

In [None]:
# class Identity(nn.Module):
#     def __init__(self):
#         super(Identity, self).__init__()

#     def forward(self, x):
#         return x
    
# class MobilenetV2LSTM(nn.Module):
#     def __init__(self, config: Dict):
#         super(MobilenetV2LSTM, self).__init__()
#         self.cfg = config
#         self.batch_size = self.cfg['train_data_loader']['batch_size']
#         self.hist_frames = self.cfg['model_params']['history_num_frames']
#         self.num_targets = 2 * self.cfg["model_params"]["future_num_frames"]
#         self.input_size = 8  # pos(2) + yaw(1) + vel(2) + accel(2) + yawrate(1)
#         self.hidden_size = 64
#         self.cnn = self.build_basecnn()
#         self.fc_infeatures = 1280 + self.hidden_size
#         self.fc0 = nn.Sequential(
#             nn.Linear(in_features=self.input_size, out_features=self.input_size),
#             nn.LeakyReLU(inplace=True)
#         )
#         self.fc1 = nn.Sequential(
#             nn.Dropout(p=0.2, inplace=False),
#             nn.Linear(in_features=self.fc_infeatures, out_features=4096),
#             nn.LeakyReLU(inplace=True),
#             nn.Dropout(p=0.2, inplace=False),
#             nn.Linear(in_features=4096, out_features=128),
#             nn.LeakyReLU(inplace=True)
#         )
#         self.encoder = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=1, batch_first=True)
#         self.decoder = nn.LSTM(input_size=128, hidden_size=self.hidden_size, num_layers=1, batch_first=True)
#         self.fc2 = nn.Linear(in_features=(self.num_targets//2) * self.hidden_size, out_features=100)
# #         self.hidden_cell = (torch.zeros(self.batch_size, 1, self.hidden_size),
# #                             torch.zeros(self.batch_size, 1, self.hidden_size))

#     def forward(self, x, pos, yaw, vel, accel, yawrate):
#         enc_inputs = torch.cat([pos, yaw, vel, accel, yawrate], dim=2)
#         enc_inputs = [self.fc0(enc_inputs[:, i, :].reshape(-1, self.input_size)).unsqueeze(1) for i in range(enc_inputs.shape[1])]
#         enc_inputs = torch.cat(enc_inputs, dim=1)
#         enc_output, _ = self.encoder(enc_inputs)
#         enc_output = enc_output[:, -1, :]  # get ouput at last timestep
#         x = self.cnn(x)
#         x = torch.cat([x, enc_output], dim=1)
#         x = self.fc1(x)
#         x = torch.repeat_interleave(x.unsqueeze(1), repeats=self.num_targets//2, dim=1)  # shape: [batch, 50, 128]
#         dec_output, _ = self.decoder(x)  # shape: [batch, 50, 64]
#         x = self.fc2(dec_output.reshape(-1, (self.num_targets//2) * self.hidden_size))

#         return x

#     def build_basecnn(self):
#         # change input channels number to match the rasterizer's output
#         mnet = mobilenet_v2(pretrained=True)
#         num_history_channels = (self.cfg["model_params"]["history_num_frames"] + 1) * 2
#         num_in_channels = 3 + num_history_channels
#         mnet.features[0][0] = nn.Conv2d(
#             num_in_channels,
#             mnet.features[0][0].out_channels,
#             kernel_size=mnet.features[0][0].kernel_size,
#             stride=mnet.features[0][0].stride,
#             padding=mnet.features[0][0].padding,
#             bias=False,
#         )

#         mnet.classifier = Identity()
        
#         return mnet

In [None]:
def forward(data, model, device, criterion):
    im_inputs = data["image"].to(device)
    # only (hist_frames - 2) accels available, so match all params to that
    pos_inputs = data["history_positions"][:, :-2, :].to(device)
    yaw_inputs = data["history_yaws"][:, :-2, :].to(device)
    vel_inputs = data["history_velocities"][:, :-1, :].to(device)
    accel_inputs = data["history_accels"].to(device)
    yawrate_inputs = data["history_yawrates"][:, :-1, :].to(device)
    
    # zero padding when there are not enough history frames in the data
    if data['history_positions'].shape[1] < model.hist_frames + 1:
        missing_frames = (model.hist_frames + 1) - data['history_positions'].shape[1]
        pos_inputs = F.pad(pos_inputs, (0, 0, 0, missing_frames))
        yaw_inputs = F.pad(yaw_inputs, (0, 0, 0, missing_frames))
        vel_inputs = F.pad(vel_inputs, (0, 0, 0, missing_frames))
        accel_inputs = F.pad(accel_inputs, (0, 0, 0, missing_frames))
        yawrate_inputs = F.pad(yawrate_inputs, (0, 0, 0, missing_frames))

    target_availabilities = data["target_availabilities"].unsqueeze(-1).to(device)
    targets = data["target_positions"].to(device)
    # Forward pass
    outputs = model(im_inputs, pos_inputs, yaw_inputs, vel_inputs, accel_inputs, yawrate_inputs).reshape(targets.shape)
    loss = criterion(outputs, targets)
    # not all the output steps are valid, but we can filter them out from the loss using availabilities
    loss = loss * target_availabilities
    loss = loss.mean()
    return loss, outputs

## Load the Train Data

Our data pipeline map a raw `.zarr` folder into a multi-processing instance ready for training by:
- loading the `zarr` into a `ChunkedDataset` object. This object has a reference to the different arrays into the zarr (e.g. agents and traffic lights);
- wrapping the `ChunkedDataset` into an `AgentDataset`, which inherits from torch `Dataset` class;
- passing the `AgentDataset` into a torch `DataLoader`

In [None]:
# Load train dataset
train_cfg = cfg["train_data_loader"]
rasterizer = build_rasterizer(cfg, dm)
train_zarr = ChunkedDataset(dm.require(train_cfg["key"])).open()
train_dataset = AgentDataset(cfg, train_zarr, rasterizer)
train_dataloader = DataLoader(train_dataset, shuffle=train_cfg["shuffle"], batch_size=train_cfg["batch_size"], 
                             num_workers=train_cfg["num_workers"])
print(train_dataset)
print(len(train_dataset))
print(len(train_dataloader))

In [None]:
# ==== INIT MODEL
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model = MobilenetV2LSTM(cfg).to(device)
model = Resnet50LSTM(cfg).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.MSELoss(reduction="none")
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=2500, eta_min=1e-8)

# Training loop

In [None]:
def train(model, device, train_loader, criterion, optimizer, scheduler, max_steps, checkpoint_steps):
    # Switch model to training mode. This is necessary for layers like dropout, batchnorm etc 
    # which behave differently in training and evaluation mode
    model.train()
    losses_train = []
    step = 0
    
    # We loop over the data iterator, and feed the inputs to the network and adjust the weights.
    train_pbar = tqdm(enumerate(train_loader), desc="Training steps", leave=True, total=max_steps)
    for batch_idx, data in train_pbar:
        # Forward pass to calculate loss
        loss, _ = forward(data, model, device, criterion)

        # Reset the gradients to 0 for all learnable weight parameters
        optimizer.zero_grad()

        # Backward pass: compute the gradients of the loss w.r.t. the model's parameters
        loss.backward()

        # Update the model weights
        optimizer.step()
        
        scheduler.step()

        # Get average loss of iterations so far
        losses_train.append(loss.item())
        avg_train_loss = np.mean(losses_train)

        # wandb logging - metrics to track
        wandb.log({'Training Loss': loss.item(), 'Avg Training Loss': avg_train_loss, 
                   'Learning rate': optimizer.param_groups[0]["lr"]})

        train_pbar.set_description(f" Avg train loss: {avg_train_loss}")
        
#         if step % checkpoint_steps == 0:
#             torch.save(model.state_dict(), f'l5run1_iter{step}_resnet50.pth')
        
        if step >= max_steps:
            return avg_train_loss
        step += 1

    return avg_train_loss

# Main

In [None]:
# Main training loop

# Initialize wandb
wandb.init(project='cs535-project', name='lstm-resnet50-25k', anonymous='must')

# WandB – Config is a variable that holds and saves hyperparameters and inputs
wandb_cfg = wandb.config                                         # initialize wandb config
wandb_cfg.batch_size = cfg['train_data_loader']['batch_size']    # input batch size for training (default: 64)
wandb_cfg.test_batch_size = cfg['val_data_loader']['batch_size'] # input batch size for testing (default: 1000)
wandb_cfg.steps = cfg['train_params']['max_num_steps']
wandb_cfg.epochs = 1                                             # number of epochs to train (default: 10)
wandb_cfg.lr = 1e-5                                              # learning rate (default: 0.01)
wandb_cfg.seed = 42                                              # random seed (default: 42)
wandb_cfg.log_interval = 0                                       # how many batches to wait before logging training status

# Set random seeds and deterministic pytorch for reproducibility
# random.seed(wandb_cfg.seed)       # python random seed
# torch.manual_seed(wandb_cfg.seed) # pytorch random seed
# np.random.seed(wandb_cfg.seed)    # numpy random seed
# torch.backends.cudnn.deterministic = True

# WandB – wandb.watch() automatically fetches all layer dimensions, gradients, model parameters and logs them automatically to your dashboard.
# Using log="all" log histograms of parameter values in addition to gradients
# wandb.watch(model)

# pbar = tqdm(range(1, wandb_cfg.epochs+1), desc="Epochs")
# for epoch in pbar:
#     train_loss = train(model, device, train_dataloader, criterion, optimizer, epoch)
#     val_loss = val(model, device, val_dataloader, criterion, epoch)
#     print(f"Epoch: {epoch}, Train loss: {train_loss}, Val loss: {val_loss}")

checkpoint_steps =  cfg['train_params']['checkpoint_every_n_steps']   
train_loss = train(model, device, train_dataloader, criterion, optimizer, scheduler, wandb_cfg.steps, checkpoint_steps)

In [None]:
# Save model
torch.save(model.state_dict(), 'cs535_lstm_resnet50.pth')