In [None]:
import sys
sys.path.insert(0, '/kaggle/input/kb-l5kit/l5kit/')

In [None]:
import os
import psutil
from pprint import pprint
from typing import Dict

import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

from l5kit.data import LocalDataManager, ChunkedDataset
from l5kit.dataset import AgentDataset, EgoDataset
from l5kit.rasterization import build_rasterizer
from l5kit.evaluation import write_pred_csv, compute_metrics_csv, read_gt_csv
from l5kit.evaluation.metrics import neg_multi_log_likelihood, time_displace

## Prepare Data path and load cfg
By setting the L5KIT_DATA_FOLDER variable, we can point the script to the folder where the data lies.

Then, we load our config file with relative paths and other configurations (rasteriser, training params...).

In [None]:
INPUT_DIR = "/kaggle/input/lyft-motion-prediction-autonomous-vehicles"
DATA_DIR = "/kaggle/input/lyft-prediction-chopped-validation-dataset/validate_chopped_100"
WEIGHTS_FILE = "/kaggle/input/lyft-training-mobilenetv2/l5run3_mobilenetv2.pth"

In [None]:
# set env variable for data
os.environ["L5KIT_DATA_FOLDER"] = INPUT_DIR
dm = LocalDataManager(None)

In [None]:
cfg = {
    'format_version': 4,
    'model_params': {
        'history_num_frames': 10,
        'history_step_size': 1,
        'history_delta_time': 0.1,
        'future_num_frames': 50,
        'future_step_size': 1,
        'future_delta_time': 0.1
    },
    
    'raster_params': {
        'raster_size': [224, 224],
        'pixel_size': [0.5, 0.5],
        'ego_center': [0.25, 0.5],
        'map_type': 'py_semantic',
        'satellite_map_key': 'aerial_map/aerial_map.png',
        'semantic_map_key': 'semantic_map/semantic_map.pb',
        'dataset_meta_key': 'meta.json',
        'filter_agents_threshold': 0.5
    },
    
    'val_data_loader': {
        'key': 'validate.zarr',
        'batch_size': 8,
        'shuffle': False,
        'num_workers': 0
    }

}

## Model

Select model to build.

In [None]:
# # ===resnet50===
# from torchvision.models.resnet import resnet50

# def build_model(cfg: Dict) -> torch.nn.Module:
#     # load pre-trained Conv2D model
#     model = resnet50(pretrained=False)

#     # change input channels number to match the rasterizer's output
#     num_history_channels = (cfg["model_params"]["history_num_frames"] + 1) * 2
#     num_in_channels = 3 + num_history_channels
#     model.conv1 = nn.Conv2d(
#         num_in_channels,
#         model.conv1.out_channels,
#         kernel_size=model.conv1.kernel_size,
#         stride=model.conv1.stride,
#         padding=model.conv1.padding,
#         bias=False,
#     )
#     # change output size to (X, Y) * number of future states
#     num_targets = 2 * cfg["model_params"]["future_num_frames"]
#     model.fc = nn.Linear(in_features=2048, out_features=num_targets)

#     return model

In [None]:
# # ===mobilenetV2===
# from torchvision.models.mobilenet import mobilenet_v2

# def build_model(cfg: Dict) -> torch.nn.Module:
#     # load pre-trained Conv2D model
#     model = mobilenet_v2(pretrained=False)

#     # change input channels number to match the rasterizer's output
#     num_history_channels = (cfg["model_params"]["history_num_frames"] + 1) * 2
#     num_in_channels = 3 + num_history_channels
#     model.features[0][0] = nn.Conv2d(
#         num_in_channels,
#         model.features[0][0].out_channels,
#         kernel_size=model.features[0][0].kernel_size,
#         stride=model.features[0][0].stride,
#         padding=model.features[0][0].padding,
#         bias=False,
#     )
#     # change output size to (X, Y) * number of future states
#     num_targets = 2 * cfg["model_params"]["future_num_frames"]
#     model.classifier[1] = nn.Linear(in_features=model.classifier[1].in_features, out_features=num_targets)

#     return model

In [None]:
# ===mobilenetV2===
from torchvision.models.mobilenet import mobilenet_v2

class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, x):
        return x
    
class MobilenetV2LSTM(nn.Module):
    def __init__(self, config: Dict):
        super(MobilenetV2LSTM, self).__init__()
        self.cfg = config
        self.batch_size = self.cfg['val_data_loader']['batch_size']
        self.hist_frames = self.cfg['model_params']['history_num_frames']
        self.fc_infeatures = 1280 + (2 * (self.hist_frames + 1)) + (2 * self.hist_frames) + (self.hist_frames + 1)
        self.num_targets = 2 * self.cfg["model_params"]["future_num_frames"]
#         self.seq_len = 1
#         self.input_size = 128
#         self.hidden_size = 128
        self.cnn = self.build_basecnn()
        self.fc1 = nn.Sequential(
            nn.Dropout(p=0.2, inplace=False),
            nn.Linear(in_features=self.fc_infeatures, out_features=4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.2, inplace=False),
            nn.Linear(in_features=4096, out_features=self.num_targets)
        )
#         self.lstm = nn.LSTM(
#             input_size=self.input_size,
#             hidden_size=self.hidden_size,
#             num_layers=1,
#             batch_first=True
#         )
#         self.fc2 = nn.Linear(in_features=128, out_features=100)
#         self.hidden_cell = (torch.zeros(self.batch_size, 1, self.hidden_size),
#                             torch.zeros(self.batch_size, 1, self.hidden_size))

    def forward(self, x, vel, accel, yaw):
        x = self.cnn(x)
        vel = vel.reshape(-1, (2 * (self.hist_frames + 1)))
        accel = accel.reshape(-1, (2 * self.hist_frames))
        yaw = yaw.reshape(-1, (self.hist_frames + 1))
        x = torch.cat([x, vel, accel, yaw], dim=1)
        x = self.fc1(x)
#         cnn_out = self.fc1(x)
#         lstm_in = cnn_out.view(self.batch_size, self.seq_len, self.input_size)
#         lstm_out, self.hidden_cell = self.lstm(lstm_in)
#         fc_in = lstm_out.view(self.batch_size, lstm_out.shape[2])
#         x = self.fc2(fc_in)

        return x

    def build_basecnn(self):
        # change input channels number to match the rasterizer's output
        mnet = mobilenet_v2(pretrained=False)
        num_history_channels = (self.cfg["model_params"]["history_num_frames"] + 1) * 2
        num_in_channels = 3 + num_history_channels
        mnet.features[0][0] = nn.Conv2d(
            num_in_channels,
            mnet.features[0][0].out_channels,
            kernel_size=mnet.features[0][0].kernel_size,
            stride=mnet.features[0][0].stride,
            padding=mnet.features[0][0].padding,
            bias=False,
        )

        mnet.classifier = Identity()
        
        return mnet

In [None]:
# ==== INIT MODEL
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MobilenetV2LSTM(cfg).to(device)
# model = build_model(cfg).to(device)
model.load_state_dict(torch.load(WEIGHTS_FILE, map_location=device))

# Evaluation

Evaluation follows a slightly different protocol than training. When working with time series, we must be absolutely sure to avoid leaking the future in the data.

If we followed the same protocol of training, one could just read ahead in the `.zarr` and forge a perfect solution at run-time, even for a private test set.

As such, **the private test set for the competition has been "chopped" using the `chop_dataset` function**.

The result is that **each scene has been reduced to only 100 frames**, and **only valid agents in the 100th frame will be used to compute the metrics**. Because following frames in the scene have been chopped off, we can't just look ahead to get the future of those agents.

In this example, we simulate this pipeline by running `chop_dataset` on the validation set. The function stores:
- a new chopped `.zarr` dataset, in which each scene has only the first 100 frames;
- a numpy mask array where only valid agents in the 100th frame are True;
- a ground-truth file with the future coordinates of those agents;

In [None]:
# Rasterizer
rasterizer = build_rasterizer(cfg, dm)

# validation data paths
eval_cfg = cfg["val_data_loader"]
eval_zarr_path = f'{DATA_DIR}/validate.zarr'
eval_mask_path = f'{DATA_DIR}/mask.npz'
eval_gt_path = f'{DATA_DIR}/gt.csv'

eval_zarr = ChunkedDataset(eval_zarr_path).open()
eval_mask = np.load(eval_mask_path)["arr_0"]
# ===== INIT DATASET AND LOAD MASK
eval_dataset = AgentDataset(cfg, eval_zarr, rasterizer, agents_mask=eval_mask)
eval_dataloader = DataLoader(eval_dataset, shuffle=eval_cfg["shuffle"], batch_size=eval_cfg["batch_size"], 
                             num_workers=eval_cfg["num_workers"])
print(eval_dataset)
print(len(eval_dataset))
print(len(eval_dataloader))

Please note how `Num Frames==(Num Scenes)*num_frames_to_chop`. 

The remaining frames in the scene have been sucessfully chopped off from the data

In [None]:
model.eval()

future_coords_offsets_pd = []
timestamps = []
agent_ids = []

with torch.no_grad():
    dataiter = iter(eval_dataloader)
    
    pbar = tqdm(dataiter)
    for data in pbar:
        
        im_inputs = data["image"].to(device)
        vel_inputs = data["history_velocities"].to(device)
        accel_inputs = data["history_accels"][:, :-1, :].to(device)  # removing last history frame since we don't have accel for it
        yaw_inputs = data["history_yaws"].to(device)

        target_availabilities = data["target_availabilities"].unsqueeze(-1).to(device)
        targets = data["target_positions"].to(device)
        # Forward pass
        outputs = model(im_inputs, vel_inputs, accel_inputs, yaw_inputs).reshape(targets.shape)
        
        future_coords_offsets_pd.append(outputs.cpu().numpy().copy())
        timestamps.append(data["timestamp"].numpy().copy())
        agent_ids.append(data["track_id"].numpy().copy())
        
        pbar.set_description(f'RAM used: {psutil.virtual_memory().percent}%')

## Save results
After the model has predicted trajectories for our evaluation set, we can save them in a csv file.

During the competition, only the .zarr and the mask will be provided for the private test set evaluation. Your solution is expected to generate a csv file which will be compared to the ground truth one on a separate server

In [None]:
pred_path = "/kaggle/working/pred.csv"

write_pred_csv(pred_path,
               timestamps=np.concatenate(timestamps),
               track_ids=np.concatenate(agent_ids),
               coords=np.concatenate(future_coords_offsets_pd),
              )

## Perform Evaluation
Pleae note that our metric supports multi-modal predictions (i.e. multiple predictions for a single GT trajectory). In that case, you will need to provide a confidence for each prediction (confidences must all be between 0 and 1 and sum to 1).

In this simple example we don't generate multiple trajectories, so we won't pass any confidences vector. Internally, the metric computation will assume a single trajectory with confidence equal to 1

In [None]:
metrics = compute_metrics_csv(eval_gt_path, pred_path, [neg_multi_log_likelihood, time_displace])
for metric_name, metric_mean in metrics.items():
    print(metric_name, metric_mean)