# Analysis on the Distributions of Features and Targets

Analyze the distribution of features (except images) and targets for the random 8192 samples.

Change log:
- v7 - random selected sample

In [None]:
from typing import Dict

from tempfile import gettempdir
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import torchvision
from torchvision.models.resnet import resnet50, resnet18, resnet34, resnet101
from tqdm import tqdm

import l5kit
from l5kit.configs import load_config_data
from l5kit.data import LocalDataManager, ChunkedDataset
from l5kit.dataset import AgentDataset, EgoDataset
from l5kit.rasterization import build_rasterizer
from l5kit.evaluation import write_pred_csv, compute_metrics_csv, read_gt_csv, create_chopped_dataset
from l5kit.evaluation.chop_dataset import MIN_FUTURE_STEPS
from l5kit.evaluation.metrics import neg_multi_log_likelihood, time_displace
from l5kit.geometry import transform_points
from l5kit.visualization import PREDICTED_POINTS_COLOR, TARGET_POINTS_COLOR, draw_trajectory
from prettytable import PrettyTable
from pathlib import Path

import matplotlib.pyplot as plt

import os
import random
import time

from IPython.display import display
from tqdm import tqdm_notebook
import gc, psutil

import warnings
warnings.filterwarnings("ignore")

l5kit.__version__

In [None]:
# Memory measurement
def memory(verbose=True):
    mem = psutil.virtual_memory()
    gb = 1024*1024*1024
    if verbose:
        print('Physical memory:',
              '%.2f GB (used),'%((mem.total - mem.available) / gb),
              '%.2f GB (available)'%((mem.available) / gb), '/',
              '%.2f GB'%(mem.total / gb))
    return (mem.total - mem.available) / gb

def gc_memory(verbose=True):
    m = gc.collect()
    if verbose:
        print('GC:', m, end=' | ')
        memory()

memory();

In [None]:
# folder = '..' 
folder = '/kaggle'
test_run = False

In [None]:
# --- Lyft configs ---
cfg = {
    'format_version': 4,
    'data_path': f'{folder}/input/lyft-motion-prediction-autonomous-vehicles',
    'model_params': {
        'model_architecture': 'resnet34',
        'history_num_frames': 10,
        'history_step_size': 1,
        'history_delta_time': 0.1,
        'future_num_frames': 50,
        'future_step_size': 1,
        'future_delta_time': 0.1,
    },
    'raster_params': {
        'raster_size': [2, 2],  # [224, 224],
        'pixel_size': [0.5, 0.5],
        'ego_center': [0.25, 0.5],
        'map_type': 'py_semantic',
        'satellite_map_key': 'aerial_map/aerial_map.png',
        'semantic_map_key': 'semantic_map/semantic_map.pb',
        'dataset_meta_key': 'meta.json',
        'filter_agents_threshold': 0.5,
    },
    'train_data_loader': {
        'key': 'scenes/train.zarr',
        'batch_size': 512 if test_run else 8192,
        'shuffle': True,  # so that we are not focus on a few scenes
        'num_workers': 4,  # 4
    },    
    'test_data_loader': {
        'key': 'scenes/test.zarr',
        'batch_size': 512 if test_run else 8192,
        'shuffle': True,  # so that we are not focus on a few scenes
        'num_workers': 4,  # 4
    },
}

In [None]:
%%time
# set env variable for data
DIR_INPUT = cfg["data_path"]
os.environ["L5KIT_DATA_FOLDER"] = DIR_INPUT
dm = LocalDataManager()
# Build rasterizer
rasterizer = build_rasterizer(cfg, dm)

In [None]:
%%time
# Train dataset
train_cfg = cfg["train_data_loader"]
train_zarr = ChunkedDataset(dm.require(train_cfg["key"])).open(cached=False)  # try to turn off cache
train_dataset = AgentDataset(cfg, train_zarr, rasterizer)
train_dataloader = DataLoader(train_dataset, shuffle=train_cfg["shuffle"], 
                              batch_size=train_cfg["batch_size"], num_workers=train_cfg["num_workers"])
print(train_dataset)

In [None]:
%%time
# Test dataset
test_cfg = cfg["test_data_loader"]
test_zarr = ChunkedDataset(dm.require(test_cfg["key"])).open(cached=False)  # try to turn off cache
test_mask = np.load(f"{DIR_INPUT}/scenes/mask.npz")["arr_0"]
test_dataset = AgentDataset(cfg, test_zarr, rasterizer, agents_mask=test_mask)
test_dataloader = DataLoader(test_dataset, shuffle=test_cfg["shuffle"],
                             batch_size=test_cfg["batch_size"], num_workers=test_cfg["num_workers"])
print(test_dataset)

In [None]:
print('train set size:', len(train_dataset))
print(' test set size:', len(test_dataset))

# Single sample via dataset API

In [None]:
data = train_dataset[0]

In [None]:
data.keys()

In [None]:
print('datakey, shape, type:')
for k, v in data.items():
    if isinstance(v, np.ndarray):
        print('%30s'%k, v.shape, type(v))
    else:
        print('%30s'%k, type(v))

In [None]:
# Example of single data
for k, v in data.items():
    if isinstance(v, np.ndarray):
        print(k)
        print(' ', v[:3])
    else:
        print(k, v)

# Distribution of a batch

In [None]:
%%time
train_data = next(iter(train_dataloader))
memory();

In [None]:
%%time
test_data = next(iter(test_dataloader))
memory();

In [None]:
del train_data['image']
del test_data['image']

In [None]:
metrics = [
    'target_positions',
    'target_availabilities',
    'history_positions',
    'history_yaws',
    'history_availabilities',
    'centroid',
    'yaw',
    'extent',
]

In [None]:
print('datakey, shape, type:')
for k, v in train_data.items():
    if isinstance(v, torch.Tensor):
        print('%30s'%k, v.shape, type(v))
    else:
        print('%30s'%k, type(v))

In [None]:
print('distinct track_id in the batch (train, test):',
      pd.Series(train_data['track_id'].numpy()).nunique(), ',',
      pd.Series(test_data['track_id'].numpy()).nunique())

In [None]:
nx = 5
plt_bins = 50
figure_width = 15
def plot_distribution(databatch, metrics=metrics):
    for m in metrics:
        data = databatch[m].numpy()
        print(m, data.shape)
        if m.endswith('availabilities'):
            data = data.reshape(-1)
        dim = len(data.shape)
        if dim == 1:
            plt.figure(figsize=(figure_width*2/5, 2))
            plt.hist(data, bins=plt_bins)
            plt.xlabel(m); plt.ylabel('N')
            plt.grid()
            plt.title(m)
        elif dim == 2:
            n = data.shape[1]
            ny = np.ceil(n / nx).astype('int')
            plt.figure(figsize=(figure_width, ny*2))
            for i in range(n):
                plt.subplot(ny, nx, i+1)
                plt.hist(data[:, i], bins=plt_bins)
                plt.xlabel(m+f' ({i})')
                plt.grid()
                if i%nx==0:
                    plt.ylabel('N')
            plt.suptitle(m)
            plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        elif dim == 3:
            # availabilities filter
            if m.startswith('target_'):
                weights = databatch['target_availabilities'].numpy()
            elif m.startswith('history_'):
                weights = databatch['history_availabilities'].numpy()
            else:
                weights = None
            if weights is not None and (weights == 0).all():
                continue
            n, nj = data.shape[1:]
            ny = np.ceil(n / nx).astype('int')
            plt.figure(figsize=(figure_width, ny*2))
            for i in range(n):
                plt.subplot(ny, nx, i+1)
                w = weights[:, i] if weights is not None else None
                for j in range(nj):
                    plt.hist(data[:, i, j], weights=w, bins=plt_bins, alpha=0.7, label=f'{j}')
                plt.xlabel(m+f' ({i})')
                plt.grid()
                if i%nx == 0:
                    plt.ylabel('N')
                if i%nx == nx-1:
                    plt.legend()
            plt.suptitle(m)
            plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.show()

## Train set distribution

In [None]:
plot_distribution(train_data)

## Test set distribution

In [None]:
plot_distribution(test_data)

# Mean values of each metric

In [None]:
def get_weights(databatch, m):
    """the availabilities of the frames as the weights"""
    shape = databatch[m].shape
    weights = None
    if len(shape) == 3:
        if m.startswith('target_'):
            weights = databatch['target_availabilities'].numpy()
        elif m.startswith('history_'):
            weights = databatch['history_availabilities'].numpy()
    if weights is not None:
        weights = np.tile(np.expand_dims(weights, -1), (1, 1, shape[-1]))
    return weights

def describe_mean(databatch, metrics=metrics):
    for m in metrics:
        data = databatch[m].numpy()
        weights = get_weights(databatch, m)
        # use weights to exclude not available points
        print(m, np.average(data, axis=0, weights=weights))

In [None]:
print('train set average')
describe_mean(train_data, metrics=metrics)

In [None]:
print('test set mean')
describe_mean(test_data, metrics=[m for m in metrics if not m.startswith('target_')])

We see the target is at a weird biased position where y is always a bit negative (|y| is about 0.1) and x is alway go to about 8. Perhaps, this is due to the small batch.

# target_positions and history_positoins

Note the stdev include the non-available points.

In [None]:
# average across all timesteps
train_mean_target_positions_all = np.average(
    train_data['target_positions'].numpy(), axis=(0, 1), weights=get_weights(train_data, 'target_positions'))
train_mean_history_positions_all = np.average(
    train_data['history_positions'].numpy(), axis=(0, 1), weights=get_weights(train_data, 'history_positions'))
test_mean_history_positions_all = np.average(
    test_data['history_positions'].numpy(), axis=(0, 1), weights=get_weights(test_data, 'history_positions'))

In [None]:
print('== train target_positions ==')
print('mean', train_mean_target_positions_all)
print('stdev', np.std(train_data['target_positions'].numpy(), axis=(0, 1)))
print('== train history_positions ==')
print('mean', train_mean_history_positions_all)
print('stdev', np.std(train_data['history_positions'].numpy(), axis=(0, 1)))
print('== test history_positions ==')
print('mean', test_mean_history_positions_all)
print('stdev', np.std(test_data['history_positions'].numpy(), axis=(0, 1)))

In [None]:
# average per timestep
train_mean_target_positions = np.average(
    train_data['target_positions'].numpy(), axis=0, weights=get_weights(train_data, 'target_positions'))
train_mean_history_positions = np.average(
    train_data['history_positions'].numpy(), axis=0, weights=get_weights(train_data, 'history_positions'))
test_mean_history_positions = np.average(
    test_data['history_positions'].numpy(), axis=0, weights=get_weights(test_data, 'history_positions'))

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(*train_mean_target_positions.T, s=10)
plt.title('train mean target_positions')
plt.xlabel('x (m)')
plt.ylabel('y (m)')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
plt.scatter(*train_mean_history_positions.T, s=10)
plt.title('train mean history_positions')
plt.xlabel('x (m)')
plt.ylabel('y (m)')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(*test_mean_history_positions.T, s=10)
plt.title('test mean history_positions')
plt.xlabel('x (m)')
plt.ylabel('y (m)')
plt.grid()
plt.show()

In [None]:
# nx = 5
# def plot_distribution(databatch, metrics=metrics):
#     for m in metrics:
#         data = databatch[m].numpy()
#         print(data.shape)
#         dim = len(data.shape)
#         if dim == 1:
#             plt.hist(data, bins=100)
#             plt.xlabel(m); plt.ylabel('N')
#             plt.title(m)
#         elif dim == 2:
#             n = data.shape[1]
#             ny = np.ceil(n / nx).astype('int')
#             plt.figure(figsize=(12, ny*3))
#             for i in range(n):
#                 plt.subplot(ny, nx, i+1).set_title(m)
#                 plt.hist(data[:, i], bins=100)
#                 plt.xlabel(m+f' ({i})')
#                 if i%nx==0:
#                     plt.ylabel('N')
#         plt.show()            

In [None]:
# tr_it = iter(train_dataloader)
# n_batch = 1
# for i in tqdm_notebook(range(n_batch)):
#     data = next(tr_it)
#     data['target_positions']
#     data['target_availabilities']
#     data['history_positions']
#     data['history_yaws']
#     data['history_availabilities']
#     data['centroid']
#     data['yaw']
#     data['extent']

In [None]:
# %%time
# test_data = next(iter(test_dataloader))
# memory()