# Make masks for sub-sampling the training dataset

This notebook will make a mask that returns only agents between `start_frame` and `end_frame` in steps of `interval`.  
This mask can be used as input to an AgentDataset.  

The rationale behind this is that each frame gets sampled at a rate of 10Hz, and if you look at an agent in frame `n`, the scene won't look much different if you look at the same agent in frame `n+1`, therefore we can sample the dataset more coarsly and still get a good set of training data.  

Additionally, in the test set we get asked to predict agents at frame 100, for another 50 frames, so selecting only frames for which there are at least 100 preceding frames and at least 50 more frames to come, we will select a subset of the training data that looks more like the test set.  

**NOTE:** I view this as a prototyping tool, i.e. as a useful way of reducing training time without reducing the data quality by much, but this does reduce the quality of the data, and you probably shoudn't use this for your final training. 

In [None]:
!pip install --no-index -f ../input/kaggle-l5kit pip==20.2.2 >/dev/nul
!pip install --no-index -f ../input/kaggle-l5kit -U l5kit > /dev/nul


In [None]:
from l5kit.data import LocalDataManager, ChunkedDataset
from l5kit.dataset import AgentDataset
from l5kit.rasterization import build_rasterizer
from l5kit.evaluation import write_pred_csv
from l5kit.data.filter import get_agents_slice_from_frames
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader

import os

In [None]:
os.environ["L5KIT_DATA_FOLDER"] = "/kaggle/input/lyft-motion-prediction-autonomous-vehicles"
# local data manager
dm = LocalDataManager()
# set dataset path
dataset_path = dm.require('scenes/train.zarr')
# load the dataset; this is a zarr format, chunked dataset
chunked_dataset = ChunkedDataset(dataset_path)
# open the dataset
chunked_dataset.open()

In [None]:
cfg = {
    'format_version': 4,
    'model_params': {
        'history_num_frames': 99,
        'history_step_size': 1,
        'history_delta_time': 0.1,
        'future_num_frames': 50,
        'future_step_size': 1,
        'future_delta_time': 0.1
    },
    
    'raster_params': {
        'raster_size': [1, 1],
        'pixel_size': [0.5, 0.5],
        'ego_center': [0.5, 0.5],
        'map_type': 'box_debug',
        'satellite_map_key': 'aerial_map/aerial_map.png',
        'semantic_map_key': 'semantic_map/semantic_map.pb',
        'dataset_meta_key': 'meta.json',
        'filter_agents_threshold': 0.5,
        'disable_traffic_light_faces' : False

    },
    
    'sample_data_loader': {
        'key': 'scenes/sample.zarr',
        'batch_size': 4,
        'shuffle': False,
        'num_workers': 8
    }
}

## First we make a basic mask by selecting only the agents between `start_frame` and `end_frame` in steps of `interval`

In [None]:
n_frames = len(chunked_dataset.frames)
frame_mask = np.zeros((n_frames,))

In [None]:
interval = 10
start_frame = 100
end_frame = 200


for scene in chunked_dataset.scenes:
    f1, _ = scene['frame_index_interval']
    for frame_no in np.arange(f1 + start_frame, f1 + end_frame + 1, interval):
        #ag_s = get_agents_slice_from_frames(chunked_dataset.frames[frame_no])
        #print(frame_no)
        frame_mask[frame_no] = 1


In [None]:
# Create the name of the oputput file
outfile = "frame_mask_" + str(start_frame) + "_" + str(end_frame) + "_" + str(interval)

# Save the mask
np.savez(outfile, frame_mask.astype(bool))