# Lyft/Zarr cache, data sampler benchmark


| Zarr cache | Sampler                   | max it/s | time /1000 iterations |
| ---------- | ------------------------- | -------- | --------------------- |
| Disabled   | RandomSampler             | 3.94     | 5:01                  |
| Enabled    | RandomSampler             | 4.16     | 4:02                  |
| Enabled    | LyftZarrCacheFixedSampler | 4.62     | 3:37                  |

*SequentialSampler would solve the caching issue, but it would cause an overfit soon because of the similar consecutive images.*

## Notes
If you want to get the most speed out of your dataloader, you'll need to adjust a few settings.

You should take a look at these:
- **num_workers** At most the number of CPU cores you have
- **cache_size_bytes** ChunkDataset.open(...) it should something like (I haven't tested): available RAM * 0.8 / num_workers
- **chunk_size** LyftZarrCacheFixedSampler(....) it should be the max that fits in the cache


In [None]:
import l5kit
assert l5kit.__version__ == "1.1.0"

In [None]:
import numpy as np
import os
import torch
import random
import plotly.express as px

from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data.sampler import Sampler
from tqdm.notebook import tqdm
from typing import Dict

from l5kit.data import LocalDataManager, ChunkedDataset
from l5kit.dataset import AgentDataset
from l5kit.rasterization import build_rasterizer

from numcodecs import blosc

blosc.set_nthreads(1)
blosc.use_threads = False

In [None]:
# os.environ["L5KIT_DATA_FOLDER"] = "../input/lyft-full-training-set"
os.environ["L5KIT_DATA_FOLDER"] = "../input/lyft-motion-prediction-autonomous-vehicles"
dm = LocalDataManager(None)

cfg = {
    'format_version': 4,
    'model_params': {
        'history_num_frames': 10,
        'history_step_size': 1,
        'history_delta_time': 0.1,
        'future_num_frames': 50,
        'future_step_size': 1,
        'future_delta_time': 0.1,
    },

    'raster_params': {
        'raster_size': [224, 224],
        'pixel_size': [0.5, 0.5],
        'ego_center': [0.25, 0.5],
        'map_type': 'stub_debug',
        # 'semantic_map_key': 'semantic_map/semantic_map.pb',
        'dataset_meta_key': 'meta.json',
        'filter_agents_threshold': 0.5,
        'disable_traffic_light_faces': False,
    },

    'train_data_loader': {
        'key': 'scenes/train.zarr',
        'batch_size': 12,
        'shuffle': True,
        'num_workers': 4
    }

}

In [None]:
class TqdmExtra(tqdm):
    
    @staticmethod
    def format_meter(n, total, elapsed, ncols=None, prefix='', ascii=False,
                     unit='it', unit_scale=False, rate=None, bar_format=None,
                     postfix=None, unit_divisor=1000, initial=0, colour=None,
                     **extra_kwargs):
        r = tqdm.format_meter(n, total, elapsed, ncols=ncols, prefix=prefix, ascii=ascii,
                              unit=unit, unit_scale=unit_scale, rate=rate, bar_format=bar_format,
                              postfix=postfix, unit_divisor=unit_divisor, initial=initial,
                              colour=colour, **extra_kwargs)
        
        global itps
        itps.append(n/elapsed)
        
        return r

# Without zarr cache

In [None]:
train_cfg = cfg["train_data_loader"]
ds_zarr = ChunkedDataset(dm.require(train_cfg["key"])).open(cached=False)

rasterizer = build_rasterizer(cfg, dm)

train_dataset = AgentDataset(cfg, ds_zarr, rasterizer)
train_dataloader = DataLoader(train_dataset,
                             shuffle=train_cfg["shuffle"],
                             batch_size=train_cfg["batch_size"],
                             num_workers=train_cfg["num_workers"])


In [None]:
itps = []
progress_bar = TqdmExtra(train_dataloader, total=1000)

for i, data in enumerate(progress_bar):

    if i == 1000:
        break
    

In [None]:
fig = px.line(x=range(len(itps)), y=itps, title='it/s without zarr cache')
fig.show()

In [None]:
del ds_zarr
del rasterizer
del train_dataset
del train_dataloader
del progress_bar
del data

# With cache (random sampler)

In [None]:
train_cfg = cfg["train_data_loader"]
ds_zarr = ChunkedDataset(dm.require(train_cfg["key"])).open(cached=True, cache_size_bytes=int(5e9))

rasterizer = build_rasterizer(cfg, dm)

train_dataset = AgentDataset(cfg, ds_zarr, rasterizer)
train_dataloader = DataLoader(train_dataset,
                             shuffle=train_cfg["shuffle"],
                             batch_size=train_cfg["batch_size"],
                             num_workers=train_cfg["num_workers"])


In [None]:
itps = []
progress_bar = TqdmExtra(train_dataloader, total=1000)

for i, data in enumerate(progress_bar):

    if i == 1000:
        break
    

In [None]:
fig = px.line(x=range(len(itps)), y=itps, title='it/s with zarr cahce + random sampler')
fig.show()

In [None]:
del progress_bar
del train_dataloader
del data

# With cache (custom sampler)

In [None]:
class LyftZarrCacheFixedSampler(Sampler):

    def __init__(self, datasource: AgentDataset, chunk_size=1000000):
        super(LyftZarrCacheFixedSampler, self).__init__(datasource)

        self.chunk_size = chunk_size
        self.datasource = datasource
        self.datasource.agents_indices.sort(kind='stable')

        self.epoch = 0
        self.n_chunks = len(self.datasource.agents_indices) // chunk_size
        self.n_last_chunk = len(self.datasource.agents_indices) % chunk_size

        if self.n_last_chunk > 0:
            self.n_chunks += 1
            
        print(f"Number of chunks: {self.n_chunks}")
        print(f"Number of agents: {len(self.datasource.agents_indices)}")

    def __len__(self) -> int:
        return len(self.datasource.agents_indices) // 4

    def __iter__(self):
        indices = np.array([x for x in range(len(self.datasource.agents_indices))])

        res_idx = []

        for chunk in range(self.n_chunks):
            from_idx = self.chunk_size * chunk
            to_idx = min([self.chunk_size * (chunk + 1), len(indices)])

            x = indices[from_idx:to_idx]
            x = x[self.epoch::4]

            np.random.shuffle(x)
            res_idx.append(x)

        self.epoch += 1
        if self.epoch == 4:
            self.epoch = 0

        random.shuffle(res_idx)
        indices = np.hstack(res_idx)

        return iter(indices)

In [None]:
train_dataloader = DataLoader(train_dataset,
                             sampler=LyftZarrCacheFixedSampler(train_dataset, chunk_size=500000),
                             batch_size=train_cfg["batch_size"],
                             num_workers=train_cfg["num_workers"])

In [None]:
itps = []
progress_bar = TqdmExtra(train_dataloader, total=1000)

for i, data in enumerate(progress_bar):

    if i == 1000:
        break
    

In [None]:
fig = px.line(x=range(len(itps)), y=itps, title='it/s with zarr cache + custom sampler')
fig.show()