# Download full recordings

author: laquitainesteeve@gmail.com

purpose: download all full recordings from Dandi Archive

Description:
* model: biophysical simulation
* duration: 34 min
* size: 100 GB
* layers: 6 layers: L1, L2/3, L4, L5, L6
* noise: background noise fitted to Marques-Smith
* network state: spontaneous

Execution time: 45 min
* note: writing speed: maximize chunk_size, set to n_jobs=20. This maximizes speed while avoiding overhead [1].

Hardware: CPU

Tested on: 32 cores, 2TB storage, 188GB RAM Ubuntu machine




# Setup 

Activate virtual environment (envs/spikebias.yml)

```bash
python -m ipykernel install --user --name dandi --display-name "dandi"
```

In [None]:
%%time 

# import python packages
import os
import numpy as np
from time import time
from dandi.dandiapi import DandiAPIClient
import spikeinterface.extractors as se
import spikeinterface.sorters as ss
import spikeinterface
import uuid
from datetime import datetime
from dateutil.tz import tzlocal
from dandi.dandiapi import DandiAPIClient
import spikeinterface.extractors as se
print("spikeinterface", spikeinterface.__version__)

# set the project path
PROJ_PATH = "/home/steeve/steeve/epfl/code/spikebias"

# set the raw dataset path
RAW_DATASET = os.path.join(PROJ_PATH, "dataset/00_raw/")

spikeinterface 0.101.2
CPU times: user 982 ms, sys: 1.91 s, total: 2.89 s
Wall time: 286 ms


  from .autonotebook import tqdm as notebook_tqdm


## Custom functions

In [None]:
class DataLoader:
    """Data loader for dandi datasets
    """
    def __init__(self, raw_dataset_path:str, dandiset_id:str, filepath:str):
        self.raw_dataset_path = raw_dataset_path
        self.dandiset_id = dandiset_id
        self.filepath = filepath
        self.recording = None
        self.sorting = None
        self.s3_path = None

    def load_data(self):
        
        # Get the file path on S3
        with DandiAPIClient() as client:
            asset = client.get_dandiset(self.dandiset_id, 'draft').get_asset_by_path(self.filepath)
            self.s3_path = asset.get_content_url(follow_redirects=1, strip_query=True)
        print("s3_path:", self.s3_path)

        # Get RecordingExtractor and SortingExtractor
        self.recording = se.NwbRecordingExtractor(file_path=self.s3_path, stream_mode="remfile")
        self.sorting = se.NwbSortingExtractor(file_path=self.s3_path, stream_mode="remfile")

        # Report
        print('\n', self.recording)
        print('\n', self.sorting)

        # Unit-test
        assert "layers" in self.recording.get_property_keys(), "RecordingExtractor should contain layer property"

    def save_data(self, recording_folder:str, sorting_folder:str, n_jobs=30, chunk_size=800000, dtype='float32'):
        self.recording.save(folder=recording_folder, n_jobs=n_jobs, verbose=True, progress_bar=True, overwrite=True, dtype=dtype, chunk_size=chunk_size)
        self.sorting.save(folder=sorting_folder, progress_bar=True, overwrite=True)

### npx_spont_fitted_40Khz_35min_f32

In [None]:
# set dataset parameters
dandiset_id = '001250'
filepath = 'sub-001-fitted/sub-001-fitted_ecephys.nwb'
recording_folder = os.path.join(RAW_DATASET, "recording_npx_spont_fitted_40Khz_35min_f32")
sorting_folder = os.path.join(RAW_DATASET, "sorting_npx_spont_fitted_40Khz_35min_f32")

# download and save dataset
data_loader = DataLoader(raw_dataset_path=RAW_DATASET, dandiset_id=dandiset_id, filepath=filepath)
data_loader.load_data() # Load the data
data_loader.save_data(recording_folder=recording_folder, sorting_folder=sorting_folder, n_jobs=30, chunk_size=800000, dtype='float32') # save

s3_path: https://dandiarchive.s3.amazonaws.com/blobs/9d6/6ed/9d66ed40-af31-43aa-b4ba-246d2206dcad

 NwbRecordingExtractor: 384 channels - 20.0kHz - 1 segments - 72,359,964 samples 
                       3,618.00s (1.00 hours) - float32 dtype - 103.51 GiB
  file_path: https://dandiarchive.s3.amazonaws.com/blobs/9d6/6ed/9d66ed40-af31-43aa-b4ba-246d2206dcad

 NwbSortingExtractor: 1836 units - 1 segments - 20.0kHz
  file_path: https://dandiarchive.s3.amazonaws.com/blobs/9d6/6ed/9d66ed40-af31-43aa-b4ba-246d2206dcad
write_binary_recording 
n_jobs=30 - samples_per_chunk=800,000 - chunk_memory=1.14 GiB - total_memory=34.33 GiB - chunk_duration=40.00s


write_binary_recording: 100%|██████████| 91/91 [37:49<00:00, 24.93s/it]   


### npx_evoked_fitted_20Khz_60min_f32

In [None]:
# set dataset parameters
dandiset_id = '001250'
filepath = 'sub-002-fitted/sub-002-fitted_ecephys.nwb'
recording_folder = os.path.join(RAW_DATASET, "recording_npx_evoked_fitted_20Khz_60min_f32")
sorting_folder = os.path.join(RAW_DATASET, "sorting_npx_evoked_fitted_20Khz_60min_f32")

# download and save dataset
data_loader = DataLoader(raw_dataset_path=RAW_DATASET, dandiset_id=dandiset_id, filepath=filepath)
data_loader.load_data() # Load the data
data_loader.save_data(recording_folder=recording_folder, sorting_folder=sorting_folder, n_jobs=30, chunk_size=800000, dtype='float32') # save

### recording_dense_fitted_probe1

In [None]:
%%time 

# set dataset parameters
dandiset_id = '001250'
filepath = 'sub-003-fitted/sub-003-fitted_ecephys.nwb'
recording_folder = os.path.join(RAW_DATASET, "recording_dense_fitted_probe1")
sorting_folder = os.path.join(RAW_DATASET, "sorting_dense_fitted_probe1")

# download and save dataset
data_loader = DataLoader(raw_dataset_path=RAW_DATASET, dandiset_id=dandiset_id, filepath=filepath)
data_loader.load_data() # Load the data
data_loader.save_data(recording_folder=recording_folder, sorting_folder=sorting_folder, n_jobs=30, chunk_size=800000, dtype='float32') # save

s3_path: https://dandiarchive.s3.amazonaws.com/blobs/dec/e65/dece6568-cee4-4ade-80bf-c1166a03fe2a

 NwbRecordingExtractor: 128 channels - 20.0kHz - 1 segments - 34,299,965 samples 
                       1,715.00s (28.58 minutes) - float32 dtype - 16.36 GiB
  file_path: https://dandiarchive.s3.amazonaws.com/blobs/dec/e65/dece6568-cee4-4ade-80bf-c1166a03fe2a

 NwbSortingExtractor: 287 units - 1 segments - 20.0kHz
  file_path: https://dandiarchive.s3.amazonaws.com/blobs/dec/e65/dece6568-cee4-4ade-80bf-c1166a03fe2a
write_binary_recording 
n_jobs=30 - samples_per_chunk=800,000 - chunk_memory=390.62 MiB - total_memory=11.44 GiB - chunk_duration=40.00s


write_binary_recording: 100%|██████████| 43/43 [07:57<00:00, 11.10s/it]  


### recording_dense_fitted_probe2

In [25]:
%%time 

# set dataset parameters
dandiset_id = '001250'
filepath = 'sub-004-fitted/sub-004-fitted_ecephys.nwb'
recording_folder = os.path.join(RAW_DATASET, "recording_dense_fitted_probe2")
sorting_folder = os.path.join(RAW_DATASET, "sorting_dense_fitted_probe2")

# download and save dataset
data_loader = DataLoader(raw_dataset_path=RAW_DATASET, dandiset_id=dandiset_id, filepath=filepath)
data_loader.load_data() # Load the data
data_loader.save_data(recording_folder=recording_folder, sorting_folder=sorting_folder, n_jobs=30, chunk_size=800000, dtype='float32') # save

s3_path: https://dandiarchive.s3.amazonaws.com/blobs/eef/9e9/eef9e95c-fb5b-46d2-a24c-878d8170b5e0

 NwbRecordingExtractor: 128 channels - 20.0kHz - 1 segments - 23,519,976 samples 
                       1,176.00s (19.60 minutes) - float32 dtype - 11.22 GiB
  file_path: https://dandiarchive.s3.amazonaws.com/blobs/eef/9e9/eef9e95c-fb5b-46d2-a24c-878d8170b5e0

 NwbSortingExtractor: 770 units - 1 segments - 20.0kHz
  file_path: https://dandiarchive.s3.amazonaws.com/blobs/eef/9e9/eef9e95c-fb5b-46d2-a24c-878d8170b5e0
write_binary_recording 
n_jobs=30 - samples_per_chunk=800,000 - chunk_memory=390.62 MiB - total_memory=11.44 GiB - chunk_duration=40.00s


write_binary_recording: 100%|██████████| 30/30 [04:30<00:00,  9.02s/it]  


CPU times: user 245 ms, sys: 241 ms, total: 486 ms
Wall time: 4min 37s


### recording_dense_fitted_probe3

In [26]:
%%time 

# set dataset parameters
dandiset_id = '001250'
filepath = 'sub-005-fitted/sub-005-fitted_ecephys.nwb'
recording_folder = os.path.join(RAW_DATASET, "recording_dense_fitted_probe3")
sorting_folder = os.path.join(RAW_DATASET, "sorting_dense_fitted_probe3")

# download and save dataset
data_loader = DataLoader(raw_dataset_path=RAW_DATASET, dandiset_id=dandiset_id, filepath=filepath)
data_loader.load_data() # Load the data
data_loader.save_data(recording_folder=recording_folder, sorting_folder=sorting_folder, n_jobs=30, chunk_size=800000, dtype='float32') # save

s3_path: https://dandiarchive.s3.amazonaws.com/blobs/ee2/816/ee2816de-d861-4b55-9cde-416a52e54049

 NwbRecordingExtractor: 128 channels - 20.0kHz - 1 segments - 35,279,964 samples 
                       1,764.00s (29.40 minutes) - float32 dtype - 16.82 GiB
  file_path: https://dandiarchive.s3.amazonaws.com/blobs/ee2/816/ee2816de-d861-4b55-9cde-416a52e54049

 NwbSortingExtractor: 1123 units - 1 segments - 20.0kHz
  file_path: https://dandiarchive.s3.amazonaws.com/blobs/ee2/816/ee2816de-d861-4b55-9cde-416a52e54049
write_binary_recording 
n_jobs=30 - samples_per_chunk=800,000 - chunk_memory=390.62 MiB - total_memory=11.44 GiB - chunk_duration=40.00s


write_binary_recording: 100%|██████████| 45/45 [07:44<00:00, 10.32s/it]  


CPU times: user 313 ms, sys: 216 ms, total: 529 ms
Wall time: 7min 51s


# References

[1] https://github.com/SpikeInterface/spikeinterface/issues/3252
* effect of n_jobs and chunk_size on writing speed: