Run `python scrape_dandi.py` first to generate the JSON files in `dandi_json`.

In [16]:
import glob
import json
import os
import warnings

## Define a function that runs a function on the first JSON file of each dandiset

In [17]:
def run_on_first_json_per_dandiset(callable: callable):
    """ Run a function on the first JSON file in each Dandiset in the "dandi_json" directory.

    For each directory in the "dandi_json" directory, get the path of the first subject folder within the directory
    For each subject folder, get the path of the first JSON file within the folder.
    Call a function on the JSON file.
    """
    # Get the list of directories in the "dandi_json" directory
    directories = os.listdir("dandi_json")

    # Iterate over each directory
    for directory in directories:
        # Get the path of the first folder within the directory
        folder_path = os.path.join("dandi_json", directory)
        if not os.path.isdir(folder_path):
            continue

        # Check if the top-level folder contains JSON files
        json_files = glob.glob(os.path.join(folder_path, "*.json"))
        if not json_files:
            # JSON files are in subject folders
            folders = next(os.walk(folder_path))[1]

            # Get the first subject folder within the directory
            first_folder = folders[0]
            inner_folder_path = os.path.join(folder_path, first_folder)

            # Get the path of the first JSON file within the folder
            json_files = glob.glob(os.path.join(inner_folder_path, "*.json"))

        if len(json_files) == 0:
            warnings.warn(f"No JSON files found in {folder_path}")

        first_json_file = json_files[0]

        # Call the function on the JSON file
        callable(first_json_file)


## Print the file name of the first JSON file in each Dandiset

In [15]:
# run_on_first_json_per_dandiset(print)

['sub-M322']
dandi_json/000546/sub-M322/sub-M322_ecephys.nwb.json
['sub-P4']
dandi_json/000122/sub-P4/sub-P4_ses-20200104T031322.nwb.json
['sub-10']
dandi_json/000579/sub-10/sub-10_ses-Mouse-10-widefield-retinotopy-and-window-vessel_ophys.nwb.json
['sub-ROV40']
dandi_json/000114/sub-ROV40/sub-ROV40_ses-Day 1-obs_ophys.nwb.json
['sub-1029212302']
dandi_json/000570/sub-1029212302/sub-1029212302_ses-1030953959_icephys.nwb.json
['sub-Fig3-Bi5-06']
dandi_json/000548/sub-Fig3-Bi5-06/sub-Fig3-Bi5-06_ses-BPAE-bipolar-5pulses_image.nwb.json
['sub-403491']
dandi_json/000711/sub-403491/sub-403491_ses-20180824T145125_image.nwb.json
['sub-P1']
dandi_json/000147/sub-P1/sub-P1_ses-2018-09-17a_ecephys.nwb.json
['sub-Jenkins']
dandi_json/000140/sub-Jenkins/sub-Jenkins_ses-small_desc-test_ecephys.nwb.json
['sub-92130c1b-4fdb-4acc-86e0-1853d429c41a']
dandi_json/000149/sub-92130c1b-4fdb-4acc-86e0-1853d429c41a/sub-92130c1b-4fdb-4acc-86e0-1853d429c41a_ses-c7bd79c9-c47e-4ea5-aea3-74dda991b48e_behavior+ecephy

## What is the listed species of each dandiset?

In [None]:
subject_species_all = dict()
def collect_subject_species(json_file: str):
    """ Get the species of the subject in the JSON file. """
    with open(json_file, "r") as f:
        data = json.load(f)

    species = data["refs"]["/"]["groups"]["general"]["groups"].get("subject", {}).get("datasets", {}).get("species", {}).get("data", None)
    subject_species_all[json_file] = species

run_on_first_json_per_dandiset(collect_subject_species)
# subject_species_all

## What neurodata types are used by each dandiset?

In [None]:
neurodata_types_all = dict()
def collect_all_neurodata_types(json_file: str):
    """ Get all of the neurodata types used in each file. """
    with open(json_file, "r") as f:
        data = json.load(f)

    # for every group or dataset, if there is an attribute called "neurodata_type", add it to the list.
    # the code below is less specific and just checks every dictionary key for "neurodata_type".
    neurodata_types = []

    def _recurse(data):
        for key, value in data.items():
            if key == "neurodata_type":
                neurodata_types.append(value)
            elif isinstance(value, dict):
                _recurse(value)
    _recurse(data)

    neurodata_types_all[json_file] = neurodata_types

run_on_first_json_per_dandiset(collect_all_neurodata_types)
# neurodata_types_all

## What NWB schema, including extensions, are used by each dandiset?

In [None]:
specs_all = dict()
def collect_all_specs(json_file: str):
    """ Get all of the specs and their versions used in each file. """
    with open(json_file, "r") as f:
        data = json.load(f)

    spec_versions = dict()  # map spec name to list of versions

    # NOTE: some files have no "specifications" group
    specs = data["refs"]["/"]["groups"].get("specifications", {}).get("groups", {})
    for spec in specs:
        spec_versions[spec] = list(specs[spec]["groups"].keys())  # the keys are the versions

    specs_all[json_file] = spec_versions

run_on_first_json_per_dandiset(collect_all_specs)
# specs_all

## What dataset filters, such as compression, are used by each dandiset? 

In [None]:
filters_all = dict()
def collect_all_filters(json_file: str):
    """ Get all of the dataset filters used in each file. """
    with open(json_file, "r") as f:
        data = json.load(f)

    # for every dataset, if there is a key called "filters", add it to the list.
    filters = []

    def _recurse(data):
        for key, value in data.items():
            if key == "datasets":
                for dataset_dict in value.values():
                    filters.append(dataset_dict["filters"])
            elif isinstance(value, dict):
                _recurse(value)
    _recurse(data)

    filters_all[json_file] = filters

run_on_first_json_per_dandiset(collect_all_filters)
filters_all

## Which dandisets have a "VoltageClampSeries/data" dataset with an attribute "IGORWaveNote"?

In [19]:
igor_attribute_all = dict()
def collect_all_with_igor_attr(json_file: str):
    """ Get all of the dandisets that have a "VoltageClamp/data" dataset with an attribute "IGORWaveNote". """
    with open(json_file, "r") as f:
        data = json.load(f)

    def _recurse(data):
        for key, value in data.items():
            if key == "attributes":
                if "neurodata_type" in data["attributes"]:
                    # stop when one is found
                    if data["attributes"]["neurodata_type"] == "VoltageClampSeries":
                        data_attrs = data["datasets"]["data"]["attributes"]
                        if "IGORWaveNote" in data_attrs:
                            igor_attribute_all[json_file] = "VoltageClampSeries"
                            break
                    if data["attributes"]["neurodata_type"] == "CurrentClampSeries":
                        data_attrs = data["datasets"]["data"]["attributes"]
                        if "IGORWaveNote" in data_attrs:
                            igor_attribute_all[json_file] = "CurrentClampSeries"
                            break
            elif isinstance(value, dict):
                _recurse(value)
    _recurse(data)

run_on_first_json_per_dandiset(collect_all_with_igor_attr)
igor_attribute_all