Run `python scrape_dandi.py` first to generate the JSON files in `dandi_json`.

## Define a function that runs a function on the first JSON file of each dandiset

In [None]:
import glob
import json
import os
import warnings

def run_on_first_json_per_dandiset(callable: callable):
    """ Run a function on the first JSON file in each Dandiset in the "dandi_json" directory.

    For each directory in the "dandi_json" directory, get the path of the first subject folder within the directory
    For each subject folder, get the path of the first JSON file within the folder.
    Call a function on the JSON file.
    """
    # Get the list of directories in the "dandi_json" directory
    directories = sorted(os.listdir("dandi_json"))

    # Iterate over each dandiset directory
    for directory in directories:
        # Get the path of the first folder within the directory
        folder_path = os.path.join("dandi_json", directory)
        if not os.path.isdir(folder_path):
            continue

        # Get all JSON files no matter how many levels deep they are
        # (Most dandisets have a single level of folders, but some have zero or two)
        json_files = glob.glob(os.path.join(folder_path, "**/*.json"), recursive=True)

        if len(json_files) == 0:
            warnings.warn(f"No JSON files found in {folder_path}")
            continue

        first_json_file = json_files[0]

        # Call the function on the JSON file
        callable(first_json_file)


## Print the file name of the first JSON file in each Dandiset

In [None]:
run_on_first_json_per_dandiset(print)

## What is the listed species of each dandiset?

In [None]:
subject_species_all = dict()
def collect_subject_species(json_file: str):
    """ Get the species of the subject in the JSON file. """
    with open(json_file, "r") as f:
        data = json.load(f)

    species = data["file"]["groups"]["general"]["groups"].get("subject", {}).get("datasets", {}).get("species", {}).get("data", None)
    subject_species_all[json_file] = species

run_on_first_json_per_dandiset(collect_subject_species)
subject_species_all

In [None]:
len(subject_species_all)

In [None]:
from collections import Counter
Counter(subject_species_all.values())

## What neurodata types are used by each dandiset?

In [None]:
neurodata_types_all = dict()
def collect_all_neurodata_types(json_file: str):
    """ Get all of the neurodata types used in each file. """
    with open(json_file, "r") as f:
        data = json.load(f)

    # for every group or dataset, if there is an attribute called "neurodata_type", add it to the list.
    # the code below is less specific and just checks every dictionary key for "neurodata_type".
    neurodata_types = []

    def _recurse(data):
        for key, value in data.items():
            if key == "neurodata_type":
                namespace = data["namespace"]
                neurodata_types.append(f"{namespace}.{value}")
            elif isinstance(value, dict):
                _recurse(value)
    _recurse(data)

    neurodata_types_all[json_file] = neurodata_types

run_on_first_json_per_dandiset(collect_all_neurodata_types)
neurodata_types_all

In [None]:
from collections import Counter
keys = [key for dandiset_nd_types in neurodata_types_all.values() for key in dandiset_nd_types]
counts = Counter(keys)
counts

In [None]:
"core.FeatureExtraction" in counts

## What NWB schema, including extensions, are used by each dandiset?

In [None]:
specs_all = dict()
def collect_all_specs(json_file: str):
    """ Get all of the specs and their versions used in each file. """
    with open(json_file, "r") as f:
        data = json.load(f)

    spec_versions = dict()  # map spec name to list of versions

    # NOTE: some files have no "specifications" group
    specs = data["file"]["groups"].get("specifications", {}).get("groups", {})
    for spec in specs:
        spec_versions[spec] = list(specs[spec]["groups"].keys())  # the keys are the versions

    specs_all[json_file] = spec_versions

run_on_first_json_per_dandiset(collect_all_specs)
specs_all

In [None]:
from collections import Counter
keys = [key for spec_versions in specs_all.values() for key in spec_versions.keys()]
Counter(keys)

## What dataset filters, such as compression, are used by each dandiset? 

In [None]:
filters_all = dict()
def collect_all_filters(json_file: str):
    """ Get all of the dataset filters used in each file. """
    with open(json_file, "r") as f:
        data = json.load(f)

    # for every dataset, if there is a key called "filters", add it to the list.
    filters = []

    def _recurse(data):
        for key, value in data.items():
            if key == "datasets":
                for dataset_dict in value.values():
                    filters.append(dataset_dict["filters"])
            elif isinstance(value, dict):
                _recurse(value)
    _recurse(data)

    filters_all[json_file] = filters

run_on_first_json_per_dandiset(collect_all_filters)
filters_all

## Which dandisets have a "VoltageClampSeries/data" dataset with an attribute "IGORWaveNote"?

In [None]:
igor_attribute_all = dict()
def collect_all_with_igor_attr(json_file: str):
    """ Get all of the dandisets that have a "VoltageClamp/data" dataset with an attribute "IGORWaveNote". """
    with open(json_file, "r") as f:
        data = json.load(f)

    def _recurse(data):
        for key, value in data.items():
            if key == "attributes":
                if "neurodata_type" in data["attributes"]:
                    # stop when one is found
                    if data["attributes"]["neurodata_type"] == "VoltageClampSeries":
                        data_attrs = data["datasets"]["data"]["attributes"]
                        if "IGORWaveNote" in data_attrs:
                            igor_attribute_all[json_file] = "VoltageClampSeries"
                            break
                    if data["attributes"]["neurodata_type"] == "CurrentClampSeries":
                        data_attrs = data["datasets"]["data"]["attributes"]
                        if "IGORWaveNote" in data_attrs:
                            igor_attribute_all[json_file] = "CurrentClampSeries"
                            break
            elif isinstance(value, dict):
                _recurse(value)
    _recurse(data)

run_on_first_json_per_dandiset(collect_all_with_igor_attr)

# sort the keys
keys = list(igor_attribute_all.keys())
keys.sort()
igor_attribute_all_sorted = {i: igor_attribute_all[i] for i in keys}
# igor_attribute_all_sorted


## What is the shape of each ElectricalSeries in each dandiset?

In [None]:
# JSON Path query
# $..groups[?(@.attributes?.neurodata_type == "ElectricalSeries")].datasets.data.shape