In [2]:
from _main_utils import ROOT_DIR
from preprocess.config import PREPROCESS_CONFIG
import numpy as np
import os
import pandas as pd
import pickle
EXPERIMENT_DATASETS = PREPROCESS_CONFIG["EXPERIMENT_DATASETS"]

In [18]:
def load_dataset(name):
    """Load a specified neural dataset's pickle file by name"""
    assert (name in EXPERIMENT_DATASETS), (
        "Unrecognized dataset!"
    )
    file = os.path.join(ROOT_DIR, "data", "processed", "neural", f"{name}.pickle")
    assert os.path.exists(file), f"The file {file} does not exist."
    with open(file, "rb") as pickle_in:
        return pickle.load(pickle_in)

ordered_datasets = [(i, name) for i, name in enumerate(EXPERIMENT_DATASETS)]
print(f"DATASETS\n{ordered_datasets}")

DATASETS
[(0, 'Kato2015'), (1, 'Nichols2017'), (2, 'Skora2018'), (3, 'Kaplan2020'), (4, 'Nejatbakhsh2020'), (5, 'Yemini2021'), (6, 'Uzel2022'), (7, 'Dag2023'), (8, 'Leifer2023'), (9, 'Lin2023'), (10, 'Flavell2023'), (11, 'Venkatachalam2024')]


In [41]:
# Loading Datasets 
def helper(dataset_name: str):
    """Function we will call over and over again in this notebook"""
    # load the dataset
    print(f"Loading {dataset_name}...")
    dataset = load_dataset(dataset_name)
    print(len(dataset), dataset.keys(), end="\n\n")

    # number of ID'd neurons versus number of neurons measured
    neurons_stats = dict(min=float("inf"), max=float("-inf"))
    timesteps_stats = dict(min=float("inf"), max=float("-inf"))
    total_avg, named_avg, neuron_to_slot_avg = 0, 0, 0
    for worm in list(dataset.keys()):
        num_worms = len(dataset)
        single_worm_dataset = dataset[worm]
        neuron_to_slot = len(single_worm_dataset["neuron_to_slot"])
        total_neurons = single_worm_dataset["num_neurons"]
        labeled_neurons = single_worm_dataset["num_labeled_neurons"]
        max_timesteps = single_worm_dataset["max_timesteps"]
        neuron_to_slot_avg += neuron_to_slot / num_worms
        total_avg += total_neurons / num_worms
        named_avg += labeled_neurons / num_worms
        neurons_stats["min"] = min(neurons_stats["min"], labeled_neurons)
        neurons_stats["max"] = max(neurons_stats["max"], labeled_neurons)
        timesteps_stats["min"] = min(timesteps_stats["min"], max_timesteps)
        timesteps_stats["max"] = max(timesteps_stats["max"], max_timesteps)
        return (num_worms, total_avg, named_avg)
    print(f"Avg num. neuron ID'd/recorded : {int(named_avg)}/{int(total_avg)}")
    print(f"Neuron_to_slot avg {int(neuron_to_slot_avg)}")
    print(f"Range num. ID'd neurons : ({neurons_stats['min']}, {neurons_stats['max']})")
    print(
        f"Range len. calcium data : ({timesteps_stats['min']}, {timesteps_stats['max']})"
    )
    print(f"Avg num. ID'd : {int(named_avg)}/{int(total_avg)}")
    
    total_neurons_counted = sum(dataset[worm]["num_neurons"] for worm in dataset)
    print(f"Total neurons counted in METHOD 1: {total_neurons_counted}")

In [48]:
# Create a table for all datasets
results = []

for dataset_name in EXPERIMENT_DATASETS:
    dataset = load_dataset(dataset_name)
    num_worms = len(dataset)
    
    # Calculate statistics
    total_avg, named_avg = 0, 0
    for worm in dataset:
        single_worm_dataset = dataset[worm]
        total_neurons = single_worm_dataset["num_neurons"]
        labeled_neurons = single_worm_dataset["num_labeled_neurons"]
        total_avg += total_neurons / num_worms
        named_avg += labeled_neurons / num_worms
    
    results.append({
        'Dataset': dataset_name,
        'Num Worms': num_worms,
        'Named Avg': round(named_avg),
        'Total Avg': round(total_avg)
    })

print("Table from data_dict (before parquet generation)")
# note that in the parquet generation some all_zero calcium traces are removed
# in testing this only applied to Yemini2021 -- otherwise data_dict and parquet
# dataset info should be indentical 

df = pd.DataFrame(results)
df['(Named Avg, Total Avg)'] = df.apply(lambda x: f"({x['Named Avg']}, {x['Total Avg']})", axis=1)
df = df[['Dataset', 'Num Worms', '(Named Avg, Total Avg)']]
df

Table from data_dict (before parquet generation)


Unnamed: 0,Dataset,Num Worms,"(Named Avg, Total Avg)"
0,Kato2015,12,"(42, 128)"
1,Nichols2017,44,"(35, 108)"
2,Skora2018,12,"(47, 129)"
3,Kaplan2020,19,"(37, 115)"
4,Nejatbakhsh2020,21,"(174, 175)"
5,Yemini2021,49,"(111, 126)"
6,Uzel2022,6,"(50, 139)"
7,Dag2023,7,"(101, 143)"
8,Leifer2023,110,"(64, 68)"
9,Lin2023,577,"(8, 8)"
