# My Scaling Law Plots
---

Parameters to vary:
1. Sequence length (time steps recorded)
2. Number of worms (number of individuals)
3. Number of neurons recorded (labeling neurons)
4. Do we need labeled neurons or we can atribute randomly?

Another question could be:
1. How the optimization time evolves with the amount of data?
2. And with the amount of used neurons?

In [1]:
from omegaconf import OmegaConf
from data._main import *
from models._main import *
from train._main import *

config_train = OmegaConf.load("../../../conf/train.yaml")
print("config:", OmegaConf.to_yaml(config_train), end="\n\n")

model = get_model(OmegaConf.load("../../../conf/model.yaml"))

dataset = get_dataset(OmegaConf.load("../../../conf/dataset.yaml"))

model, log_dir, config_train = train_model(
    config_train,
    model,
    dataset,
    shuffle_worms=False,
    log_dir=os.path.join("logs", "{}".format(datetime.now().strftime("%Y_%m_%d_%H_%M_%S"))),
)

In [2]:
from tests.leandro.mini_connectome import *
from tests.leandro.plots import *
from omegaconf import OmegaConf
from data._main import *
from train._utils import split_train_test
from statsmodels.tsa.stattools import grangercausalitytests
import torch
from torch.utils.data import DataLoader, random_split, ConcatDataset
import random

In [3]:
miniconnectome = MiniConnectome(direction='TD', group_by='four')
connected_to_M3L = miniconnectome.get_connected_neurons('M3L', index=False)
miniconnectome.minigraph.add_nodes('M3L', connected_to_M3L)
miniconnectome.minigraph.add_nodes(connected_to_M3L[0], miniconnectome.get_connected_neurons(connected_to_M3L[0], index=False)[2:3])

miniconnectome.minigraph.display(save=False, filename='M3L.png')

# Create the dataloader
---

In [4]:
# Load the dict dataset
config = OmegaConf.load("../../../conf/dataset.yaml")
dict_dataset = get_dataset(config)

Chosen dataset(s): ['Flavell2023']
Num. worms: 50



In [59]:
# Get one worm relevant data for training

def get_calcium_data(worm_data, neuron_names=None, use_residual=False):

    if use_residual:
        calcium_data = worm_data['smooth_residual_calcium']
    else:
        calcium_data = worm_data['smooth_calcium_data']

    time_vector = worm_data['time_in_seconds']

    # Verify if we have the neurons in the dataset
    if neuron_names is not None:
        for neuron in neuron_names:
            assert neuron in worm_data['named_neuron_to_slot'], f"We don't have data of neuron {neuron} for this worm"
        source_neurons_idx = [worm_data['named_neuron_to_slot'][neuron] for neuron in neuron_names] # Subset of labeled neurons
        return time_vector, calcium_data[:, source_neurons_idx]
    
    else:
        return time_vector, calcium_data

In [60]:
time_vec, calcium_data = get_calcium_data(dict_dataset['worm0'], neuron_names=None)

In [62]:
# === Train and test loaders ===

# One cohort = one epoch
# In one cohort we have X worms
# We can extract Y time series samples from each worm
# In each epoch we have sum_i=1^X Y_i samples

number_cohorts = 2

# This is a way of keeping track of the number of worms that we need.

# TODO: In order to chose the number of worms in a cohort, we can select the desired ones in the dict_dataset


# === Parameters ===
use_residual = False
k_splits = 2 # Number of chunks to split the data: 1 chunk = 1 train/test split. Order: train, test, train, test, ...
seq_len = 100 # Number of time steps to extract from each chunk (time steps of each example)
num_samples = 100 # Total number of sample pairs (input,target) to extract from each worm
reverse = False # If True, the time series is reversed
tau = 10 # Number of time steps to shift the target (number of time steps we want to predict ahead)

batch_size = 32
shuffle_samples = True

desired_neurons_to_train_with = ['M3L']

shuffle_worms = True

cohorts = sorted(dict_dataset.items()) * number_cohorts

assert (len(cohorts) == number_cohorts * len(dict_dataset)), "Invalid number of worms."

if shuffle_worms == True:
	cohorts = random.sample(cohorts, k=len(cohorts))

# Split one cohort per epoch
cohorts = np.array(np.array_split(cohorts, number_cohorts)) # Shape: (number_cohorts, number_worms, 2 - wormID and wormData)

# === Creating the datasets ===

# Memoize creation of data loaders and masks for speedup
memo = {}

cohort_trainloaders = np.empty(cohorts.shape[0], dtype=object)
cohort_testloaders = np.empty(cohorts.shape[0], dtype=object)

for cohort_idx, cohort in enumerate(cohorts):
	# Train and test datasets for each worm
	train_datasets = np.empty(cohort.shape[0], dtype=object)
	if cohort_idx == 0:  # Keep the validation dataset the same
		test_datasets = np.empty(cohort.shape[0], dtype=object)
	# Used neurons masks for each worm
	neuron_masks = np.empty(cohort.shape[0], dtype=object)
	
	# Iterate over worms
	for worm_idx, (worm_id, worm_data) in enumerate(cohort):
		# If we have already loaded the worm...
		if worm_id in memo:
			train_datasets[worm_idx] = memo[worm_id]["train_dataset"] # Recover the train dataset of this worm
			if cohort_idx == 0:
				test_datasets[worm_idx] = memo[worm_id]["test_dataset"] # Recover the test dataset of this worm
		
		# If we have not loaded the worm...
		else:

			time_vec, calcium_data = get_calcium_data(worm_data, neuron_names=desired_neurons_to_train_with)

			train_datasets[worm_idx], test_dataset_tmp, _, _ = split_train_test(
				data = calcium_data,
				k_splits = k_splits,
				seq_len = seq_len,
				num_samples = num_samples,
				time_vec = time_vec,
				reverse = reverse,
				tau = tau,
				use_residual = use_residual,
			)
			if cohort_idx == 0:  # Keep the validation dataset the same
					test_datasets[worm_idx] = test_dataset_tmp

			# Add to memo
			memo[worm_id] = dict(
				train_dataset=train_datasets[worm_idx],
				test_dataset=test_dataset_tmp,
			)

	cohort_train_dataset = ConcatDataset(list(train_datasets))
	cohort_test_dataset = ConcatDataset(list(test_datasets))

	cohort_trainloaders[cohort_idx] = DataLoader(
            cohort_train_dataset,
            batch_size=batch_size,
            shuffle=shuffle_samples,
            pin_memory=True,
            num_workers=0,
        )  # returns (X, Y, Dict) when iterating over it
	
	cohort_testloaders[cohort_idx] = DataLoader(
            cohort_test_dataset,
            batch_size=batch_size,
            shuffle=shuffle_samples,
            pin_memory=True,
            num_workers=0,
        )  # returns (X, Y, Dict) when iterating over it
	
# Print number of examples in each cohort
for cohort_idx, cohort in enumerate(cohorts):
    print(f"Cohort {cohort_idx}: {len(cohort)} worms")
    print(f"Train dataset: {len(cohort_trainloaders[cohort_idx])*batch_size} samples")
    print(f"Test dataset: {len(cohort_testloaders[cohort_idx])*batch_size} samples")

Cohort 0: 50 worms
Train dataset: 5024 samples
Test dataset: 5024 samples
Cohort 1: 50 worms
Train dataset: 5024 samples
Test dataset: 5024 samples


In [63]:
next(iter(cohort_trainloaders[0]))[0].shape

torch.Size([32, 100, 1])