### Create worm datasets of various sizes by subsampling a larger combined datasets
---
*Last updated: 14 November 2023*

In [1]:
import os
import pickle

from pprint import PrettyPrinter
from itertools import combinations
from utils import init_random_seeds
from data._utils import generate_all_subsets, generate_subsets_of_size

CUDA device found.


In [2]:
# Set random seed for reproducibility
seed = 42
init_random_seeds(seed)

# Settings for pretty printing
pp = PrettyPrinter(indent=4, width=100, compact=True)

# Load combined dataset and print some information about it
# datadir = "custom"
datadir = "processed/neural"
# file = os.path.join(datadir, "combined_dataset.pickle")
file = os.path.join(datadir, "VanDerPol0000.pickle")
combined_dataset = pickle.load(open(file, "rb"))  # a few GBs large
num_worms = len(combined_dataset.keys())

In [3]:
# Print some information about the dataset
print(f"number of worms (N): {num_worms}", end="\n\n")
pp.pprint("all worm IDs: {}".format(list(combined_dataset.keys())))
print()
print(f"data keys for each worm' data:")
pp.pprint(list(combined_dataset["worm0"].keys()))
print()

# Example of choosing all subsets of size n = N-1 of the wormIDs,
# where N is the number of worms in the combined dataset
all_worm_ids = list(combined_dataset.keys())
N = num_worms
worm_subsets = combinations(all_worm_ids, N - 1)

# Print some information about the subsets
first_n_subsets = 3  # can be very long if we display all
last_x_items = 5  # can be very long if we display all
print(f"Last {last_x_items} items of the first {first_n_subsets} subset sizes:")
for i, subset in enumerate(worm_subsets):
    print(f"\t{i}: {subset[-last_x_items:]}")
    if i == first_n_subsets - 1:
        break
print()

number of worms (N): 6

"all worm IDs: ['worm0', 'worm1', 'worm2', 'worm3', 'worm4', 'worm5']"

data keys for each worm' data:
[   'dataset', 'smooth_method', 'worm', 'calcium_data', 'smooth_calcium_data', 'residual_calcium',
    'smooth_residual_calcium', 'max_timesteps', 'time_in_seconds', 'dt', 'resample_median_dt',
    'num_neurons', 'num_named_neurons', 'num_unknown_neurons', 'named_neurons_mask',
    'unknown_neurons_mask', 'neurons_mask', 'slot_to_named_neuron', 'named_neuron_to_slot',
    'slot_to_unknown_neuron', 'unknown_neuron_to_slot', 'slot_to_neuron', 'neuron_to_slot',
    'original_time_in_seconds', 'original_dt', 'original_median_dt', 'original_calcium_data',
    'original_smooth_calcium_data', 'original_residual_calcium',
    'original_smooth_residual_calcium']

Last 5 items of the first 3 subset sizes:
	0: ('worm0', 'worm1', 'worm2', 'worm3', 'worm4')
	1: ('worm0', 'worm1', 'worm2', 'worm3', 'worm5')
	2: ('worm0', 'worm1', 'worm2', 'worm4', 'worm5')



In [4]:
# Generate `num_examples` of all the possible size-n subsets;
# Returns a list of the data(sub)sets containing n worms.
num_examples = 5
n = 3
subsets_n = generate_subsets_of_size(
    combined_dataset,
    subset_size=n,
    max_subsets=num_examples,
    as_assignment=True,
)
print(f"type(subsets_n): {type(subsets_n)}")
print(f"len(subsets_n): {len(subsets_n)}")
print(f"type(subsets_n[0]): {type(subsets_n[0])}")
print()
print(f"\tExample # \t Dataset/Worms")
print(f"\t{'-'*40}")
for _, subset in enumerate(subsets_n):
    print(f"\t\t{_+1}: \t {tuple(subset.keys())}")
print()

type(subsets_n): <class 'list'>
len(subsets_n): 5
type(subsets_n[0]): <class 'dict'>

	Example # 	 Dataset/Worms
	----------------------------------------
		1: 	 ('VanDerPol0000',)
		2: 	 ('VanDerPol0000',)
		3: 	 ('VanDerPol0000',)
		4: 	 ('VanDerPol0000',)
		5: 	 ('VanDerPol0000',)



In [5]:
# Generate 2 examples of each size-n subset for n from 1 to N-1,
# where N is the number of worms in the combined dataset;
# returns dict mapping n to list of datasets containing n worms
num_examples = 2
N = len(combined_dataset)
all_subsets = generate_all_subsets(
    combined_dataset,
    max_subsets_per_size=num_examples,  # `max_subsets_per_size` examples
    max_size=N - 1,  # n = max_size, the size of the largest subset to generate
    as_assignment=True,
)
print(f"\ttype(all_subsets): {type(all_subsets)}")
print(f"\tlen(all_subsets): {len(all_subsets)}")
print()

	type(all_subsets): <class 'dict'>
	len(all_subsets): 5



In [6]:
# By using the `as_assignment=True` argument, we get a way to distribute worms to different
# experimental datasets such that we achieve the desired combined dataset size.
print("Distribution of worms in the first assignment:")
for size, subsets in all_subsets.items():
    print(f"size {size}: \t num. subsets: {len(subsets)} \t example: {subsets[0]}")
print()

print("Distribution of worms in the the second assignment:")
for size, subsets in all_subsets.items():
    print(f"size {size}: \t num. subsets: {len(subsets)} \t example: {subsets[1]}")
print()

Distribution of worms in the first assignment:
size 1: 	 num. subsets: 2 	 example: {'VanDerPol0000': 1}
size 2: 	 num. subsets: 2 	 example: {'VanDerPol0000': 2}
size 3: 	 num. subsets: 2 	 example: {'VanDerPol0000': 3}
size 4: 	 num. subsets: 2 	 example: {'VanDerPol0000': 4}
size 5: 	 num. subsets: 2 	 example: {'VanDerPol0000': 5}

Distribution of worms in the the second assignment:
size 1: 	 num. subsets: 2 	 example: {'VanDerPol0000': 1}
size 2: 	 num. subsets: 2 	 example: {'VanDerPol0000': 2}
size 3: 	 num. subsets: 2 	 example: {'VanDerPol0000': 3}
size 4: 	 num. subsets: 2 	 example: {'VanDerPol0000': 4}
size 5: 	 num. subsets: 2 	 example: {'VanDerPol0000': 5}



In [8]:
# Do one assignment for data subset sizes from 1 and N-1
num_examples = 1
N = len(combined_dataset)
all_subsets = generate_all_subsets(
    combined_dataset,
    max_subsets_per_size=num_examples,
    max_size=N - 1,  #  `max_size=None`/ `max_size=N` subsets of all sizes from 1 to N
    as_assignment=True,
)
print(f"\ttype(all_subsets): {type(all_subsets)}")
print(f"\tlen(all_subsets): {len(all_subsets)}")
print()
print("Distribution of worms in assignment:", end="\n\n")
for size, subsets in all_subsets.items():
    print(f"size {size}: \t num. subsets: {len(subsets)} \t example: {subsets[0]}")
print()

	type(all_subsets): <class 'dict'>
	len(all_subsets): 5

Distribution of worms in assignment:

size 1: 	 num. subsets: 1 	 example: {'VanDerPol0000': 1}
size 2: 	 num. subsets: 1 	 example: {'VanDerPol0000': 2}
size 3: 	 num. subsets: 1 	 example: {'VanDerPol0000': 3}
size 4: 	 num. subsets: 1 	 example: {'VanDerPol0000': 4}
size 5: 	 num. subsets: 1 	 example: {'VanDerPol0000': 5}



In [9]:
# Example of how to get the Hydra config string
hydra_str = ""
for size, subsets in all_subsets.items():
    # removing quotes is important as hydra config parser treats parameters as a string
    _str = str(subsets[0]).replace("'", "")
    # need to remive space after colon (:) as hydra config parser throws an error otherwise
    _str = _str.replace(": ", ":")
    hydra_str += f"{_str},"
hydra_str = hydra_str[:-1]
print(f"{hydra_str}")

{VanDerPol0000:1},{VanDerPol0000:2},{VanDerPol0000:3},{VanDerPol0000:4},{VanDerPol0000:5}
