### Create worm datasets of various sizes by subsampling a larger combined datasets
---
*Last updated: 14 November 2023*

In [72]:
import os
import pickle

from pprint import PrettyPrinter
from itertools import combinations
from utils import init_random_seeds
from data._utils import generate_all_subsets, generate_subsets_of_size

In [73]:
# Set random seed for reproducibility
seed = 42
init_random_seeds(seed)

# Settings for pretty printing
pp = PrettyPrinter(indent=4, width=100, compact=True)

# Choose which dataset to work with
## Synthetic datasets ###
datadir = "processed/neural"
# file = os.path.join(datadir, "Sines0000.pickle")
# file = os.path.join(datadir, "Lorenz0000.pickle")
# file = os.path.join(datadir, "WhiteNoise0000.pickle")
# file = os.path.join(datadir, "RandWalk0000.pickle")
file = os.path.join(datadir, "VanDerPol0000.pickle")
###########################################

# ### Combined experimental datasets ###
# datadir = "combined_AllExperimental"
# file = os.path.join(datadir, "combined_dataset.pickle")
# ############################################

# Load combined dataset and print some information about it
combined_dataset = pickle.load(open(file, "rb"))  # a few GBs large
num_worms = len(combined_dataset.keys())

In [74]:
# Print some information about the dataset
print(f"number of worms (N): {num_worms}", end="\n\n")
pp.pprint("all worm IDs: {}".format(list(combined_dataset.keys())))
print()
print(f"data keys for each worm' data:")
pp.pprint(list(combined_dataset["worm0"].keys()))
print()

# Example of choosing all subsets of size n = N-1 of the wormIDs,
# where N is the number of worms in the combined dataset
all_worm_ids = list(combined_dataset.keys())
N = num_worms
worm_subsets = combinations(all_worm_ids, N - 1)

# Print some information about the subsets
first_n_subsets = 3  # can be very long if we display all
last_x_items = 5  # can be very long if we display all
print(f"Last {last_x_items} items of the first {first_n_subsets} subset sizes:")
for i, subset in enumerate(worm_subsets):
    print(f"\t{i}: {subset[-last_x_items:]}")
    if i == first_n_subsets - 1:
        break
print()

number of worms (N): 101

("all worm IDs: ['worm0', 'worm1', 'worm2', 'worm3', 'worm4', 'worm5', 'worm6', 'worm7', 'worm8', "
 "'worm9', 'worm10', 'worm11', 'worm12', 'worm13', 'worm14', 'worm15', 'worm16', 'worm17', "
 "'worm18', 'worm19', 'worm20', 'worm21', 'worm22', 'worm23', 'worm24', 'worm25', 'worm26', "
 "'worm27', 'worm28', 'worm29', 'worm30', 'worm31', 'worm32', 'worm33', 'worm34', 'worm35', "
 "'worm36', 'worm37', 'worm38', 'worm39', 'worm40', 'worm41', 'worm42', 'worm43', 'worm44', "
 "'worm45', 'worm46', 'worm47', 'worm48', 'worm49', 'worm50', 'worm51', 'worm52', 'worm53', "
 "'worm54', 'worm55', 'worm56', 'worm57', 'worm58', 'worm59', 'worm60', 'worm61', 'worm62', "
 "'worm63', 'worm64', 'worm65', 'worm66', 'worm67', 'worm68', 'worm69', 'worm70', 'worm71', "
 "'worm72', 'worm73', 'worm74', 'worm75', 'worm76', 'worm77', 'worm78', 'worm79', 'worm80', "
 "'worm81', 'worm82', 'worm83', 'worm84', 'worm85', 'worm86', 'worm87', 'worm88', 'worm89', "
 "'worm90', 'worm91', 'worm92

In [75]:
# Generate `num_examples` of all the possible size-n subsets;
# Returns a list of the data(sub)sets containing n worms.
num_examples = 5
n = 3
subsets_n = generate_subsets_of_size(
    combined_dataset,
    subset_size=n,
    max_subsets=num_examples,
    as_assignment=True,
)
print(f"type(subsets_n): {type(subsets_n)}")
print(f"len(subsets_n): {len(subsets_n)}")
print(f"type(subsets_n[0]): {type(subsets_n[0])}")
print()
print(f"\tExample # \t Dataset/Worms")
print(f"\t{'-'*40}")
for _, subset in enumerate(subsets_n):
    print(f"\t\t{_+1}: \t {tuple(subset.keys())}")
print()

type(subsets_n): <class 'list'>
len(subsets_n): 5
type(subsets_n[0]): <class 'dict'>

	Example # 	 Dataset/Worms
	----------------------------------------
		1: 	 ('VanDerPol0000',)
		2: 	 ('VanDerPol0000',)
		3: 	 ('VanDerPol0000',)
		4: 	 ('VanDerPol0000',)
		5: 	 ('VanDerPol0000',)



In [76]:
# Generate 2 examples of each size-n subset for n from 1 to N-1,
# where N is the number of worms in the combined dataset;
# returns dict mapping n to list of datasets containing n worms
num_examples = 2
N = len(combined_dataset)
all_subsets = generate_all_subsets(
    combined_dataset,
    max_subsets_per_size=num_examples,  # `max_subsets_per_size` examples
    max_size=N - 1,  # n = max_size, the size of the largest subset to generate
    as_assignment=True,
)
print(f"\ttype(all_subsets): {type(all_subsets)}")
print(f"\tlen(all_subsets): {len(all_subsets)}")
print()

	type(all_subsets): <class 'dict'>
	len(all_subsets): 100



In [77]:
# By using the `as_assignment=True` argument, we get a way to distribute worms to different
# experimental datasets such that we achieve the desired combined dataset size.
print("Distribution of worms in the first assignment:")
for size, subsets in all_subsets.items():
    print(f"size {size}: \t num. subsets: {len(subsets)} \t example: {subsets[0]}")
print()

print("Distribution of worms in the the second assignment:")
for size, subsets in all_subsets.items():
    print(f"size {size}: \t num. subsets: {len(subsets)} \t example: {subsets[1]}")
print()

Distribution of worms in the first assignment:
size 1: 	 num. subsets: 2 	 example: {'VanDerPol0000': 1}
size 2: 	 num. subsets: 2 	 example: {'VanDerPol0000': 2}
size 3: 	 num. subsets: 2 	 example: {'VanDerPol0000': 3}
size 4: 	 num. subsets: 2 	 example: {'VanDerPol0000': 4}
size 5: 	 num. subsets: 2 	 example: {'VanDerPol0000': 5}
size 6: 	 num. subsets: 2 	 example: {'VanDerPol0000': 6}
size 7: 	 num. subsets: 2 	 example: {'VanDerPol0000': 7}
size 8: 	 num. subsets: 2 	 example: {'VanDerPol0000': 8}
size 9: 	 num. subsets: 2 	 example: {'VanDerPol0000': 9}
size 10: 	 num. subsets: 2 	 example: {'VanDerPol0000': 10}
size 11: 	 num. subsets: 2 	 example: {'VanDerPol0000': 11}
size 12: 	 num. subsets: 2 	 example: {'VanDerPol0000': 12}
size 13: 	 num. subsets: 2 	 example: {'VanDerPol0000': 13}
size 14: 	 num. subsets: 2 	 example: {'VanDerPol0000': 14}
size 15: 	 num. subsets: 2 	 example: {'VanDerPol0000': 15}
size 16: 	 num. subsets: 2 	 example: {'VanDerPol0000': 16}
size 17: 	 

In [78]:
# Do one assignment for data subset sizes from 1 and N-1
num_examples = 1
N = len(combined_dataset)
all_subsets = generate_all_subsets(
    combined_dataset,
    max_subsets_per_size=num_examples,
    max_size=N - 1,  #  `max_size=None`/ `max_size=N` subsets of all sizes from 1 to N
    as_assignment=True,
)
print(f"\ttype(all_subsets): {type(all_subsets)}")
print(f"\tlen(all_subsets): {len(all_subsets)}")
print()
print("Distribution of worms in assignment:", end="\n\n")
for size, subsets in all_subsets.items():
    print(f"size {size}: \t num. subsets: {len(subsets)} \t example: {subsets[0]}")
print()

	type(all_subsets): <class 'dict'>
	len(all_subsets): 100

Distribution of worms in assignment:

size 1: 	 num. subsets: 1 	 example: {'VanDerPol0000': 1}
size 2: 	 num. subsets: 1 	 example: {'VanDerPol0000': 2}
size 3: 	 num. subsets: 1 	 example: {'VanDerPol0000': 3}
size 4: 	 num. subsets: 1 	 example: {'VanDerPol0000': 4}
size 5: 	 num. subsets: 1 	 example: {'VanDerPol0000': 5}
size 6: 	 num. subsets: 1 	 example: {'VanDerPol0000': 6}
size 7: 	 num. subsets: 1 	 example: {'VanDerPol0000': 7}
size 8: 	 num. subsets: 1 	 example: {'VanDerPol0000': 8}
size 9: 	 num. subsets: 1 	 example: {'VanDerPol0000': 9}
size 10: 	 num. subsets: 1 	 example: {'VanDerPol0000': 10}
size 11: 	 num. subsets: 1 	 example: {'VanDerPol0000': 11}
size 12: 	 num. subsets: 1 	 example: {'VanDerPol0000': 12}
size 13: 	 num. subsets: 1 	 example: {'VanDerPol0000': 13}
size 14: 	 num. subsets: 1 	 example: {'VanDerPol0000': 14}
size 15: 	 num. subsets: 1 	 example: {'VanDerPol0000': 15}
size 16: 	 num. subse

In [79]:
# Example of how to get the Hydra config string
hydra_str = ""
skip_every = 3  # skip every n-th subset
for size, subsets in all_subsets.items():
    if size % skip_every != 0:
        continue
    # removing quotes is important as hydra config parser treats parameters as a string
    _str = str(subsets[0]).replace("'", "")
    # need to remove space after colon (:) as hydra config parser throws an error otherwise
    _str = _str.replace(": ", ":")
    hydra_str += f"{_str},"
hydra_str = hydra_str[:-1]
print(f"{hydra_str}")

{VanDerPol0000:3},{VanDerPol0000:6},{VanDerPol0000:9},{VanDerPol0000:12},{VanDerPol0000:15},{VanDerPol0000:18},{VanDerPol0000:21},{VanDerPol0000:24},{VanDerPol0000:27},{VanDerPol0000:30},{VanDerPol0000:33},{VanDerPol0000:36},{VanDerPol0000:39},{VanDerPol0000:42},{VanDerPol0000:45},{VanDerPol0000:48},{VanDerPol0000:51},{VanDerPol0000:54},{VanDerPol0000:57},{VanDerPol0000:60},{VanDerPol0000:63},{VanDerPol0000:66},{VanDerPol0000:69},{VanDerPol0000:72},{VanDerPol0000:75},{VanDerPol0000:78},{VanDerPol0000:81},{VanDerPol0000:84},{VanDerPol0000:87},{VanDerPol0000:90},{VanDerPol0000:93},{VanDerPol0000:96},{VanDerPol0000:99}
