### Create worm datasets of various sizes by subsampling a larger combined datasets
---
*Last updated: 14 November 2023*

In [9]:
import os
import pickle

from pprint import PrettyPrinter
from itertools import combinations
from utils import init_random_seeds
from data._utils import generate_all_subsets, generate_subsets_of_size

In [10]:
# # Set random seed for reproducibility
# seed = 42
# init_random_seeds(seed)

# Settings for pretty printing
pp = PrettyPrinter(indent=4, width=100, compact=True)

# Load combined dataset and print some information about it
datadir = "custom"
file = os.path.join(datadir, "combined_dataset.pickle")
combined_dataset = pickle.load(open(file, "rb"))  # a few GBs large
num_worms = len(combined_dataset.keys())

In [11]:
# Print some information about the dataset
print(f"number of worms (N): {num_worms}", end="\n\n")
pp.pprint("all worm IDs: {}".format(list(combined_dataset.keys())))
print()
print(f"data keys for each worm' data:")
pp.pprint(list(combined_dataset["worm0"].keys()))
print()

# Example of choosing all subsets of size N-1 of the wormIDs when N
all_worm_ids = list(combined_dataset.keys())
worm_subsets = combinations(all_worm_ids, len(all_worm_ids) - 1)
print("Last five items of the first three subset sizes:")
for i, subset in enumerate(worm_subsets):
    print(f"\t{i}: {subset[-5:]}")
    if i == 2:
        break
print()

number of worms (N): 284

("all worm IDs: ['worm0', 'worm1', 'worm2', 'worm3', 'worm4', 'worm5', 'worm6', 'worm7', 'worm8', "
 "'worm9', 'worm10', 'worm11', 'worm12', 'worm13', 'worm14', 'worm15', 'worm16', 'worm17', "
 "'worm18', 'worm19', 'worm20', 'worm21', 'worm22', 'worm23', 'worm24', 'worm25', 'worm26', "
 "'worm27', 'worm28', 'worm29', 'worm30', 'worm31', 'worm32', 'worm33', 'worm34', 'worm35', "
 "'worm36', 'worm37', 'worm38', 'worm39', 'worm40', 'worm41', 'worm42', 'worm43', 'worm44', "
 "'worm45', 'worm46', 'worm47', 'worm48', 'worm49', 'worm50', 'worm51', 'worm52', 'worm53', "
 "'worm54', 'worm55', 'worm56', 'worm57', 'worm58', 'worm59', 'worm60', 'worm61', 'worm62', "
 "'worm63', 'worm64', 'worm65', 'worm66', 'worm67', 'worm68', 'worm69', 'worm70', 'worm71', "
 "'worm72', 'worm73', 'worm74', 'worm75', 'worm76', 'worm77', 'worm78', 'worm79', 'worm80', "
 "'worm81', 'worm82', 'worm83', 'worm84', 'worm85', 'worm86', 'worm87', 'worm88', 'worm89', "
 "'worm90', 'worm91', 'worm92

In [16]:
# Generate 5 examples of all the possible size-3 subsets
# returns list of datasets containing 3 worms
subsets_3 = generate_subsets_of_size(
    combined_dataset,
    subset_size=3,
    max_subsets=5,
    as_reference=True,
)
print(f"\ttype(subsets_3): {type(subsets_3)}")
print(f"\tlen(subsets_3): {len(subsets_3)}")
print(f"\ttype(subsets_3[0]): {type(subsets_3[0])}")
print()
for _, subset in enumerate(subsets_3):
    print(f"{_+1}: {tuple(subset.keys())}")
print()

	type(subsets_3): <class 'list'>
	len(subsets_3): 5
	type(subsets_3[0]): <class 'dict'>

1: ('Nichols2017', 'Yemini2021', 'Kato2015')
2: ('Nichols2017', 'Yemini2021', 'Leifer2023')
3: ('Nichols2017', 'Yemini2021', 'Leifer2023')
4: ('Nichols2017', 'Yemini2021')
5: ('Nichols2017', 'Yemini2021', 'Kato2015')



In [23]:
# Generate 2 examples of each size-n subset for n from 1 to 20;
# returns dict mapping n to list of datasets containing n worms
all_subsets = generate_all_subsets(
    combined_dataset,
    max_subsets_per_size=2,
    max_size=20,
    as_reference=True,
)
print(f"\ttype(all_subsets): {type(all_subsets)}")
print(f"\tlen(all_subsets): {len(all_subsets)}")
print()

	type(all_subsets): <class 'dict'>
	len(all_subsets): 20



In [22]:
print("Distribution of the first examples:")
for size, subsets in all_subsets.items():
    print(f"size {size}: \t num. subsets: {len(subsets)} \t example: {subsets[0]}")
print()

print("Distribution of the second examples:")
for size, subsets in all_subsets.items():
    print(f"size {size}: \t num. subsets: {len(subsets)} \t example: {subsets[1]}")
print()

Distribution of the first examples:
size 1: 	 num. subsets: 2 	 example: {'Uzel2022': 1}
size 2: 	 num. subsets: 2 	 example: {'Leifer2023': 2}
size 3: 	 num. subsets: 2 	 example: {'Yemini2021': 2, 'Skora2018': 1}
size 4: 	 num. subsets: 2 	 example: {'Nichols2017': 2, 'Flavell2023': 2}
size 5: 	 num. subsets: 2 	 example: {'Leifer2023': 2, 'Nichols2017': 2, 'Flavell2023': 1}
size 6: 	 num. subsets: 2 	 example: {'Yemini2021': 2, 'Flavell2023': 1, 'Nichols2017': 1, 'Leifer2023': 2}
size 7: 	 num. subsets: 2 	 example: {'Nichols2017': 2, 'Leifer2023': 4, 'Kaplan2020': 1}
size 8: 	 num. subsets: 2 	 example: {'Yemini2021': 3, 'Leifer2023': 4, 'Nichols2017': 1}
size 9: 	 num. subsets: 2 	 example: {'Nichols2017': 2, 'Leifer2023': 4, 'Kaplan2020': 1, 'Flavell2023': 1, 'Yemini2021': 1}
size 10: 	 num. subsets: 2 	 example: {'Leifer2023': 3, 'Kaplan2020': 1, 'Yemini2021': 2, 'Flavell2023': 1, 'Nichols2017': 3}
size 11: 	 num. subsets: 2 	 example: {'Nichols2017': 1, 'Leifer2023': 8, 'Yemini