### Create worm datasets of various sizes by subsampling a larger combined datasets
---
*Last updated: 14 November 2023*

In [5]:
import os
import pickle

from itertools import combinations
from utils import init_random_seeds
from data._utils import generate_all_subsets, generate_subsets_of_size

In [6]:
# Set random seed for reproducibility
seed = 42
init_random_seeds(seed)

# Load combined dataset and print some information about it
datadir = "combined_dataset"
file = os.path.join(datadir, "combined_dataset.pickle")
combined_dataset = pickle.load(open(file, "rb"))  # a few GBs large
num_worms = len(combined_dataset.keys())
print(f"number of worms (N): {num_worms}", end="\n\n")
print(f"all worm Ids:\n {list(combined_dataset.keys())}", end="\n\n")
print(f"data keys for each worm' data:\n {list(combined_dataset['worm0'].keys())}")

# Example of coosing all subsets of size N-1 of the wormIDs when N
all_worm_ids = list(combined_dataset.keys())
worm_subsets = combinations(all_worm_ids, len(all_worm_ids) - 1)
for i, subset in enumerate(worm_subsets):
    print(f"{i}: {subset[-5:]}")
    if i == 2:
        break
print()

# Generate 10 of all the possible size-3 subsets
# returns list of datasets containing 3 worms
subsets_3 = generate_subsets_of_size(combined_dataset, subset_size=3, max_subsets=10)
print(type(subsets_3))
print(len(subsets_3))
print(type(subsets_3[0]))
print()
for _, subset in enumerate(subsets_3):
    print(f"{_}: {tuple(subset.keys())}")
print()

# Generate 10 of each size-n subset for n from 1 to 20;
# returns dict mapping n to list of datasets containing n worms
all_subsets = generate_all_subsets(
    combined_dataset,
    max_subsets_per_size=10,
    max_size=20,
)
print(type(all_subsets))
print(len(all_subsets))
print()
for size, subsets in all_subsets.items():
    print(f"size {size}: \t num. subsets: {len(subsets)}")
print()