In [1]:
# @title Imports

import os
import subprocess
import numpy as np
import matplotlib.pyplot as plt

from omegaconf import OmegaConf
from utils import init_random_seeds
from torch_geometric.data import download_url
from sklearn.preprocessing import StandardScaler
from _utils import Dag2023Preprocessor, reshape_calcium_data
from utils import NEURON_LABELS, ROOT_DIR, EXPERIMENT_DATASETS

# Initialize the random seeds
init_random_seeds(42)

CUDA device found.
	 GPU: NVIDIA A100 80GB PCIe


In [2]:
# @title Setup: Download the dataset

# download the dataset
dataset_name = "Dag2023"
assert dataset_name in EXPERIMENT_DATASETS, f"{dataset_name} is not a valid dataset."
config = OmegaConf.load("../configs/submodule/preprocess.yaml")
url = config.preprocess.opensource_url
zipfile = config.preprocess.opensource_zipfile
zip_path = os.path.join(ROOT_DIR, zipfile)
source_path = os.path.join(ROOT_DIR, zipfile.strip(".zip"))

if not os.path.exists(os.path.join(source_path, dataset_name)):
    download_url(url=url, folder=ROOT_DIR, filename=zipfile)
    # unzip the dataset folder
    bash_command = [
        "unzip",
        zip_path,
        "{}/*".format(dataset_name),
        "-d",
        source_path,
    ]
    std_out = subprocess.run(bash_command, text=True)  # Run the bash command
    print(std_out, end="\n\n")
    # delete the zip file
    os.unlink(zip_path)
else:
    print(f"{dataset_name} dataset already downloaded and unzipped.")

DATA_DIR = os.path.join(source_path, dataset_name)

Dag2023 dataset already downloaded and unzipped.


In [3]:
transform = StandardScaler()
smooth_method = "exponential"
interpolate_method = "linear"
resample_dt = 0.333

preprocessor = Dag2023Preprocessor(transform, smooth_method, interpolate_method, resample_dt)

In [4]:
 # Load and preprocess data
preprocessed_data = dict()  # Store preprocessed data
worm_idx = 0  # Initialize worm index outside file loop

# There are two subfolders in the Dag2023 dataset: 'swf415_no_id' and 'swf702_with_id'
withid_data_files = os.path.join(DATA_DIR, "swf702_with_id")
noid_data_files = os.path.join(DATA_DIR, "swf415_no_id")

# 'NeuroPAL_labels_dict.json' maps data file names to a dictionary of neuron label information
labels_file = "NeuroPAL_labels_dict.json"

# First deal with the swf702_with_id which contains data from labeled neurons
for file in os.listdir(withid_data_files):
    if not file.endswith(".h5"):
        continue
    data_file = os.path.join("swf702_with_id", file)
    neurons, raw_traces, time_vector_seconds = preprocessor.extract_data(data_file, labels_file)
    # print(f"DEBUG neurons: \n\t neurons \n\n") # DEBUG
    # break
    preprocessed_data, worm_idx = preprocessor.preprocess_traces(
        neurons, raw_traces, time_vector_seconds, preprocessed_data, worm_idx
    )  # preprocess

# Next deal with the swf415_no_id which contains data from unlabeled neurons
# NOTE: These don't get used at all as they are skipped in BasePreprocessor.preprocess_traces
for file in os.listdir(noid_data_files):
    if not file.endswith(".h5"):
        continue
    data_file = os.path.join("swf415_no_id", file)
    neurons, raw_traces, time_vector_seconds = preprocessor.extract_data(data_file, labels_file)
    # break
    preprocessed_data, worm_idx = preprocessor.preprocess_traces(
        neurons, raw_traces, time_vector_seconds, preprocessed_data, worm_idx
        )  # preprocess


# Reshape calcium data
for worm in preprocessed_data.keys():
    preprocessed_data[worm] = reshape_calcium_data(preprocessed_data[worm])