In [1]:
import os
import numpy as np
import math
import pickle
import torch
from preprocess._utils import *

In [3]:
# TODO: filename should depend on the opensource_data folder
filename = "../opensource_data/Leifer2023"
print(os.getcwd())

def str_to_float(str_num):
    '''
    Change textual scientific notation into a floating-point number
    '''
    before_e = float(str_num.split('e')[0])
    sign = str_num.split('e')[1][:1]
    after_e = int(str_num.split('e')[1][1:])

    if sign == '+':
        float_num = before_e * math.pow(10, after_e)
    elif sign == '-':
        float_num = before_e * math.pow(10, -after_e)
    else:
        float_num = None
        print('error: unknown sign')
    return float_num

def pickle_Leifer2023(transform, smooth_method="fft"):
    """
    Pickles the worm neural activity data from Leifer et al., 2022,
    Neural signal propagation atlas of \textit{C. elegans}.
    """
    # calculate the number of worms
    # TODO: filename should depend on the opensource_data folder
    # TODO: currently the filename is hard coded
    files = os.listdir(filename)
    num_worms = int(len(files) / 6) # every worm has 6 txt files
    data_dict = {}

    for i in range(0, num_worms):
        worm = "worm" + str(i)

        real_data = []
        with open(filename + "/" + str(i) + "_gcamp.txt", "r") as f:
            for line in f.readlines():
                cal = list(map(float, line.split(" ")))
                real_data.append(cal)
        real_data = np.array(real_data) # format: (time, neuron)

        label_list = []
        with open(filename + "/" + str(i) + "_labels.txt", "r") as f:
            for line in f.readlines():
                l = line.strip("\n")
                label_list.append(l)

        num_unnamed = 0
        label_list = label_list[:real_data.shape[1]]
        for j, item in enumerate(label_list):
            previous_list = label_list[:j]
            # if the neuron is unnamed, give it a name
            if item == '' or item == 'smthng else':
                label_list[j] = str(j + 300)
                num_unnamed += 1
            elif item in previous_list:
                label_list[j] = str(j + 300)
                num_unnamed += 1
            else:
                label_list[j] = item


        neuron_to_idx = {}
        for k, item in enumerate(label_list):
            neuron_to_idx[item] = k

        sc = transform  # normalize data
        real_data = sc.fit_transform(real_data)
        real_data = torch.tensor(
                real_data, dtype=torch.float32
            )  # add a feature dimension and convert to tensor
        smooth_real_data, residual, smooth_residual = smooth_data_preprocess(
            real_data, smooth_method
        )

        timeVectorSeconds = []
        with open(filename + "/" + str(i) + "_t.txt", "r") as f:
            for line in f.readlines():
                l = line.strip("\n")
                timeVectorSeconds.append(str_to_float(l))

        time_in_seconds = np.array(timeVectorSeconds)
        time_in_seconds = torch.tensor(time_in_seconds).to(torch.float32)
        dt = torch.zeros_like(time_in_seconds).to(torch.float32)
        dt[1:] = time_in_seconds[1:] - time_in_seconds[:-1]

        num_neurons = real_data.shape[1]
        num_named = num_neurons - num_unnamed
        max_time = real_data.shape[0]

        print(
                "len. Ca recording %s, total num. neurons %s, num. ID'd neurons %s"
                % (max_time, num_neurons, num_named),
                end="\n\n",
        )

        data_dict.update(
                {
                    worm: {
                        "dataset": "Leifer2023",
                        "worm": worm,
                        "calcium_data": real_data,
                        "smooth_calcium_data": smooth_real_data,
                        "residual_calcium": residual,
                        "smooth_residual_calcium": smooth_residual,
                        "neuron_to_idx": neuron_to_idx,
                        "idx_to_neuron": dict((v, k) for k, v in neuron_to_idx.items()),
                        "max_time": int(max_time),
                        "time_in_seconds": time_in_seconds,
                        "dt": dt,
                        "num_neurons": int(num_neurons),
                        "num_named_neurons": num_named,
                        "num_unknown_neurons": num_unnamed
                    },
                }
            )

        # standardize the shape of calcium data to 302 x time
        data_dict[worm] = reshape_calcium_data(data_dict[worm])
        data_dict[worm]["num_named_neurons"] = data_dict[worm]["named_neurons_mask"].sum().item()
        data_dict[worm]["num_unknown_neurons"] = data_dict[worm]["num_neurons"] - data_dict[worm]["num_named_neurons"]

    # pickle the data
    file = os.path.join(os.getcwd(), "Leifer2023.pickle")
    pickle_out = open(file, "wb")
    pickle.dump(data_dict, pickle_out)
    pickle_out.close()
    pickle_in = open(file, "rb")
    Leifer2023 = pickle.load(pickle_in)
    print(Leifer2023.keys(), end="\n\n")

In [4]:
smooth_method = 'fft'
transform = MinMaxScaler(feature_range=(-1, 1))

In [5]:
pickle_Leifer2023(transform, smooth_method=smooth_method)

len. Ca recording 4454, total num. neurons 114, num. ID'd neurons 51

len. Ca recording 5395, total num. neurons 136, num. ID'd neurons 65

len. Ca recording 3789, total num. neurons 132, num. ID'd neurons 51

len. Ca recording 4469, total num. neurons 155, num. ID'd neurons 48

len. Ca recording 3818, total num. neurons 149, num. ID'd neurons 65

len. Ca recording 3776, total num. neurons 163, num. ID'd neurons 39

len. Ca recording 5427, total num. neurons 95, num. ID'd neurons 55

len. Ca recording 4981, total num. neurons 118, num. ID'd neurons 54

len. Ca recording 4494, total num. neurons 103, num. ID'd neurons 61

len. Ca recording 3795, total num. neurons 137, num. ID'd neurons 65

len. Ca recording 3804, total num. neurons 136, num. ID'd neurons 46

len. Ca recording 4041, total num. neurons 161, num. ID'd neurons 42

len. Ca recording 3847, total num. neurons 136, num. ID'd neurons 61

len. Ca recording 4147, total num. neurons 90, num. ID'd neurons 55

len. Ca recording 5093

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


len. Ca recording 3800, total num. neurons 97, num. ID'd neurons 58

len. Ca recording 393, total num. neurons 105, num. ID'd neurons 50

len. Ca recording 416, total num. neurons 109, num. ID'd neurons 49

len. Ca recording 4663, total num. neurons 111, num. ID'd neurons 76

len. Ca recording 4209, total num. neurons 91, num. ID'd neurons 0

len. Ca recording 2733, total num. neurons 95, num. ID'd neurons 45

len. Ca recording 760, total num. neurons 80, num. ID'd neurons 62

len. Ca recording 4161, total num. neurons 97, num. ID'd neurons 65



  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


len. Ca recording 5056, total num. neurons 103, num. ID'd neurons 40



  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


len. Ca recording 3869, total num. neurons 101, num. ID'd neurons 67

len. Ca recording 3913, total num. neurons 95, num. ID'd neurons 63

len. Ca recording 4139, total num. neurons 90, num. ID'd neurons 56

len. Ca recording 4411, total num. neurons 102, num. ID'd neurons 49

len. Ca recording 4535, total num. neurons 95, num. ID'd neurons 45

len. Ca recording 4409, total num. neurons 108, num. ID'd neurons 56

len. Ca recording 3853, total num. neurons 132, num. ID'd neurons 62

len. Ca recording 4215, total num. neurons 99, num. ID'd neurons 52

len. Ca recording 4170, total num. neurons 127, num. ID'd neurons 65

len. Ca recording 4176, total num. neurons 100, num. ID'd neurons 59

len. Ca recording 3329, total num. neurons 90, num. ID'd neurons 62

len. Ca recording 893, total num. neurons 90, num. ID'd neurons 39

len. Ca recording 1175, total num. neurons 99, num. ID'd neurons 46

len. Ca recording 939, total num. neurons 85, num. ID'd neurons 22

len. Ca recording 1240, total 

In [19]:
pickle_in = open("Leifer2023.pickle", "rb")
dataset = pickle.load(pickle_in)
print(dataset.keys())

dict_keys(['worm0', 'worm1', 'worm2', 'worm3', 'worm4', 'worm5', 'worm6', 'worm7', 'worm8', 'worm9', 'worm10', 'worm11', 'worm12', 'worm13', 'worm14', 'worm15', 'worm16', 'worm17', 'worm18', 'worm19', 'worm20', 'worm21', 'worm22', 'worm23', 'worm24', 'worm25', 'worm26', 'worm27', 'worm28', 'worm29', 'worm30', 'worm31', 'worm32', 'worm33', 'worm34', 'worm35', 'worm36', 'worm37', 'worm38', 'worm39', 'worm40', 'worm41', 'worm42', 'worm43', 'worm44', 'worm45', 'worm46', 'worm47', 'worm48'])


In [20]:
single_worm_dataset = dataset['worm0']

In [21]:
print(single_worm_dataset.keys())

dict_keys(['dataset', 'worm', 'calcium_data', 'smooth_calcium_data', 'residual_calcium', 'smooth_residual_calcium', 'neuron_to_idx', 'idx_to_neuron', 'max_time', 'time_in_seconds', 'dt', 'num_neurons', 'num_named_neurons', 'num_unknown_neurons', 'named_neurons_mask', 'unknown_neurons_mask', 'neurons_mask', 'named_neuron_to_idx', 'idx_to_named_neuron', 'unknown_neuron_to_idx', 'idx_to_unknown_neuron', 'slot_to_named_neuron', 'named_neuron_to_slot', 'slot_to_unknown_neuron', 'unknown_neuron_to_slot', 'slot_to_neuron', 'neuron_to_slot'])


In [22]:
print(single_worm_dataset['calcium_data'].shape)

torch.Size([4454, 302])


In [23]:
print(single_worm_dataset['smooth_calcium_data'].shape)

torch.Size([4454, 302])


In [24]:
print(single_worm_dataset['residual_calcium'].shape)

torch.Size([4454, 302])


In [25]:
print(single_worm_dataset['smooth_residual_calcium'].shape)

torch.Size([4454, 302])


In [26]:
print(single_worm_dataset['neuron_to_idx'])

{'300': 0, 'IL2V': 1, 'BAGL': 2, 'ASEL': 3, '304': 4, 'SMDVL': 5, 'AMsoL': 6, 'I1L': 7, '308': 8, '309': 9, 'RMED': 10, '311': 11, 'NSML': 12, 'M3L': 13, 'M1': 14, 'RMDVL': 15, '316': 16, 'AVDL': 17, 'AWBL': 18, 'AVEL': 19, 'FLPL': 20, '321': 21, '322': 22, '323': 23, '324': 24, '325': 25, '326': 26, 'I3': 27, 'URXL': 28, '329': 29, '330': 30, 'AVAL': 31, '332': 32, '333': 33, 'SMDDL': 34, 'AIYL': 35, 'AIZL': 36, 'AMsoR': 37, 'OLQDR': 38, '339': 39, '340': 40, 'IL1VL': 41, '342': 42, 'SAAD': 43, '344': 44, 'AQR': 45, 'RMDDL': 46, '347': 47, '348': 48, '349': 49, 'AVJR': 50, 'AWAR': 51, '352': 52, '353': 53, 'I1R': 54, 'OLLR': 55, 'AVDR': 56, 'ASGR': 57, 'M3R': 58, '359': 59, 'OLQVR': 60, '361': 61, '362': 62, '363': 63, '364': 64, '365': 65, 'DB1': 66, '367': 67, 'VA1': 68, '369': 69, '370': 70, '371': 71, 'I2R': 72, '373': 73, '374': 74, 'FLPR': 75, '376': 76, 'AWBR': 77, '41': 78, '379': 79, 'AIZR': 80, '48': 81, '382': 82, 'DB2': 83, 'VB1': 84, 'RIG': 85, '386': 86, '387': 87, '388'

In [27]:
print(single_worm_dataset['idx_to_neuron'])

{0: '300', 1: 'IL2V', 2: 'BAGL', 3: 'ASEL', 4: '304', 5: 'SMDVL', 6: 'AMsoL', 7: 'I1L', 8: '308', 9: '309', 10: 'RMED', 11: '311', 12: 'NSML', 13: 'M3L', 14: 'M1', 15: 'RMDVL', 16: '316', 17: 'AVDL', 18: 'AWBL', 19: 'AVEL', 20: 'FLPL', 21: '321', 22: '322', 23: '323', 24: '324', 25: '325', 26: '326', 27: 'I3', 28: 'URXL', 29: '329', 30: '330', 31: 'AVAL', 32: '332', 33: '333', 34: 'SMDDL', 35: 'AIYL', 36: 'AIZL', 37: 'AMsoR', 38: 'OLQDR', 39: '339', 40: '340', 41: 'IL1VL', 42: '342', 43: 'SAAD', 44: '344', 45: 'AQR', 46: 'RMDDL', 47: '347', 48: '348', 49: '349', 50: 'AVJR', 51: 'AWAR', 52: '352', 53: '353', 54: 'I1R', 55: 'OLLR', 56: 'AVDR', 57: 'ASGR', 58: 'M3R', 59: '359', 60: 'OLQVR', 61: '361', 62: '362', 63: '363', 64: '364', 65: '365', 66: 'DB1', 67: '367', 68: 'VA1', 69: '369', 70: '370', 71: '371', 72: 'I2R', 73: '373', 74: '374', 75: 'FLPR', 76: '376', 77: 'AWBR', 78: '41', 79: '379', 80: 'AIZR', 81: '48', 82: '382', 83: 'DB2', 84: 'VB1', 85: 'RIG', 86: '386', 87: '387', 88: '

In [28]:
print(single_worm_dataset['max_time'])

4454


In [29]:
print(single_worm_dataset['time_in_seconds'].shape)

torch.Size([4454])


In [30]:
print(single_worm_dataset['dt'].shape)

torch.Size([4454])


In [31]:
print(single_worm_dataset['num_neurons'])

114


In [32]:
print(single_worm_dataset['num_named_neurons'])

43


In [33]:
print(single_worm_dataset['num_unknown_neurons'])

71


In [34]:
print(single_worm_dataset['dataset'])

Leifer2023


In [35]:
print(single_worm_dataset['worm'])

worm0


In [36]:
print(single_worm_dataset['num_named_neurons'])
print(single_worm_dataset['num_unknown_neurons'])
print(single_worm_dataset['num_neurons'])
print(single_worm_dataset['num_named_neurons'] + single_worm_dataset['num_unknown_neurons'] == single_worm_dataset['num_neurons'])

43
71
114
True
