In [1]:
import torch
from torch_geometric.data import download_url, extract_zip

In [2]:
import os
import shutil
import mat73
import pickle
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.io import loadmat
from sklearn import preprocessing

In [3]:
root = os.getcwd()
url = 'https://www.dropbox.com/s/9dnzrlh12hf5p89/opensource_data.zip?dl=1'
filename = os.path.join('opensource_data.zip')
data_path = os.path.join(os.getcwd(), 'opensource_data')

if not os.path.exists(root):
    os.mkdir(root)
if not os.path.exists(data_path):
    download_url(url=url, folder=os.getcwd(), filename=filename)
    extract_zip(filename, folder=data_path) # extract zip file
    os.unlink(filename) # remove zip file

Downloading https://www.dropbox.com/s/9dnzrlh12hf5p89/opensource_data.zip?dl=1
Extracting opensource_data.zip


In [4]:
#@title Nguyen et al., PLOS CompBio 2017, *Automatically tracking neurons in a moving and deforming brain* 

# WORM 1
# load .mat file for  worm 1
arr1 = loadmat(os.path.join(data_path, 'Nguyen2017', 'heatData_worm1.mat')) # load .mat file
print(list(arr1.keys()))
print()
# get data for worm 1
G2 = arr1['G2'] # the ratio signal is defined as gPhotoCorr/rPhotoCorr, the Ratio is then normalized as delta R/ R0. is the same way as R2 and G2.
cgIdx = arr1['cgIdx'].squeeze() # ordered indices derived from heirarchically clustering the correlation matrix. 
real_data1 = G2[cgIdx-1, :].T # to show organized traces, use Ratio2(cgIdx,:)
real_data1 = np.nan_to_num(real_data1) # replace NaNs 
max_time1, num_neurons1 = real_data1.shape 
worm1_IDs = {i: str(i) for i in range(num_neurons1)}
print('len. Ca recording %s, num. neurons %s'%(max_time1, num_neurons1))
print()
# normalize the data 
sc = preprocessing.MinMaxScaler()
real_data1 = sc.fit_transform(real_data1[:, :num_neurons1]) 

# WORM 2
# load .mat file for  worm 1
arr2 = loadmat(os.path.join(data_path, 'Nguyen2017', 'heatData_worm2.mat')) # load .mat file
print(list(arr2.keys()))
print()
# get data for worm 2
G2 = arr2['G2'] # the ratio signal is defined as gPhotoCorr/rPhotoCorr, the Ratio is then normalized as delta R/ R0. is the same way as R2 and G2.
cgIdx = arr2['cgIdx'].squeeze() # ordered indices derived from heirarchically clustering the correlation matrix. 
real_data2 = G2[cgIdx-1, :].T # to show organized traces, use Ratio2(cgIdx,:)
real_data2 = np.nan_to_num(real_data2) # replace NaNs 
max_time2, num_neurons2 = real_data2.shape 
worm2_IDs = {i+1: str(i+1) for i in range(num_neurons2)}
print('len. Ca recording %s, num. neurons %s'%(max_time2, num_neurons2))
print()
# normalize the data 
sc = preprocessing.MinMaxScaler()
real_data2 = sc.fit_transform(real_data2[:, :num_neurons2]) 

# pickle the data
data_dict = {'worm1': {'data': real_data1, 'neuron_ids': worm1_IDs, 'max_time': max_time1, 'num_neurons': num_neurons1}, 
             'worm2': {'data': real_data2, 'neuron_ids': worm2_IDs, 'max_time': max_time2, 'num_neurons': num_neurons2},
            }
file = os.path.join(root, "Nguyen2017.pickle")
pickle_out = open(file, "wb")
pickle.dump(data_dict, pickle_out)
pickle_out.close()
pickle_in = open(file, "rb")
Nguyen2017 = pickle.load(pickle_in)
print(Nguyen2017.keys())
print()


['__header__', '__version__', '__globals__', 'hasPointsTime', 'ethoTrack', 'R2', 'G2', 'Ratio2', 'acorr', 'cgIdx', 'cgIdxRev', 'DmatAll']

len. Ca recording 1516, num. neurons 77

['__header__', '__version__', '__globals__', 'hasPointsTime', 'ethoTrack', 'R2', 'G2', 'Ratio2', 'acorr', 'cgIdx', 'cgIdxRev', 'rRaw', 'gRaw', 'rPhotoCorr', 'gPhotoCorr']

len. Ca recording 2849, num. neurons 156

dict_keys(['worm1', 'worm2'])



In [5]:
#@title Kaplan et al., Neuron 2020, *Nested Neuronal Dynamics Orchestrate a Behavioral Hierarchy across Timescales*

data_dict = dict()

# 'RIShisCl_Neuron2019'
# load the first .mat file
arr = mat73.loadmat(os.path.join(data_path, 'Kaplan2020', 'Neuron2019_Data_RIShisCl.mat'))['RIShisCl_Neuron2019']
print(list(arr.keys()))
print()
# get data for all worms
all_IDs = arr['neuron_ID'] # identified neuron IDs (only subset have neuron names)
all_traces = arr['traces_bleach_corrected'] # neural activity traces corrected for bleaching
print('num. worms:', len(all_IDs))
print()

for i, real_data in enumerate(all_traces):
    worm =  "worm"+str(i+1)
    neuron_IDs = {nid+1: str(j) for nid, j in enumerate(all_IDs[i])} 
    max_time, num_neurons = real_data.shape  
    print("len. Ca recording %s, total num. neurons %s, num. ID'd neurons %s"%(
        max_time, num_neurons, len(neuron_IDs)))
    sc = preprocessing.MinMaxScaler() # normalize data
    real_data = sc.fit_transform(real_data[:, :num_neurons]) 
    data_dict.update({worm: {'data': real_data, 'neuron_ids': neuron_IDs, 
                             'max_time': max_time, 'num_neurons': num_neurons},
                     })
    
# 'MNhisCl_RIShisCl_Neuron2019'
# load the second .mat file
arr = mat73.loadmat(os.path.join(data_path, 'Kaplan2020', 'Neuron2019_Data_MNhisCl_RIShisCl.mat'))['MNhisCl_RIShisCl_Neuron2019']
print(list(arr.keys()))
print()
# get data for all worms
all_IDs = arr['neuron_ID'] # identified neuron IDs (only subset have neuron names)
all_traces = arr['traces_bleach_corrected'] # neural activity traces corrected for bleaching
print('num. worms:', len(all_IDs))
print()
for ii, real_data in enumerate(all_traces):
    worm =  "worm"+str(ii+1 + i+1)
    neuron_IDs = {nid+1: str(j) for nid, j in enumerate(all_IDs[ii])} 
    max_time, num_neurons = real_data.shape  
    print("len. Ca recording %s, total num. neurons %s, num. ID'd neurons %s"%(
        max_time, num_neurons, len(neuron_IDs)))
    sc = preprocessing.MinMaxScaler() # normalize data
    real_data = sc.fit_transform(real_data[:, :num_neurons]) 
    data_dict.update({worm: {'data': real_data, 'neuron_ids': neuron_IDs, 
                             'max_time': max_time, 'num_neurons': num_neurons},
                     })
    

# 'MNhisCl_RIShisCl_Neuron2019'
# load the third .mat file
arr = mat73.loadmat(os.path.join(data_path, 'Kaplan2020', 'Neuron2019_Data_SMDhisCl_RIShisCl.mat'))['SMDhisCl_RIShisCl_Neuron2019']
print(list(arr.keys()))
print()
# get data for all worms
all_IDs = arr['neuron_ID'] # identified neuron IDs (only subset have neuron names)
all_traces = arr['traces_bleach_corrected'] # neural activity traces corrected for bleaching
print('num. worms:', len(all_IDs))
print()
for iii, real_data in enumerate(all_traces):
    worm =  "worm"+str(iii+1 + ii+1 + i+1)
    neuron_IDs = {nid+1: str(j) for nid, j in enumerate(all_IDs[iii])} 
    max_time, num_neurons = real_data.shape  
    print("len. Ca recording %s, total num. neurons %s, num. ID'd neurons %s"%(
        max_time, num_neurons, len(neuron_IDs)))
    sc = preprocessing.MinMaxScaler() # normalize data
    real_data = sc.fit_transform(real_data[:, :num_neurons]) 
    data_dict.update({worm: {'data': real_data, 'neuron_ids': neuron_IDs, 
                             'max_time': max_time, 'num_neurons': num_neurons},
                     })

    
# pickle the data
file = os.path.join(root, "Kaplan2020.pickle")
pickle_out = open(file, "wb")
pickle.dump(data_dict, pickle_out)
pickle_out.close()
pickle_in = open(file, "rb")
Kaplan2020 = pickle.load(pickle_in)
print(Kaplan2020.keys())
print()

ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)


['five_state_annotation', 'neuron_ID', 'parameters', 'peaks', 'raw_traces', 'time_vector', 'traces_bleach_corrected', 'traces_bleach_corrected_detrended', 'traces_derivatives', 'volumes_per_second']

num. worms: 10

len. Ca recording 5455, total num. neurons 103, num. ID'd neurons 103
len. Ca recording 5455, total num. neurons 129, num. ID'd neurons 129
len. Ca recording 5455, total num. neurons 122, num. ID'd neurons 122
len. Ca recording 5455, total num. neurons 119, num. ID'd neurons 119
len. Ca recording 5455, total num. neurons 124, num. ID'd neurons 124
len. Ca recording 3578, total num. neurons 111, num. ID'd neurons 111
len. Ca recording 3269, total num. neurons 128, num. ID'd neurons 128
len. Ca recording 2941, total num. neurons 127, num. ID'd neurons 127
len. Ca recording 4010, total num. neurons 129, num. ID'd neurons 129
len. Ca recording 3692, total num. neurons 124, num. ID'd neurons 124


ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)


['five_state_annotation', 'neuron_ID', 'parameters', 'peaks', 'raw_traces', 'time_vector', 'traces_bleach_corrected', 'traces_bleach_corrected_detrended', 'traces_derivatives', 'volumes_per_second']

num. worms: 4

len. Ca recording 2812, total num. neurons 102, num. ID'd neurons 102
len. Ca recording 2917, total num. neurons 99, num. ID'd neurons 99
len. Ca recording 3163, total num. neurons 107, num. ID'd neurons 107
len. Ca recording 2795, total num. neurons 97, num. ID'd neurons 97


ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)
ERROR:root:ERROR: MATLAB type not supported: function_handle_workspace, (uint32)


['five_state_annotation', 'neuron_ID', 'parameters', 'peaks', 'raw_traces', 'time_vector', 'traces_bleach_corrected', 'traces_bleach_corrected_detrended', 'traces_derivatives', 'volumes_per_second']

num. worms: 5

len. Ca recording 3805, total num. neurons 105, num. ID'd neurons 105
len. Ca recording 3803, total num. neurons 114, num. ID'd neurons 114
len. Ca recording 3657, total num. neurons 123, num. ID'd neurons 123
len. Ca recording 2808, total num. neurons 97, num. ID'd neurons 97
len. Ca recording 3157, total num. neurons 117, num. ID'd neurons 117
dict_keys(['worm1', 'worm2', 'worm3', 'worm4', 'worm5', 'worm6', 'worm7', 'worm8', 'worm9', 'worm10', 'worm11', 'worm12', 'worm13', 'worm14', 'worm15', 'worm16', 'worm17', 'worm18', 'worm19'])



In [11]:
#@title Uzel et al 2022., Cell CurrBio 2022, *A set of hub neurons and non-local connectivity features support global brain dynamics in C. elegans*

data_dict = dict()

# load .mat file
arr = mat73.loadmat(os.path.join(data_path, 'Uzel2022', 'Uzel_WT.mat'))['Uzel_WT'] # load .mat file
print(list(arr.keys()))
print()

# get data for all worms
all_IDs = arr['IDs'] # identified neuron IDs (only subset have neuron names)
all_traces = arr['traces'] # neural activity traces corrected for bleaching
print('num. worms:', len(all_IDs))
print()
for i, real_data in enumerate(all_traces):
    worm =  "worm"+str(i+1)
    i_IDs = [np.array(j).item() for j in all_IDs[i]]
    neuron_IDs = {nid+1: (str(int(j)) if type(j)!=str else j) for nid,j  in enumerate(i_IDs)} 
    max_time, num_neurons = real_data.shape  
    print("len. Ca recording %s, total num. neurons %s, num. ID'd neurons %s"%(
        max_time, num_neurons, len(neuron_IDs)))
    sc = preprocessing.MinMaxScaler() # normalize data
    real_data = sc.fit_transform(real_data[:, :num_neurons]) 
    data_dict.update({worm: {'data': real_data, 'neuron_ids': neuron_IDs, 
                             'max_time': max_time, 'num_neurons': num_neurons},
                     })
    
# pickle the data
file = os.path.join(root, "Uzel2022.pickle")
pickle_out = open(file, "wb")
pickle.dump(data_dict, pickle_out)
pickle_out.close()
pickle_in = open(file, "rb")
Uzel2022 = pickle.load(pickle_in)
print(Uzel2022.keys())
print()


['IDs', 'derivatives', 'fps', 'options', 'states', 'statesKey', 'traces', 'tv']

num. worms: 6

len. Ca recording 3312, total num. neurons 154, num. ID'd neurons 154
len. Ca recording 3749, total num. neurons 154, num. ID'd neurons 154
len. Ca recording 4126, total num. neurons 133, num. ID'd neurons 133
len. Ca recording 5450, total num. neurons 124, num. ID'd neurons 124
len. Ca recording 3313, total num. neurons 136, num. ID'd neurons 136
len. Ca recording 3311, total num. neurons 133, num. ID'd neurons 133
dict_keys(['worm1', 'worm2', 'worm3', 'worm4', 'worm5', 'worm6'])



In [13]:
#@title Delete the downloaded raw datasets.
#@markdown The files are too large to push to GitHub.
shutil.rmtree(data_path)