In [1]:
def load_hdf_group(data_dir, hdf_filename, group="/"):
    '''
    Loads any datasets from the given hdf group into a dictionary. Also will
    recursively load other groups if any exist under the given group

    Args:
        data_dir (str): folder where data is located
        hdf_filename (str): name of hdf file
        group (str): name of the group to load
    
    Returns:
        dict: all the datasets contained in the given group
    '''
    full_file_name = os.path.join(data_dir, hdf_filename)
    hdf = h5py.File(full_file_name, 'r')
    if group not in hdf:
        raise ValueError('No such group in file {}'.format(hdf_filename))

    # Recursively load groups until datasets are reached
    def _load_hdf_group(hdf):
        keys = hdf.keys()
        data = dict()
        for k in keys:
            if isinstance(hdf[k], h5py.Group):
                data[k] = _load_hdf_group(hdf[k])
            else:
                k_, v = _load_hdf_dataset(hdf[k], k)
                data[k_] = v
        return data

    data = _load_hdf_group(hdf[group])
    hdf.close()
    return data

def _load_hdf_dataset(dataset, name):
    '''
    Internal function for loading hdf datasets. Decodes json and unicode data automatically.

    Args:
        dataset (hdf object): dataset to load
        name (str): name of the dataset

    Returns:
        tuple: Tuple containing:
            | **name (str):** name of the dataset (might be modified)
            | **data (object):** loaded data
    '''
    data = dataset[()]
    if '_json' in name:
        import json
        name = name.replace('_json', '')
        data = json.loads(data)
    try:
        data = data.decode('utf-8')
    except:
        pass
    return name, data

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import aopy
import os
import pandas as pds
from db import dbfunctions as db
from ipywidgets import interactive, widgets
import scipy
import h5py
from tqdm.auto import tqdm 
import seaborn as sn
import sklearn
from sklearn.decomposition import PCA, FactorAnalysis
from itertools import compress
import multiprocessing as mp
import time
import math
from scipy.fft import fft
import glob
from datetime import date



# Set parameters

In [3]:
save_figs = False
base_save_dir = "/media/moor-data/results/Ryan/neuropixel_targeting/"
np_preproc_data_folder = 'np_analysis_preproc_data'
ecog_dec_acc_file_name = 'ecog_decoding_maps/npinsert_ecog_decoding'

subject = 'beignet'
align_events = ['TARGET ONSET', 'GO CUE', 'MOVEMENT ONSET']

In [4]:
# Decoding calculation parameters
tbefore = 0.5
tafter = 1
nlda_lags = 1
niter_match = 50
min_trial_prop = .85
ntrial_bin_size = 96
nfolds = 4

# Visualization parameters
colors = sn.color_palette(n_colors=9)
recording_brain_areas={'M1': [30, 56, 47, 40, 121, 48, 120, 98], 'PM':[11, 9, 18, 22, 10, 45]}
day_colors = ['dodgerblue', 'indigo', 'violet', 'lightblue', 'mediumorchid',
              'purple', 'steelblue', 'dodgerblue', 'lightblue', 'red', 'black', 'green', 'purple', 'cyan', 'gray', 'yellow'] 

save_dir = "/media/moor-data/results/Ryan/neuropixel_targeting/np_analysis_preproc_data"

# Load and extract relevant data

In [5]:
start = time.time()
aopy.utils.release_memory_limit()
df, rasters, preproc_metadata = aopy.data.base.pkl_read(f"{subject}_np_preprocessed", os.path.join(base_save_dir, np_preproc_data_folder))
print(f"{np.round((time.time()-start)/60)} min to load preprocessed data")
nrecs = preproc_metadata['nrecs']
recording_site = preproc_metadata['recording_sites'] # will be the same for all align events
implants = ['NP_Insert72' if preproc_metadata['implant'][irec] == 'NP_Insert72' else 'NP_Insert137' for irec in range(len(preproc_metadata['implant']))] #Rename because name in bmi3d is slightly different (TODO)
dates = np.unique(df['date'])

4.0 min to load preprocessed data


In [6]:
ecog_dec_acc = load_hdf_group(base_save_dir, ecog_dec_acc_file_name)

In [7]:
qc_results, ksdrift = aopy.data.base.pkl_read(f"{subject}_QCunits", os.path.join(base_save_dir, np_preproc_data_folder))
# stable_unit_labels = [qc_results['final_good_unit_labels'][irec] for irec in range(nrecs)]
# stable_unit_idx = [qc_results['final_good_unit_idx'][irec] for irec in range(nrecs)]
# nstable_unit = np.array([len(qc_results['final_good_unit_idx'][irec]) for irec in range(nrecs)])
# neuron_pos = [qc_results['position'][irec] for irec in range(nrecs)]

# if subject == 'affi':
stable_unit_labels = [qc_results['manual_good_unit_labels'][irec] for irec in range(nrecs)]
stable_unit_idx = [qc_results['manual_good_unit_idx'][irec] for irec in range(nrecs)]
nstable_unit = np.array([len(qc_results['manual_good_unit_idx'][irec]) for irec in range(nrecs)])
neuron_pos = [qc_results['manual_position'][irec] for irec in range(nrecs)]

In [8]:
if subject == 'beignet':
    surface_pos = np.ones(nrecs)*3840
elif subject == 'affi':
    surface_pos = np.array([3840, 3840, 3400, 3250,3840,3800,3800,3900,3500,3700,2500,3250,2100,3840,3100,1600,3250,2500,3100,3300,3800,3100,3250,2750,2900, 2750,
                           3000,3000, 3000,3700, 3000,3700, 3000,3700, 35000])
print(preproc_metadata['recording_sites'])
print(surface_pos.shape, nrecs, surface_pos)

[ 11  30  55   9  55  40  18  11   9 121  22   9 120  98  45]
(15,) 15 [3840. 3840. 3840. 3840. 3840. 3840. 3840. 3840. 3840. 3840. 3840. 3840.
 3840. 3840. 3840.]


In [9]:
# compile dataframe of unit information
# Need columns for: date, recording site, unit_label, absolute y-pos, relative y-pos, waveform
unit_info = {'rec_number': [], 'date': [], 'rec_site': [], 'rec_xpos': [], 'rec_ypos': [], 'rec_rcaxis': [], 'implant': [],
             'unit_label': [], 'unit_idx': [], 'abs_depth': [], 'rel_depth': [], 'waveform': [], 'surface_pos': [], 'penetration': []}
# unit_info = {'rec_number': [], 'date': [], 'rec_site': [], 'rec_xpos': [], 'rec_ypos': [], 'implant': [],
#              'unit_label': [], 'unit_idx': []}
surface_buffer = 200
for irec in tqdm(range(nrecs)):
    # print(irec, nstable_unit[irec], neuron_pos[irec].shape)
    for iunit in range(nstable_unit[irec]):
        # Only save units that are below the estimated surface of the brain
        # if neuron_pos[irec][iunit] <= (surface_pos[irec]+surface_buffer):   
        
        unit_info['rec_number'].append(irec) # Recording_number
        unit_info['date'].append(list(df[df['penetration']==irec]['date'])[0]) # date (assumes each recording is done on a different day)
        unit_info['rec_site'].append(preproc_metadata['recording_sites'][irec]) # Recording site
        unit_info['rec_xpos'].append(ecog_dec_acc[subject]['rec_locations'][irec,0]) # Recording site x pos in chamber
        unit_info['rec_ypos'].append(ecog_dec_acc[subject]['rec_locations'][irec,1]) # Recording site y pos in chamber
        unit_info['rec_rcaxis'].append(ecog_dec_acc[subject]['rc_axis'][irec,0]) # Recording site y pos in chamber
        unit_info['implant'].append(implants[irec]) # Implant
        unit_info['unit_label'].append(stable_unit_labels[irec][iunit]) # unit label output from kilosort
        unit_info['unit_idx'].append(stable_unit_idx[irec][iunit]) # unit label output from kilosort
        unit_info['abs_depth'].append(surface_pos[irec] - neuron_pos[irec][iunit]) #Absolute depth [um]
        unit_info['rel_depth'].append(3840-neuron_pos[irec][iunit]-np.min(3840-neuron_pos[irec])) #Relative depth [um] (to top detected neuron)
        unit_info['waveform'].append(qc_results['mean_wfs'][irec][iunit,:]) #waveform
        unit_info['surface_pos'].append(surface_pos[irec])
        unit_info['penetration'].append(irec)
        # unit_info['waveform'].append(qc_results['manual_wfs'][irec][iunit,:]) #waveform
        

  0%|          | 0/15 [00:00<?, ?it/s]

In [10]:
unit_df = pds.DataFrame(unit_info)

# Create pseudopopulations

In [11]:
pp_configs = ['random', 'column', 'depth'] # Pseudopopulation cofigurations 

## Random

In [12]:
# Create random neuron populations
nrandom_units = 10 # Number of random units chosen in each group
nrandom_groups = 1000 # Number of random groups to make
random_group_info = []
for igroup in range(nrandom_groups):
    group_idx = np.random.choice(np.arange(len(unit_df)), size=(nrandom_units), replace=False)
    random_group_info.append(unit_df.loc[group_idx])

## Column

In [13]:
unique_rec_pos, unique_rec_pos_idx = np.unique(ecog_dec_acc[subject]['rec_locations'], axis=0, return_index=True)
column_group_info = []

In [14]:
# Create random neuron populations from each recording_location
min_probe_depth = 0 #[um]
unique_rec_pos, unique_rec_pos_idx = np.unique(ecog_dec_acc[subject]['rec_locations'], axis=0, return_index=True)
column_group_info = []
for ipos in range(unique_rec_pos.shape[0]):
    pos = unique_rec_pos[ipos,:]
    new_df = unit_df.loc[(unit_df['rec_xpos']==pos[0]) & (unit_df['rec_ypos']==pos[1]) & (unit_df['surface_pos']>min_probe_depth)].reset_index()
    
    # Add a row of 0 if there are no units so code works. There is no recording site 0 because it is 1-indexed.
    if len(new_df)==0:   
        new_df.loc[0] = [np.nan] * len(new_df.columns)
        new_df['rec_site'] = ecog_dec_acc[subject]['rec_sites'][unique_rec_pos_idx[ipos]]
        new_df['implant'] = implants[unique_rec_pos_idx[ipos]]
        
    column_group_info.append(new_df) #only use sites whose surface position is greater than a threshold

    # print(unit_df.loc[(unit_df['rec_xpos']==pos[0]) & (unit_df['rec_ypos']==pos[1])].reset_index())
    # print(len(column_group_info[ipos]),column_group_info[ipos]['rec_site'][0])

In [15]:
print('nUnits:  ', [len(column_group_info[ii]) for ii in range(len(column_group_info))])
print('Rec Site:', [column_group_info[ii]['rec_site'][0] for ii in range(len(column_group_info))])
print(len(unique_rec_pos))
print(unique_rec_pos)
print(recording_site)

nUnits:   [5, 3, 5, 27, 10, 17, 89, 22, 33, 85, 40, 46]
Rec Site: [9, 9, 18, 11, 22, 45, 55, 30, 40, 98, 120, 121]
12
[[-4.55999021  3.03999347]
 [-3.99998992  1.99999496]
 [-1.99999496  1.99999496]
 [-1.99999496  3.99998992]
 [-1.51999674  4.55999021]
 [ 0.          3.03999347]
 [ 0.99999748 -2.99999244]
 [ 1.99999496  3.99998992]
 [ 2.99999244  2.99999244]
 [ 3.79999184  0.75999837]
 [ 4.55999021 -1.51999674]
 [ 5.31998858 -0.75999837]]
[ 11  30  55   9  55  40  18  11   9 121  22   9 120  98  45]


## Depth

In [16]:
# depth_ranges = [(0,1000), (1000, 2000), (2000, 100000)] #Boundary distinguishing shallow and deep neurons 

In [17]:
# Create random neuron populations from neurons at different depths

depth_ranges = [(0,1000),(250,1250),(500,1500),(750,1750),(1000,2000),(1250,2250),(1500,2500),(1750,2750),(2000,3000),(2250,3250),(2500,3500),(2750,3750),(np.max(unit_df['rel_depth'])-1000,np.max(unit_df['rel_depth']))] #Boundary distinguishing shallow and deep neurons 

depth_group_info = []
for irange, depth_range in enumerate(depth_ranges):
    depth_group_info.append(unit_df.loc[(unit_df['rel_depth'] >= depth_range[0]) & (unit_df['rel_depth'] < depth_range[1])].reset_index())

In [18]:
#Create random neuron populations for each recording site from neurons at different depths

depth_group_info_by_site = {}
unique_sites = np.unique(unit_df['rec_site'])
for isite, site in enumerate(unique_sites):
    depth_group_info_by_site[site] = []
    for irange, depth_range in enumerate(depth_ranges):
        depth_mask = (unit_df['rel_depth'] >= depth_range[0]) & (unit_df['rel_depth'] < depth_range[1])
        site_mask = unit_df['rec_site'] == site
        depth_group_info_by_site[site].append(unit_df.loc[depth_mask*site_mask].reset_index())

In [19]:
for isite, site in enumerate(unique_sites):
    print(site, [len(depth_group_info_by_site[site][idepth]) for idepth in range(len(depth_group_info_by_site[site]))])

9 [7, 5, 4, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0]
11 [9, 8, 7, 5, 3, 4, 3, 7, 6, 8, 12, 9, 9]
18 [4, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1]
22 [3, 0, 1, 1, 1, 3, 3, 4, 4, 3, 3, 2, 2]
30 [2, 2, 4, 5, 8, 7, 5, 7, 7, 10, 11, 8, 8]
40 [4, 5, 9, 11, 12, 14, 12, 12, 11, 10, 10, 7, 7]
45 [3, 4, 4, 7, 7, 6, 7, 7, 7, 6, 4, 1, 2]
55 [10, 16, 19, 23, 26, 25, 33, 39, 39, 45, 34, 21, 25]
98 [13, 17, 16, 15, 13, 16, 21, 27, 36, 34, 35, 34, 34]
120 [8, 10, 10, 9, 12, 11, 11, 13, 9, 11, 14, 11, 11]
121 [5, 7, 7, 14, 17, 18, 20, 19, 16, 17, 16, 9, 11]


In [20]:
print(column_group_info[-4])

    index  rec_number        date  rec_site  rec_xpos  rec_ypos  rec_rcaxis  \
0     116           5  2023-08-29        40  2.999992  2.999992    2.999992   
1     117           5  2023-08-29        40  2.999992  2.999992    2.999992   
2     118           5  2023-08-29        40  2.999992  2.999992    2.999992   
3     119           5  2023-08-29        40  2.999992  2.999992    2.999992   
4     120           5  2023-08-29        40  2.999992  2.999992    2.999992   
5     121           5  2023-08-29        40  2.999992  2.999992    2.999992   
6     122           5  2023-08-29        40  2.999992  2.999992    2.999992   
7     123           5  2023-08-29        40  2.999992  2.999992    2.999992   
8     124           5  2023-08-29        40  2.999992  2.999992    2.999992   
9     125           5  2023-08-29        40  2.999992  2.999992    2.999992   
10    126           5  2023-08-29        40  2.999992  2.999992    2.999992   
11    127           5  2023-08-29        40  2.99999

# Save

In [21]:
pseudopopulation_metadata = {'nrandom_units': nrandom_units, 'nrandom_groups': nrandom_groups, 'depth_ranges': depth_ranges}
print(pseudopopulation_metadata)

{'nrandom_units': 10, 'nrandom_groups': 1000, 'depth_ranges': [(0, 1000), (250, 1250), (500, 1500), (750, 1750), (1000, 2000), (1250, 2250), (1500, 2500), (1750, 2750), (2000, 3000), (2250, 3250), (2500, 3500), (2750, 3750), (2720.0, 3720.0)]}


In [22]:
# Save data
aopy.data.base.pkl_write(f"{subject}_np_psuedopopulations", (random_group_info, column_group_info, depth_group_info, depth_group_info_by_site, pseudopopulation_metadata, unit_df), save_dir)

In [23]:
print(len(depth_group_info[0]))
print(len(depth_group_info[1]))
print(len(depth_group_info[2]))


68
75
82


In [24]:
aopy.utils.get_memory_available_gb()
type(df['unit_labels'][df['date']==dates[0]][0])

numpy.ndarray