sources are based on: https://github.com/AllenInstitute/MicronsBinder/tree/master/notebooks/mm3_intro 

# setting up workspace

In [1]:
# import stuff
from caveclient import CAVEclient
import pandas as pd
import numpy as np
import scipy.spatial as sci_spatial
from scipy.spatial import distance_matrix
from tqdm import tqdm
import csv
import pickle #how to use pickle: https://www.datacamp.com/tutorial/pickle-python-tutorial 
import utils
from nglui import statebuilder
import plotly.figure_factory as ff
import networkx as nx
from itertools import chain, combinations
from scipy.cluster.hierarchy import dendrogram
import random

client = CAVEclient()

In [2]:
# uncomment the following line below to get new token if one has not previously done so; comment out if one has already done 
# client.auth.get_new_token()

# uncomment the following line below to get new token if one has not previously done so; comment out if one has already done 
# client.auth.save_token(token="55d33f46f502c5c22535abf93c68cdb0")

# double checking the token number 
# auth = client.auth
# print(f"My current token is: {auth.token}")

In [3]:
#load up the dataset through query # no query for minnie35: https://github.com/seung-lab/CAVEclient/issues/49 
client = CAVEclient('minnie65_public_v117') #minnie65_public_v117
# client2 = CAVEclient('minnie35_public_v0 ')

In [4]:
#view the tables we can query from the materialization engine
client.materialize.get_tables()

['nucleus_detection_v0',
 'synapses_pni_2',
 'nucleus_neuron_svm',
 'proofreading_status_public_release',
 'func_unit_em_match_release',
 'allen_soma_ei_class_model_v1',
 'allen_visp_column_soma_coarse_types_v1']

working to connect the pre-post synaptic graph. source: https://github.com/AllenInstitute/MicronsBinder/blob/master/notebooks/mm3_intro/SynapseAndAnnotationQuery.ipynb 

In [5]:
# this shows you the basic information about this datastack within CAVE
# client.info.get_datastack_info()

In [6]:
# Load all the proofreaded neurons (with different proofreadbility)
prf_df=client.materialize.query_table('proofreading_status_public_release')

In [7]:
len(prf_df)

601

In [8]:
prf_df.head()

Unnamed: 0,id,valid,pt_supervoxel_id,pt_root_id,valid_id,status_dendrite,status_axon,pt_position
0,1,t,89529934389098311,864691136296964635,864691136296964635,extended,non,"[179808, 216672, 23361]"
1,2,t,90584228533843146,864691136311986237,864691136311986237,extended,non,"[187840, 207232, 22680]"
2,3,t,89528353773943370,864691135355207119,864691135355207119,extended,non,"[180016, 204592, 22798]"
3,4,t,91077153340676495,864691135355207375,864691135355207375,extended,non,"[191424, 209888, 22845]"
4,5,t,88897234233461709,864691136422983727,864691136422983727,extended,non,"[175248, 220944, 23561]"


In [9]:
statuses_axon = pd.Series(prf_df.loc[:,"status_axon"])
print(len(statuses_axon))

601


In [10]:
# not_non_axon_count = 0 
# for i in range(len(statuses_axon)):
#     if statuses_axon[i] != "non":
#         not_non_axon_count += 1
# print(not_non_axon_count)

In [11]:
raw_valid_ids = pd.Series(prf_df.loc[:,"valid_id"])

In [12]:
len(raw_valid_ids)

601

In [13]:
# len(raw_valid_ids)

In [14]:
# valid_ids = []
# for i in range(len(statuses_axon)):
#     if statuses_axon[i] != "non":
#         valid_ids.append(raw_valid_ids[i])
# print(len(valid_ids))

valid_ids = []
for i in range(len(statuses_axon)):
    valid_ids.append(raw_valid_ids[i])
print(len(valid_ids))

601


In [15]:
# verify that pt_root_id == valid_id for clean axons
verified_ids_len = 0 
for i in range(len(valid_ids)):
    verified_ids_len += 1
print('verified_ids_len', verified_ids_len)

verified_ids_len 601


In [16]:
neuron_type_df_coverall_notallright=client.materialize.query_table('allen_soma_ei_class_model_v1')

In [17]:
neuron_type_df_coverall_notallright.head()

Unnamed: 0,id,valid,classification_system,cell_type,pt_supervoxel_id,pt_root_id,pt_position
0,485509,t,aibs_coarse_excitatory,excitatory,103588564537113366,864691136740606812,"[282608, 103808, 20318]"
1,113721,t,aibs_coarse_excitatory,excitatory,79951332685465031,864691135366988025,"[110208, 153664, 23546]"
2,263203,t,aibs_coarse_excitatory,excitatory,87694643458256575,864691135181741826,"[166512, 174176, 24523]"
3,456177,t,aibs_coarse_excitatory,excitatory,102677963354799688,864691135337690598,"[275616, 135120, 24873]"
4,364447,t,aibs_coarse_excitatory,excitatory,94449079618306553,864691136883828334,"[216064, 166800, 15025]"


In [18]:
ids_of_coverall_notallright_ser = pd.Series(neuron_type_df_coverall_notallright.loc[:,"pt_root_id"])
types_of_coverall_notallright_ser = pd.Series(neuron_type_df_coverall_notallright.loc[:,"cell_type"])

In [19]:
# valid_ids
ext_neuron_valid_ids_lst = []
for i in range(len(ids_of_coverall_notallright_ser)):
    if ids_of_coverall_notallright_ser[i] in valid_ids and types_of_coverall_notallright_ser[i] == 'excitatory':
        ext_neuron_valid_ids_lst.append(ids_of_coverall_notallright_ser[i])
print(len(ext_neuron_valid_ids_lst))

492


In [20]:
valid_ids = ext_neuron_valid_ids_lst

In [21]:
# setting up str_num_2_presypneuron and presypneuron_2_str_num to simplify the representation of pre-syp-neuron-id
# using string number (e.g.: "20") to correspond to the long-digit id of the pre-syp-neuron 
num_rep_2_presypneuron = {}
presypneuron_2_num_rep = {}
for i, the_id in enumerate(valid_ids):
    num_rep_2_presypneuron[i] = the_id
    presypneuron_2_num_rep[the_id] = i

In [22]:
#500 Server Error reported but should try again
# neuron_type_df_notcoverall_allright=client.materialize.query_table('allen_visp_column_soma_coarse_types_v1')

# working on sheezneat

In [23]:
def len_3_to_6_num_seqs_and_num_syps_of_n_randomly_chosen_valid_neurons_with_m_trials(n, m, valid_ids):
    num_seqs = {'len3':[], 'len4':[], 'len5':[], 'len6':[]}
    num_syps = [] #running with all valid neuron the number is 105162
    with tqdm(total= m) as pbar:
        for i in range(m): #30, 60, 120, 240, 480 
            pbar.update(1)
            
            #randomly choosing n elements without replacement 
            chose_ones = random.sample(valid_ids, n)

            #getting the info of the sample 
            syp_dict, syp_voxel_pos, syp_pos_tracking = utils.creating_syp_information(chose_ones, len(chose_ones), client)
            syp_voxel_pos = np.delete(syp_voxel_pos, 0, 0)
            conversion_array = np.array([4, 4, 40])
            syp_xyz_pos = np.empty_like(syp_voxel_pos)
            for i in range(np.shape(syp_voxel_pos)[0]):
                syp_xyz_pos[i] = np.multiply(syp_voxel_pos[i], conversion_array)
            num_syps.append(len(syp_voxel_pos)) 

            #setting up kd tree
            syp_pos_kd_tree = sci_spatial.KDTree(syp_xyz_pos, 3)
            num_rows_of_syp_positions, num_cols_of_syp_positions = np.shape(syp_xyz_pos)
            radius = 5e3

            #applying kd tree to obtain spatially ordered seq
            #this part of the code takes care of the potential of multiple counting same sequence of pre-syp-neurons ]
            #that have the same post-ysp-neurons connections and address the issue of end points of cand seq swap. 

            seqs_with_post_neuron_lst = {}
            syp_ids_of_seqs_with_post_neuron_lst = {}

    #         with tqdm(total= num_rows_of_syp_positions) as pbar:
            for i in range(num_rows_of_syp_positions):
    #             pbar.update(1)
                da_pt_and_its_neighbors_pos_lst = []
                da_pt_pos = syp_xyz_pos[i]
                resulting_neighbors_idxes = syp_pos_kd_tree.query_ball_point(da_pt_pos, radius, p=2.0, eps=0, workers=3, return_sorted=None, return_length=False)
                da_pt_syp_ID = syp_pos_tracking[i]
                neighbors_IDs_list = []
                for neighbor_pos_idx in resulting_neighbors_idxes:
                    neighbor_ID = syp_pos_tracking[neighbor_pos_idx]
                    neighbors_IDs_list.append(neighbor_ID)
                syps_with_same_post_syp_ID_lst = []
                neur_with_same_post_syp_ID_lst = []
                da_pt_post_syp_id = syp_dict[da_pt_syp_ID][0][1]
                for neighbor_id in neighbors_IDs_list:
                    neighbor_pre_syp_id, neighbor_post_syp_id = syp_dict[neighbor_id][0]
                    if neighbor_post_syp_id == da_pt_post_syp_id:
                        syps_with_same_post_syp_ID_lst.append(neighbor_id)
                        neur_with_same_post_syp_ID_lst.append(neighbor_pre_syp_id)
                        da_pt_and_its_neighbors_pos_lst.append(syp_xyz_pos[syp_pos_tracking.index(neighbor_id)])
                ### cast syps_with_post_syp_ID_lst to np array 
                syps_with_same_post_syp_ID_lst = np.array(syps_with_same_post_syp_ID_lst)
                neur_with_same_post_syp_ID_lst = np.array(neur_with_same_post_syp_ID_lst)

                ### sorted based on spatial ordering 
                da_pt_dist_matrix = distance_matrix(da_pt_and_its_neighbors_pos_lst, da_pt_and_its_neighbors_pos_lst)
                da_pt_dist_mat_max_vals = np.amax(da_pt_dist_matrix)
                da_pt_maxval_loc_in_dist_mat = np.where(da_pt_dist_matrix == np.amax(da_pt_dist_matrix))
                da_pt_anchor_pt_idx = da_pt_maxval_loc_in_dist_mat[0][0] #problem with naming variable 
                ordered_seq_idxs = np.argsort(da_pt_dist_matrix[da_pt_anchor_pt_idx]) #
                ordered_syn_ids = tuple(syps_with_same_post_syp_ID_lst[ordered_seq_idxs]) #make syps_... an array instead of list
                ordered_neur_ids = tuple(neur_with_same_post_syp_ID_lst[ordered_seq_idxs])

                if not ordered_neur_ids in seqs_with_post_neuron_lst:
                    seqs_with_post_neuron_lst[ordered_neur_ids] = [da_pt_post_syp_id]
                    syp_ids_of_seqs_with_post_neuron_lst[ordered_neur_ids] = [ordered_syn_ids]
                else:
                    if not da_pt_post_syp_id in seqs_with_post_neuron_lst[ordered_neur_ids]:
                        seqs_with_post_neuron_lst[ordered_neur_ids].append(da_pt_post_syp_id)
                        syp_ids_of_seqs_with_post_neuron_lst[ordered_neur_ids].append(ordered_syn_ids)

        #### getting the seqs with diff len from 3 to 6
            for j in range(3, 7):
                occurences = [len(seqs_with_post_neuron_lst[key]) for key in seqs_with_post_neuron_lst if len(key)==j]
                num_seqs['len'+str(j)].append(len(occurences))
    return num_seqs, num_syps

In [24]:
num_seqs_of_30_neurons, num_syps_of_30_neurons = len_3_to_6_num_seqs_and_num_syps_of_n_randomly_chosen_valid_neurons_with_m_trials(30, 20, valid_ids)
utils.save_obj_with_name(num_seqs_of_30_neurons, 'num_seqs_of_30_neurons')
utils.save_obj_with_name(num_syps_of_30_neurons, 'num_syps_of_30_neurons')

100%|██████████████████████████████████████████████| 20/20 [02:10<00:00,  6.52s/it]


In [26]:
num_seqs_of_60_neurons, num_syps_of_60_neurons = len_3_to_6_num_seqs_and_num_syps_of_n_randomly_chosen_valid_neurons_with_m_trials(60, 20, valid_ids)
utils.save_obj_with_name(num_seqs_of_60_neurons, 'num_seqs_of_60_neurons')
utils.save_obj_with_name(num_syps_of_60_neurons, 'num_syps_of_60_neurons')

100%|██████████████████████████████████████████████| 20/20 [03:45<00:00, 11.27s/it]


In [28]:
num_seqs_of_120_neurons, num_syps_of_120_neurons = len_3_to_6_num_seqs_and_num_syps_of_n_randomly_chosen_valid_neurons_with_m_trials(120, 20, valid_ids)
utils.save_obj_with_name(num_seqs_of_120_neurons, 'num_seqs_of_120_neurons')
utils.save_obj_with_name(num_syps_of_120_neurons, 'num_syps_of_120_neurons')

100%|██████████████████████████████████████████████| 20/20 [08:26<00:00, 25.31s/it]


In [31]:
num_seqs_of_240_neurons, num_syps_of_240_neurons = len_3_to_6_num_seqs_and_num_syps_of_n_randomly_chosen_valid_neurons_with_m_trials(240, 20, valid_ids)
utils.save_obj_with_name(num_seqs_of_240_neurons, 'num_seqs_of_240_neurons')
utils.save_obj_with_name(num_syps_of_240_neurons, 'num_syps_of_240_neurons')

100%|██████████████████████████████████████████████| 20/20 [20:51<00:00, 62.58s/it]


In [33]:
num_seqs_of_480_neurons, num_syps_of_480_neurons = len_3_to_6_num_seqs_and_num_syps_of_n_randomly_chosen_valid_neurons_with_m_trials(480, 20, valid_ids)
utils.save_obj_with_name(num_seqs_of_480_neurons, 'num_seqs_of_480_neurons')
utils.save_obj_with_name(num_syps_of_480_neurons, 'num_syps_of_480_neurons')

100%|█████████████████████████████████████████████| 20/20 [59:03<00:00, 177.16s/it]


In [30]:
num_seqs_of_30_neurons = utils.load_obj_from_filename('num_seqs_of_30_neurons')
num_syps_of_30_neurons = utils.load_obj_from_filename('num_syps_of_30_neurons')
num_seqs_of_60_neurons = utils.load_obj_from_filename('num_seqs_of_60_neurons')
num_syps_of_60_neurons = utils.load_obj_from_filename('num_syps_of_60_neurons')
num_seqs_of_120_neurons = utils.load_obj_from_filename('num_seqs_of_120_neurons')
num_syps_of_120_neurons = utils.load_obj_from_filename('num_syps_of_120_neurons')
num_seqs_of_240_neurons = utils.load_obj_from_filename('num_seqs_of_240_neurons')
num_syps_of_240_neurons = utils.load_obj_from_filename('num_syps_of_240_neurons')
num_seqs_of_480_neurons = utils.load_obj_from_filename('num_seqs_of_480_neurons')
num_syps_of_480_neurons = utils.load_obj_from_filename('num_syps_of_480_neurons')

In [None]:
print('using radius = 5um for the kd tree')
legends = []
for i in range(3, 7):
    occurences = [len(seqs_with_post_neuron_lst[key]) for key in seqs_with_post_neuron_lst if len(key)==i]
    occ_array = np.array(occurences)
    print('ttl number of seqs with len' + str(i) + ' = ' + str(len(occ_array)))
    plt.plot(np.sort(occ_array), np.linspace(0, 1, len(occ_array), endpoint=False))
    plt.ylim([0.5, 1])
    plt.xscale("log")
    legends.append('len' + str(i))
plt.legend(legends)
plt.title("cdf of number of occurance on log scale")
plt.show()