# Match cell ids to other cell ids for intracolumnar neurons

This notebook contains code for:
<br><br> A) Reordering ids based on a reference list
<br> B) Matching ids based on connectivity

#### Importing packages

In [1]:
import os
import pandas as pd
import numpy as np
import fafbseg
from fafbseg import flywire
from helpers.synapse_queries import combine_xyz



#### Defininig some useful functions

In [2]:
def reorder_lists(list_A, list_B, list_B2):
    
    """
    Reorders items in list_A based on the order of list_B, while considering an unordered version list_B2.

    Parameters:
    list_A (list): The original list of items to be reordered.
    list_B (list): The target order of items used for reordering.
    list_B2 (list): An unordered version of list_B with the same items.

    Returns:
    list: A new list containing the items from list_A reordered according to the order of list_B.
    
    Example:
    >>> list_A = [1, 2, 3, 4, 5]
    >>> list_B = ['apple', 'banana', 'orange', 'grape', 'pear']
    >>> list_B2 = ['grape', 'banana', 'apple', 'orange', 'pear']
    >>> list_A2 = reorder_lists(list_A, list_B, list_B2)
    >>> print(list_A2)
    [4, 2, 1, 3, 5]
    """
    
    # Create a dictionary to store the indices of items in list_B2
    index_dict = {item: index for index, item in enumerate(list_B2)}

    # Sort list_B2 based on the order of list_B
    sorted_list_B2 = sorted(list_B2, key=lambda x: list_B.index(x))

    # Use the indices from index_dict to reorder list_A
    list_A2 = [list_A[index_dict[item]] for item in sorted_list_B2]
    

    return list_A2

### A) Reordering ids based on a reference list

In [47]:
# Choose path and file
dataPath = r'C:\Connectomics-Data\FlyWire\Excels'
fileName = f'sort_Tm1_and_Tm4_lists.xlsx'
filePath = os.path.join(dataPath,fileName)

In [48]:
#Loading file as DataFrame
df = pd.read_excel(filePath, dtype = str)
df.head()

Unnamed: 0,Notes,optic_lobe_id,Tm1 ID RIGHT ORDER,Tm1 ID,Tm4 ID
0,,R3,720575940628535356,720575940623656935,720575940616866699
1,,R5,720575940625170360,720575940644820168,720575940643520200
2,,R6,720575940635695864,720575940641585460,720575940627872503
3,,R17,720575940624135242,720575940627774952,720575940628998460
4,,R18,720575940638748093,720575940617451238,720575940627052681


In [50]:
# Running for my case

list_A = df['Tm4 ID'].tolist()
list_B = df['Tm1 ID RIGHT ORDER'].tolist()
list_B2 = df['Tm1 ID'].tolist()

list_A2 = reorder_lists(list_A, list_B, list_B2)
df['Tm4 ID RIGHT ORDER'] =list_A2
df.head()

Unnamed: 0,Notes,optic_lobe_id,Tm1 ID RIGHT ORDER,Tm1 ID,Tm4 ID,Tm14 ID RIGHT ORDER
0,,R3,720575940628535356,720575940623656935,720575940616866699,720575940623233100
1,,R5,720575940625170360,720575940644820168,720575940643520200,720575940617633117
2,,R6,720575940635695864,720575940641585460,720575940627872503,720575940633696096
3,,R17,720575940624135242,720575940627774952,720575940628998460,720575940628538630
4,,R18,720575940638748093,720575940617451238,720575940627052681,720575940613481955


In [52]:
df.tail()

Unnamed: 0,Notes,optic_lobe_id,Tm1 ID RIGHT ORDER,Tm1 ID,Tm4 ID,Tm14 ID RIGHT ORDER
206,,R675,720575940612294835,720575940625713907,7.205759406365216e+17,7.205759406227758e+17
207,,R683,720575940614365343,720575940617570193,7.205759406177074e+17,7.205759406180415e+17
208,,R688,720575940640734883,720575940626245932,7.205759406270351e+17,
209,,R689,720575940630797164,720575940634995225,,
210,,R0,0,0,,


In [55]:
# Creating string for the date
import datetime
x = datetime.datetime.now()
date_str = x.strftime("%d") + x.strftime("%b") + x.strftime("%Y")

# Writting in an existing excel file
from openpyxl import load_workbook
book = load_workbook(filePath)
writer = pd.ExcelWriter(filePath, engine = 'openpyxl')
writer.book = book

df = df.astype(str)
df.to_excel(writer, sheet_name='Updated_table_'+date_str) #sorted_df
writer.save()
writer.close()

### B) Matching ids based on connectivity

### 1. Loading curated data sets

In [9]:
#Loading information  from excel files
PC_disc = 'D'
dataPath = f'{PC_disc}:\Connectomics-Data\FlyWire\Excels\drive-data-sets'
date = '20231002'
pre_neuron_type = 'L3'
post_neuron_type = 'Tm9'
fileName_post = f'{post_neuron_type} proofreadings_{date}.xlsx'
filePath_post = os.path.join(dataPath,fileName_post)
fileName_pre = f'{pre_neuron_type} proofreadings_{date}.xlsx'
filePath_pre = os.path.join(dataPath,fileName_pre)

#Loading file as DataFrame
post_df = pd.read_excel(filePath_post)
pre_df = pd.read_excel(filePath_pre)

#### 1.1 Selecting data

In [10]:
# Filtering data
hemisphere = 'R' # 'R', 'L'
neuropile_mesh = 'ME_L'

# Selecting the R optic lobe IDs
R_pre_df = pre_df[pre_df['hemisphere'] == hemisphere].copy()
R_post_df = post_df[post_df['hemisphere'] == hemisphere].copy()

# ID lists
pre_ids = R_pre_df['Updated_seg_id'].tolist()
post_ids = R_post_df['Updated_seg_id'].tolist()

print(f'Total number of presynaptic ids: {len(pre_ids)}')
print(f'Total number of postsynaptic ids: {len(post_ids)}')

Total number of presynaptic ids: 708
Total number of postsynaptic ids: 766


### 2. Matching all presynaptic cell ids connecting to each postsynaptic cell

In [11]:
# Updating presynaptic neurons if they are not up-to-date
if not np.unique(flywire.is_latest_root(pre_ids))[0]: #if not up-to-date
    update_df = flywire.update_ids(pre_ids, stop_layer=2, supervoxels=None, timestamp=None, dataset='production', progress=True)
    up_to_date_pre_ids = update_df['new_id'].tolist()
    if len(update_df[update_df['confidence']< 1]) == 0:
        print('Good to go and match ids')
    else:
        display(update_df[update_df['changed'] == True])
        print('Consider updating your presynaptic neuron ids in the original data set')
else:
    up_to_date_pre_ids = pre_ids
    print('Good to go and match ids')

Good to go and match ids


In [12]:
## Getting all synaptic connections

# Quick update
if not np.unique(flywire.is_latest_root(post_ids))[0]: # if not up-to-date
    #Updating the IDs via Fafbseg
    update_df = flywire.update_ids(post_ids, stop_layer=2, supervoxels=None, timestamp=None, dataset='production', progress=True)
    up_to_date_post_ids = update_df['new_id']
    if len(update_df[update_df['confidence']< 1]) == 0:
        print('Good to go and match ids')
    else:
        display(update_df[update_df['changed'] == True])
        print('Consider updating your postsynaptic neuron ids in the original data set')
else:
    up_to_date_post_ids = post_ids
    print('Good to go and match ids')
        
    

# Fetch the neuron's inputs
post_inputs = flywire.synapses.fetch_synapses(up_to_date_post_ids, pre=False, post=True, attach=True, 
                                             min_score=50, clean=True, transmitters=False, 
                                             neuropils=True, batch_size=30, 
                                             dataset='production', progress=True,mat= "live")

#Combining pre- and postsynpases XYZ values in single columns
combine_xyz(post_inputs) # Function that does the operation

# Filtering: keeping only synapses in the medulla
post_inputs = post_inputs[post_inputs['neuropil'] == neuropile_mesh].copy()
len(post_inputs)

# Filter connections just selected presynaptic cells
pre_post_match_df = post_inputs[post_inputs['pre_pt_root_id'].isin(up_to_date_pre_ids)].copy()

# Aggregating data frame based on unique post and pre segment IDs
# While aggregating, counting the number of contacts for each pre-post pair
pre_post_counts = pre_post_match_df.groupby(['post_pt_root_id', 'pre_pt_root_id'])['pre_pt_root_id'].count().reset_index(name='pre_syn_count')
display(pre_post_counts.head())

Good to go and match ids


Fetching synapses:   0%|          | 0/26 [00:00<?, ?it/s]

Unnamed: 0,post_pt_root_id,pre_pt_root_id,pre_syn_count
0,720575940600084489,720575940620369243,39
1,720575940600084489,720575940628226310,2
2,720575940602880736,720575940613776365,1
3,720575940602880736,720575940632981293,35
4,720575940603557920,720575940616803513,32


In [13]:
## Getting the post-pre match with the higher number of synaptic contacts

# Find the row indices with the highest pre_syn_count for each unique pair
idx = pre_post_counts.groupby('post_pt_root_id')['pre_syn_count'].idxmax()
pre_post_counts_max = pre_post_counts.loc[idx].copy()
display(pre_post_counts_max.head())
print(f'Total number of presynpatic ids: {len(up_to_date_pre_ids)}')
print(f'Total number of postsynpatic ids: {len(up_to_date_post_ids)}')
print(f'Total number of ids with a pre-post match: {len(pre_post_counts_max)}')


Unnamed: 0,post_pt_root_id,pre_pt_root_id,pre_syn_count
0,720575940600084489,720575940620369243,39
3,720575940602880736,720575940632981293,35
4,720575940603557920,720575940616803513,32
6,720575940604080318,720575940626822960,31
7,720575940604668094,720575940614277357,23


Total number of presynpatic ids: 708
Total number of postsynpatic ids: 766
Total number of ids with a pre-post match: 665


### 3. Merging tables to match optic_lobe_ids in both pre and post databases

In [14]:
# Adding info about optic lobe ids (ids need to be up-to-date in the databse)
pre_optic_lobe_ids = pd.merge(pre_post_counts_max, R_post_df[['optic_lobe_id', 'Updated_seg_id']], left_on='post_pt_root_id', right_on='Updated_seg_id', how='left')

# Taking just the pre - post most connected pair (some pre can give input to more than one post)
idx = pre_optic_lobe_ids.groupby('pre_pt_root_id')['pre_syn_count'].idxmax()
pre_optic_lobe_ids_max = pre_optic_lobe_ids.loc[idx].copy()

# Getting a dataframe where post cells are orphan (no strong pre cell connected specifically to that post)
pre_optic_lobe_ids_not_max = pre_optic_lobe_ids.loc[~pre_optic_lobe_ids.index.isin(idx)].copy() 
# Briefly checking for duplicates
pre_optic_lobe_ids_max_duplicates = pre_optic_lobe_ids_max[pre_optic_lobe_ids_max.duplicated(subset='pre_pt_root_id', keep=False)]

# Brielfy getting all postsynaptic ids with no match.
not_matched_post_ids = [item for item in up_to_date_post_ids if item not in pre_optic_lobe_ids_max.post_pt_root_id.tolist()]


# Discarding redundant columns
pre_optic_lobe_ids_max.drop(['post_pt_root_id', 'Updated_seg_id','pre_syn_count'], axis=1, inplace = True)
display(pre_optic_lobe_ids_max.head())


Unnamed: 0,pre_pt_root_id,optic_lobe_id
206,720575940604889568,R368
199,720575940605268140,R395
324,720575940605273772,R81
449,720575940605345410,R424
212,720575940605586622,R53


In [15]:
# Try to find strongly-connected pre cells in the columns of these post cells
display(pre_optic_lobe_ids_not_max)
print('Find better pre for these post cells')


Unnamed: 0,post_pt_root_id,pre_pt_root_id,pre_syn_count,optic_lobe_id,Updated_seg_id
5,720575940604810208,720575940628215378,1,R307,720575940604810208
8,720575940605034161,720575940612345038,1,R158,720575940605034161
50,720575940610445922,720575940632452626,6,R132,720575940610445922
118,720575940615329602,720575940620629697,1,R676,720575940615329602
136,720575940616410909,720575940627401561,1,R63,720575940616410909
143,720575940616954297,720575940627832392,3,R182,720575940616954297
166,720575940617634772,720575940634800888,1,R578,720575940617634772
176,720575940618287064,720575940619123483,3,R384,720575940618287064
180,720575940618339284,720575940627937301,8,R690,720575940618339284
197,720575940619730590,720575940620949089,15,R275,720575940619730590


Find better pre for these post cells


### 4. Transfering the optic_lobe_id information to the presynaptic cell database

In [16]:
pre_optic_lobe_ids_max_no_duplicates = pre_optic_lobe_ids_max.drop_duplicates(subset='pre_pt_root_id')
R_pre_df_new = pd.merge(R_pre_df, pre_optic_lobe_ids_max_no_duplicates, left_on='Updated_seg_id', right_on='pre_pt_root_id', how='left')

R_pre_df_new.columns

Index(['XYZ-ME', 'symbol', 'XYZ-LO', 'quality comment', 'rim_area (Y/N)',
       'hemisphere', 'lab', 'author', 'name', 'seg_id', 'identified_in',
       'Updated_seg_id', 'Update_confidence', 'lab authorship (Y/N)',
       'optic_lobe_id_x', 'patch_id', 'column_id', 'backbone proofread (Y/N)',
       'twigs proofread (Y/N)', 'inputs_proofread (Y/N)', 'dark_neurons (Y/N)',
       'bad_quality_rim (Y/N)', 'healthy_L3 (Y/N)', 'detached_lamina (Y/N)',
       'pre_pt_root_id', 'optic_lobe_id_y'],
      dtype='object')

### 5. Saving the updated dataframe back in the data base file

In [17]:
# Creating string for the date
import datetime
x = datetime.datetime.now()
date_str = x.strftime("%d") + x.strftime("%b") + x.strftime("%Y")

# Writting in an existing excel file
from openpyxl import load_workbook
book = load_workbook(filePath_pre)
writer = pd.ExcelWriter(filePath_pre, engine = 'openpyxl')
writer.book = book

R_pre_df_new.to_excel(writer, sheet_name='Updated_table_'+date_str) #sorted_df
writer.save()
writer.close()