# Database (db) creator and ID trasnsfer to input file


This notebook loads excel files and combines them into a database

In [34]:
# Import packages
import os
from glob import glob
import pandas as pd
import fafbseg

In [35]:
# Some custom functions

def update_dataframe_single_column(source_df, target_df, reference_column):
    # Create a dictionary mapping from the reference column to the source DataFrame
    reference_dict = source_df.groupby(reference_column).first().reset_index().to_dict(orient='records')
    reference_dict = {row[reference_column]: row for row in reference_dict}

    # Update the target DataFrame based on the reference column
    for i, row in target_df.iterrows():
        ref = row[reference_column]
        if ref in reference_dict:
            source_row = reference_dict[ref]
            target_df.loc[i] = source_row

    return target_df

### 1. Loading all data sets of interest in a loop

The original excell files need tzo be stored in the same folder. All files in that folder will be loaded


In [22]:
# Data paths
# Choose path and file
PC_disc = 'D'
dataPath = f'{PC_disc}:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database'

fileName_ls = glob(dataPath +"\\"+ "*.xlsx")


#Creating the database in a loop
df_ls = []
for fileName in fileName_ls:
    print(f'Importing: {fileName}')
    filePath = os.path.join(dataPath,fileName)
    df = pd.read_excel(filePath)
    df_ls.append(df)

Importing: D:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\C2 proofreadings_20230906.xlsx
Importing: D:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\C3 proofreadings_20230906.xlsx
Importing: D:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\Dm12 proofreadings_20230912.xlsx
Importing: D:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\L3 proofreadings_20231002.xlsx
Importing: D:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\Mi4 proofreadings_20230912.xlsx
Importing: D:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\Tm16 proofreadings_20230912.xlsx
Importing: D:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\Tm9 proofreadings_20231002.xlsx


### 2. Creating, updating and filtering the database (db) of interest

In [55]:
### Creation
db = pd.concat(df_ls)
print(f'\n\nCell types in the database: {db.symbol.unique()}, total = {len(db.symbol.unique())}')

### Filtering and updating database
## Chossing optic lobe of interest:
_hemisphere = 'R'
db_R = db[db.hemisphere == _hemisphere].copy()

## Updating segmnet ids
#Getting the lists of IDs to update
curr_ID_ls = db_R["Updated_seg_id"].tolist()
#Updating all IDs at once
updated_ID_df = fafbseg.flywire.update_ids(curr_ID_ls, stop_layer=2, supervoxels=None, timestamp=None, 
                                           dataset='production', progress=True)
db_R['Updated_seg_ids'] = updated_ID_df['new_id'].astype(str).tolist()
db_R['Updated_confidence'] = updated_ID_df['confidence'].astype(str).tolist()



Cell types in the database: ['C2' 'C3' 'Dm12' 'L3' 'Mi4' 'Tm16' 'Tm9'], total = 7


Updating:   0%|          | 0/3377 [00:00<?, ?it/s]

### 3. Creating a input file of interest

In [42]:
#Selecting postsynaptic neuron of interest
neuron = 'Tm9'
neuron_df = db_R[db_R['symbol'] == neuron].copy()

#Filtering for valid segment ids based on a given criteria
neuron_selected_df = neuron_df.copy()
neuron_selected_df = neuron_selected_df[neuron_selected_df['backbone proofread (Y/N)'] == 'Y'].copy()
neuron_selected_df = neuron_selected_df[neuron_selected_df['inputs_proofread (Y/N)'] == 'N'].copy()

print(f'\n\nTotal number of postsynaptic cells: {len(neuron_selected_df)}\n\n')

post_ID_ls = neuron_selected_df['Updated_seg_ids'].tolist()

#Fetching the neuron's inputs and putputs
neurons_inputs = fafbseg.flywire.synapses.fetch_synapses(post_ID_ls, pre=False, post=True, attach=True, 
                                             min_score=50, clean=True, transmitters=False, 
                                             neuropils=True, batch_size=30, 
                                             dataset='production', progress=True,mat= "live")


#Counting inputs per ID, option joining dataframes
final_input_df = pd.DataFrame()
for n in neurons_inputs['post'].unique():
    inputs_count = {}
    curr_inputs = neurons_inputs[neurons_inputs['post'] == n]
    inputs_str = curr_inputs.applymap(str)
    
    for c in inputs_str['pre'].to_list():
        inputs_count[c] = inputs_count.get(c, 0) + 1
    input_count_df = pd.DataFrame(inputs_count, index=[0])
    input_count_df = input_count_df.T
    input_count_df.rename(columns={0: "counts"},inplace=True)
    input_count_df.index.names = ['presynaptic_ID']
    input_count_df = input_count_df.sort_values(by="counts",ascending=False)
    input_count_df['postsynaptic_ID'] = inputs_str['post'].to_list()[0:len(input_count_df)]
    final_input_df = final_input_df.append(input_count_df)
    #print(f'Counting done for: {n}')
input_count_str_df = final_input_df.applymap(str)
input_count_str_df.head()




Total number of postsynaptic cells: 416




Fetching synapses:   0%|          | 0/14 [00:00<?, ?it/s]

Unnamed: 0_level_0,counts,postsynaptic_ID
presynaptic_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
720575940628842517,26,720575940617429131
720575940614885010,15,720575940617429131
720575940616172405,12,720575940617429131
720575940627905566,6,720575940617429131
720575940623276938,5,720575940617429131


### 4. Adding useful information to the data frame

In [45]:
# Selecting dataframe
#Updating the IDs via Fafbseg
partner_ID = input_count_str_df.index.tolist()
updated_ID_df = fafbseg.flywire.update_ids(partner_ID, stop_layer=2, supervoxels=None, timestamp=None, dataset='production', progress=True)
partner_ID_ls = updated_ID_df["new_id"].tolist()

# Identifying user-based annotations about cell identity

identification_df = fafbseg.flywire.find_celltypes(partner_ID_ls, user=None, exact=False, case=False, regex=True, update_roots=False)
identification_no_duplicates_df = identification_df.drop_duplicates(subset='pt_root_id', keep='last', inplace=False, ignore_index=False).copy()

# Adding info to the current data set

# Selecting dataframes and resetting index
source_df = identification_no_duplicates_df.copy()
source_df.reset_index(inplace = True, drop = True)
target_df = input_count_str_df.copy()
target_df.reset_index(inplace = True, drop = True)


# Adding columns for the function to properly work
target_df['presynaptic_ID'] = input_count_str_df.index.astype(str)
source_df['presynaptic_ID'] = identification_no_duplicates_df['pt_root_id'].tolist()
target_df['guess'] = None
source_df['guess'] = identification_no_duplicates_df['tag'].tolist()
target_df['author'] = None
source_df['author'] = identification_no_duplicates_df['user_id'].tolist()

# Function inputs
source_cols = ['guess', 'author','presynaptic_ID']
target_cols = ['guess', 'author', 'presynaptic_ID']
reference_column = 'presynaptic_ID'

source_df = source_df[source_cols].copy()
target_df = target_df[source_cols].copy()

source_df = source_df.astype(str)
target_df = target_df.astype(str)


# Running the function and compleating the dataset
result_df = update_dataframe_single_column(source_df, target_df,reference_column)
result_df['counts'] = input_count_str_df['counts'].tolist()
result_df['postsynaptic_ID'] = input_count_str_df['postsynaptic_ID'].tolist()
result_df.head()

Updating:   0%|          | 0/14150 [00:00<?, ?it/s]

Unnamed: 0,guess,author,presynaptic_ID,counts,postsynaptic_ID
0,CT1,95,720575940628842517,26,720575940617429131
1,Mi4; Medullary intrinsic neuron 4; FBbt_00003779,392,720575940614885010,15,720575940617429131
2,Tm16; Transmedullary neuron 16; FBbt_00003804,392,720575940616172405,12,720575940617429131
3,Lamina monopolar 3; L3,96,720575940627905566,6,720575940617429131
4,"Transmedullary neuron 20, Tm20, [FBbt_00003808...",1185,720575940623276938,5,720575940617429131


### 5. Transfering information from the main database (db)

In [52]:
# Matcing data types
db_R['Updated_seg_id'] = db_R['Updated_seg_id'].astype(str)
result_df['presynaptic_ID'] = result_df['presynaptic_ID'].astype(str)

# Merging the DataFrames based on common values
merged_df = pd.merge(result_df, db_R[['Updated_seg_id', 'symbol']], left_on='presynaptic_ID', right_on='Updated_seg_id', how='left')

# Drop the extra 'seg_id' column
merged_df.drop(columns=['Updated_seg_id'], inplace=True)

# Display the merged DataFrame
display(merged_df)

Unnamed: 0,guess,author,presynaptic_ID,counts,postsynaptic_ID,symbol
0,CT1,95,720575940628842517,26,720575940617429131,
1,Mi4; Medullary intrinsic neuron 4; FBbt_00003779,392,720575940614885010,15,720575940617429131,Mi4
2,Tm16; Transmedullary neuron 16; FBbt_00003804,392,720575940616172405,12,720575940617429131,Tm16
3,Lamina monopolar 3; L3,96,720575940627905566,6,720575940617429131,
4,"Transmedullary neuron 20, Tm20, [FBbt_00003808...",1185,720575940623276938,5,720575940617429131,
...,...,...,...,...,...,...
14145,Mti_unknown_3,100,720575940616294194,1,720575940639232575,
14146,Centrifugal 3; C3,28,720575940629320704,1,720575940639232575,C3
14147,Tm16; Transmedullary neuron 16; FBbt_00003804,392,720575940627958168,1,720575940639232575,Tm16
14148,,,720575940633561566,1,720575940639232575,


### 6. Adding more useful information for later analysis

In [54]:
# Matcing data types
neuron_df['Updated_seg_id'] = neuron_df['Updated_seg_id'].astype(str)
merged_df['postsynaptic_ID'] = merged_df['postsynaptic_ID'].astype(str)

# Merging the DataFrames based on common values
merged_2_df = pd.merge(merged_df, neuron_df[['Updated_seg_id', 'optic_lobe_id','dorso-ventral']], left_on='postsynaptic_ID', right_on='Updated_seg_id', how='left')

# Drop the extra 'seg_id' column
merged_2_df.drop(columns=['Updated_seg_id'], inplace=True)

# Display the merged DataFrame
display(merged_2_df)

Unnamed: 0,guess,author,presynaptic_ID,counts,postsynaptic_ID,symbol,optic_lobe_id,dorso-ventral
0,CT1,95,720575940628842517,26,720575940617429131,,R39,D
1,Mi4; Medullary intrinsic neuron 4; FBbt_00003779,392,720575940614885010,15,720575940617429131,Mi4,R39,D
2,Tm16; Transmedullary neuron 16; FBbt_00003804,392,720575940616172405,12,720575940617429131,Tm16,R39,D
3,Lamina monopolar 3; L3,96,720575940627905566,6,720575940617429131,,R39,D
4,"Transmedullary neuron 20, Tm20, [FBbt_00003808...",1185,720575940623276938,5,720575940617429131,,R39,D
...,...,...,...,...,...,...,...,...
14145,Mti_unknown_3,100,720575940616294194,1,720575940639232575,,R739,
14146,Centrifugal 3; C3,28,720575940629320704,1,720575940639232575,C3,R739,
14147,Tm16; Transmedullary neuron 16; FBbt_00003804,392,720575940627958168,1,720575940639232575,Tm16,R739,
14148,,,720575940633561566,1,720575940639232575,,R739,


### Saving 

In [57]:
# Saving data in your computer
PC_disc = 'D'
outDir = f'{PC_disc}:\Connectomics-Data\FlyWire\Excels\min-score-50' # YOUR-PATH for saving excel file
save_excel_file = True

import datetime
x = datetime.datetime.now()
date_str = x.strftime("%d") + x.strftime("%b") + x.strftime("%Y")

if save_excel_file: 
    ## Input count
    file_name = f'{neuron}_neurons_input_count_{_hemisphere}_{date_str}.xlsx'
    savePath = os.path.join(outDir, file_name)
    merged_2_df.to_excel(savePath, sheet_name='Buhmann synapses')