# Database (db) creator and ID trasnsfer to input file


This notebook loads excel files and combines them into a database

In [1]:
# Import packages
import os
from glob import glob
import pandas as pd
import fafbseg



In [2]:
# Some custom functions

def update_dataframe_single_column(source_df, target_df, reference_column):
    # Create a dictionary mapping from the reference column to the source DataFrame
    reference_dict = source_df.groupby(reference_column).first().reset_index().to_dict(orient='records')
    reference_dict = {row[reference_column]: row for row in reference_dict}

    # Update the target DataFrame based on the reference column
    for i, row in target_df.iterrows():
        ref = row[reference_column]
        if ref in reference_dict:
            source_row = reference_dict[ref]
            target_df.loc[i] = source_row

    return target_df

### 1. Loading all data sets of interest in a loop

The original excell files need tzo be stored in the same folder. All files in that folder will be loaded


In [3]:
# Data paths
# Choose path and file
PC_disc = 'E'
dataPath = f'{PC_disc}:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database'

fileName_ls = glob(dataPath +"\\"+ "*.xlsx")


#Creating the database in a loop
df_ls = []
for fileName in fileName_ls:
    print(f'Importing: {fileName}')
    filePath = os.path.join(dataPath,fileName)
    df = pd.read_excel(filePath)
    #Dropping the fisrt row ('asdf' was added as a walk-around to set that column values as type str)
    if df["seg_id"][0] == 'asdf': 
        df = df.iloc[1: , :]
        df.reset_index(inplace=True,drop=True)
    df_ls.append(df)

Importing: E:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\C2 proofreadings.xlsx
Importing: E:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\C3 proofreadings.xlsx
Importing: E:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\Dm1 proofreadings.xlsx
Importing: E:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\Dm10 proofreadings.xlsx
Importing: E:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\Dm12 proofreadings.xlsx
Importing: E:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\Dm15 proofreadings.xlsx
Importing: E:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\Dm2 proofreadings.xlsx
Importing: E:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\Dm4 proofreadings.xlsx
Importing: E:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\Dm6 proofreadings.xlsx
Importing: E:\Connectomics-Data\FlyWire\Excels\drive-data-sets\database\L1 proofreadings.xlsx
Importing: E:\Connectomics-Data\FlyWire\Excels\dri

### 2. Creating, updating and filtering the database (db) of interest

In [4]:
### Creation
db = pd.concat(df_ls)
print(f'\n\nCell types in the database: {db.symbol.unique()}, total = {len(db.symbol.unique())}')

## Removing any Nan columns
db = db[db["Updated_seg_id"].notna()]

### Filtering and updating database
## Chossing optic lobe of interest:
_hemisphere = 'R'
db_R = db[db.hemisphere != 'L'].copy()

## Updating segmnet ids
#Getting the lists of IDs to update
curr_ID_ls = db_R["Updated_seg_id"].tolist()
curr_ID_ls = db_R["seg_id"].tolist()
#Updating all IDs at once
updated_ID_df = fafbseg.flywire.update_ids(curr_ID_ls, stop_layer=2, supervoxels=None, timestamp=None, 
                                           dataset='production', progress=True)
db_R['Updated_seg_ids'] = updated_ID_df['new_id'].astype(str).tolist()
db_R['Updated_confidence'] = updated_ID_df['confidence'].astype(str).tolist()



Cell types in the database: ['C2' 'C3' 'Dm1' 'Dm10' 'Dm12' 'Dm15' 'Dm2a' 'Dm2b' 'Dm2c' 'Dm4' 'Dm6'
 'L1' 'L2' 'L3' 'L4' 'L5' 'Me-Lo-2-N.I.' 'Mi1' 'Mi10' 'Mi13' 'Mi4' 'Mi9'
 'ML1' 'Pm1_Tm1' 'Pm1_Tm2' 'Pm1a_Tm1' 'Pm1a_Mi1' 'Pm2' 'Pm3' 'Pm4'
 'OA-AL2b2-L1' 'OA-AL2b2-L2' 'OA-AL2b2-R1' 'OA-AL2b2-R2' 'CT1' 'pMP7'
 'PS125' 'R7' 'R8' 'T1' 'T3' 'T4a' 'T4b' 'T4c' 'T4d' 'Tm1' 'Tm1/2' 'Tm16'
 'Tm2' 'Tm20' 'Tm20?' 'Tm3' 'Tm4' 'Tm4a' 'Tm9' 'Tm10' 'Tm11' 'Tm12'
 'TmY10' 'TmY15' 'TmY17'], total = 61


Updating:   0%|          | 0/21757 [00:00<?, ?it/s]

In [5]:
db_R['symbol'].unique()

array(['C2', 'C3', 'Dm1', 'Dm10', 'Dm12', 'Dm15', 'Dm2a', 'Dm2b', 'Dm2c',
       'Dm4', 'Dm6', 'L1', 'L2', 'L3', 'L4', 'L5', 'Me-Lo-2-N.I.', 'Mi1',
       'Mi10', 'Mi13', 'Mi4', 'Mi9', 'ML1', 'Pm1_Tm1', 'Pm1_Tm2',
       'Pm1a_Tm1', 'Pm1a_Mi1', 'Pm2', 'Pm3', 'Pm4', 'OA-AL2b2-L1',
       'OA-AL2b2-L2', 'OA-AL2b2-R1', 'OA-AL2b2-R2', 'CT1', 'pMP7',
       'PS125', 'R7', 'R8', 'T1', 'T3', 'T4a', 'T4b', 'T4c', 'T4d', 'Tm1',
       'Tm1/2', 'Tm16', 'Tm2', 'Tm20', 'Tm20?', 'Tm3', 'Tm4', 'Tm4a',
       'Tm9', 'TmY10', 'TmY15', 'TmY17'], dtype=object)

### 3. Creating a input file of interest

In [20]:
#Selecting postsynaptic neuron of interest
neuron = 'C3'
neuron_df = db_R[db_R['symbol'] == neuron].copy()

#Filtering for valid segment ids based on a given criteria
neuron_selected_df = neuron_df.copy()
neuron_selected_df = neuron_selected_df[neuron_selected_df['backbone proofread (Y/N)'] == 'Y'].copy()
neuron_selected_df = neuron_selected_df.iloc[:135].copy() # Temp just for C3
#neuron_selected_df = neuron_selected_df[neuron_selected_df['inputs_proofread (Y/N)'] == 'N'].copy()

print(f'\n\nTotal number of postsynaptic cells: {len(neuron_selected_df)}\n\n')

ID_ls = neuron_selected_df['Updated_seg_ids'].tolist()

#Fetching the neuron's inputs and putputs
neurons_inputs = fafbseg.flywire.synapses.fetch_synapses(ID_ls, pre=False, post=True, attach=True, 
                                             min_score=50, clean=True, transmitters=False, 
                                             neuropils=True, batch_size=30, 
                                             dataset='production', progress=True,mat= "live")

neurons_outputs = fafbseg.flywire.synapses.fetch_synapses(ID_ls, pre=True, post=False, attach=True, 
                                             min_score=50, clean=True, transmitters=False, 
                                             neuropils=True, batch_size=30, 
                                             dataset='production', progress=True,mat= "live")


#Counting inputs per ID, option joining dataframes
final_input_df = pd.DataFrame()
for n in neurons_inputs['post'].unique():
    inputs_count = {}
    curr_inputs = neurons_inputs[neurons_inputs['post'] == n]
    inputs_str = curr_inputs.applymap(str)
    
    for c in inputs_str['pre'].to_list():
        inputs_count[c] = inputs_count.get(c, 0) + 1
    input_count_df = pd.DataFrame(inputs_count, index=[0])
    input_count_df = input_count_df.T
    input_count_df.rename(columns={0: "counts"},inplace=True)
    input_count_df.index.names = ['presynaptic_ID']
    input_count_df = input_count_df.sort_values(by="counts",ascending=False)
    input_count_df['postsynaptic_ID'] = inputs_str['post'].to_list()[0:len(input_count_df)]
    final_input_df = final_input_df.append(input_count_df)
    #print(f'Counting done for: {n}')
input_count_str_df = final_input_df.applymap(str)
print('INPUTS: ')
display(input_count_str_df.head())


#Counting outputs per ID, option joining dataframes
final_output_df = pd.DataFrame()
for n in neurons_outputs['pre'].unique():
    outputs_count = {}
    curr_outputs = neurons_outputs[neurons_outputs['pre'] == n]
    outputs_str = curr_outputs.applymap(str)
    
    for c in outputs_str['post'].to_list():
        outputs_count[c] = outputs_count.get(c, 0) + 1
    output_count_df = pd.DataFrame(outputs_count, index=[0])
    output_count_df = output_count_df.T
    output_count_df.rename(columns={0: "counts"},inplace=True)
    output_count_df.index.names = ['postsynaptic_ID']
    output_count_df = output_count_df.sort_values(by="counts",ascending=False)
    output_count_df['presynaptic_ID'] = outputs_str['pre'].to_list()[0:len(output_count_df)]
    final_output_df = final_output_df.append(output_count_df)
    #print(f'Counting done for: {n}')
output_count_str_df = final_output_df.applymap(str)
print('OUTPUTS: ')
display(output_count_str_df.head())




Total number of postsynaptic cells: 135




Fetching synapses:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching synapses:   0%|          | 0/5 [00:00<?, ?it/s]

INPUTS: 


Unnamed: 0_level_0,counts,postsynaptic_ID
presynaptic_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
720575940611749233,42,720575940620786342
720575940616464434,22,720575940620786342
720575940620233899,12,720575940620786342
720575940622931992,11,720575940620786342
720575940623331208,10,720575940620786342


OUTPUTS: 


Unnamed: 0_level_0,counts,presynaptic_ID
postsynaptic_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
720575940618171878,84,720575940627019876
720575940629905487,79,720575940627019876
720575940620814171,18,720575940627019876
720575940637576158,16,720575940627019876
720575940611803000,11,720575940627019876


### 4. Adding useful information to the data frame

In [21]:
## For INPUTS


# Selecting dataframe
#Updating the IDs via Fafbseg
partner_ID = input_count_str_df.index.tolist()
updated_ID_df = fafbseg.flywire.update_ids(partner_ID, stop_layer=2, supervoxels=None, timestamp=None, dataset='production', progress=True)
partner_ID_ls = updated_ID_df["new_id"].tolist()

# Identifying user-based annotations about cell identity

identification_df = fafbseg.flywire.find_celltypes(partner_ID_ls, user=None, exact=False, case=False, regex=True, update_roots=False)
identification_no_duplicates_df = identification_df.drop_duplicates(subset='pt_root_id', keep='last', inplace=False, ignore_index=False).copy()

# Adding info to the current data set

# Selecting dataframes and resetting index
source_df = identification_no_duplicates_df.copy()
source_df.reset_index(inplace = True, drop = True)
target_df = input_count_str_df.copy()
target_df.reset_index(inplace = True, drop = True)


# Adding columns for the function to properly work
target_df['presynaptic_ID'] = input_count_str_df.index.astype(str)
source_df['presynaptic_ID'] = identification_no_duplicates_df['pt_root_id'].tolist()
target_df['guess'] = None
source_df['guess'] = identification_no_duplicates_df['tag'].tolist()
target_df['author'] = None
source_df['author'] = identification_no_duplicates_df['user_id'].tolist()

# Function inputs
source_cols = ['guess', 'author','presynaptic_ID']
target_cols = ['guess', 'author', 'presynaptic_ID']
reference_column = 'presynaptic_ID'

source_df = source_df[source_cols].copy()
target_df = target_df[source_cols].copy()

source_df = source_df.astype(str)
target_df = target_df.astype(str)


# Running the function and compleating the dataset
result_input_df = update_dataframe_single_column(source_df, target_df,reference_column)
result_input_df['counts'] = input_count_str_df['counts'].tolist()
result_input_df['postsynaptic_ID'] = input_count_str_df['postsynaptic_ID'].tolist()
result_input_df.head()

Updating:   0%|          | 0/6617 [00:00<?, ?it/s]

Unnamed: 0,guess,author,presynaptic_ID,counts,postsynaptic_ID
0,Lamina monopolar 1; L1,96,720575940611749233,42,720575940620786342
1,Lamina monopolar 5; L5,96,720575940616464434,22,720575940620786342
2,Mi1,100,720575940620233899,12,720575940620786342
3,Centrifugal 2; C2; FBbt_00003743,392,720575940622931992,11,720575940620786342
4,Lamina monopolar 3; L3,96,720575940623331208,10,720575940620786342


In [22]:
## For OUTPUTS

# Selecting dataframe
#Updating the IDs via Fafbseg
partner_ID = output_count_str_df.index.tolist()
updated_ID_df = fafbseg.flywire.update_ids(partner_ID, stop_layer=2, supervoxels=None, timestamp=None, dataset='production', progress=True)
partner_ID_ls = updated_ID_df["new_id"].tolist()

# Identifying user-based annotations about cell identity

identification_df = fafbseg.flywire.find_celltypes(partner_ID_ls, user=None, exact=False, case=False, regex=True, update_roots=False)
identification_no_duplicates_df = identification_df.drop_duplicates(subset='pt_root_id', keep='last', inplace=False, ignore_index=False).copy()

# Adding info to the current data set

# Selecting dataframes and resetting index
source_df = identification_no_duplicates_df.copy()
source_df.reset_index(inplace = True, drop = True)
target_df = output_count_str_df.copy()
target_df.reset_index(inplace = True, drop = True)


# Adding columns for the function to properly work
target_df['postsynaptic_ID'] = output_count_str_df.index.astype(str)
source_df['postsynaptic_ID'] = identification_no_duplicates_df['pt_root_id'].tolist()
target_df['guess'] = None
source_df['guess'] = identification_no_duplicates_df['tag'].tolist()
target_df['author'] = None
source_df['author'] = identification_no_duplicates_df['user_id'].tolist()

# Function inputs
source_cols = ['guess', 'author','postsynaptic_ID']
target_cols = ['guess', 'author', 'postsynaptic_ID']
reference_column = 'postsynaptic_ID'

source_df = source_df[source_cols].copy()
target_df = target_df[source_cols].copy()

source_df = source_df.astype(str)
target_df = target_df.astype(str)


# Running the function and compleating the dataset
result_output_df = update_dataframe_single_column(source_df, target_df,reference_column)
result_output_df['counts'] = output_count_str_df['counts'].tolist()
result_output_df['presynaptic_ID'] = output_count_str_df['presynaptic_ID'].tolist()
result_output_df.head()

Updating:   0%|          | 0/25069 [00:00<?, ?it/s]

Unnamed: 0,guess,author,postsynaptic_ID,counts,presynaptic_ID
0,Lamina monopolar 2; L2,96,720575940618171878,84,720575940627019876
1,T1,14,720575940629905487,79,720575940627019876
2,Tm1; Transmedullary neuron 1; FBbt_00003789,392,720575940620814171,18,720575940627019876
3,Lamina monopolar 5; L5,96,720575940637576158,16,720575940627019876
4,Dm1; Distal medullary amacrine neuron 1; FBbt_...,392,720575940611803000,11,720575940627019876


### 5. Transfering information from the main database (db)

In [23]:
## For INPUTS

# Matcing data types
db_R['Updated_seg_id'] = db_R['Updated_seg_id'].astype(str)
result_input_df['presynaptic_ID'] = result_input_df['presynaptic_ID'].astype(str)

# Merging the DataFrames based on common values
merged_input_df = pd.merge(result_input_df, db_R[['Updated_seg_id', 'symbol']], left_on='presynaptic_ID', right_on='Updated_seg_id', how='left')

# Drop the extra 'seg_id' column
merged_input_df.drop(columns=['Updated_seg_id'], inplace=True)

# Display the merged DataFrame
print('For INPUTS')
display(merged_input_df)


## For OUTPUTS

# Matcing data types
db_R['Updated_seg_id'] = db_R['Updated_seg_id'].astype(str)
result_output_df['postynaptic_ID'] = result_output_df['postsynaptic_ID'].astype(str)

# Merging the DataFrames based on common values
merged_output_df = pd.merge(result_output_df, db_R[['Updated_seg_id', 'symbol']], left_on='postsynaptic_ID', right_on='Updated_seg_id', how='left')

# Drop the extra 'seg_id' column
merged_output_df.drop(columns=['Updated_seg_id'], inplace=True)

# Display the merged DataFrame
print('For OUTPUTS')
display(merged_output_df)

For INPUTS


Unnamed: 0,guess,author,presynaptic_ID,counts,postsynaptic_ID,symbol
0,Lamina monopolar 1; L1,96,720575940611749233,42,720575940620786342,L1
1,Lamina monopolar 5; L5,96,720575940616464434,22,720575940620786342,L5
2,Mi1,100,720575940620233899,12,720575940620786342,Mi1
3,Centrifugal 2; C2; FBbt_00003743,392,720575940622931992,11,720575940620786342,C2
4,Lamina monopolar 3; L3,96,720575940623331208,10,720575940620786342,L3
...,...,...,...,...,...,...
6687,Lamina monopolar 5; L5,96,720575940630893817,1,720575940633319628,L5
6688,Tm2: Transmedullary neuron 2; FBbt_00003790,392,720575940631458252,1,720575940633319628,Tm2
6689,Transmedullary 25; Tm25,103,720575940621823103,1,720575940633319628,
6690,Tm4; Transmedullary neuron 4; FBbt_00003792,392,720575940620063435,1,720575940633319628,Tm4


For OUTPUTS


Unnamed: 0,guess,author,postsynaptic_ID,counts,presynaptic_ID,postynaptic_ID,symbol
0,Lamina monopolar 2; L2,96,720575940618171878,84,720575940627019876,720575940618171878,L2
1,T1,14,720575940629905487,79,720575940627019876,720575940629905487,T1
2,Tm1; Transmedullary neuron 1; FBbt_00003789,392,720575940620814171,18,720575940627019876,720575940620814171,Tm1
3,Lamina monopolar 5; L5,96,720575940637576158,16,720575940627019876,720575940637576158,L5
4,Dm1; Distal medullary amacrine neuron 1; FBbt_...,392,720575940611803000,11,720575940627019876,720575940611803000,Dm1
...,...,...,...,...,...,...,...
25282,,,720575940566381636,1,720575940640298429,720575940566381636,
25283,Proximal medulla 2; Pm2,17,720575940625615368,1,720575940640298429,720575940625615368,
25284,,,720575940608848602,1,720575940640298429,720575940608848602,
25285,Putative Transmedullary Y 5a; TmY5a,103,720575940617841153,1,720575940640298429,720575940617841153,


### 6. Adding more useful information for later analysis

In [24]:
## For INPUTS

# Matcing data types
neuron_df['Updated_seg_id'] = neuron_df['Updated_seg_id'].astype(str)
merged_input_df['postsynaptic_ID'] = merged_input_df['postsynaptic_ID'].astype(str)

# Merging the DataFrames based on common values
merged_input_2_df = pd.merge(merged_input_df, neuron_df[['Updated_seg_id', 'optic_lobe_id','dorso-ventral']], left_on='postsynaptic_ID', right_on='Updated_seg_id', how='left')

# Drop the extra 'seg_id' column
merged_input_2_df.drop(columns=['Updated_seg_id'], inplace=True)

# Display the merged DataFrame
print('For INPUTS:')
display(merged_input_2_df)


## For OUTPUTS

# Matcing data types
neuron_df['Updated_seg_id'] = neuron_df['Updated_seg_id'].astype(str)
merged_output_df['presynaptic_ID'] = merged_output_df['presynaptic_ID'].astype(str)

# Merging the DataFrames based on common values
merged_output_2_df = pd.merge(merged_output_df, neuron_df[['Updated_seg_id', 'optic_lobe_id','dorso-ventral']], left_on='presynaptic_ID', right_on='Updated_seg_id', how='left')

# Drop the extra 'seg_id' column
merged_output_2_df.drop(columns=['Updated_seg_id'], inplace=True)

# Display the merged DataFrame
print('For OUTPUTS:')
display(merged_output_2_df)

For INPUTS:


Unnamed: 0,guess,author,presynaptic_ID,counts,postsynaptic_ID,symbol,optic_lobe_id,dorso-ventral
0,Lamina monopolar 1; L1,96,720575940611749233,42,720575940620786342,L1,C3-R216,
1,Lamina monopolar 5; L5,96,720575940616464434,22,720575940620786342,L5,C3-R216,
2,Mi1,100,720575940620233899,12,720575940620786342,Mi1,C3-R216,
3,Centrifugal 2; C2; FBbt_00003743,392,720575940622931992,11,720575940620786342,C2,C3-R216,
4,Lamina monopolar 3; L3,96,720575940623331208,10,720575940620786342,L3,C3-R216,
...,...,...,...,...,...,...,...,...
6687,Lamina monopolar 5; L5,96,720575940630893817,1,720575940633319628,L5,C3-R70,
6688,Tm2: Transmedullary neuron 2; FBbt_00003790,392,720575940631458252,1,720575940633319628,Tm2,C3-R70,
6689,Transmedullary 25; Tm25,103,720575940621823103,1,720575940633319628,,C3-R70,
6690,Tm4; Transmedullary neuron 4; FBbt_00003792,392,720575940620063435,1,720575940633319628,Tm4,C3-R70,


For OUTPUTS:


Unnamed: 0,guess,author,postsynaptic_ID,counts,presynaptic_ID,postynaptic_ID,symbol,optic_lobe_id,dorso-ventral
0,Lamina monopolar 2; L2,96,720575940618171878,84,720575940627019876,720575940618171878,L2,C3-R27,
1,T1,14,720575940629905487,79,720575940627019876,720575940629905487,T1,C3-R27,
2,Tm1; Transmedullary neuron 1; FBbt_00003789,392,720575940620814171,18,720575940627019876,720575940620814171,Tm1,C3-R27,
3,Lamina monopolar 5; L5,96,720575940637576158,16,720575940627019876,720575940637576158,L5,C3-R27,
4,Dm1; Distal medullary amacrine neuron 1; FBbt_...,392,720575940611803000,11,720575940627019876,720575940611803000,Dm1,C3-R27,
...,...,...,...,...,...,...,...,...,...
25282,,,720575940566381636,1,720575940640298429,720575940566381636,,,
25283,Proximal medulla 2; Pm2,17,720575940625615368,1,720575940640298429,720575940625615368,,,
25284,,,720575940608848602,1,720575940640298429,720575940608848602,,,
25285,Putative Transmedullary Y 5a; TmY5a,103,720575940617841153,1,720575940640298429,720575940617841153,,,


### Saving 

In [25]:
# Saving data in your computer
PC_disc = 'E'
outDir = f'{PC_disc}:\Connectomics-Data\FlyWire\Excels\min-score-50' # YOUR-PATH for saving excel file
save_excel_file = True

import datetime
x = datetime.datetime.now()
date_str = x.strftime("%d") + x.strftime("%b") + x.strftime("%Y")

if save_excel_file: 
    ## Input count
    file_name = f'{neuron}_neurons_input_count_{_hemisphere}_{date_str}.xlsx'
    savePath = os.path.join(outDir, file_name)
    merged_input_2_df.to_excel(savePath, sheet_name='Buhmann synapses, inputs')
    
    ## Output count
    file_name = f'{neuron}_neurons_output_count_{_hemisphere}_{date_str}.xlsx'
    savePath = os.path.join(outDir, file_name)
    merged_output_2_df.to_excel(savePath, sheet_name='Buhmann synapses, outputs')