# Combining information from two tables 

This notebook contains code to compare and copy-paste information from one dataframe to another if the identity is the same

In [2]:
import os
import pandas as pd
import numpy as np
from fafbseg import flywire
from caveclient import CAVEclient
client = CAVEclient('flywire_fafb_production')



## A) Choosing, loading and updating tables


### Chossing files of interest and data to transfer

In [3]:
# Choose path and file
dataPath = r'D:\Connectomics-Data\FlyWire\Excels\drive-data-sets'

date = '20230526'
fileName_list = [f'All_Tm9_neurons_input_count_ME_R_{date}.xlsx',
                 f'All_Tm9_neurons_input_count_ME_R_no_twig_proofread_{date}.xlsx']

filePath_list = []
for fileName in fileName_list:
    filePath_list.append(os.path.join(dataPath,fileName))


In [308]:
#Choose column information to compare and to transfer
columns_to_compare = ['presynaptic_ID', 'postsynaptic_ID', 'seg_id']
columns_to_transfer_based_on_pre = ['presynaptic_ID','postsynaptic_ID','symbol','guess','hemisphere','lab', 'author','name','twigs proofread (Y/N)', 
                                    'FlyWire proofread (Y/N)','identified_in', 'lab authorship (Y/N)', 'Extra notes as comments (initials)']
columns_to_transfer_based_on_post = ['presynaptic_ID','postsynaptic_ID', 'optic_lobe_id','column_id','detached_lamina (Y/N)', 'healthy_L3 (Y/N)']


### Loading files as distict dataframes

In [175]:
# The distinct dataframes will be stored in a dictionary

data_frames = dict()

for i in range(1, len(filePath_list)+1):
    data_frames['df_%02d' % i] = pd.read_excel(filePath_list[i-1])
    #Dropping the fisrt row ('asdf' was added as a walk-around to set that column values as type str)
    if data_frames['df_%02d' % i]["postsynaptic_ID"][0] == 'asdf': 
        data_frames['df_%02d' % i] = data_frames['df_%02d' % i].iloc[1: , :]
        data_frames['df_%02d' % i].reset_index(inplace=True,drop=True)

### Updating column information

In [177]:
# The distinct columns to update and the updated versions of them will be stored dictionaries

columns_to_update = dict()
for df_name in data_frames.keys():
    curr_df = data_frames[df_name]
    for column_name in columns_to_compare:
        columns_to_update[f'{df_name}_{column_name}'] = curr_df[column_name].copy()

In [179]:
#Replacing the 'INPUTS PROOFREAD' labelled row of the lists for a well-known marker id

marker_id = '720575940628553731' # VM1 ORN
marker_id_update_df = flywire.update_ids(marker_id, stop_layer=2, supervoxels=None, timestamp=None, dataset='production', progress=True)
marker_id_updated = marker_id_update_df["new_id"][0]
for key,value in columns_to_update.items():
    for i, id in enumerate(value):
        if id == 'INPUTS PROOFREAD':
            value[i] = marker_id_updated # Replacement by the marker
    

In [180]:
# Updating the different columns. Information is stored in a dictionary
updated_columns = dict()
updated_columns_confidence = dict()
for key, value in columns_to_update.items():
    temp_segmentIDs_df = flywire.update_ids(value.tolist(), stop_layer=2, supervoxels=None, timestamp=None, dataset='production', progress=True)
    updated_value = temp_segmentIDs_df["new_id"]
    confidence_of_update = temp_segmentIDs_df["confidence"]
    updated_columns[key] = updated_value
    updated_columns_confidence[key] = confidence_of_update
    

Updating:   0%|          | 0/1351 [00:00<?, ?it/s]

Updating:   0%|          | 0/1351 [00:00<?, ?it/s]

Updating:   0%|          | 0/1351 [00:00<?, ?it/s]

Updating:   0%|          | 0/294 [00:00<?, ?it/s]

Updating:   0%|          | 0/294 [00:00<?, ?it/s]

Updating:   0%|          | 0/294 [00:00<?, ?it/s]

### Updating dataframe information

In [182]:
# Restoring the initial INPUTS PROOFREAD' marker
for key, series in updated_columns.items():
    updated_columns[key] = series.replace(to_replace = int(marker_id), value = 'INPUTS PROOFREAD')
    

In [184]:
#Updating the different dataframes inplace
for df_name, df_values in data_frames.items():
    for column_name in columns_to_compare:
        data_frames[df_name][column_name] =  updated_columns[f'{df_name}_{column_name}']


## B) Transfering data from a source data frame into another

### Define a function that performs the copy-paste operation

In [374]:
def update_dataframe(source_df, target_df, reference_column1, reference_column2):
    # Create a dictionary mapping from the reference columns to the source DataFrame
    reference_columns = [reference_column1, reference_column2]
    reference_dict = source_df.groupby(reference_columns).first().reset_index().to_dict(orient='records')
    reference_dict = {(row[reference_column1], row[reference_column2]): row for row in reference_dict}

    # Update the target DataFrame based on the reference columns
    for i, row in target_df.iterrows():
        ref1 = row[reference_column1]
        ref2 = row[reference_column2]
        if (ref1, ref2) in reference_dict:
            source_row = reference_dict[(ref1, ref2)]
            target_df.loc[i] = source_row

    return target_df

### Provide the user-chosen columns and reference column as inputs to the function

In [375]:
source_cols = columns_to_transfer_based_on_pre 
target_cols = columns_to_transfer_based_on_pre 
reference_column1 = 'presynaptic_ID'
reference_column2 = 'postsynaptic_ID'

### Call the function with the source and target data frames and the provided inputs

In [390]:
source_df = data_frames['df_01'][source_cols].copy()
target_df = data_frames['df_02'][target_cols].copy()

source_df = source_df.astype(str)
target_df = target_df.astype(str)

result_df = update_dataframe(source_df, target_df,reference_column1, reference_column2)
presynaptic_result_df = result_df.copy()

### Repeating the same process but for postsynaptic_id-based information

In [391]:
source_cols = columns_to_transfer_based_on_post 
target_cols = columns_to_transfer_based_on_post 
reference_column1 = 'presynaptic_ID'
reference_column2 = 'postsynaptic_ID'

source_df = data_frames['df_01'][source_cols].copy()
target_df = data_frames['df_02'][target_cols].copy()

source_df = source_df.astype(str)
target_df = target_df.astype(str)

result_df = update_dataframe(source_df, target_df,reference_column1, reference_column2)
postsynaptic_result_df = result_df.copy()

### Combining the data frames and saving the data in an excel file

In [395]:
final_df = pd.concat([presynaptic_result_df,postsynaptic_result_df],axis=1)

In [398]:
#Saving in a new file

str_final_df = final_df.astype(str)

import datetime
x = datetime.datetime.now()
date_str = x.strftime("%d") + x.strftime("%b") + x.strftime("%Y")
fileName_list[0]
file_name = f'{fileName_list[1]}_UPDATED_from_{fileName_list[0]}_{date_str}.xlsx'
savePath = os.path.join(dataPath, file_name)
str_final_df.to_excel(savePath, sheet_name='Data frame update')