# Segment IDs authorship information


For publication purposes following the FlyWire principles, this notebook is used to retrive following information:
- Number of edits aggregated by lab for each ID
- ID identification, first person and lab that did it
- ID completion status, the person and lab that did it

In [92]:
# Importing packages

import os
import pandas as pd
from fafbseg import flywire
from caveclient import CAVEclient
client = CAVEclient('flywire_fafb_production')
from openpyxl import load_workbook

In [93]:
# Loading data set with IDs of interest

# Choose path and file
PC_disc = 'D'
dataPath = f'{PC_disc}:\Connectomics-Data\FlyWire\Excels\drive-data-sets\submission_nature'
date = '20230823'
fileName = f'Updated_unique_seg_ids_in_publication_{date}.xlsx'
filePath = os.path.join(dataPath,fileName)

#Loading file as DataFrame
df = pd.read_excel(filePath)


#Dropping the fisrt row ('asdf' was added as a walk-around to set that column values as type str)
if df["seg_id"][0] == 'asdf': 
    df = df.iloc[1: , :]
    df.reset_index(inplace=True,drop=True)
    
#Dropping dupllicates
segment_publication_df = df.drop_duplicates(subset=["Updated_seg_ids"], keep='first')

#Quick look on the dataframe
display(segment_publication_df.head())
    
#Getting the lists of IDs to update
segmentIDs = df["Updated_seg_ids"].tolist()


Unnamed: 0,seg_id,symbol,optic_lobe_id,hemisphere,Updated_seg_ids
0,720575940617931604,Tm9,,,720575940617931604
1,720575940611032530,Tm9,,,720575940611032530
2,720575940630963275,Tm9,,,720575940630963275
3,720575940612424470,Tm9,,,720575940612424470
4,720575940622307753,Tm9,,,720575940622307753


In [94]:

len(segmentIDs)

4608

In [95]:
# Preventing errors before running the hard part
for s in segmentIDs:
    if not type(s) == str:
        print(f'{s} is not a string')

### Getting number of edits and aggregate by lab

In [96]:
# Getting edits
#edits = flywire.fetch_edit_history(unique_segmentIDs[0:500])
#few_segmentIDs = segmentIDs[0:1000]
#edits = flywire.fetch_edit_history(few_segmentIDs)


# Getting edits in a for loop

starting_round = 0 # usually it must starts from 0
_steps = 500
_start = _steps*starting_round
_last = (_steps*starting_round) + _steps
_rounds =  round(len(segmentIDs)/ _steps)

print(f'Rounds to perform: {_rounds}')
edits = pd.DataFrame()
for i in range(starting_round,_rounds):
    #print(f'Round {i}')
    # getting edits in batches
    try:
        curr_edits = flywire.fetch_edit_history(segmentIDs[_start:_last])
        edits = pd.concat([edits,curr_edits])
        
    # getting edits from single ids   
    except:
        for i in range(_start,_last):
            curr_id = segmentIDs[i]
            try:
                curr_edits = flywire.fetch_edit_history(curr_id)
                edits = pd.concat([edits,curr_edits])
            except:
                print(f'Faulty id: {curr_id}')
        
    _start = _start + _steps
    _last = _last + _steps
    

Rounds to perform: 9


Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Faulty id: 720575940640126141


Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Faulty id: 720575940609934019
Faulty id: 720575940617282747


Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

In [108]:
# Calculating the total number of entries for each "user_name" within each unique "segment"
user_name_counts = edits.groupby(['segment', 'user_name'])['user_name'].count().reset_index(name='user_edit_count')
# Merging with the original DataFrame to get 'user_affiliation'
user_name_counts = user_name_counts.merge(edits[['segment', 'user_name', 'user_affiliation']].drop_duplicates(), on=['segment', 'user_name'])
# Creating a new DataFrame for segment counts
segment_counts_df = edits.groupby('segment').size().reset_index(name='total_edit_count')
# Merging with the segment_counts_df DataFrame
user_name_counts = user_name_counts.merge(segment_counts_df, on='segment')
# Merging with the segment_publication_df DataFrame to get 'symbol'
user_name_counts = user_name_counts.merge(segment_publication_df[['Updated_seg_ids', 'symbol']], left_on='segment', right_on='Updated_seg_ids', how='left')
# Drop the redundant 'Updated_seg_ids' column
user_name_counts = user_name_counts.drop('Updated_seg_ids', axis=1)
# Calculating the 'user_percent_edited'
user_name_counts['user_percent_edited'] = round(user_name_counts['user_edit_count'] / user_name_counts['total_edit_count'] * 100)
# Calculating the percentage sum for each 'user_affiliation' group within the same 'segment'
user_affiliation_percentage_sum = user_name_counts.groupby(['segment', 'user_affiliation'])['user_percent_edited'].sum().reset_index(name='lab_percent_edited')
# Merging back into the user_name_counts DataFrame
user_name_counts = user_name_counts.merge(user_affiliation_percentage_sum, on=['segment', 'user_affiliation'])
# Sort the new DataFrame by 'user_affiliation'
user_name_counts = user_name_counts.sort_values(by='user_affiliation')


# Summing user_edit_count for each user_name
user_name_counts = user_name_counts[user_name_counts['lab_percent_edited'] >= 10].copy()
unique_users_df = user_name_counts.groupby('user_name', as_index=False).agg({
    'user_affiliation': 'first',  # Assuming user_affiliation is the same for all rows of a user
    'user_edit_count': 'sum'
})

# Sort the new DataFrame by 'user_affiliation'
unique_users_df = unique_users_df.sort_values(by='user_affiliation')

# Reordering columns
unique_users_df = unique_users_df[['user_name', 'user_affiliation', 'user_edit_count']]

Unnamed: 0,user_name,user_affiliation,user_edit_count
100,ShanaBanana,,1
107,Wes Murfin,,2
6,Amalia Braun,Alexander Borst Lab,42
2,Alisa Poh,Barry Dickson Lab,7
17,Beas Dutta,Bassem Hassan Lab,12
...,...,...,...
105,Tommy Crahan,Sung Soo Kim Lab,2
31,Dustin Garner,Sung Soo Kim Lab,556
66,Lucy Houghton,Sung Soo Kim Lab,13
51,Jasper Phelps,Wei-Chung Lee lab,1


In [109]:
user_name_counts_final = user_name_counts[['segment', 'symbol', 'user_name', 'user_affiliation','user_edit_count','total_edit_count', 'lab_percent_edited']].copy()
user_name_counts_final = user_name_counts_final[user_name_counts_final['lab_percent_edited'] >= 10]
user_name_counts_final.head()

Unnamed: 0,segment,symbol,user_name,user_affiliation,user_edit_count,total_edit_count,lab_percent_edited
11022,720575940638919002,Dm12,ShanaBanana,,1,8,12.0
1960,720575940614774959,Tm16,Wes Murfin,,2,4,50.0
2175,720575940615464098,Tm9,Amalia Braun,Alexander Borst Lab,1,4,25.0
4169,720575940621386843,Dm12,Amalia Braun,Alexander Borst Lab,1,8,12.0
669,720575940609986680,Tm9,Amalia Braun,Alexander Borst Lab,2,2,100.0


In [111]:
#Saving in a new file

import datetime
x = datetime.datetime.now()
date_str = x.strftime("%d") + x.strftime("%b") + x.strftime("%Y")
file_name = f'segment_ids_edit_counts_{date_str}.xlsx'
savePath = os.path.join(dataPath, file_name)
unique_users_final.to_excel(savePath, sheet_name='User-Lab summary')


#More dataframes in same excel file
book = load_workbook(savePath)
writer = pd.ExcelWriter(savePath, engine = 'openpyxl')
writer.book = book
user_name_counts_final.to_excel(writer, sheet_name='Edits passed 10 percent')
writer.save()
writer.close()

In [86]:
savePath

'D:\\Connectomics-Data\\FlyWire\\Excels\\drive-data-sets\\submission_nature\\segment_ids_edit_counts_30Nov2023.xlsx'

In [12]:
# Step 1: Count the number of entries for each unique "segment"
segment_counts = edits['segment'].value_counts().reset_index()
segment_counts.columns = ['segment', 'total_segment_count']

# Step 2: Calculate the total number of entries for each "user_affiliation" within each unique "segment"
user_affiliation_counts = edits.groupby(['segment', 'user_affiliation'])['user_affiliation'].count().reset_index(name='user_affiliation_count')
user_affiliation_counts = user_affiliation_counts.merge(segment_counts, on='segment')
user_affiliation_counts['affiliation_percentage'] = round(user_affiliation_counts['user_affiliation_count'] / user_affiliation_counts['total_segment_count'] * 100)

# Step 3: Calculate the total number of entries for each "user_name" within each unique "segment"
user_name_counts = edits.groupby(['segment', 'user_name'])['user_name'].count().reset_index(name='user_name_count')
user_name_counts = user_name_counts.merge(segment_counts, on='segment')
user_name_counts['user_percentage'] = round(user_name_counts['user_name_count'] / user_name_counts['total_segment_count'] * 100)

In [20]:
user_name_counts.head()


Unnamed: 0,segment,user_name,user_name_count,total_segment_count,user_percentage
0,720575940607889609,Daril Bautista,1,3,33.0
1,720575940607889609,Nelsie Panes,1,3,33.0
2,720575940607889609,marchan manaytay,1,3,33.0
3,720575940610854776,J. Dolorosa,1,3,33.0
4,720575940610854776,Philip,1,3,33.0


In [21]:
user_affiliation_counts.head()

Unnamed: 0,segment,user_affiliation,user_affiliation_count,total_segment_count,affiliation_percentage
0,720575940607889609,"Mala Murthy Lab, Sebastian Seung Lab",3,3,100.0
1,720575940610854776,Greg Jefferis Lab,1,3,33.0
2,720575940610854776,"Mala Murthy Lab, Sebastian Seung Lab",2,3,67.0
3,720575940611032530,"Mala Murthy Lab, Sebastian Seung Lab",3,3,100.0
4,720575940612198835,"Mala Murthy Lab, Sebastian Seung Lab",3,3,100.0


In [17]:
# Merge the two DataFrames on the 'segment' column
edit_counts = pd.merge(user_affiliation_counts[['segment', 'user_affiliation','user_affiliation_count', 'affiliation_percentage', 'total_segment_count']], user_name_counts[['segment', 'user_name', 'user_name_count','user_percentage']], on='segment')


# Sort the DataFrame by 'user_name'
edit_counts = edit_counts.sort_values(by='user_name')

# Reset the index to have a clean index order
edit_counts = edit_counts.reset_index(drop=True)

# Filtering the data set to >10% edits bb lab
edit_counts_final = edit_counts[['segment','user_name', 'user_name_count', 'user_affiliation','user_affiliation_count', 'total_segment_count','affiliation_percentage']].copy()
edit_counts_final = edit_counts_final[edit_counts_final['affiliation_percentage']>= 10].copy()

# Print the merged DataFrame
display(edit_counts_final)

Unnamed: 0,segment,user_name,user_name_count,user_affiliation,user_affiliation_count,total_segment_count,affiliation_percentage
0,720575940618701888,Amalia Braun,1,"Mala Murthy Lab, Sebastian Seung Lab",1,2,50.0
1,720575940618701888,Amalia Braun,1,Alexander Borst Lab,1,2,50.0
2,720575940622307753,Anjali Pandey,1,Greg Jefferis Lab,1,4,25.0
3,720575940622307753,Anjali Pandey,1,"Mala Murthy Lab, Sebastian Seung Lab",3,4,75.0
4,720575940644074656,Ariel Dagohoy,3,"Mala Murthy Lab, Sebastian Seung Lab",3,5,60.0
5,720575940644074656,Ariel Dagohoy,3,Greg Jefferis Lab,2,5,40.0
6,720575940616503669,Ariel Dagohoy,1,"Mala Murthy Lab, Sebastian Seung Lab",14,14,100.0
7,720575940630963275,Arti Yadav,1,Eyewire,6,7,86.0
8,720575940630963275,Arti Yadav,1,Greg Jefferis Lab,1,7,14.0
9,720575940624226795,AzureJay,17,Eyewire,17,17,100.0


In [11]:
#Saving in a new file

import datetime
x = datetime.datetime.now()
date_str = x.strftime("%d") + x.strftime("%b") + x.strftime("%Y")
file_name = f'segment_ids_edit_counts_{date_str}.xlsx'
savePath = os.path.join(dataPath, file_name)
edit_counts_final.to_excel(savePath, sheet_name='Edits passed 10 percent')


#More dataframes in same excel file
book = load_workbook(savePath)
writer = pd.ExcelWriter(savePath, engine = 'openpyxl')
writer.book = book
user_name_counts.to_excel(writer, sheet_name='Edits counts by user')
user_affiliation_counts.to_excel(writer, sheet_name='Edits counts by affiliation')
writer.save()
writer.close()

In [None]:
savePath