# Segment IDs authorship information


For publication purposes following the FlyWire principles, this notebook is used to retrive following information:
- Number of edits aggregated by lab for each ID
- ID identification, first person and lab that did it
- ID completion status, the person and lab that did it

In [1]:
# Importing packages

import os
import pandas as pd
from fafbseg import flywire
from caveclient import CAVEclient
client = CAVEclient('flywire_fafb_production')
from openpyxl import load_workbook



In [2]:
# Loading data set with IDs of interest

# Choose path and file
PC_disc = 'D'
dataPath = f'{PC_disc}:\Connectomics-Data\FlyWire\Excels\drive-data-sets\submission_nature'
date = '20230823'
fileName = f'Updated_unique_seg_ids_in_publication_{date}.xlsx'
filePath = os.path.join(dataPath,fileName)

#Loading file as DataFrame
df = pd.read_excel(filePath)


#Dropping the fisrt row ('asdf' was added as a walk-around to set that column values as type str)
if df["seg_id"][0] == 'asdf': 
    df = df.iloc[1: , :]
    df.reset_index(inplace=True,drop=True)
    
#Dropping dupllicates
result_df = df.drop_duplicates(subset=["Updated_seg_ids"], keep='first')

#Quick look on the dataframe
display(result_df.head())
    
#Getting the lists of IDs to update
segmentIDs = df["Updated_seg_ids"].tolist()


Unnamed: 0,seg_id,symbol,optic_lobe_id,hemisphere,Updated_seg_ids
0,720575940617931604,Tm9,,,720575940617931604
1,720575940611032530,Tm9,,,720575940611032530
2,720575940630963275,Tm9,,,720575940630963275
3,720575940612424470,Tm9,,,720575940612424470
4,720575940622307753,Tm9,,,720575940622307753


In [3]:
# Preventing errors before running the hard part
for s in segmentIDs:
    if not type(s) == str:
        print(f'{s} is not a string')

### Getting number of edits and aggregate by lab

In [4]:
# Getting edits
#edits = flywire.fetch_edit_history(unique_segmentIDs[0:500])
#few_segmentIDs = segmentIDs[0:1000]
#edits = flywire.fetch_edit_history(few_segmentIDs)


# Getting edits in a for loop

starting_round = 0 # usually it must starts from 0
_steps = 500
_start = _steps*starting_round
_last = (_steps*starting_round) + _steps
_rounds =  round(len(segmentIDs)/ _steps)

print(f'Rounds to perform: {_rounds}')
edits = pd.DataFrame()
for i in range(starting_round,_rounds):
    #print(f'Round {i}')
    # getting edits in batches
    try:
        curr_edits = flywire.fetch_edit_history(segmentIDs[_start:_last])
        edits = pd.concat([edits,curr_edits])
        
    # getting edits from single ids   
    except:
        for i in range(_start,_last):
            curr_id = segmentIDs[i]
            try:
                curr_edits = flywire.fetch_edit_history(curr_id)
                edits = pd.concat([edits,curr_edits])
            except:
                print(f'Faulty id: {curr_id}')
        
    _start = _start + _steps
    _last = _last + _steps
    

Rounds to perform: 9
Round 0


Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Round 1


Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Round 2


Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Round 3


Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Round 4


Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Faulty id: 720575940640126141
Round 5


Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Round 6


Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Faulty id: 720575940609934019
Round 7


Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

Round 8


Fetching:   0%|          | 0/500 [00:00<?, ?it/s]

In [8]:
# Step 1: Count the number of entries for each unique "segment"
segment_counts = edits['segment'].value_counts().reset_index()
segment_counts.columns = ['segment', 'total_segment_count']

# Step 2: Calculate the total number of entries for each "user_affiliation" within each unique "segment"
user_affiliation_counts = edits.groupby(['segment', 'user_affiliation'])['user_affiliation'].count().reset_index(name='user_affiliation_count')
user_affiliation_counts = user_affiliation_counts.merge(segment_counts, on='segment')
user_affiliation_counts['affiliation_percentage'] = round(user_affiliation_counts['user_affiliation_count'] / user_affiliation_counts['total_segment_count'] * 100)

# Step 3: Calculate the total number of entries for each "user_name" within each unique "segment"
user_name_counts = edits.groupby(['segment', 'user_name'])['user_name'].count().reset_index(name='user_name_count')
user_name_counts = user_name_counts.merge(segment_counts, on='segment')
user_name_counts['user_percentage'] = round(user_name_counts['user_name_count'] / user_name_counts['total_segment_count'] * 100)

In [6]:
# Merge the two DataFrames on the 'segment' column
edit_counts = pd.merge(user_affiliation_counts[['segment', 'user_affiliation','user_affiliation_count', 'affiliation_percentage', 'total_segment_count']], user_name_counts[['segment', 'user_name', 'user_name_count','user_percentage']], on='segment')


# Sort the DataFrame by 'user_name'
edit_counts = edit_counts.sort_values(by='user_name')

# Reset the index to have a clean index order
edit_counts = edit_counts.reset_index(drop=True)

# Filtering the data set to >10% edits bb lab
edit_counts_final = edit_counts[['segment','user_name', 'user_name_count', 'user_affiliation','user_affiliation_count', 'total_segment_count','affiliation_percentage']].copy()
edit_counts_final = edit_counts_final[edit_counts_final['affiliation_percentage']>= 10].copy()

# Print the merged DataFrame
display(edit_counts_final)

Unnamed: 0,segment,user_name,user_name_count,user_affiliation,user_affiliation_count,total_segment_count,affiliation_percentage
9,720575940635571552,A. Javier,2,"Mala Murthy Lab, Sebastian Seung Lab",212,231,92.0
10,720575940626044942,A. Javier,4,"Mala Murthy Lab, Sebastian Seung Lab",189,205,92.0
13,720575940637971363,Albert Lin,1,"Mala Murthy Lab, Sebastian Seung Lab",2,3,67.0
14,720575940613149034,Albert Lin,3,Gerit Linneweber Lab,1,5,20.0
15,720575940616392514,Albert Lin,4,Mala Murthy Lab,4,34,12.0
...,...,...,...,...,...,...,...
27117,720575940645803630,twotwos,1,"Mala Murthy Lab, Sebastian Seung Lab",4,5,80.0
27118,720575940629485584,twotwos,10,Eyewire,24,25,96.0
27119,720575940626012761,twotwos,17,"Mala Murthy Lab, Sebastian Seung Lab",2,20,10.0
27121,720575940626012761,twotwos,17,Eyewire,17,20,85.0


In [11]:
#Saving in a new file

import datetime
x = datetime.datetime.now()
date_str = x.strftime("%d") + x.strftime("%b") + x.strftime("%Y")
file_name = f'segment_ids_edit_counts_{date_str}.xlsx'
savePath = os.path.join(dataPath, file_name)
edit_counts_final.to_excel(savePath, sheet_name='Edits passed 10 percent')


#More dataframes in same excel file
book = load_workbook(savePath)
writer = pd.ExcelWriter(savePath, engine = 'openpyxl')
writer.book = book
user_name_counts.to_excel(writer, sheet_name='Edits counts by user')
user_affiliation_counts.to_excel(writer, sheet_name='Edits counts by affiliation')
writer.save()
writer.close()

In [None]:
savePath