# Process Exported Collection Data from Proteinbase

Source: https://proteinbase.com/api/proteins/download?collectionId=019adf02-13db-93aa-9f8f-2321ce34c976&slug=nipah-binder-competition-all-submissions

In [38]:
import pandas as pd
import json

In [None]:
## Load in the collection data downloaded from Proteinbase
collection_df = pd.read_csv('../data/proteinbase_collection_nipah-binder-competition-all-submissions.csv')

collection_df.head()

Unnamed: 0,id,name,sequence,author,designMethod,evaluations
0,swift-otter-reed,target_binder_design_cdr3_fixed_run_45_cycle_8...,QVQLVESGGGLVQPGGSLRLSCAASGFSFSYYWLGWFRQAPGQGLE...,willv,cdr3-optimization-with-protein-hunter-ranking-...,"[{""type"":""computational"",""value"":""Mainly Beta""..."
1,azure-fox-flint,target_binder_design_cdr3_fixed_run_14_cycle_8...,QVQLVESGGGLVQPGGSLRLSCAASGFSFSYYWLGWFRQAPGQGLE...,willv,cdr3-optimization-with-protein-hunter-ranking-...,"[{""type"":""computational"",""value"":[{""chain"":""B""..."
2,dark-seal-lava,d26,SKPGEKGPKIVLTPKPGYKVYMVDKDQADRCTIKKENTPLLNAAKP...,sasha-murrell,,"[{""type"":""computational"",""value"":""Few Secondar..."
3,calm-orca-ruby,1,KSTAIKATELQLKLLDALENDAPFEEIVAILRELLELLSDLGAAQL...,tom-pan,bg-top-hits-w8qr9efM38,"[{""type"":""computational"",""unit"":""%"",""value"":87..."
4,noble-lynx-lotus,5,SATVTLTALSDFEITVTVTGEGVKEVDVFTASAVDCGFERIKAGGS...,tom-pan,bg-top-hits-w8qr9efM38,"[{""type"":""computational"",""unit"":""%"",""value"":0...."


In [None]:
## Define function to parse JSON evaluations and add ID
def parse_json_evaluations(row):
    evals_df = pd.json_normalize(json.loads(row['evaluations']), max_level=0)
    evals_df['id'] = row['id']
    # pivoted = evals_df.pivot(index='id', columns='metric', values='value')
    # return pivoted
    return evals_df

In [None]:
## Loop through each row to expand JSON evaluations column into separate columns
evals_expanded = collection_df.apply(lambda row: parse_json_evaluations(row), axis=1)
evals_expanded = pd.concat([x for x in evals_expanded], ignore_index=True)

evals_expanded.head()

Unnamed: 0,type,value,metric,valueType,target,unit,id
0,computational,Mainly Beta,classification,label,,,swift-otter-reed
1,computational,0.54831,boltz2_complex_pde,numeric,nipah-glycoprotein-g,,swift-otter-reed
2,computational,7.962377,isoelectric_point,numeric,,pH,swift-otter-reed
3,computational,0.859505,boltz2_plddt,numeric,nipah-glycoprotein-g,%,swift-otter-reed
4,computational,0.814842,boltz2_complex_iplddt,numeric,nipah-glycoprotein-g,%,swift-otter-reed


In [None]:
## Pivot wider using id, metric, value
pivoted_evals = evals_expanded.pivot_table(index='id', columns='metric', values='value', aggfunc='first').reset_index()

pivoted_evals.head()

In [None]:
## Join back to original collection dataframe
final_df = collection_df.merge(pivoted_evals, on='id', how='left')
final_df = final_df.drop(columns=['evaluations'])

final_df.head()

Unnamed: 0,id,name,sequence,author,designMethod,aligned-lengthafdb50,aligned-lengthcath50,aligned-lengthpdb100,boltz2_complex_iplddt,boltz2_complex_pde,...,rmsdcath50,rmsdpdb100,seqidentityafdb50,seqidentitycath50,seqidentitypdb100,shape_complimentarity_boltz2_binder_ss,ted_confidence,tm-scoreafdb50,tm-scorecath50,tm-scorepdb100
0,swift-otter-reed,target_binder_design_cdr3_fixed_run_45_cycle_8...,QVQLVESGGGLVQPGGSLRLSCAASGFSFSYYWLGWFRQAPGQGLE...,willv,cdr3-optimization-with-protein-hunter-ranking-...,,,,0.814842,0.54831,...,,,,,,52.394688,2,,,
1,azure-fox-flint,target_binder_design_cdr3_fixed_run_14_cycle_8...,QVQLVESGGGLVQPGGSLRLSCAASGFSFSYYWLGWFRQAPGQGLE...,willv,cdr3-optimization-with-protein-hunter-ranking-...,,,,0.818778,1.029762,...,,,,,,49.784588,2,,,
2,dark-seal-lava,d26,SKPGEKGPKIVLTPKPGYKVYMVDKDQADRCTIKKENTPLLNAAKP...,sasha-murrell,,,,,0.849479,0.686919,...,,,,,,61.836276,1,,,
3,calm-orca-ruby,1,KSTAIKATELQLKLLDALENDAPFEEIVAILRELLELLSDLGAAQL...,tom-pan,bg-top-hits-w8qr9efM38,,,,0.769909,0.907971,...,,,,,,49.103452,2,,,
4,noble-lynx-lotus,5,SATVTLTALSDFEITVTVTGEGVKEVDVFTASAVDCGFERIKAGGS...,tom-pan,bg-top-hits-w8qr9efM38,,,,0.809155,0.601708,...,,,,,,57.400839,2,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3744,ivory-deer-willow,protein_5,WTSCGKCDDNGWYTFSEVSSHGGEGKGRALFSSGLPSVFAHYIYNF...,theo,,80,90,98,0.653559,1.512914,...,5.73,5.32,14,8,15.3,43.000664,2,0.3536,0.29989,0.34485
3745,hollow-lion-reed,protein_2,FSYVKDLNFYNVCSKGSWPFRSAYTFRAVNWLSGFQQNLRACCSSC...,theo,,,24,,0.638545,0.5661,...,2.42,,,13.7,,47.904722,1,,0.39079,
3746,brisk-hawk-orchid,protein_10,YTTFYKNFVNREPWCYKQQGVELSSIFRDKERRGSFIYADSKVRFP...,theo,,,,,0.655195,0.797355,...,,,,,,44.404086,2,,,
3747,rapid-toad-oak,protein_1,NDIAACCVNFIYMEWASWKNSELYESKTLQTVYWKQFPDLYCRGPG...,theo,,27,,,0.51944,0.670441,...,,,23.6,,,53.551863,1,0.30675,,


In [None]:
## Save to file
final_df.to_csv('../data/proteinbase_collection_nipah-binder-competition-all-submissions_processed.csv', index=False)