# Process Exported Experimental Data from Proteinbase

Source: https://proteinbase.com/api/proteins/download?collectionId=019be357-ae36-ec95-4bc6-9db0046b0600&slug=nipah-binder-competition-results

In [1]:
import pandas as pd
import json

In [2]:
## Load in the collection data downloaded from Proteinbase
collection_df = pd.read_csv('../data/proteinbase_collection_nipah-binder-competition-results.csv')

collection_df.head()

Unnamed: 0,id,name,sequence,author,designMethod,evaluations
0,azure-wolf-maple,control-6CMI_3,EVQLVQSGAEVKKRGSSVKVSCKSSGGTFSNYAINWVRQAPGQGLE...,adaptyv-bio,,"[{""type"":""experimental"",""value"":""Strong"",""metr..."
1,calm-panda-fern,control-7TXZ_2,EVKLEESGGGLVQPGGSMKLSCVASGFSFSYYWMNWVRQSPEKGLE...,adaptyv-bio,,"[{""type"":""computational"",""value"":0.0073,""metri..."
2,deep-heron-rose,control-ephrin-B2,KSIVLEPIYWNSSNSKFLPGQGLVLYPQIGDKLDIICPKVDSKTVG...,adaptyv-bio,,"[{""type"":""computational"",""unit"":""pH"",""value"":6..."
3,ivory-orca-fern,control-8K3C_2,EVQLVQSGGGLVQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLE...,adaptyv-bio,,"[{""type"":""computational"",""value"":53.9529645499..."
4,azure-fox-flint,target_binder_design_cdr3_fixed_run_14_cycle_8...,QVQLVESGGGLVQPGGSLRLSCAASGFSFSYYWLGWFRQAPGQGLE...,willv,cdr3-optimization-with-protein-hunter-ranking-...,"[{""type"":""experimental"",""unit"":""μg/ml"",""value""..."


In [3]:
## Define function to parse JSON evaluations and add ID
def parse_json_evaluations(row):
    evals_df = pd.json_normalize(json.loads(row['evaluations']), max_level=0)
    evals_df['id'] = row['id']
    # pivoted = evals_df.pivot(index='id', columns='metric', values='value')
    # return pivoted
    return evals_df

In [4]:
## Loop through each row to expand JSON evaluations column into separate columns
evals_expanded = collection_df.apply(lambda row: parse_json_evaluations(row), axis=1)
evals_expanded = pd.concat([x for x in evals_expanded], ignore_index=True)

evals_expanded.head()

Unnamed: 0,type,value,metric,target,valueType,unit,id
0,experimental,Strong,binding_strength,nipah-glycoprotein-g,label,,azure-wolf-maple
1,computational,scFv,design_class,,label,,azure-wolf-maple
2,computational,"{'metrics': [{'name': 'SeqIdentity(afdb50)', '...",domainmatch,,json,,azure-wolf-maple
3,computational,EEEEEEEEE,foldstring,,label,,azure-wolf-maple
4,computational,1.503,proteinmpnn_score,,numeric,,azure-wolf-maple


In [5]:
## Pivot wider using id, metric, value
pivoted_evals = evals_expanded.pivot_table(index='id', columns='metric', values='value', aggfunc='first').reset_index()

pivoted_evals.head()

metric,id,aligned-lengthafdb50,aligned-lengthcath50,aligned-lengthpdb100,binding,binding_strength,bli_kinetic_curves,boltz2_complex_iplddt,boltz2_complex_pde,boltz2_complex_plddt,...,seqidentity,seqidentityafdb50,seqidentitycath50,seqidentitypdb100,shape_complimentarity_boltz2_binder_ss,spr_kinetic_curves,ted_confidence,tm-scoreafdb50,tm-scorecath50,tm-scorepdb100
0,amber-bat-vine,,,,False,,,0.893818,0.452474,0.915093,...,"{'date': '2026-01-23', 'value': 0, 'databases'...",,,,62.859129,{'url': 'https://proteinbase-pub.t3.storage.de...,2,,,
1,amber-bee-plume,,,,False,,,0.830967,0.617701,0.850396,...,"{'date': '2026-01-23', 'match': {'db': 'pdb', ...",,,,49.727932,{'url': 'https://proteinbase-pub.t3.storage.de...,2,,,
2,amber-boar-reed,,,,False,,,0.863948,0.428005,0.852939,...,"{'date': '2026-01-23', 'value': 0, 'databases'...",,,,57.050979,{'url': 'https://proteinbase-pub.t3.storage.de...,2,,,
3,amber-crow-willow,97.0,102.0,73.0,False,,,0.862624,0.668106,0.862654,...,"{'date': '2026-01-23', 'value': 0, 'databases'...",13.4,10.7,24.3,56.282231,{'url': 'https://proteinbase-pub.t3.storage.de...,2,0.75749,0.80638,0.56119
4,amber-eagle-ember,,,,False,,,0.802026,1.060194,0.843329,...,"{'date': '2026-01-23', 'value': 0, 'databases'...",,,,56.101999,{'url': 'https://proteinbase-pub.t3.storage.de...,2,,,


In [8]:
## Define columns to keep from original dataframe
cols_to_keep = [
    'id', 'name',
    'binding', 'binding_strength', 'bli_kinetic_curves',
    'expressed', 'expression-yield',
    'kd', 'koff', 'kon', 'neutralization'
    ]


In [9]:
## Join back to original collection dataframe
final_df = collection_df.merge(pivoted_evals, on='id', how='left')
final_df = final_df.drop(columns=['evaluations'])

## Keep only relevant columns
final_df = final_df[cols_to_keep]

final_df.head()

Unnamed: 0,id,name,binding,binding_strength,bli_kinetic_curves,expressed,expression-yield,kd,koff,kon,neutralization
0,azure-wolf-maple,control-6CMI_3,True,Strong,{'url': 'https://proteinbase-pub.t3.storage.de...,True,,0.0,0.000199,51499.707222,
1,calm-panda-fern,control-7TXZ_2,True,Strong,{'url': 'https://proteinbase-pub.t3.storage.de...,True,,0.0,0.000445,44544.734509,
2,deep-heron-rose,control-ephrin-B2,True,Strong,{'url': 'https://proteinbase-pub.t3.storage.de...,True,,0.0,0.000704,815100.4157,
3,ivory-orca-fern,control-8K3C_2,True,Strong,{'url': 'https://proteinbase-pub.t3.storage.de...,True,,0.0,9e-05,8228.26963,
4,azure-fox-flint,target_binder_design_cdr3_fixed_run_14_cycle_8...,False,,,True,54.107455,,,,


In [10]:
## Save to file
final_df.to_csv('../data/proteinbase_collection_nipah-binder-competition-results_processed.csv', index=False)