# Generate ML Dataset

In [4]:
import pandas as pd

In [8]:
## Load in the processed data files
collection_df = pd.read_csv('../data/proteinbase_collection_nipah-binder-competition-all-submissions_processed.csv')

experimental_df = pd.read_csv('../data/proteinbase_collection_nipah-binder-competition-results_processed.csv')

haddock_df = pd.read_csv('../data/haddock3_scores.csv')

In [10]:
## Join the datasets on id

ml_dataset_df = collection_df.merge(experimental_df, on='id', how='inner') \
    .merge(haddock_df, on='id', how='inner')


ml_dataset_df.head()

Unnamed: 0,id,name_x,sequence,author,designMethod,aligned-lengthafdb50,aligned-lengthcath50,aligned-lengthpdb100,boltz2_complex_iplddt,boltz2_complex_pde,...,kd,koff,kon,neutralization,score,vdw,elec,desolv,bsa,total
0,swift-otter-reed,target_binder_design_cdr3_fixed_run_45_cycle_8...,QVQLVESGGGLVQPGGSLRLSCAASGFSFSYYWLGWFRQAPGQGLE...,willv,cdr3-optimization-with-protein-hunter-ranking-...,,,,0.814842,0.54831,...,,,,,-127.6897,-64.9667,-215.195,-19.684,3463.02,-280.1617
1,azure-fox-flint,target_binder_design_cdr3_fixed_run_14_cycle_8...,QVQLVESGGGLVQPGGSLRLSCAASGFSFSYYWLGWFRQAPGQGLE...,willv,cdr3-optimization-with-protein-hunter-ranking-...,,,,0.818778,1.029762,...,,,,,-49.2013,-35.1036,3.56907,-14.8115,1675.19,-31.53453
2,quick-hawk-moss,Nipah_2408_0_v2,EVQLVESGGGLVQPGGSLRLSCAASSGFTSSDGMAWFRQAPGQERE...,velaa2,boltzgen,,,,0.863335,0.43199,...,,,,,-40.8789,-19.0934,-113.416,0.897653,2146.18,-132.5094
3,violet-swan-sand,run6_nipah_g_hotspot1_truncated_463,EVQLVESGGGLVQPGGSLRLSCAASGETIPDYAGMAWYRQAPGKGR...,nvith,boltzgen,,,,0.820958,0.592882,...,,,,,-125.2337,-90.9761,-162.599,-1.73779,2794.07,-253.5751
4,silent-crane-birch,protenix_design_dcc15bd4_7_T0.1_v5_T0.2_v6_T0....,KDPEKELEESAREATNEFIEGYKNLGGTLTEEEVEELEKSLLEVAT...,nappenstance,protenix-solublempnn-refinement-7HziZU0e71,,,,0.890597,0.582612,...,,,,,-124.0925,-49.5994,-464.014,18.3097,3079.9,-513.6134


In [12]:
## Define columns to keep from original dataframe
cols_to_keep = [
    'id', 'classification', 'design_class',
    'boltz2_complex_iplddt', 'boltz2_complex_pde', 'boltz2_complex_plddt',
    'boltz2_ipsae', 'boltz2_iptm', 'boltz2_lis', 'boltz2_min_ipsae',
    'boltz2_pdockq', 'boltz2_pdockq2', 'boltz2_plddt', 'boltz2_ptm',
    'esmfold_plddt', 'isoelectric_point', 'molecular_weight',
    'proteinmpnn_score', 'proteinmpnn_seq_recovery', 'redesigned_proteinmpnn_score',
    'shape_complimentarity_boltz2_binder_ss', 'expression-yield',
    'kd', 'koff', 'kon',
    'score', 'vdw', 'elec', 'desolv', 'bsa', 'total'
    ]


In [13]:
## Keep only relevant columns
ml_dataset_df = ml_dataset_df[cols_to_keep]

ml_dataset_df.head()

Unnamed: 0,id,classification,design_class,boltz2_complex_iplddt,boltz2_complex_pde,boltz2_complex_plddt,boltz2_ipsae,boltz2_iptm,boltz2_lis,boltz2_min_ipsae,...,expression-yield,kd,koff,kon,score,vdw,elec,desolv,bsa,total
0,swift-otter-reed,Mainly Beta,Nanobody,0.814842,0.54831,0.880486,0.448441,0.775582,0.1769,0.240606,...,45.283466,,,,-127.6897,-64.9667,-215.195,-19.684,3463.02,-280.1617
1,azure-fox-flint,Mainly Beta,Nanobody,0.818778,1.029762,0.835774,0.0,0.269881,0.0,0.0,...,54.107455,,,,-49.2013,-35.1036,3.56907,-14.8115,1675.19,-31.53453
2,quick-hawk-moss,Mainly Beta,Nanobody,0.863335,0.43199,0.892859,0.832161,0.928313,0.5502,0.74766,...,67.698711,,,,-40.8789,-19.0934,-113.416,0.897653,2146.18,-132.5094
3,violet-swan-sand,Mainly Beta,Nanobody,0.820958,0.592882,0.855316,0.70903,0.886572,0.4871,0.586014,...,73.929433,,,,-125.2337,-90.9761,-162.599,-1.73779,2794.07,-253.5751
4,silent-crane-birch,Mainly Alpha,Other,0.890597,0.582612,0.87954,0.807099,0.930599,0.6263,0.747279,...,510.520508,,,,-124.0925,-49.5994,-464.014,18.3097,3079.9,-513.6134


In [16]:
## Filter out records where kd is NaN
ml_dataset_df = ml_dataset_df[ml_dataset_df['kd'].notna()]

ml_dataset_df.shape

(97, 31)

In [17]:
## Save to file
ml_dataset_df.to_csv('../data/proteinbase_nipah-binder-competition_ml-dataset.csv', index=False)