In [1]:
import pandas as pd
import os
import pathlib
import sys
from Bio import SeqIO

root = pathlib.Path("__file__").absolute().parents[3]
os.chdir(root)
sys.path.insert(0, str(root))
root

PosixPath('/Users/lu.zhu/Documents/Codebase/ValenceLab/polaris-recipes')


## &#x1F64C; Additional dataset from Adaptv competition participant @agitter

Anthony Gitter has shared the results from 11 additional sequences he ordered from Adaptyv for post-EGFR round 1 analysis. \
You can check out more details on the data and project here: https://github.com/agitter/adaptyvbio-egfr.

Note that `P01133-971-1023` is the possitive control (human EGF).

In [2]:
# load result
res_df = pd.read_csv(
    "https://raw.githubusercontent.com/agitter/adaptyvbio-egfr/refs/heads/main/round1-second-submission-data.csv"
)
res_df

Unnamed: 0,name,replicate,expression,binding,kd,kon,koff,sequence,binding_strength
0,P01133-971-1023,1,high,true,9.598217e-08,70048.032467,0.006723,NSDSECPLSHDGYCLHDGVCMYIEALDKYACNCVVGYIGERCQYRD...,medium
1,P01133-971-1023,2,high,true,7.39916e-08,70316.60687,0.005203,NSDSECPLSHDGYCLHDGVCMYIEALDKYACNCVVGYIGERCQYRD...,medium
2,P01133-971-1023,3,high,true,1.084595e-07,71749.349728,0.007782,NSDSECPLSHDGYCLHDGVCMYIEALDKYACNCVVGYIGERCQYRD...,medium
3,deepsatflow-design:7 n:0|mpnn:1.320|plddt:0.92...,1,high,false,,,,SESELEAKRQELKELMEKTEEKARELLAKGDVAGARAAYGAYLLKAAE,none
4,deepsatflow-design:7 n:0|mpnn:1.320|plddt:0.92...,2,high,false,,,,SESELEAKRQELKELMEKTEEKARELLAKGDVAGARAAYGAYLLKAAE,none
5,deepsatflow-design:7 n:0|mpnn:1.320|plddt:0.92...,3,high,false,,,,SESELEAKRQELKELMEKTEEKARELLAKGDVAGARAAYGAYLLKAAE,none
6,gitter-yolo10,1,high,false,,,,QVQLVESGPGLVKPSQTLSLTCTVSGGSVSSGDYYWTWIRQPPGKG...,none
7,gitter-yolo10,2,low,false,,,,QVQLVESGPGLVKPSQTLSLTCTVSGGSVSSGDYYWTWIRQPPGKG...,none
8,gitter-yolo10,3,medium,false,,,,QVQLVESGPGLVKPSQTLSLTCTVSGGSVSSGDYYWTWIRQPPGKG...,none
9,gitter-yolo2,1,high,false,,,,TVSGFDLTDYGVHWVRQSPGKGLEWLGVIWSGGNTDYNTPFTSRLS...,none


In [3]:
# add binding labels
res_df["binding_class"] = False
res_df.loc[res_df["kd"].notna(), "binding_class"] = True

# the columns of interest
cols = ["name", "binding_class"]

In [4]:
# get the median from the replicates
med_k_vals = res_df.groupby("name")[["kd","kon","koff"]].median().values
res_df = res_df.drop_duplicates(subset=cols)
res_df.loc[:,["kd","kon","koff"]] = med_k_vals

In [5]:
res_df.to_csv(
    "gs://polaris-public/polaris-recipes/org-AdaptyvBio/EGFR_binders/raw/round1_agitter_additional_11_results_summary_with_class.csv",
    index=False,
)

In [6]:
# Get V0 raw data
res_df_v0 = pd.read_csv(
    "gs://polaris-public/polaris-recipes/org-AdaptyvBio/EGFR_binders/raw/result_summary_with_class.csv"
)

In [7]:
res_df_v0.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,192,193,194,195,196,197,198,199,200,201
name,Cetuximab_scFv,ahmedsameh-Q3,ahmedsameh-yy2,martin.pacesa-EGFR_l138_s90285_mpnn2,x.rustamov-m_18_41,alecl-Sequence1,alan.blakely-design:5 n:6|mpnn:1.247|plddt:0.8...,adrian.tripp-egfr_cetuxi_0133_0002_A,alex.naka-158511,alex.naka-dda1c8,...,"colby-TUPEGFR_018 | anti-EGFR, Fv-like protein...",tim-silica_corpora_sampled_epitope_0_generated...,tim-silica_corpora_sampled_epitope_1_generated...,tim-silica_corpora_sampled_epitope_6_generated...,tim-silica_corpora_sampled_epitope_0_generated...,tim-silica_corpora_sampled_epitope_1_generated...,ahmedsameh-y4,ahmedsameh-y6,ahmedsameh-s3,deepsatlow-design:0 n:0|mpnn:1.164|plddt:0.823...
username,,ahmedsameh,ahmedsameh,martin.pacesa,x.rustamov,alecl,alan.blakely,adrian.tripp,alex.naka,alex.naka,...,colby,tim,tim,tim,tim,tim,ahmedsameh,ahmedsameh,ahmedsameh,deepsatflow
sequence_name,,Q3,yy2,EGFR_l138_s90285_mpnn2,m_18_41,Sequence1,design:5 n:6|mpnn:1.247|plddt:0.825|ptm:0.709|...,egfr_cetuxi_0133_0002_A,158511,dda1c8,...,"TUPEGFR_018 | anti-EGFR, Fv-like protein, diff...",silica_corpora_sampled_epitope_0_generated_var...,silica_corpora_sampled_epitope_1_generated_var...,silica_corpora_sampled_epitope_6_generated_var...,silica_corpora_sampled_epitope_0_generated_var...,silica_corpora_sampled_epitope_1_generated_var...,y4,y6,s3,design:0 n:0|mpnn:1.164|plddt:0.823|i_ptm:0.75...
kd,0.0,0.0,0.0,0.0,0.000005,0.00001,0.00001,0.000023,,,...,,,,,,,,,,
sequence,QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLE...,WVQLQESGGGLVQPGGSLRLSCAASGRTFSSYAMGWFRQAPGKQRE...,QVQLQESGGGLVQPGGSLRLSCAASGRTFSSHAMGWFRQAPGKQRE...,SPFDLFLDRLPEQDPEMTEEGKWWAEEMKRMVGPHFEELEEYIRNN...,SAGQAQIEEVKARADKAKTLEELKELRKEAYEKNWKAYMAVVDETE...,SVDEECPASYEGFCQNDGTCLYLEKLDRYACRCREGYIGERCEFRD...,DSECPLSHDGYCLHDGVCMYIEALDKYACNCVVGYIGERCQYRDLK...,MAKLIIANSEEALKEYLEKLGEEAKDYEKVVVPLGDGSVVQSAQNA...,PSFSACPSNYDGVCCNGGVCHLAESLTSYTCQCILGYSGHRVQTFD...,PSFSACPSNYDGVCCNGGVCHLAESLTSYTCQCIIGYSGDRVQTFD...,...,SQSLSITCTVSGFDLTSYGVHWVRQSPGKGLEWLGVIWSDGSTDYN...,EVQLVESGGGLVQPGGSLRLSCAASGFTSSNNPMGWFRQAPGKGRE...,QVQLVESGGGLVKPGGSLRLSCAASGFTSSSYAAAWFRQAPGKERE...,QVQLVESGGGLVKPGGSLRLSCAASGGTSSSYAAAWFRQAPGKERE...,EVQLLESGGGEVQPGGSLRLSCAASGSTFSNYGMGWFRQAPGKERE...,QVQLVESGGGLVKPGGSLRLSCAASGSTSSNYAAAWFRQAPGKERE...,QVQLQESGGGLVQPGGSLRLSCAASGRTFSSYAMGWFRQAPGKQRE...,QVQLQESGGGLVQPGGSLRLSCAASGRTFSSYAMGWFRQAPGKQRE...,QVQLQESGGGLVQPGGSLRLSCAASGRTFSSYAMGWFRQAPGKQRE...,SLKEKEEKLIEELEEELKKIKEEYEKKIKEYLEEGNIEKAEKLKEE...
dna,ATGCAGGTGCAGCTGAAACAGAGCGGCCCGGGCCTGGTGCAGCCAT...,ATGTGGGTGCAGCTGCAGGAAAGCGGCGGCGGCTTAGTGCAACCAG...,ATGCAGGTGCAGCTGCAGGAAAGCGGCGGCGGCTTAGTGCAACCAG...,ATGAGCCCGTTTGATCTGTTTCTGGATCGCCTGCCGGAACAGGATC...,ATGAGCGCGGGCCAGGCGCAGATTGAAGAAGTGAAAGCGCGCGCAG...,ATGAGCGTGGATGAAGAATGCCCGGCGAGCTATGAAGGCTTTTGCC...,ATGGACTCTGAGTGTCCTTTGTCACACGACGGGTACTGTTTGCATG...,ATGGCGAAACTGATTATTGCGAACAGCGAAGAAGCGCTGAAAGAGT...,ATGCCGAGCTTTAGCGCGTGCCCGAGCAACTATGATGGCGTGTGCT...,ATGCCGAGCTTTAGCGCGTGCCCGAGCAACTATGATGGCGTGTGCT...,...,ATGAGCCAGAGCCTGAGCATTACCTGCACCGTGAGCGGCTTTGATC...,ATGGAAGTGCAGCTGGTGGAAAGCGGCGGCGGCCTGGTTCAACCAG...,ATGCAGGTGCAGCTGGTGGAAAGCGGCGGCGGCCTGGTTAAACCAG...,ATGCAGGTGCAGCTGGTGGAAAGCGGCGGCGGCCTGGTTAAACCAG...,ATGGAAGTGCAGCTGCTGGAAAGCGGCGGCGGCGAAGTGCAACCAG...,ATGCAGGTGCAGCTGGTGGAAAGCGGCGGCGGCCTGGTTAAACCAG...,ATGCAGGTGCAGCTGCAGGAAAGCGGCGGCGGCTTAGTGCAACCAG...,ATGCAGGTGCAGCTGCAGGAAAGCGGCGGCGGCTTAGTGCAACCAG...,ATGCAGGTGCAGCTGCAGGAAAGCGGCGGCGGCTTAGTGCAACCAG...,ATGAGCCTGAAAGAAAAAGAAGAAAAGCTGATTGAAGAACTGGAAG...
plddt,,77.840455,77.288939,88.653551,89.5806,84.862264,49.140108,90.535933,92.5664,92.4346,...,59.76645,72.963984,72.938862,70.989268,74.955968,74.065691,77.784242,77.753258,77.456894,56.370435
pae_interaction,,28.217942,28.17707,16.878782,14.921833,9.206467,21.431173,16.086371,7.562282,7.596206,...,28.589824,28.063192,28.264615,28.289215,28.305615,28.327152,28.178994,28.208359,28.246956,21.55856
similarity_check,,0.992,0.992,,,0.584164,0.263,,0.61946,0.65988,...,0.42973,0.859,0.878,0.878,0.903,0.886,1.0,0.992,0.992,
model_names,,"[""Rosetta""]","[""Rosetta""]","[""AF2 Backprop""]","[""AF2 Backprop""]","[""ProteinMPNN""]",[],[],"[""Custom (Active Learning)""]","[""Custom (Active Learning)""]",...,[],"[""Custom (Generative)""]","[""Custom (Generative)""]","[""Custom (Generative)""]","[""Custom (Generative)""]","[""Custom (Generative)""]","[""Rosetta""]","[""Rosetta""]","[""Rosetta""]",[]


### &#x26A0;	For the v1 version, we consider the weak binders as non-binders because their interactions are relatively weak and may not be stable or effective in inhibiting EGFR.

In [8]:
res_df_v0.loc[
    res_df_v0.name.isin(
        [
            "alecl-Sequence1",
            "alan.blakely-design:5 n:6|mpnn:1.247|plddt:0.825|ptm:0.709|pae:10.151|rmsd:3.535",
        ]
    ),
    "binding_class",
] = False

In [9]:
res_df_v1 = pd.concat([res_df_v0, res_df], axis=0, ignore_index=True)

In [10]:
res_df_v1[res_df_v1.duplicated("name", keep=False)]

Unnamed: 0,name,username,sequence_name,kd,sequence,dna,plddt,pae_interaction,similarity_check,model_names,methods,binding_class,replicate,expression,binding,kon,koff,binding_strength


In [13]:
res_df_v1.to_csv(
    "gs://polaris-public/polaris-recipes/org-AdaptyvBio/EGFR_binders/raw/round1_results_summary_with_class_v1.csv",
    index=False,
)