In [1]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from Bio.PDB import PDBParser, PPBuilder
import py3Dmol, pathlib
import pcmap
import math
from tqdm import trange

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch #https://github.com/facebookresearch/esm
torch.cuda.set_device(0)  # 0 == "first visible" -> actually GPU 3 on the node
print(torch.cuda.get_device_name(0))

Tesla V100-SXM2-32GB


### Loading .pdb files

In [2]:
interaction_df = pd.read_csv("/work3/s232958/data/meta_analysis/source_data.csv")
interaction_df = interaction_df[(interaction_df["A_seq"].notna())& (interaction_df["B_seq"].notna())].reset_index(drop=True)
interaction_df = interaction_df[interaction_df["target_chains"] == "[\"B\"]"]
interaction_df = interaction_df[interaction_df["B_seq"].notna()].reset_index(drop=True)
cols_remain = ["binder_id", "target_id", "binder_chain", "target_chains", "binder", "A_seq", "B_seq"]
interaction_df = interaction_df[cols_remain]
# interaction_df = interaction_df.drop(columns=["binder_chain", "target_chains"])

# Adding new target id
dfu = interaction_df.drop_duplicates(["target_id", "B_seq"])
dfu[dfu.target_id == "EGFR"].B_seq.tolist()
targets_Dict, target_tracking = {}, {target:int(2) for target in dfu.target_id.unique()}

for idx, target_id in enumerate(dfu.target_id):
    if target_id not in targets_Dict.keys():
        targets_Dict[target_id] = dfu.iloc[idx]["B_seq"]
    else:
        target_id_new = f"{target_id}_{target_tracking[target_id]}"
        targets_Dict[target_id_new] = dfu.iloc[idx]["B_seq"]
        count = target_tracking[target_id]
        target_tracking[target_id] = count + 1

interaction_df['target_id_mod'] = interaction_df['B_seq'].map({b:a for a, b in targets_Dict.items()})
interaction_df['B_seq'].map({b:a for a, b in targets_Dict.items()})
cnt = interaction_df.groupby('target_id_mod').cumcount() + 1
interaction_df['target_binder_ID'] = interaction_df['target_id_mod'].astype(str) + '_' + cnt.astype(str)
interaction_df

Unnamed: 0,binder_id,target_id,binder_chain,target_chains,binder,A_seq,B_seq,target_id_mod,target_binder_ID
0,EHEE_rd4_0110_min_rise1_21_000000015_0001,VirB8,A,"[""B""]",False,LDFIVFAGPEKAIKFYKEMAKRNLEVKIWIDGDWAVVQVK,ANPYISVANIMLQNYVKQREKYNYDTLKEQFTFIKNASTSIVYMQF...,VirB8,VirB8_1
1,Grafting_Motif0040_ems_3hM_3083_0001_0002,FGFR2,A,"[""B""]",False,SEQDETMHRIVRSVIQHAYKHNDEMAEYFAQNAAEIYKEQNKSEEA...,RSPHRPILQAGLPANASTVVGGDVEFVCKVYSDAQPHIQWIKHVPY...,FGFR2,FGFR2_1
2,Grafting_Motif0042_ems_3hM_148_0001,FGFR2,A,"[""B""]",False,DYKQLKKHATKLLELAKKDPSSKRDLLRTAASYANKVLFEDSDPRA...,RSPHRPILQAGLPANASTVVGGDVEFVCKVYSDAQPHIQWIKHVPY...,FGFR2,FGFR2_2
3,Grafting_Motif0042_ems_3hM_1661_0001,FGFR2,A,"[""B""]",False,DEKEELERRANRVAFLAIQIQNEEYHRILAELYVQFMKAAENNDTE...,RSPHRPILQAGLPANASTVVGGDVEFVCKVYSDAQPHIQWIKHVPY...,FGFR2,FGFR2_3
4,Grafting_Motif0042_ems_3hM_257_0001,FGFR2,A,"[""B""]",False,PDNKEKLMSIAVQLILRINEAARSEEQWRYANRAAFAAVEASSGSD...,RSPHRPILQAGLPANASTVVGGDVEFVCKVYSDAQPHIQWIKHVPY...,FGFR2,FGFR2_4
...,...,...,...,...,...,...,...,...,...
3527,il2ra_site1_2b5i_sap_19_mot_HHH_b2_06055_af2_0...,IL2Ra,A,"[""B""]",False,DLRKYAAELVDRLAEKYNLDSDQYNALVRLASELVWQGKSKEEIEK...,ELCDDDPPEIPHATFKAMAYKEGTMLNCECKRGFRRIKSGSLYMLC...,IL2Ra,IL2Ra_62
3528,il2ra_site1_2b5i_sap_20_mot_HHH_b1_01934_af2_0...,IL2Ra,A,"[""B""]",False,SKEEIKKEAEELIEELKKKGYNLPLRILEFALKEIEETNSEKYYEQ...,ELCDDDPPEIPHATFKAMAYKEGTMLNCECKRGFRRIKSGSLYMLC...,IL2Ra,IL2Ra_63
3529,il2ra_site1_2b5i_sap_23_mot_88fc943612ced423dc...,IL2Ra,A,"[""B""]",False,SPEYKKFLELIKEAEAARKAGDLDKAKELLEKALELAKKMKAKSLI...,ELCDDDPPEIPHATFKAMAYKEGTMLNCECKRGFRRIKSGSLYMLC...,IL2Ra,IL2Ra_64
3530,il2ra_site1_2b5i_sap_25_mot_HHH_b2_01943_00000...,IL2Ra,A,"[""B""]",False,DPLLAYKLLKLSQKALEKAYAEDRERAEELLEEAEAALRSLGDEAG...,ELCDDDPPEIPHATFKAMAYKEGTMLNCECKRGFRRIKSGSLYMLC...,IL2Ra,IL2Ra_65


In [4]:
interaction_df[interaction_df["target_binder_ID"] == "EGFR_2_279"]

Unnamed: 0,binder_id,target_id,binder_chain,target_chains,binder,A_seq,B_seq,target_id_mod,target_binder_ID
3160,jakublala_full_dibinder_45aa_pdb,EGFR,A,"[""B""]",False,DGYPESNIMTRNPEENIDQFLEKQLYKINLFAREDQQMQQYDRDP,LEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYV...,EGFR_2,EGFR_2_279


In [5]:
meta_binders_Df = interaction_df[["target_binder_ID", "binder_id", "A_seq"]]
meta_binders_Df

Unnamed: 0,target_binder_ID,binder_id,A_seq
0,VirB8_1,EHEE_rd4_0110_min_rise1_21_000000015_0001,LDFIVFAGPEKAIKFYKEMAKRNLEVKIWIDGDWAVVQVK
1,FGFR2_1,Grafting_Motif0040_ems_3hM_3083_0001_0002,SEQDETMHRIVRSVIQHAYKHNDEMAEYFAQNAAEIYKEQNKSEEA...
2,FGFR2_2,Grafting_Motif0042_ems_3hM_148_0001,DYKQLKKHATKLLELAKKDPSSKRDLLRTAASYANKVLFEDSDPRA...
3,FGFR2_3,Grafting_Motif0042_ems_3hM_1661_0001,DEKEELERRANRVAFLAIQIQNEEYHRILAELYVQFMKAAENNDTE...
4,FGFR2_4,Grafting_Motif0042_ems_3hM_257_0001,PDNKEKLMSIAVQLILRINEAARSEEQWRYANRAAFAAVEASSGSD...
...,...,...,...
3527,IL2Ra_62,il2ra_site1_2b5i_sap_19_mot_HHH_b2_06055_af2_0...,DLRKYAAELVDRLAEKYNLDSDQYNALVRLASELVWQGKSKEEIEK...
3528,IL2Ra_63,il2ra_site1_2b5i_sap_20_mot_HHH_b1_01934_af2_0...,SKEEIKKEAEELIEELKKKGYNLPLRILEFALKEIEETNSEKYYEQ...
3529,IL2Ra_64,il2ra_site1_2b5i_sap_23_mot_88fc943612ced423dc...,SPEYKKFLELIKEAEAARKAGDLDKAKELLEKALELAKKMKAKSLI...
3530,IL2Ra_65,il2ra_site1_2b5i_sap_25_mot_HHH_b2_01943_00000...,DPLLAYKLLKLSQKALEKAYAEDRERAEELLEEAEAALRSLGDEAG...


### Loading ESM2 model

In [6]:
model, alphabet = torch.hub.load("facebookresearch/esm:main", "esm2_t33_650M_UR50D")
model.eval().to("cuda")
batch_converter = alphabet.get_batch_converter()

model

Using cache found in /zhome/c9/0/203261/.cache/torch/hub/facebookresearch_esm_main


ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0-32): 33 x TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
  )
  (contact_head): ContactPredictionHead(
    (regression): Linear(in_features=660, out_features=1, bias=True)
    (activation): Sigmoid()
  )
  (emb_layer_norm_after): LayerNorm((1280,), eps=1

### Loading meta-analysis sequences and pdb files

In [9]:
data = []
pdbs_Dict = {}
for row in interaction_df.iloc():
    binder_id = row.target_binder_ID
    pdb_name = f"{row.binder_id}.pdb.gz"
    pdb_path = os.path.join("/work3/s232958/data/meta_analysis/input_pdbs/", pdb_name)
    seq = row.A_seq
    pdbs_Dict[binder_id] = pdb_path
    data.append((binder_id, seq))

batch_labels, batch_strs, batch_tokens = batch_converter(data)
batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)  # tensor([L_plus_special])

print(len(batch_labels), "e.g.:", batch_labels[1]) # binder ids
print(len(batch_strs), "e.g.:", batch_strs[1]) # sequences
print(batch_tokens.shape, "e.g.:", batch_tokens[1]) # shape (batch_size, max_len_in_batch)
# all the sequences in the batch are padded to be the same in length
assert int(batch_lens[1]) == (batch_tokens[1] != 1).sum().item()
# counts all non-PAD tokens, i.e. it includes BOS/CLS and EOS too. So each batch_lens[i] equals:
# true_residue_length + 2   # (+1 for BOS/CLS at pos 0, +1 for EOS at the end)

3532 e.g.: FGFR2_1
3532 e.g.: SEQDETMHRIVRSVIQHAYKHNDEMAEYFAQNAAEIYKEQNKSEEAKKLVRSLQEYLESILS
torch.Size([3532, 252]) e.g.: tensor([ 0,  8,  9, 16, 13,  9, 11, 20, 21, 10, 12,  7, 10,  8,  7, 12, 16, 21,
         5, 19, 15, 21, 17, 13,  9, 20,  5,  9, 19, 18,  5, 16, 17,  5,  5,  9,
        12, 19, 15,  9, 16, 17, 15,  8,  9,  9,  5, 15, 15,  4,  7, 10,  8,  4,
        16,  9, 19,  4,  9,  8, 12,  4,  8,  2,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  

In [10]:
list(pdbs_Dict.items())[:5]

[('VirB8_1',
  '/work3/s232958/data/meta_analysis/input_pdbs/EHEE_rd4_0110_min_rise1_21_000000015_0001.pdb.gz'),
 ('FGFR2_1',
  '/work3/s232958/data/meta_analysis/input_pdbs/Grafting_Motif0040_ems_3hM_3083_0001_0002.pdb.gz'),
 ('FGFR2_2',
  '/work3/s232958/data/meta_analysis/input_pdbs/Grafting_Motif0042_ems_3hM_148_0001.pdb.gz'),
 ('FGFR2_3',
  '/work3/s232958/data/meta_analysis/input_pdbs/Grafting_Motif0042_ems_3hM_1661_0001.pdb.gz'),
 ('FGFR2_4',
  '/work3/s232958/data/meta_analysis/input_pdbs/Grafting_Motif0042_ems_3hM_257_0001.pdb.gz')]

In [11]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.memory_allocated: 2.425342GB
torch.cuda.memory_reserved: 2.445312GB
torch.cuda.max_memory_reserved: 2.445312GB


### Pseudo perplecity (torch.exp(-average_log_likelyhood)) calculation

In [12]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

def _generate_masked_sequences(sequence: str, mask_length: list):
    """
    Generates sequences with a sliding window of masks of length N.

    Args:
        sequence (str): The input protein sequence.
        mask_length (int): The number of adjacent tokens to mask (N).

    Yields:
        A tuple containing the masked sequence string, the start index of the
        mask, and the end index of the mask.
    """
    np.random.seed(0)
    seq_indexes = np.arange(0,len(sequence))
    np.random.shuffle(seq_indexes)
    # print(seq_indexes)
    batched_seq_indexes = list(batch(seq_indexes,n=1))
    # print(batched_seq_indexes)
    all_sequences = []
    for masked_index in batched_seq_indexes: # if mask_length=1, then only 1 index
        seq_copy = list(sequence).copy() # make sequence into list
        for index in masked_index:
            seq_copy[index] = "<mask>"
            
        all_sequences.append((masked_index,"".join(seq_copy)))
    
    return all_sequences

seq = "LDFIVFAGPEKAIKFYKEMAKRNLEVKIWIDGDWAVVQVK"

all_sequences = _generate_masked_sequences(sequence = seq, mask_length = 1)

In [13]:
all_sequences[:5]

[(array([22]), 'LDFIVFAGPEKAIKFYKEMAKR<mask>LEVKIWIDGDWAVVQVK'),
 (array([20]), 'LDFIVFAGPEKAIKFYKEMA<mask>RNLEVKIWIDGDWAVVQVK'),
 (array([25]), 'LDFIVFAGPEKAIKFYKEMAKRNLE<mask>KIWIDGDWAVVQVK'),
 (array([4]), 'LDFI<mask>FAGPEKAIKFYKEMAKRNLEVKIWIDGDWAVVQVK'),
 (array([10]), 'LDFIVFAGPE<mask>AIKFYKEMAKRNLEVKIWIDGDWAVVQVK')]

In [14]:
print(all_sequences[0][0]) # masked amino acid
print(all_sequences[0][1]) # seqeunced with mask

[22]
LDFIVFAGPEKAIKFYKEMAKR<mask>LEVKIWIDGDWAVVQVK


In [15]:
print(len(alphabet))              # -> 33 (vocab size V)
print(alphabet.all_toks)          # list of token strings
print(alphabet.get_idx('A'))      # int id for 'A'
print(alphabet.get_tok(0))        # token string for id 0

33
['<cls>', '<pad>', '<eos>', '<unk>', 'L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C', 'X', 'B', 'U', 'Z', 'O', '.', '-', '<null_1>', '<mask>']
5
<cls>


In [16]:
model, alphabet = torch.hub.load("facebookresearch/esm:main", "esm2_t33_650M_UR50D")
model = model.to("cuda")
batch_converter = alphabet.get_batch_converter()

Using cache found in /zhome/c9/0/203261/.cache/torch/hub/facebookresearch_esm_main


In [17]:
data[1]

('FGFR2_1', 'SEQDETMHRIVRSVIQHAYKHNDEMAEYFAQNAAEIYKEQNKSEEAKKLVRSLQEYLESILS')

In [20]:
@torch.no_grad()
def calculate_pll_score(sequence: str, mask_length: int = 1) -> float:
    """
    Calculates a score based on the model's ability to predict residues
    within a sliding window of N masked tokens.

    When mask_length=1, this is equivalent to standard pseudo-log-likelihood (pLL).

    Args:
        sequence (str): The input protein sequence.
        mask_length (int): The length of the mask stretch (N).
        batch_size (int): The number of masked sequences to process in each batch.

    Returns:
        float: The calculated pseudo-perplexity-like score.
    """
    if not sequence or not isinstance(sequence, str):
        raise ValueError("Input sequence must be a non-empty string.")

    # Set seed for suffling
    np.random.seed(0)
            
    # 1. Generate all sequences with sliding window masks
    masked_data = _generate_masked_sequences(sequence, mask_length)

    # ESM-2 input
    ESM_input = [(i, masked_seq[1]) for i, masked_seq in enumerate(masked_data)] #masked_seq[1] - masked sequence
 
    # 2. Convert to batches
    batch_labels, batch_strs, batch_tokens = batch_converter(ESM_input)
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
    batch_tokens = batch_tokens.to("cuda")
    
    # 3. Pass through ESM-2
    out = model(batch_tokens, repr_layers=[33], return_contacts=False)
    logits = out["logits"] # e.g. logits.shape (40, 42, 33)
    """
    B=40: you created 40 masked variants (one per masked position).
    T=42: your 40-residue sequence plus [CLS] at start and [EOS] at end.
    V=33: the model’s vocabulary size (all tokens it can predict).
        - 20 standard amino acids
        - special/ambiguous tokens and special symbols used by the model - [CLS], [EOS], [PAD], [MASK];
        - ambiguous amino acids like X, B, Z, possibly U, O.
    """
    logit_prob = torch.nn.functional.log_softmax(logits, dim=-1) # dim=-1 - so that last dimension (size 33) would be used 
    tok_labels = list(alphabet.all_toks)  # ['<cls>', '<pad>', '<eos>', '<unk>', 'L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C', 'X', 'B', 'U', 'Z', 'O', '.', '-', '<null_1>', '<mask>']

    # 4. Calcualte PLL
    log_likelihood = 0
    count_meta_all, count_meta_correct = 0, 0
    for i, (masked_index, _) in enumerate(masked_data):
        
        for j in masked_index: # if more than one mask per sequence
            
            # print(alphabet.all_toks)
            # print(torch.sigmoid(logit_prob[i, j+1]))
            # print(alphabet.get_idx(sequence[j]))

            if i % 100 == 0:
                # probabilities over the 33 tokens at position j (account for CLS shift)
                probs = logit_prob[i, j+1, :].exp().detach().cpu().numpy()  # i - masked seqeunce, j - position of residue (+1 account for CLS shift)
                
                # print(f"True amino acid: {sequence[j]}")
                # for i, p in enumerate(probs):
                #     print(tok_labels[i], round(p,2)) 

                # df_plot = pd.DataFrame({"token": tok_labels, "prob": probs})
                # ax = sns.barplot(data=df_plot, x="prob", y="token", orient="h")
                # ax.set_title(f"Position {j} (true AA: {sequence[j]})")
                # plt.tight_layout()
                # plt.show()
            
            log_likelihood += logit_prob[i, j+1, alphabet.get_idx(sequence[j])]
            """
            j+1   shifts for the leading [CLS]
            alphabet.get_idx(sequence[j])  int id for true amino acid masked
            """

            count_meta_all += 1
            true_aa_idx = alphabet.get_idx(sequence[j])
            predicted_aa_idx = torch.argmax(logit_prob[i, j+1, :]).item()
            is_correct = (predicted_aa_idx == true_aa_idx)
            if is_correct:
                count_meta_correct += 1
    
    # Calculate the average log likelihood per token
    avg_log_likelihood = log_likelihood / len(sequence) # or / logit_prob.shape[0] # the higher the better
    # print(f"Average log-likelihood: {avg_log_likelihood.item()}")

    # Compute and return the pseudo-perplexity
    pll = torch.exp(-torch.tensor(avg_log_likelihood)).item() # the lower the better
    # print(f"Pseudo-perplexity: {pll}")
    return float(avg_log_likelihood), pll, count_meta_all, count_meta_correct

In [None]:
Count_Meta_all, Count_Meta_correct = 0, 0

for i in trange(len(meta_binders_Df)):
    seq = meta_binders_Df.iloc[i]["A_seq"]
    avg_log_likelihood, pll, count_all, count_correct = calculate_pll_score(sequence=seq, mask_length=1)
    Count_Meta_all += count_all
    Count_Meta_correct += count_correct
    
    meta_binders_Df.at[i, "avg_log_likelihood"] = avg_log_likelihood
    meta_binders_Df.at[i, "pseudo_perplexity"] = pll

  pll = torch.exp(-torch.tensor(avg_log_likelihood)).item() # the lower the better
  1%|▋                                                                              | 32/3532 [01:07<2:34:06,  2.64s/it]

In [18]:
meta_binders_Df = meta_binders_Df.rename(columns = {
    "A_seq" : "seq_binder"
})
meta_binders_Df

Unnamed: 0,target_binder_ID,binder_id,seq_binder,avg_log_likelihood,pseudo_perplexity
0,VirB8_1,EHEE_rd4_0110_min_rise1_21_000000015_0001,LDFIVFAGPEKAIKFYKEMAKRNLEVKIWIDGDWAVVQVK,-2.749460,15.634179
1,FGFR2_1,Grafting_Motif0040_ems_3hM_3083_0001_0002,SEQDETMHRIVRSVIQHAYKHNDEMAEYFAQNAAEIYKEQNKSEEA...,-2.732485,15.371030
2,FGFR2_2,Grafting_Motif0042_ems_3hM_148_0001,DYKQLKKHATKLLELAKKDPSSKRDLLRTAASYANKVLFEDSDPRA...,-2.527573,12.523080
3,FGFR2_3,Grafting_Motif0042_ems_3hM_1661_0001,DEKEELERRANRVAFLAIQIQNEEYHRILAELYVQFMKAAENNDTE...,-2.289752,9.872489
4,FGFR2_4,Grafting_Motif0042_ems_3hM_257_0001,PDNKEKLMSIAVQLILRINEAARSEEQWRYANRAAFAAVEASSGSD...,-2.577624,13.165817
...,...,...,...,...,...
3527,IL2Ra_62,il2ra_site1_2b5i_sap_19_mot_HHH_b2_06055_af2_0...,DLRKYAAELVDRLAEKYNLDSDQYNALVRLASELVWQGKSKEEIEK...,-2.042634,7.710890
3528,IL2Ra_63,il2ra_site1_2b5i_sap_20_mot_HHH_b1_01934_af2_0...,SKEEIKKEAEELIEELKKKGYNLPLRILEFALKEIEETNSEKYYEQ...,-1.827751,6.219881
3529,IL2Ra_64,il2ra_site1_2b5i_sap_23_mot_88fc943612ced423dc...,SPEYKKFLELIKEAEAARKAGDLDKAKELLEKALELAKKMKAKSLI...,-1.441779,4.228210
3530,IL2Ra_65,il2ra_site1_2b5i_sap_25_mot_HHH_b2_01943_00000...,DPLLAYKLLKLSQKALEKAYAEDRERAEELLEEAEAALRSLGDEAG...,-2.069157,7.918145


In [19]:
meta_binders_Df.to_csv("/work3/s232958/data/meta_analysis/averageLL_pPLL.csv", index=False)