In [1]:
import sys, os
print("Kernel Python:", sys.executable)

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import trange
import ast

import torch
print("PyTorch:", torch.__version__)

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
torch.cuda.set_device(0)  # 0 == "first visible" -> actually GPU 2 on the node
print(torch.cuda.get_device_name(0))

from tqdm import tqdm
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print("Current location:", os.getcwd())

import torch
print(torch.version.cuda)   # shows the CUDA version PyTorch was built against
print(torch.backends.cudnn.version())  # cuDNN version
print(torch.cuda.is_available())

import random

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import training_utils.partitioning_utils as pat_utils

Kernel Python: /work3/s232958/envs/esm_gpu/bin/python
PyTorch: 2.5.1
Tesla V100-SXM2-32GB
Using device: cuda
Current location: /zhome/c9/0/203261/DBL046_PP_osaul/DBL046_PP_osaul/tmp/ona_drafts
12.1
90100
True


In [2]:
def print_mem_consumption():
    # 1. Total memory available on the GPU (device 0)
    t = torch.cuda.get_device_properties(0).total_memory
    # 2. How much memory PyTorch has *reserved* from CUDA
    r = torch.cuda.memory_reserved(0)
    # 3. How much of that reserved memory is actually *used* by tensors
    a = torch.cuda.memory_allocated(0)
    # 4. Reserved but not currently allocated (so “free inside PyTorch’s pool”)
    f = r - a

    print("Total memory: ", t/1e9)      # total VRAM in GB
    print("Reserved memory: ", r/1e9)   # PyTorch’s reserved pool in GB
    print("Allocated memory: ", a//1e9) # actually in use (integer division)
    print("Free memory: ", f/1e9)       # slack in the reserved pool in GB
print_mem_consumption()

Total memory:  34.072559616
Reserved memory:  0.0
Allocated memory:  0.0
Free memory:  0.0


### Loading meta-analysis

In [3]:
meta_interactions_df = pd.read_csv("/work3/s232958/data/meta_analysis/interaction_df_metaanal.csv")[["A_seq", "B_seq", "target_id_mod", "target_binder_ID"]].rename(columns = {
    "A_seq" : "seq_binder",
    "B_seq" : "seq_target",
    "target_binder_ID" : "binder_id",
    "target_id_mod" : "target_id"
})
meta_interactions_df

Unnamed: 0,seq_binder,seq_target,target_id,binder_id
0,LDFIVFAGPEKAIKFYKEMAKRNLEVKIWIDGDWAVVQVK,ANPYISVANIMLQNYVKQREKYNYDTLKEQFTFIKNASTSIVYMQF...,VirB8,VirB8_1
1,SEQDETMHRIVRSVIQHAYKHNDEMAEYFAQNAAEIYKEQNKSEEA...,RSPHRPILQAGLPANASTVVGGDVEFVCKVYSDAQPHIQWIKHVPY...,FGFR2,FGFR2_1
2,DYKQLKKHATKLLELAKKDPSSKRDLLRTAASYANKVLFEDSDPRA...,RSPHRPILQAGLPANASTVVGGDVEFVCKVYSDAQPHIQWIKHVPY...,FGFR2,FGFR2_2
3,DEKEELERRANRVAFLAIQIQNEEYHRILAELYVQFMKAAENNDTE...,RSPHRPILQAGLPANASTVVGGDVEFVCKVYSDAQPHIQWIKHVPY...,FGFR2,FGFR2_3
4,PDNKEKLMSIAVQLILRINEAARSEEQWRYANRAAFAAVEASSGSD...,RSPHRPILQAGLPANASTVVGGDVEFVCKVYSDAQPHIQWIKHVPY...,FGFR2,FGFR2_4
...,...,...,...,...
3527,DLRKYAAELVDRLAEKYNLDSDQYNALVRLASELVWQGKSKEEIEK...,ELCDDDPPEIPHATFKAMAYKEGTMLNCECKRGFRRIKSGSLYMLC...,IL2Ra,IL2Ra_62
3528,SKEEIKKEAEELIEELKKKGYNLPLRILEFALKEIEETNSEKYYEQ...,ELCDDDPPEIPHATFKAMAYKEGTMLNCECKRGFRRIKSGSLYMLC...,IL2Ra,IL2Ra_63
3529,SPEYKKFLELIKEAEAARKAGDLDKAKELLEKALELAKKMKAKSLI...,ELCDDDPPEIPHATFKAMAYKEGTMLNCECKRGFRRIKSGSLYMLC...,IL2Ra,IL2Ra_64
3530,DPLLAYKLLKLSQKALEKAYAEDRERAEELLEEAEAALRSLGDEAG...,ELCDDDPPEIPHATFKAMAYKEGTMLNCECKRGFRRIKSGSLYMLC...,IL2Ra,IL2Ra_65


### Loading PPint data

In [4]:
### Loading df
PPint_interactions_df = pd.read_csv("/work3/s232958/data/PPint_DB/PPint_sample_muClustering.csv")
# PPint_interaactions_df = pd.read_csv("/zhome/c9/0/203261/DBL046_PP_osaul/DBL046_PP_osaul/tmp/data/PPint_DB/PPint_sample.csv")
PPint_interactions_df["seq1_len"] = [len(seq) for seq in PPint_interactions_df["seq1"].tolist()]
PPint_interactions_df["seq2_len"] = [len(seq) for seq in PPint_interactions_df["seq2"].tolist()]
PPint_interactions_df["target_binder_id"] = PPint_interactions_df["ID1"]+"_"+PPint_interactions_df["ID2"]
# PPint_interaactions_df["index"] = [i for i in PPint_interaactions_df["target_binder_id"].tolist()]
PPint_interactions_df

Unnamed: 0,interface_id,seq1,seq2,ID1,ID2,dimer,seq_target_len,seq_binder_len,target_binder_id,cluster,seq1_len,seq2_len
0,1JEB_2,SLTKTERTIIVSMWAKISTQADTIGTETLERLFLSHPQTKTYFPHF...,VHLTDAEKAAVSGLWGKVNADEVGGEALGRLLVVYPWTQRYFDSFG...,1JEB_2_C,1JEB_2_D,False,141,146,1JEB_2_C_1JEB_2_D,852,141,146
1,7B12_23,TTTLAFKFQHGVIAAVDSRASAGSYISALRVNKVIEINPYLLGTMS...,DRGVNTFSPEGRLFQVEYAIEAIKLGSTAIGIQTSEGVCLAVEKRI...,7B12_23_Z,7B12_23_s,False,203,230,7B12_23_Z_7B12_23_s,523,203,230
2,6VCD_1,ITHLPPEVMLSIFSYLNPQELCRCSQVSMKWSQLTKTGSLWKHLYP...,PSIKLQSSDGEIFEVDVEIAKQSVTIKTMLEDLGDPVPLPNVNAAI...,6VCD_1_B,6VCD_1_C,False,255,135,6VCD_1_B_6VCD_1_C,2294,255,135
3,2OKG_0,NAKDVLGLTLLEKTLKERLNLKDAIIVSGDSDQSPWVKKEGRAAVA...,AKDVLGLTLLEKTLKERLNLKDAIIVSGDSDQSPWVKKEGRAAVAC...,2OKG_0_A,2OKG_0_B,True,241,243,2OKG_0_A_2OKG_0_B,8029,241,243
4,3MBX_0,DIVMSQSPSSLAVSVGEKVTMSCKSSQSLLYNNNQKNYLAWYQQKP...,VTLKESGPGILQPSQTLSLTCSFSGFSLSTYGMGVGWIRQPSGKGL...,3MBX_0_L,3MBX_0_H,False,220,229,3MBX_0_L_3MBX_0_H,82,220,229
...,...,...,...,...,...,...,...,...,...,...,...,...
2467,3HR4_0,REIPLKVLVKAVLFACMLMRKTMASRVRVTILFATETGKSEALAWD...,QLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAEL...,3HR4_0_A,3HR4_0_B,False,189,145,3HR4_0_A_3HR4_0_B,218,189,145
2468,6D3M_0,NKYRFIDVQPLTGVLGAEITGVDLREPLDDSTWNEILDAFHTYQVI...,KYRFIDVQPLTGVLGAEITGVDLREPLDDSTWNEILDAFHTYQVIY...,6D3M_0_A,6D3M_0_B,True,286,285,6D3M_0_A_6D3M_0_B,630,286,285
2469,4LUB_0,TMKIAYLGPSGSFTHNVALHAFPAADLLPFENITEVIKAYESKQVC...,TMKIAYLGPSGSFTHNVALHAFPAADLLPFENITEVIKAYESKQVC...,4LUB_0_A,4LUB_0_B,True,188,190,4LUB_0_A_4LUB_0_B,9344,188,190
2470,4MN4_2,ERDEVGARKNAVDEEIERLSQPGDQRLNALAERFGGVLLSEIYDDV...,EPVTIVLSQGWVRSAKGHDIDAPGLNYKAGDSFKAAVKGKSNQPVV...,4MN4_2_D,4MN4_2_B,False,154,236,4MN4_2_D_4MN4_2_B,1366,154,236


In [7]:
PPint_interactions_df[PPint_interactions_df.target_binder_id == "6M9S_0_A_6M9S_0_B"]

Unnamed: 0,interface_id,seq1,seq2,ID1,ID2,dimer,seq_target_len,seq_binder_len,target_binder_id,cluster,seq1_len,seq2_len
411,6M9S_0,HVPPHVPFELSGAELRDAIVQYATNPIYHDNLDWLNHDNPYRRQLR...,HVPPHVPFELSGAELRDAIVQYATNPIYHDNLDWLNHDNPYRRQLR...,6M9S_0_A,6M9S_0_B,True,450,443,6M9S_0_A_6M9S_0_B,9046,450,443


In [10]:
clusters_counts_Dict = dict(PPint_interactions_df.cluster.value_counts())
clusters_counts = list(clusters_counts_Dict.items())

random.seed(0)
random.shuffle(clusters_counts)

counter = 0
ceil = round(len(PPint_interactions_df) * 0.2)
val_clusters = []

for (clust, num) in clusters_counts:
    
    if num + counter < ceil:
        counter += num
        val_clusters.append(clust)

    elif num + counter > ceil:
        continue

    elif num + counter == ceil:
        counter += num
        val_clusters.append(clust)
        break

counter = 0
for c in val_clusters:
    counter += clusters_counts_Dict[c]
print(counter)
print(ceil)

494
494


In [None]:
PPint_interactions_df.cluster.value_counts().head(20)

### Clusters from Christian

In [31]:
path_to_mmseqs_clustering = "/work3/s232958/data/PPint_DB/3_å_dataset5_singlefasta/clusterRes40"
all_seqs, clust, clust_keys = pat_utils.mmseqs_parser(path_to_mmseqs_clustering)

path_to_interaction_df = "/work3/s232958/data/PPint_DB/disordered_interfaces_no_cutoff_filtered_nonredundant80_3å_5.csv.gz"
disordered_interfaces_df = pd.read_csv(path_to_interaction_df,index_col=0).reset_index(drop=True)
disordered_interfaces_df["PDB_chain_name"] = (disordered_interfaces_df["PDB"] + "_" + disordered_interfaces_df["chainname"]).tolist()
disordered_interfaces_df["index_num"] = np.arange(len(disordered_interfaces_df))
disordered_interfaces_df["chain_name_index"] = [row["PDB_chain_name"] + "_" + str(row["index_num"]) for index, row in disordered_interfaces_df.iterrows()]
disordered_interfaces_df = disordered_interfaces_df.set_index("PDB_interface_name")
disordered_interfaces_df["interface_residues"] = disordered_interfaces_df["interface_residues"].apply(lambda x: ast.literal_eval(x))
# disordered_interfaces_df["inter_chain_hamming"] = [1 - (Ldistance(seq.split("-")[0], seq.split("-")[1]))/np.max([len(seq.split("-")[0]), len(seq.split("-")[1])]) for seq in disordered_interfaces_df["protien_interface_sequences"]]
disordered_interfaces_df["dimer"] = disordered_interfaces_df["inter_chain_hamming"] > 0.60
disordered_interfaces_df["clust_keys"] = [clust_keys.get(row["chain_name_index"]) for index, row in disordered_interfaces_df.iterrows()] 

pdb_interface_and_clust_keys = {index:disordered_interfaces_df.loc[index,"clust_keys"].values.tolist() for index in tqdm(disordered_interfaces_df.index.drop_duplicates(), total=len(disordered_interfaces_df)/2)}
new_clusters, new_clusters_clustkeys = pat_utils.recluster_mmseqs_keys_to_non_overlapping_groups(pdb_interface_and_clust_keys)

### Creating train and test datasets based on train and test-idexes
train_indexes, test_indexes = pat_utils.run_train_test_partition(interaction_df=disordered_interfaces_df,
                                                    clustering=new_clusters, # Clusters from Bidentate-graphs
                                                    train_ratio=0.8, 
                                                    test_ratio=0.2, 
                                                    v=True, 
                                                    seed=0)

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 24725/24725.0 [00:31<00:00, 772.83it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████| 27834/27834 [00:00<00:00, 883261.89it/s]

0.8
0.2





In [42]:
disordered_interfaces_df["ID"] = [row["PDB"]+"_"+str(row["interface_index"])+"_"+row["chainname"] for __, row in disordered_interfaces_df.iterrows()]
disordered_interfaces_df["PDB_interface_name"] = disordered_interfaces_df.index
disordered_interfaces_df

grouped = {}
for _, row in disordered_interfaces_df.iterrows():
    iface = row["PDB_interface_name"]
    seq = row["sequence"]
    rid = row["ID"]
    dimer = row["dimer"]
    
    if iface not in grouped:
        grouped[iface] = {
            "sequences": [],
            "IDs": [],
            "dimer": dimer,        # keep the dimer value for this interface
        }
    else:
        # Optional: sanity-check it's consistent per interface
        if grouped[iface]["dimer"] != dimer:
            print(f"Warning: multiple dimers for interface {iface}:",
                  grouped[iface]['dimer'], "vs", dimer)

    grouped[iface]["sequences"].append(seq)
    grouped[iface]["IDs"].append(rid)

records = []
for iface, vals in grouped.items():
    seqs = vals["sequences"]
    ids = vals["IDs"]
    if len(seqs) >= 2 and len(ids) >= 2:
        records.append({
            "interface_id": iface,
            "seq1": seqs[0],
            "seq2": seqs[1],
            "ID1": ids[0],
            "ID2": ids[1],
            "dimer": vals["dimer"],   # <- add dimer to final record
        })

PPint_interactions_NEW = pd.DataFrame(records)
PPint_interactions_NEW["seq_target_len"] = [len(row.seq1) for __, row in PPint_interactions_NEW.iterrows()]
PPint_interactions_NEW["seq_binder_len"] = [len(row.seq2) for __, row in PPint_interactions_NEW.iterrows()]
PPint_interactions_NEW["target_binder_id"] = PPint_interactions_NEW["ID1"] + "_" + PPint_interactions_NEW["ID2"]

# sample random 10%
random.seed(0)
train_indexes_sample = random.sample(train_indexes, int(len(train_indexes) * 0.1))
test_indexes_sample = random.sample(test_indexes, int(len(test_indexes) * 0.1))

In [44]:
mask = (
    PPint_interactions_NEW['interface_id'].isin(train_indexes_sample) |
    PPint_interactions_NEW['interface_id'].isin(test_indexes_sample)
)

PPint_interactions_df = PPint_interactions_NEW.loc[mask].copy()
PPint_interactions_df

Unnamed: 0,interface_id,seq1,seq2,ID1,ID2,dimer,seq_target_len,seq_binder_len,target_binder_id
4,6IDB_0,DKICLGHHAVSNGTKVNTLTERGVEVVNATETVERTNIPRICSKGK...,GLFGAIAGFIENGWEGLIDGWYGFRHQNAQGEGTAADYKSTQSAID...,6IDB_0_A,6IDB_0_B,False,317,172,6IDB_0_A_6IDB_0_B
8,2WZP_3,VQLQESGGGLVQAGGSLRLSCTASRRTGSNWCMGWFRQLAGKEPEL...,TIKNFTFFSPNSTEFPVGSNNDGKLYMMLTGMDYRTIRRKDWSSPL...,2WZP_3_D,2WZP_3_G,False,122,266,2WZP_3_D_2WZP_3_G
11,1ZKP_0,LYFQSNAKTVVGFWGGFPEAGEATSGYLFEHDGFRLLVDCGSGVLA...,AKTVVGFWGGFPEAGEATSGYLFEHDGFRLLVDCGSGVLAQLQKYI...,1ZKP_0_A,1ZKP_0_C,True,246,240,1ZKP_0_A_1ZKP_0_C
25,6GRH_3,SKHELSLVEVTHYTDPEVLAIVKDFHVRGNFASLPEFAERTFVSAV...,MINVYSNLMSAWPATMAMSPKLNRNMPTFSQIWDYERITPASAAGE...,6GRH_3_C,6GRH_3_D,False,266,396,6GRH_3_C_6GRH_3_D
35,8R57_1,DLMTALQLVMKKSSAHDGLVKGLREAAKAIEKHAAQICVLAEDCDQ...,PKKQKHKHKKVKLAVLQFYKVDDATGKVTRLRKECPNADCGAGTFM...,8R57_1_M,8R57_1_f,False,118,64,8R57_1_M_8R57_1_f
...,...,...,...,...,...,...,...,...,...
24688,3CKI_0,DPMKNTCKLLVVADHRFYRYMGRGEESTTTNYLIELIDRVDDIYRN...,CTCSPSHPQDAFCNSDIVIRAKVVGKKLVKEGPFGTLVYTIKQMKM...,3CKI_0_A,3CKI_0_B,False,256,121,3CKI_0_A_3CKI_0_B
24697,7MHY_1,QVQLRQSGAELAKPGASVKMSCKASGYTFTNYWLHWIKQRPGQGLE...,DVLMTQTPLSLPVSLGDQVSISCRSSQSIVHNTYLEWYLQKPGQSP...,7MHY_1_M,7MHY_1_N,False,118,109,7MHY_1_M_7MHY_1_N
24698,7MHY_2,IQLVQSGPELVKISCKASGYTFTNYGMNWVRQAPGKGLKWMGWINT...,VLMTQTPLSLPVSISCRSSQSIVHSNGNTYLEWYLQKPGQSPKLLI...,7MHY_2_O,7MHY_2_P,False,100,94,7MHY_2_O_7MHY_2_P
24715,6WDS_0,SVLTQPPSASGTPGQRVTISCSGSSSNIEYNYVYWYQKFPGTAPKL...,VQLVESGGGLVKPGGLRLSCAASGFTFSTYIMTWVRQAPGRGLEWV...,6WDS_0_L,6WDS_0_H,False,106,115,6WDS_0_L_6WDS_0_H


### Loading ESM2

In [12]:
# model, alphabet = torch.hub.load("facebookresearch/esm:main", "esm2_t48_15B_UR50D")
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
model = model.eval().to("cuda")

batch_converter = alphabet.get_batch_converter()

Using cache found in /work3/s232958/torch/hub/facebookresearch_esm_main


ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0-32): 33 x TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
  )
  (contact_head): ContactPredictionHead(
    (regression): Linear(in_features=660, out_features=1, bias=True)
    (activation): Sigmoid()
  )
  (emb_layer_norm_after): LayerNorm((1280,), eps=1

### Encoding PPint_DB

In [13]:
path_to_output_embeddings_targets = "/work3/s232958/data/PPint_DB/targets_embeddings_esm2"
path_to_output_embeddings_binders = "/work3/s232958/data/PPint_DB/binders_embeddings_esm2"

os.makedirs(path_to_output_embeddings_targets, exist_ok=True)
os.makedirs(path_to_output_embeddings_binders, exist_ok=True)

# helper: convert torch tensor to numpy
def to_numpy(x):
    try:
        return x.detach().cpu().numpy()
    except AttributeError:
        return np.asarray(x)

seen_targets = set()
seen_binders = set()

for i in trange(len(PPint_interactions_df), desc="Embedding"):
    # ---- target ----
    seq_target = PPint_interactions_df.iloc[i]["seq1"]
    target_id  = PPint_interactions_df.iloc[i]["ID1"]

    if target_id not in seen_targets:
        data = [(target_id, seq_target)]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)
        batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

        with torch.no_grad():
            L = model.num_layers
            reps = model(
                batch_tokens.to("cuda"),
                repr_layers=[L],
                return_contacts=False
            )["representations"][L]  # shape: [1, seq_len, hidden_dim]

        emb_np = to_numpy(reps[0, :batch_lens[0]])  # trim padding for cleanliness

        out_path = os.path.join(
            path_to_output_embeddings_targets,
            f"{target_id}.npy"
        )
        np.save(out_path, emb_np)
        seen_targets.add(target_id)

    # ---- binder ----
    seq_binder = PPint_interactions_df.iloc[i]["seq2"]
    binder_id  = PPint_interactions_df.iloc[i]["ID2"]

    if binder_id not in seen_binders:
        data = [(binder_id, seq_binder)]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)
        batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

        with torch.no_grad():
            L = model.num_layers
            reps = model(
                batch_tokens.to("cuda"),
                repr_layers=[L],
                return_contacts=False
            )["representations"][L]

        emb_np = to_numpy(reps[0, :batch_lens[0]])

        out_path = os.path.join(
            path_to_output_embeddings_binders,
            f"{binder_id}.npy"
        )
        np.save(out_path, emb_np)
        seen_binders.add(binder_id)

Embedding: 100%|████████████████████████████████████████████████████████████████████████████████████████| 2472/2472 [03:38<00:00, 11.34it/s]


### Encoding meta analysis

In [None]:
path_to_output_embeddings_targets = "/work3/s232958/data/meta_analysis/targets_embeddings_esm2"
path_to_output_embeddings_binders = "/work3/s232958/data/meta_analysis/binders_embeddings_esm2"

os.makedirs(path_to_output_embeddings_targets, exist_ok=True)
os.makedirs(path_to_output_embeddings_binders, exist_ok=True)

# helper: convert torch tensor to numpy
def to_numpy(x):
    try:
        return x.detach().cpu().numpy()
    except AttributeError:
        return np.asarray(x)

seen_targets = set()
seen_binders = set()

for i in trange(len(meta_interactions_df), desc="Embedding"):
    # ---- target ----
    seq_target = meta_interactions_df.iloc[i]["seq_target"]
    target_id  = meta_interactions_df.iloc[i]["target_id"]

    if target_id not in seen_targets:
        data = [(target_id, seq_target)]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)
        batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

        with torch.no_grad():
            L = model.num_layers
            reps = model(
                batch_tokens.to("cuda"),
                repr_layers=[L],
                return_contacts=False
            )["representations"][L]  # shape: [1, seq_len, hidden_dim]

        emb_np = to_numpy(reps[0, :batch_lens[0]])  # trim padding for cleanliness

        out_path = os.path.join(
            path_to_output_embeddings_targets,
            f"{target_id}.npy"
        )
        np.save(out_path, emb_np)
        seen_targets.add(target_id)

    # ---- binder ----
    seq_binder = meta_interactions_df.iloc[i]["seq_binder"]
    binder_id  = meta_interactions_df.iloc[i]["binder_id"]

    if binder_id not in seen_binders:
        data = [(binder_id, seq_binder)]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)
        batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

        with torch.no_grad():
            L = model.num_layers
            reps = model(
                batch_tokens.to("cuda"),
                repr_layers=[L],
                return_contacts=False
            )["representations"][L]

        emb_np = to_numpy(reps[0, :batch_lens[0]])

        out_path = os.path.join(
            path_to_output_embeddings_binders,
            f"{binder_id}.npy"
        )
        np.save(out_path, emb_np)
        seen_binders.add(binder_id)

### Contact maps PPint_DB

In [14]:
path_to_output_contacts_targets = "/work3/s232958/data/PPint_DB/targets_contacts"
path_to_output_contacts_binders = "/work3/s232958/data/PPint_DB/binders_contacts"

os.makedirs(path_to_output_contacts_targets, exist_ok=True)
os.makedirs(path_to_output_contacts_binders, exist_ok=True)

# helper: convert torch tensor to numpy
def to_numpy(x):
    try:
        return x.detach().cpu().numpy()
    except AttributeError:
        return np.asarray(x)

seen_targets = set()
seen_binders = set()

for i in trange(len(PPint_interactions_df), desc="Contact maps"):
    # ------- target -------
    seq_target = PPint_interactions_df.iloc[i]["seq1"]
    target_id  = PPint_interactions_df.iloc[i]["ID1"]

    if target_id not in seen_targets:
        data = [(target_id, seq_target)]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)
        batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

        with torch.no_grad():
            # model returns contacts if return_contacts=True
            out = model(
                batch_tokens.to("cuda"),
                repr_layers=[],          # no embeddings needed
                return_contacts=True
            )
            contact_map = out["contacts"][0]  # shape [seq_len, seq_len] for this sequence

        contact_np = to_numpy(contact_map[: batch_lens[0], : batch_lens[0]])

        out_path = os.path.join(
            path_to_output_contacts_targets,
            f"{target_id}.npy"
        )
        np.save(out_path, contact_np)
        seen_targets.add(target_id)

    # ------- binder -------
    seq_binder = PPint_interactions_df.iloc[i]["seq2"]
    binder_id  = PPint_interactions_df.iloc[i]["ID2"]

    if binder_id not in seen_binders:
        data = [(binder_id, seq_binder)]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)
        batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

        with torch.no_grad():
            out = model(
                batch_tokens.to("cuda"),
                repr_layers=[],
                return_contacts=True
            )
            contact_map = out["contacts"][0]

        contact_np = to_numpy(contact_map[: batch_lens[0], : batch_lens[0]])

        out_path = os.path.join(
            path_to_output_contacts_binders,
            f"{binder_id}.npy"
        )
        np.save(out_path, contact_np)
        seen_binders.add(binder_id)

Contact maps: 100%|█████████████████████████████████████████████████████████████████████████████████████| 2472/2472 [03:48<00:00, 10.83it/s]


### Contact maps meta-analysis dataset

In [None]:
path_to_output_contacts_targets = "/work3/s232958/data/meta_analysis/targets_contacts"
path_to_output_contacts_binders = "/work3/s232958/data/meta_analysis/binders_contacts"

os.makedirs(path_to_output_contacts_targets, exist_ok=True)
os.makedirs(path_to_output_contacts_binders, exist_ok=True)

# helper: convert torch tensor to numpy
def to_numpy(x):
    try:
        return x.detach().cpu().numpy()
    except AttributeError:
        return np.asarray(x)

seen_targets = set()
seen_binders = set()

for i in trange(len(meta_interactions_df), desc="Contact maps"):
    # ------- target -------
    seq_target = meta_interactions_df.iloc[i]["seq_target"]
    target_id  = meta_interactions_df.iloc[i]["target_id"]

    if target_id not in seen_targets:
        data = [(target_id, seq_target)]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)
        batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

        with torch.no_grad():
            # model returns contacts if return_contacts=True
            out = model(
                batch_tokens.to("cuda"),
                repr_layers=[],          # no embeddings needed
                return_contacts=True
            )
            contact_map = out["contacts"][0]  # shape [seq_len, seq_len] for this sequence

        contact_np = to_numpy(contact_map[: batch_lens[0], : batch_lens[0]])

        out_path = os.path.join(
            path_to_output_contacts_targets,
            f"{target_id}.npy"
        )
        np.save(out_path, contact_np)
        seen_targets.add(target_id)

    # ------- binder -------
    seq_binder = meta_interactions_df.iloc[i]["seq_binder"]
    binder_id  = meta_interactions_df.iloc[i]["binder_id"]

    if binder_id not in seen_binders:
        data = [(binder_id, seq_binder)]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)
        batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

        with torch.no_grad():
            out = model(
                batch_tokens.to("cuda"),
                repr_layers=[],
                return_contacts=True
            )
            contact_map = out["contacts"][0]

        contact_np = to_numpy(contact_map[: batch_lens[0], : batch_lens[0]])

        out_path = os.path.join(
            path_to_output_contacts_binders,
            f"{binder_id}.npy"
        )
        np.save(out_path, contact_np)
        seen_binders.add(binder_id)

### Plotting ESM2 embeddings meta-analysis binders vs PPint_DB binders only

In [None]:
PPint_encodings_path = "/work3/s232958/data/PPint_DB/binders_embeddings_esm2"
PPint_DB_encodings_files = os.listdir(PPint_encodings_path)
random.shuffle(PPint_DB_encodings_files)
PPint_DB_embedings = [np.load(os.path.join(PPint_encodings_path, f)) for f in PPint_DB_encodings_files[:1000]]
# for file in PPint_DB_encodings_list[:1000]:
#     path = os.path.join(encodings_path, file)
#     embedding = np.load(path)[0]
#     PPint_DB_embedings.append(embedding)

meta_encodings_path = "/work3/s232958/data/meta_analysis/binders_embeddings_esm2"
meta_binders_embedding_files = os.listdir(meta_encodings_path)
random.shuffle(meta_binders_embedding_files)
meta_binders_embedings = [np.load(os.path.join(meta_encodings_path, f)) for f in meta_binders_embedding_files[:1000]]

In [None]:
assert len(PPint_DB_embedings) == len(meta_binders_embedings)

In [None]:
# 1) Pool per-sample (mean over residues) -> (Ni, 1152)
pooled_PPintDB_binders       = np.stack([arr.mean(axis=0) for arr in PPint_DB_embedings], axis=0)        # (N1, 1152)
pooled_meta_binders  = np.stack([arr.mean(axis=0) for arr in meta_binders_embedings], axis=0)    # (N2, 1152)

# 2) Standardize jointly (important!)
X_all = np.vstack([pooled_PPintDB_binders, pooled_meta_binders])
X_all_std = StandardScaler().fit_transform(X_all)

# 3) PCA on combined
pca = PCA(n_components=2, random_state=0)
Z_all = pca.fit_transform(X_all_std)  # (N1+N2+N3, 2)

# 4) Split back by counts
N1 = pooled_PPintDB_binders.shape[0]
N2 = pooled_meta_binders.shape[0]

Z_PPintDB_binders = Z_all[:N1]
Z_meta_binders = Z_all[N1:]

# 5) Plot
plt.figure(figsize=(7,7))
plt.scatter(Z_PPintDB_binders[:,0], Z_PPintDB_binders[:,1], s=18, alpha=0.5, label="PPintDB_binders",  marker='o')
plt.scatter(Z_meta_binders[:,0], Z_meta_binders[:,1], s=18, alpha=0.5, label="Meta binders", marker='o')

plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)")
plt.title("PCA of pooled 1152-d embeddings")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
pca.explained_variance_ratio_