In [63]:
import sys, os
print("Kernel Python:", sys.executable)

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import trange
import re
from collections import defaultdict

import torch
print("PyTorch:", torch.__version__)

from tqdm import tqdm
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print("Current location:", os.getcwd())

import torch
print(torch.version.cuda)   # shows the CUDA version PyTorch was built against
print(torch.backends.cudnn.version())  # cuDNN version
print(torch.cuda.is_available())

import random

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import training_utils.partitioning_utils as pat_utils
import ast

Kernel Python: /work3/s232958/envs/esm_gpu/bin/python
PyTorch: 2.5.1
Using device: cuda
Current location: /zhome/c9/0/203261/DBL046_PP_osaul/DBL046_PP_osaul/tmp/ona_drafts
12.1
90100
True


In [29]:
def print_mem_consumption():
    # 1. Total memory available on the GPU (device 0)
    t = torch.cuda.get_device_properties(0).total_memory
    # 2. How much memory PyTorch has *reserved* from CUDA
    r = torch.cuda.memory_reserved(0)
    # 3. How much of that reserved memory is actually *used* by tensors
    a = torch.cuda.memory_allocated(0)
    # 4. Reserved but not currently allocated (so “free inside PyTorch’s pool”)
    f = r - a

    print("Total memory: ", t/1e9)      # total VRAM in GB
    print("Reserved memory: ", r/1e9)   # PyTorch’s reserved pool in GB
    print("Allocated memory: ", a//1e9) # actually in use (integer division)
    print("Free memory: ", f/1e9)       # slack in the reserved pool in GB
print_mem_consumption()

Total memory:  34.072559616
Reserved memory:  0.0
Allocated memory:  0.0
Free memory:  0.0


### Loading PPint data

In [30]:
### Laodinf dataframe
PPint_interactions = pd.read_csv("/work3/s232958/data/PPint_DB/disordered_interfaces_no_cutoff_filtered_nonredundant80_3å_5.csv.gz",index_col=0).reset_index(drop=True)

### Adding ID column
ids = []
for idx, row in PPint_interactions.iterrows():
    ids.append(f"{row.PDB_interface_name}_{row.chainname}")
PPint_interactions["ID"] = ids

PPint_interactions.head(2)

Unnamed: 0,PDB,interface_index,chainname,sequence,interface_residues,disorderscore_predictions,mean disorder interface,seq. len,PDB_interface_name,protien_interface_sequences,interacting_seq_aa,inter_chain_hamming,ID
0,6NZA,0,A,MNTVRSEKDSMGAIDVPADKLWGAQTQRSLEHFRISTEKMPTSLIH...,"[184, 185, 186, 187, 188, 191, 192, 194, 198, ...","[0.7864, 0.7974, 0.7796, 0.7675, 0.7414, 0.720...",0.038646,461,6NZA_0,MNTVRSEKDSMGAIDVPADKLWGAQTQRSLEHFRISTEKMPTSLIH...,GRTHLATLQESGWAMHYAVTGLNFEDVQGAGASMKNDMVNGASGNF...,0.995662,6NZA_0_A
1,6NZA,0,B,TVRSEKDSMGAIDVPADKLWGAQTQRSLEHFRISTEKMPTSLIHAL...,"[182, 183, 184, 185, 186, 190, 192, 196, 197, ...","[0.6701, 0.7181, 0.6893, 0.671, 0.6561, 0.6428...",0.037456,459,6NZA_0,MNTVRSEKDSMGAIDVPADKLWGAQTQRSLEHFRISTEKMPTSLIH...,GRTHLATLQESGWAMHYAVTGLNFEDVQGAGASMKNDMVNGASGNF...,0.995662,6NZA_0_B


You are labeling an interaction as a dimer (likely homodimer) when the two chains are similar (**Hamming similarity > 0.60**). 

- That means `dimer = True` → likely same/similar chains; 

- `dimer = False` → likely a complex/heterodimer (different proteins)

#### Hamming similarity > 0.6 (arbitrary number) - we have homo-dimer (homo-interaction)

In [31]:
# make an explicit copy to avoid chained assignment issues
PPint_interactions_long = PPint_interactions[
    ["ID", "PDB_interface_name", "chainname", "sequence", "inter_chain_hamming"]
].copy()

# vectorized boolean (much faster than iterrows)
PPint_interactions_long["dimer"] = PPint_interactions_long["inter_chain_hamming"].gt(0.60)
PPint_interactions_long = PPint_interactions_long.drop(columns = ["inter_chain_hamming", "chainname"])
# If NaNs should be treated as False, do:
# PPint_interactions_long["dimer"] = PPint_interactions_long["inter_chain_hamming"].fillna(0).gt(0.60)

# your uniqueness check
assert PPint_interactions_long["ID"].nunique() == len(PPint_interactions_long)
### --> it means that ID col is unique per chain

In [32]:
PPint_interactions_long.head()

Unnamed: 0,ID,PDB_interface_name,sequence,dimer
0,6NZA_0_A,6NZA_0,MNTVRSEKDSMGAIDVPADKLWGAQTQRSLEHFRISTEKMPTSLIH...,True
1,6NZA_0_B,6NZA_0,TVRSEKDSMGAIDVPADKLWGAQTQRSLEHFRISTEKMPTSLIHAL...,True
2,9JKA_1_B,9JKA_1,VAAGATLALLSFLTPLAFLLLPPLLWREELEPCGTACEGLFISVAF...,True
3,9JKA_1_C,9JKA_1,VAAGATLALLSFLTPLAFLLLPPLLWREELEPCGTACEGLFISVAF...,True
4,8DQ6_1_B,8DQ6_1,PTLNLFTNIPVDAVTCSDILKDATKAVAKIIGKPESYVMILLNSGV...,True


In [33]:
grouped = {}
for _, row in PPint_interactions_long.iterrows():
    iface = row["PDB_interface_name"]
    seq = row["sequence"]
    rid = row["ID"]
    dimer = row["dimer"]
    
    if iface not in grouped:
        grouped[iface] = {
            "sequences": [],
            "IDs": [],
            "dimer": dimer,        # keep the dimer value for this interface
        }
    else:
        # Optional: sanity-check it's consistent per interface
        if grouped[iface]["dimer"] != dimer:
            print(f"Warning: multiple dimers for interface {iface}:",
                  grouped[iface]['dimer'], "vs", dimer)

    grouped[iface]["sequences"].append(seq)
    grouped[iface]["IDs"].append(rid)

records = []
for iface, vals in grouped.items():
    seqs = vals["sequences"]
    ids = vals["IDs"]
    if len(seqs) >= 2 and len(ids) >= 2:
        records.append({
            "interface_id": iface,
            "seq1": seqs[0],
            "seq2": seqs[1],
            "ID1": ids[0],
            "ID2": ids[1],
            "dimer": vals["dimer"],   # <- add dimer to final record
        })

PPint_interactions_NEW = pd.DataFrame(records)
PPint_interactions_NEW["seq_target_len"] = [len(row.seq1) for __, row in PPint_interactions_NEW.iterrows()]
PPint_interactions_NEW["seq_binder_len"] = [len(row.seq2) for __, row in PPint_interactions_NEW.iterrows()]
PPint_interactions_NEW["target_binder_id"] = PPint_interactions_NEW["ID1"] + "_" + PPint_interactions_NEW["ID2"]

PPint_interactions_NEW.head()

Unnamed: 0,interface_id,seq1,seq2,ID1,ID2,dimer,seq_target_len,seq_binder_len,target_binder_id
0,6NZA_0,MNTVRSEKDSMGAIDVPADKLWGAQTQRSLEHFRISTEKMPTSLIH...,TVRSEKDSMGAIDVPADKLWGAQTQRSLEHFRISTEKMPTSLIHAL...,6NZA_0_A,6NZA_0_B,True,461,459,6NZA_0_A_6NZA_0_B
1,9JKA_1,VAAGATLALLSFLTPLAFLLLPPLLWREELEPCGTACEGLFISVAF...,VAAGATLALLSFLTPLAFLLLPPLLWREELEPCGTACEGLFISVAF...,9JKA_1_B,9JKA_1_C,True,362,362,9JKA_1_B_9JKA_1_C
2,8DQ6_1,PTLNLFTNIPVDAVTCSDILKDATKAVAKIIGKPESYVMILLNSGV...,PTLNLFTNIPVDAVTCSDILKDATKAVAKIIGKPESYVMILLNSGV...,8DQ6_1_B,8DQ6_1_C,True,109,97,8DQ6_1_B_8DQ6_1_C
3,2YMZ_0,ARMFEMFNLDWKSGGTMKIKGHISEDAESFAINLGCKSSDLALHFN...,ARMFEMFNLDWKSGGTMKIKGHISEDAESFAINLGCKSSDLALHFN...,2YMZ_0_A,2YMZ_0_B,True,130,130,2YMZ_0_A_2YMZ_0_B
4,6IDB_0,DKICLGHHAVSNGTKVNTLTERGVEVVNATETVERTNIPRICSKGK...,GLFGAIAGFIENGWEGLIDGWYGFRHQNAQGEGTAADYKSTQSAID...,6IDB_0_A,6IDB_0_B,False,317,172,6IDB_0_A_6IDB_0_B


In [34]:
assert 2*len(PPint_interactions_NEW) == len(PPint_interactions_long)
### --> it means that all the sequences were paired

In [35]:
PPint_interactions_NEW.to_csv("/work3/s232958/data/PPint_DB/PPint_interactions_filetered.csv", index=False)

In [36]:
PPint_interactions_NEW["dimer"].value_counts()

dimer
True     16269
False     8456
Name: count, dtype: int64

### From df to .fasta file

In [37]:
PPint_interactions_long.head()

Unnamed: 0,ID,PDB_interface_name,sequence,dimer
0,6NZA_0_A,6NZA_0,MNTVRSEKDSMGAIDVPADKLWGAQTQRSLEHFRISTEKMPTSLIH...,True
1,6NZA_0_B,6NZA_0,TVRSEKDSMGAIDVPADKLWGAQTQRSLEHFRISTEKMPTSLIHAL...,True
2,9JKA_1_B,9JKA_1,VAAGATLALLSFLTPLAFLLLPPLLWREELEPCGTACEGLFISVAF...,True
3,9JKA_1_C,9JKA_1,VAAGATLALLSFLTPLAFLLLPPLLWREELEPCGTACEGLFISVAF...,True
4,8DQ6_1_B,8DQ6_1,PTLNLFTNIPVDAVTCSDILKDATKAVAKIIGKPESYVMILLNSGV...,True


In [38]:
records = {}
for _, row in PPint_interactions_long.iterrows():
    records[row.ID] = row.sequence

with open("all_sequences.fasta", "w") as f:
    for header, seq in records.items():
        f.write(f">{header}\n")
        f.write(f"{seq}\n")

### MMSeq2 clustering

In command line:

```mkdir -p clusterings/0.4_0.8/tmp```
```python
  mmseqs easy-cluster all_sequences.fasta clusterings/0.4_0.8/0.4_0.8 clusterings/0.4_0.8/tmp \
  --min-seq-id 0.4 \
  -c 0.8 \
  --cov-mode 3 \
  -s 7.5
  ```

In [44]:
def mmseqs_parser(tsv_file = None, fasta_file = None):
    """ From POMC_utils - function to parse mmseqs files"""
    all_seqs = {}
    cur_id = ''
    cur_seq = ''
    clust = {}
    clust_keys=[]
    ## parsing the sequence file - it has a fasta-like format with a few exceptions (double headers lines)
    try:
        with open(fasta_file) as file:
            for line in file:
                m = re.search(">(.+)", line.rstrip(),re.IGNORECASE)
                if m:
                    if cur_seq!='':
                        all_seqs[cur_id] = cur_seq
                        cur_seq = ''
                    cur_id = m.groups()[0]
                        
                else:
                    cur_seq+=line.rstrip()
            all_seqs[cur_id] = cur_seq
            #print(all_seqs)
        print(f"File {fasta_file} successfuly processed!")
    except IOError:
        print(f"Error: File {fasta_file} does not appear to exist.")
   
    ## parsing the cluster file
    try:
        with open(tsv_file) as file:
            for line in file:
                m = re.search("(.+)\t(.+)", line.rstrip(),re.IGNORECASE)
                if m:
                    clust.setdefault(m.groups()[0], [])
                    clust[m.groups()[0]].append( m.groups()[1])
        print(f"File {tsv_file} successfuly processed!")
    except IOError:
        print(f"Error: File {tsv_file} does not appear to exist.") 
    return all_seqs, clust
tsv_file="clusterings/0.4_0.8/0.4_0.8_cluster.tsv"
fasta_file = "clusterings/0.4_0.8/0.4_0.8_rep_seq.fasta"
all_seqs, clust = mmseqs_parser(fasta_file=fasta_file, tsv_file = tsv_file)

File clusterings/0.4_0.8/0.4_0.8_rep_seq.fasta successfuly processed!
File clusterings/0.4_0.8/0.4_0.8_cluster.tsv successfuly processed!


In [45]:
# def mmseqs_parser(file_prefix):
#     """ From POMC_utils - function to parse mmseqs files"""

#     """
#     all_seq = {sequence_id: amino_acid_sequence}
#     clust = {cluster_rep_id: [member_id_1, member_id_2, ...]}
#     clust_keys = {member_id: cluster_rep_id} chain → cluster key
#     """

#     all_seqs = {}
#     cur_id = ''
#     cur_seq = ''
#     clust = {}
#     clust_keys={}
#     ## parsing the sequence file - it has a fasta-like format with a few exceptions (double headers lines)
#     try:
#         with open(file_prefix+"_all_seqs.fasta") as file:
#             for line in file:
#                 m = re.search(">(.+)", line.rstrip(),re.IGNORECASE)
#                 if m:
#                     if cur_seq!='':
#                         all_seqs[cur_id] = cur_seq
#                         cur_seq = ''
#                     cur_id = m.groups()[0]
                        
#                 else:
#                     cur_seq+=line.rstrip()
#             all_seqs[cur_id] = cur_seq
#             #print(all_seqs)
#     except IOError:
#         print( "Error: File does not appear to exist.")
   
#     ## parsing the cluster file
#     try:
#         with open(file_prefix+"_cluster.tsv") as file:
#             for line in file:
#                 m = re.search("(.+)\t(.+)", line.rstrip(),re.IGNORECASE)
#                 if m:
#                     clust.setdefault(m.groups()[0], [])
#                     clust[m.groups()[0]].append( m.groups()[1] ) 
#                     clust_keys[m.groups()[1]] = m.groups()[0]
#     except IOError:
#         print( "Error: File does not appear to exist.") 
#     return all_seqs, clust, clust_keys

# path_to_mmseqs_clustering = "/work3/s232958/data/PPint_DB/3_å_dataset5_singlefasta/clusterRes40"
# all_seqs, clust, clust_keys = mmseqs_parser(path_to_mmseqs_clustering)

In [46]:
cluster_sizes = [len(v) for v in clust.values()]
unique_clusters = sum(1 for s in cluster_sizes if s == 1)
redundant_clusters = sum(1 for s in cluster_sizes if s > 1)

print(f"Total sequences: {sum(cluster_sizes)}")
print(f"Total clusters: {len(cluster_sizes)}")
print(f"Unique clusters (size 1): {unique_clusters}")
print(f"Redundant clusters (size >1): {redundant_clusters}")
print(f"Mean cluster size: {np.mean(cluster_sizes):.2f}")

Total sequences: 49450
Total clusters: 16203
Unique clusters (size 1): 4337
Redundant clusters (size >1): 11866
Mean cluster size: 3.05


In [47]:
list(clust.items())[:20]

[('9B62_3_E', ['9B62_3_E', '9B62_2_E']),
 ('6YIP_0_B', ['6YIP_0_B', '6YIP_0_A']),
 ('3GXU_0_B',
  ['3GXU_0_B',
   '4UF7_0_C',
   '6P7Y_0_B',
   '2WO2_0_B',
   '2VSM_0_B',
   '6PDL_0_B',
   '2HLE_0_B']),
 ('4QXZ_0_B', ['4QXZ_0_B', '4QXZ_0_A']),
 ('12AS_0_B', ['12AS_0_B', '12AS_0_A']),
 ('3TSA_0_B', ['3TSA_0_B', '3TSA_0_A']),
 ('3N0T_0_B', ['3N0T_0_B', '3N0T_0_A']),
 ('2ED6_1_C', ['2ED6_1_C', '2ED6_1_B']),
 ('1I36_0_B', ['1I36_0_B', '1I36_0_A']),
 ('8DCL_0_A', ['8DCL_0_A', '8DCL_0_B', '7ANG_0_A', '7ANG_0_D']),
 ('1TLJ_0_B', ['1TLJ_0_B', '1TLJ_0_A']),
 ('3HHW_0_A', ['3HHW_0_A']),
 ('3HHW_0_K', ['3HHW_0_K', '3HHW_6_L', '3HHW_6_M']),
 ('2RG7_0_C', ['2RG7_0_C', '2RG7_0_A']),
 ('6THL_0_A', ['6THL_0_A']),
 ('6THL_0_B', ['6THL_0_B']),
 ('5JP2_0_E', ['5JP2_0_E']),
 ('5JP2_0_A', ['5JP2_0_A']),
 ('4Y7S_1_C', ['4Y7S_1_C', '4Y7S_1_B']),
 ('3MGJ_0_B', ['3MGJ_0_B', '3MGJ_0_A'])]

In [48]:
clustID_sedIDs = {}

for idx, (ikey, ivalues) in enumerate(clust.items()):
    # Check if this cluster shares entries with an existing one
    ivalues = set([value.split("_")[0]+"_"+value.split("_")[1] for value in ivalues])
    merged = False
    for existing_key, existing_values in clustID_sedIDs.items():
        existing_values = set([value.split("_")[0]+"_"+value.split("_")[1] for value in existing_values])
        if ivalues.intersection(existing_values):
            clustID_sedIDs[existing_key].update(ivalues)
            merged = True
            break
    
    # If not merged, create a new entry
    if not merged:
        clustID_sedIDs[idx] = ivalues

# Convert sets back to lists
for k in clustID_sedIDs:
    clustID_sedIDs[k] = list(clustID_sedIDs[k])

list(clustID_sedIDs.items())[:20]

[(0,
  ['1BYU_0',
   '7MO0_0',
   '5AEK_0',
   '3UIO_1',
   '5D2M_1',
   '5EQL_0',
   '2G4D_0',
   '9B0Z_1',
   '7MNX_0',
   '3QHT_0',
   '7R2E_0',
   '5CLL_0',
   '4HAX_1',
   '9B62_2',
   '7MNY_0',
   '7MNZ_0',
   '6UYX_0',
   '5JP1_0',
   '5TVQ_0',
   '2PE6_0',
   '9G8I_0',
   '3UIP_1',
   '7P99_0',
   '9B62_3',
   '3A9K_0',
   '1WYW_0',
   '2D07_0',
   '1EUV_0',
   '2EKE_0',
   '3GJ3_0']),
 (1, ['6YIP_0']),
 (2,
  ['6P7Y_0',
   '7CZE_1',
   '2BBA_0',
   '2X9M_0',
   '7SYY_0',
   '4W4Z_0',
   '3CZU_0',
   '7B7N_0',
   '8K0D_2',
   '3GXU_0',
   '8K0D_1',
   '6PDL_0',
   '6VY6_0',
   '2WO3_0',
   '2VSM_0',
   '2HLE_0',
   '6NK1_0',
   '4L0P_0',
   '2WO2_0',
   '6VY4_1',
   '4UF7_0',
   '2QBX_0']),
 (3, ['4QXZ_0']),
 (4, ['12AS_0']),
 (5, ['3TSA_0']),
 (6, ['3N0T_0']),
 (7, ['2ED6_1']),
 (8, ['1I36_0']),
 (9, ['8DCL_0', '7ANG_0']),
 (10, ['1TLJ_0']),
 (11, ['3HHW_0', '3HHW_6']),
 (13, ['2RG7_0']),
 (14, ['6THL_0']),
 (16, ['5JP2_0']),
 (18, ['4Y7S_1']),
 (19, ['3MGJ_0']),
 (20, ['5YYL_

In [49]:
sedID_clusterIndex = {}
for idx, (key, values) in enumerate(clustID_sedIDs.items()):
    for val in values:
        sedID_clusterIndex[val] = idx
list(sedID_clusterIndex.items())[:20]

[('1BYU_0', 0),
 ('7MO0_0', 0),
 ('5AEK_0', 6165),
 ('3UIO_1', 7493),
 ('5D2M_1', 0),
 ('5EQL_0', 82),
 ('2G4D_0', 6165),
 ('9B0Z_1', 82),
 ('7MNX_0', 0),
 ('3QHT_0', 82),
 ('7R2E_0', 0),
 ('5CLL_0', 0),
 ('4HAX_1', 0),
 ('9B62_2', 0),
 ('7MNY_0', 0),
 ('7MNZ_0', 0),
 ('6UYX_0', 3244),
 ('5JP1_0', 0),
 ('5TVQ_0', 7621),
 ('2PE6_0', 1281)]

In [50]:
PPint_interactions_NEW

Unnamed: 0,interface_id,seq1,seq2,ID1,ID2,dimer,seq_target_len,seq_binder_len,target_binder_id
0,6NZA_0,MNTVRSEKDSMGAIDVPADKLWGAQTQRSLEHFRISTEKMPTSLIH...,TVRSEKDSMGAIDVPADKLWGAQTQRSLEHFRISTEKMPTSLIHAL...,6NZA_0_A,6NZA_0_B,True,461,459,6NZA_0_A_6NZA_0_B
1,9JKA_1,VAAGATLALLSFLTPLAFLLLPPLLWREELEPCGTACEGLFISVAF...,VAAGATLALLSFLTPLAFLLLPPLLWREELEPCGTACEGLFISVAF...,9JKA_1_B,9JKA_1_C,True,362,362,9JKA_1_B_9JKA_1_C
2,8DQ6_1,PTLNLFTNIPVDAVTCSDILKDATKAVAKIIGKPESYVMILLNSGV...,PTLNLFTNIPVDAVTCSDILKDATKAVAKIIGKPESYVMILLNSGV...,8DQ6_1_B,8DQ6_1_C,True,109,97,8DQ6_1_B_8DQ6_1_C
3,2YMZ_0,ARMFEMFNLDWKSGGTMKIKGHISEDAESFAINLGCKSSDLALHFN...,ARMFEMFNLDWKSGGTMKIKGHISEDAESFAINLGCKSSDLALHFN...,2YMZ_0_A,2YMZ_0_B,True,130,130,2YMZ_0_A_2YMZ_0_B
4,6IDB_0,DKICLGHHAVSNGTKVNTLTERGVEVVNATETVERTNIPRICSKGK...,GLFGAIAGFIENGWEGLIDGWYGFRHQNAQGEGTAADYKSTQSAID...,6IDB_0_A,6IDB_0_B,False,317,172,6IDB_0_A_6IDB_0_B
...,...,...,...,...,...,...,...,...,...
24720,6O42_0,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPR...,QVQLVQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLE...,6O42_0_L,6O42_0_H,False,214,220,6O42_0_L_6O42_0_H
24721,2QIB_0,GVEERRQQLIGVALDLFSRRSPDEVSIDEIASAAGISRPLVYHYFP...,RRQQLIGVALDLFSRRSPDEVSIDEIASAAGISRPLVYHYFPGKLS...,2QIB_0_A,2QIB_0_B,True,217,204,2QIB_0_A_2QIB_0_B
24722,7OXG_0,MKVGQDKVVTIRYTLQVEGEVLDQGELSYLHGHRNLIPGLEEALEG...,TRYWNAKALPFAFG,7OXG_0_A,7OXG_0_C,False,99,14,7OXG_0_A_7OXG_0_C
24723,6UUM_0,EIVLTQSPVTLSLSSGETGTLSCRASQNISSSWIAWYQQRRGQVPR...,QVQLVQSGAEVRKPGSSVTISCKPVGGTFTNFAIHWVRQAPGQGLE...,6UUM_0_F,6UUM_0_H,False,215,227,6UUM_0_F_6UUM_0_H


In [51]:
clusters = [sedID_clusterIndex[row.interface_id] for idx, row in PPint_interactions_NEW.iterrows()]
PPint_interactions_NEW["cluster"] = clusters
PPint_interactions_NEW

Unnamed: 0,interface_id,seq1,seq2,ID1,ID2,dimer,seq_target_len,seq_binder_len,target_binder_id,cluster
0,6NZA_0,MNTVRSEKDSMGAIDVPADKLWGAQTQRSLEHFRISTEKMPTSLIH...,TVRSEKDSMGAIDVPADKLWGAQTQRSLEHFRISTEKMPTSLIHAL...,6NZA_0_A,6NZA_0_B,True,461,459,6NZA_0_A_6NZA_0_B,2161
1,9JKA_1,VAAGATLALLSFLTPLAFLLLPPLLWREELEPCGTACEGLFISVAF...,VAAGATLALLSFLTPLAFLLLPPLLWREELEPCGTACEGLFISVAF...,9JKA_1_B,9JKA_1_C,True,362,362,9JKA_1_B_9JKA_1_C,11277
2,8DQ6_1,PTLNLFTNIPVDAVTCSDILKDATKAVAKIIGKPESYVMILLNSGV...,PTLNLFTNIPVDAVTCSDILKDATKAVAKIIGKPESYVMILLNSGV...,8DQ6_1_B,8DQ6_1_C,True,109,97,8DQ6_1_B_8DQ6_1_C,10817
3,2YMZ_0,ARMFEMFNLDWKSGGTMKIKGHISEDAESFAINLGCKSSDLALHFN...,ARMFEMFNLDWKSGGTMKIKGHISEDAESFAINLGCKSSDLALHFN...,2YMZ_0_A,2YMZ_0_B,True,130,130,2YMZ_0_A_2YMZ_0_B,8863
4,6IDB_0,DKICLGHHAVSNGTKVNTLTERGVEVVNATETVERTNIPRICSKGK...,GLFGAIAGFIENGWEGLIDGWYGFRHQNAQGEGTAADYKSTQSAID...,6IDB_0_A,6IDB_0_B,False,317,172,6IDB_0_A_6IDB_0_B,59
...,...,...,...,...,...,...,...,...,...,...
24720,6O42_0,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPR...,QVQLVQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLE...,6O42_0_L,6O42_0_H,False,214,220,6O42_0_L_6O42_0_H,82
24721,2QIB_0,GVEERRQQLIGVALDLFSRRSPDEVSIDEIASAAGISRPLVYHYFP...,RRQQLIGVALDLFSRRSPDEVSIDEIASAAGISRPLVYHYFPGKLS...,2QIB_0_A,2QIB_0_B,True,217,204,2QIB_0_A_2QIB_0_B,932
24722,7OXG_0,MKVGQDKVVTIRYTLQVEGEVLDQGELSYLHGHRNLIPGLEEALEG...,TRYWNAKALPFAFG,7OXG_0_A,7OXG_0_C,False,99,14,7OXG_0_A_7OXG_0_C,933
24723,6UUM_0,EIVLTQSPVTLSLSSGETGTLSCRASQNISSSWIAWYQQRRGQVPR...,QVQLVQSGAEVRKPGSSVTISCKPVGGTFTNFAIHWVRQAPGQGLE...,6UUM_0_F,6UUM_0_H,False,215,227,6UUM_0_F_6UUM_0_H,82


In [52]:
PPint_interactions_NEW.cluster.value_counts().head(20)

cluster
59      1109
82       951
401      218
626      129
90        84
311       59
935       51
1702      51
2329      50
7668      50
1956      48
706       45
653       43
3418      42
3727      42
852       41
537       39
1361      39
3107      39
2145      38
Name: count, dtype: int64

In [53]:
PPint_interactions_NEW_sample = PPint_interactions_NEW.sample(n=int(len(PPint_interactions_NEW) * 0.1), random_state=0) #sampling 10% of datapoints
PPint_interactions_NEW_sample

Unnamed: 0,interface_id,seq1,seq2,ID1,ID2,dimer,seq_target_len,seq_binder_len,target_binder_id,cluster
17393,1JEB_2,SLTKTERTIIVSMWAKISTQADTIGTETLERLFLSHPQTKTYFPHF...,VHLTDAEKAAVSGLWGKVNADEVGGEALGRLLVVYPWTQRYFDSFG...,1JEB_2_C,1JEB_2_D,False,141,146,1JEB_2_C_1JEB_2_D,852
15228,7B12_23,TTTLAFKFQHGVIAAVDSRASAGSYISALRVNKVIEINPYLLGTMS...,DRGVNTFSPEGRLFQVEYAIEAIKLGSTAIGIQTSEGVCLAVEKRI...,7B12_23_Z,7B12_23_s,False,203,230,7B12_23_Z_7B12_23_s,523
18220,6VCD_1,ITHLPPEVMLSIFSYLNPQELCRCSQVSMKWSQLTKTGSLWKHLYP...,PSIKLQSSDGEIFEVDVEIAKQSVTIKTMLEDLGDPVPLPNVNAAI...,6VCD_1_B,6VCD_1_C,False,255,135,6VCD_1_B_6VCD_1_C,2294
13480,2OKG_0,NAKDVLGLTLLEKTLKERLNLKDAIIVSGDSDQSPWVKKEGRAAVA...,AKDVLGLTLLEKTLKERLNLKDAIIVSGDSDQSPWVKKEGRAAVAC...,2OKG_0_A,2OKG_0_B,True,241,243,2OKG_0_A_2OKG_0_B,8029
7781,3MBX_0,DIVMSQSPSSLAVSVGEKVTMSCKSSQSLLYNNNQKNYLAWYQQKP...,VTLKESGPGILQPSQTLSLTCSFSGFSLSTYGMGVGWIRQPSGKGL...,3MBX_0_L,3MBX_0_H,False,220,229,3MBX_0_L_3MBX_0_H,82
...,...,...,...,...,...,...,...,...,...,...
8120,3HR4_0,REIPLKVLVKAVLFACMLMRKTMASRVRVTILFATETGKSEALAWD...,QLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAEL...,3HR4_0_A,3HR4_0_B,False,189,145,3HR4_0_A_3HR4_0_B,218
1963,6D3M_0,NKYRFIDVQPLTGVLGAEITGVDLREPLDDSTWNEILDAFHTYQVI...,KYRFIDVQPLTGVLGAEITGVDLREPLDDSTWNEILDAFHTYQVIY...,6D3M_0_A,6D3M_0_B,True,286,285,6D3M_0_A_6D3M_0_B,630
8507,4LUB_0,TMKIAYLGPSGSFTHNVALHAFPAADLLPFENITEVIKAYESKQVC...,TMKIAYLGPSGSFTHNVALHAFPAADLLPFENITEVIKAYESKQVC...,4LUB_0_A,4LUB_0_B,True,188,190,4LUB_0_A_4LUB_0_B,9344
9488,4MN4_2,ERDEVGARKNAVDEEIERLSQPGDQRLNALAERFGGVLLSEIYDDV...,EPVTIVLSQGWVRSAKGHDIDAPGLNYKAGDSFKAAVKGKSNQPVV...,4MN4_2_D,4MN4_2_B,False,154,236,4MN4_2_D_4MN4_2_B,1366


In [54]:
PPint_interactions_NEW_sample.cluster.value_counts().head(20)

cluster
82      105
59       89
401      24
626      13
90        8
311       8
3418      8
4651      7
1817      7
1956      7
7668      6
1992      6
537       5
653       5
3727      5
706       5
1361      5
7974      4
7071      4
5187      4
Name: count, dtype: int64

In [56]:
PPint_interactions_NEW_sample.to_csv("/work3/s232958/data/PPint_DB/PPint_sample_muClustering.csv", index=False)

### Comparing clustering

In [62]:
path_to_mmseqs_clustering = "/work3/s232958/data/PPint_DB/3_å_dataset5_singlefasta/clusterRes40"
all_seqs, clust, clust_keys = pat_utils.mmseqs_parser(path_to_mmseqs_clustering)

path_to_interaction_df = "/work3/s232958/data/PPint_DB/disordered_interfaces_no_cutoff_filtered_nonredundant80_3å_5.csv.gz"
disordered_interfaces_df = pd.read_csv(path_to_interaction_df,index_col=0).reset_index(drop=True)
disordered_interfaces_df["PDB_chain_name"] = (disordered_interfaces_df["PDB"] + "_" + disordered_interfaces_df["chainname"]).tolist()
disordered_interfaces_df["index_num"] = np.arange(len(disordered_interfaces_df))
disordered_interfaces_df["chain_name_index"] = [row["PDB_chain_name"] + "_" + str(row["index_num"]) for index, row in disordered_interfaces_df.iterrows()]
disordered_interfaces_df = disordered_interfaces_df.set_index("PDB_interface_name")
disordered_interfaces_df["interface_residues"] = disordered_interfaces_df["interface_residues"].apply(lambda x: ast.literal_eval(x))
# disordered_interfaces_df["inter_chain_hamming"] = [1 - (Ldistance(seq.split("-")[0], seq.split("-")[1]))/np.max([len(seq.split("-")[0]), len(seq.split("-")[1])]) for seq in disordered_interfaces_df["protien_interface_sequences"]]
disordered_interfaces_df["dimer"] = disordered_interfaces_df["inter_chain_hamming"] > 0.60
disordered_interfaces_df["clust_keys"] = [clust_keys.get(row["chain_name_index"]) for index, row in disordered_interfaces_df.iterrows()] 

pdb_interface_and_clust_keys = {index:disordered_interfaces_df.loc[index,"clust_keys"].values.tolist() for index in tqdm(disordered_interfaces_df.index.drop_duplicates(), total=len(disordered_interfaces_df)/2)}
new_clusters, new_clusters_clustkeys = pat_utils.recluster_mmseqs_keys_to_non_overlapping_groups(pdb_interface_and_clust_keys)

### Creating train and test datasets based on train and test-idexes
train_indexes, test_indexes = pat_utils.run_train_test_partition(interaction_df=disordered_interfaces_df,
                                                    clustering=new_clusters, # Clusters from Bidentate-graphs
                                                    train_ratio=0.8, 
                                                    test_ratio=0.2, 
                                                    v=True, 
                                                    seed=0)

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 24725/24725.0 [00:33<00:00, 732.22it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████| 27834/27834 [00:00<00:00, 846494.27it/s]


0.8
0.2


In [65]:
# sample random 10%
random.seed(0)
train_indexes_sample = random.sample(train_indexes, int(len(train_indexes) * 0.1))
test_indexes_sample = random.sample(test_indexes, int(len(test_indexes) * 0.1))

In [66]:
PPint_interactions_NEW_sample

Unnamed: 0,interface_id,seq1,seq2,ID1,ID2,dimer,seq_target_len,seq_binder_len,target_binder_id,cluster
17393,1JEB_2,SLTKTERTIIVSMWAKISTQADTIGTETLERLFLSHPQTKTYFPHF...,VHLTDAEKAAVSGLWGKVNADEVGGEALGRLLVVYPWTQRYFDSFG...,1JEB_2_C,1JEB_2_D,False,141,146,1JEB_2_C_1JEB_2_D,852
15228,7B12_23,TTTLAFKFQHGVIAAVDSRASAGSYISALRVNKVIEINPYLLGTMS...,DRGVNTFSPEGRLFQVEYAIEAIKLGSTAIGIQTSEGVCLAVEKRI...,7B12_23_Z,7B12_23_s,False,203,230,7B12_23_Z_7B12_23_s,523
18220,6VCD_1,ITHLPPEVMLSIFSYLNPQELCRCSQVSMKWSQLTKTGSLWKHLYP...,PSIKLQSSDGEIFEVDVEIAKQSVTIKTMLEDLGDPVPLPNVNAAI...,6VCD_1_B,6VCD_1_C,False,255,135,6VCD_1_B_6VCD_1_C,2294
13480,2OKG_0,NAKDVLGLTLLEKTLKERLNLKDAIIVSGDSDQSPWVKKEGRAAVA...,AKDVLGLTLLEKTLKERLNLKDAIIVSGDSDQSPWVKKEGRAAVAC...,2OKG_0_A,2OKG_0_B,True,241,243,2OKG_0_A_2OKG_0_B,8029
7781,3MBX_0,DIVMSQSPSSLAVSVGEKVTMSCKSSQSLLYNNNQKNYLAWYQQKP...,VTLKESGPGILQPSQTLSLTCSFSGFSLSTYGMGVGWIRQPSGKGL...,3MBX_0_L,3MBX_0_H,False,220,229,3MBX_0_L_3MBX_0_H,82
...,...,...,...,...,...,...,...,...,...,...
8120,3HR4_0,REIPLKVLVKAVLFACMLMRKTMASRVRVTILFATETGKSEALAWD...,QLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAEL...,3HR4_0_A,3HR4_0_B,False,189,145,3HR4_0_A_3HR4_0_B,218
1963,6D3M_0,NKYRFIDVQPLTGVLGAEITGVDLREPLDDSTWNEILDAFHTYQVI...,KYRFIDVQPLTGVLGAEITGVDLREPLDDSTWNEILDAFHTYQVIY...,6D3M_0_A,6D3M_0_B,True,286,285,6D3M_0_A_6D3M_0_B,630
8507,4LUB_0,TMKIAYLGPSGSFTHNVALHAFPAADLLPFENITEVIKAYESKQVC...,TMKIAYLGPSGSFTHNVALHAFPAADLLPFENITEVIKAYESKQVC...,4LUB_0_A,4LUB_0_B,True,188,190,4LUB_0_A_4LUB_0_B,9344
9488,4MN4_2,ERDEVGARKNAVDEEIERLSQPGDQRLNALAERFGGVLLSEIYDDV...,EPVTIVLSQGWVRSAKGHDIDAPGLNYKAGDSFKAAVKGKSNQPVV...,4MN4_2_D,4MN4_2_B,False,154,236,4MN4_2_D_4MN4_2_B,1366


In [68]:
clusters_counts_Dict = dict(PPint_interactions_NEW_sample.cluster.value_counts())
clusters_counts = list(clusters_counts_Dict.items())

random.seed(0)
random.shuffle(clusters_counts)

counter = 0
ceil = round(len(PPint_interactions_NEW_sample) * 0.2)
val_clusters = []

for (clust, num) in clusters_counts:
    
    if num + counter < ceil:
        counter += num
        val_clusters.append(clust)

    elif num + counter > ceil:
        continue

    elif num + counter == ceil:
        counter += num
        val_clusters.append(clust)
        break

counter = 0
for c in val_clusters:
    counter += clusters_counts_Dict[c]
print(counter)
print(ceil)

494
494


In [82]:
Df_val = PPint_interactions_NEW_sample[PPint_interactions_NEW_sample.cluster.isin(val_clusters)]
Df_train = PPint_interactions_NEW_sample.drop(Df_val.index)
val_interfaces = [row.interface_id for __, row in Df_val.iterrows()]

In [81]:
print(len(train_indexes), len(test_indexes))

19781 4944


In [83]:
print(len(val_interfaces))

494


In [84]:
for i in val_interfaces:
    if i in train_indexes and i in test_indexes:
        print(i, "in BOTH!")
    elif i in train_indexes:
        print(i, "in train!")
    elif i in test_indexes:
        print(i, "in test!")

6D40_0 in test!
5N11_0 in train!
6NB5_0 in train!
6CK9_1 in train!
2Z0G_1 in train!
4MZM_0 in train!
8XYT_0 in train!
4GHB_0 in train!
3T5X_0 in test!
4QRI_0 in train!
7V1X_3 in train!
3LQ6_0 in train!
8DWC_3 in train!
3KRZ_1 in train!
2YYR_0 in train!
4ZKJ_0 in train!
4GR2_0 in test!
2W81_0 in train!
2H32_1 in train!
6GG9_0 in test!
5GJ4_1 in train!
6BE3_0 in train!
1FIW_0 in train!
7W40_0 in train!
8VSR_0 in train!
6OL7_1 in train!
7E4D_0 in train!
8BDK_0 in train!
7PVX_0 in train!
2GAO_0 in train!
3RCZ_0 in train!
3FDI_0 in train!
9FY0_10 in train!
3HHW_6 in train!
2HL1_0 in train!
1T72_0 in test!
8IB1_1 in train!
4QNP_1 in train!
6JKU_2 in test!
6T3U_0 in test!
3VIE_2 in train!
3O1N_0 in test!
7AM0_0 in train!
3TWE_0 in train!
7LX0_4 in train!
1AUV_0 in train!
1BBR_2 in test!
2PNZ_0 in train!
2C0U_1 in train!
2X3D_4 in train!
6BZV_0 in train!
1HWU_1 in train!
1C4T_1 in train!
2H3N_2 in train!
7N4J_1 in train!
5OCX_1 in train!
8W8Q_0 in train!
2B0U_1 in train!
8G0P_0 in train!
2IG6_