In [78]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
from itertools import combinations
from Levenshtein import distance
from scipy.cluster.hierarchy import fcluster, linkage


# making clusters

In [79]:
paraplume = pd.read_csv("/home/athenes/paratope_model/datasets/paraplume/paraplume.csv")


In [67]:
from typing import Optional

import pandas as pd
from biopandas.pdb import PandasPdb

def read_pdb_to_dataframe(
    pdb_path: Optional[str] = None,
) -> pd.DataFrame:
    """
    Read a PDB file, and return a Pandas DataFrame containing the atomic coordinates and metadata.

    Args:
        pdb_path (str, optional): Path to a local PDB file to read. Defaults to None.

    Returns:
        pd.DataFrame: A DataFrame containing the atomic coordinates and metadata, removed of it's
        hydrogen atomswith one row per atom.
    """
    atomic_df = PandasPdb().read_pdb(pdb_path).df["ATOM"].query("element_symbol!='H'")
    atomic_df["IMGT"]=atomic_df["residue_number"].astype(str)+atomic_df["insertion"].astype(str)
    return atomic_df
amino_acid_dict = {
    "ALA": "A",
    "CYS": "C",
    "ASP": "D",
    "GLU": "E",
    "PHE": "F",
    "GLY": "G",
    "HIS": "H",
    "ILE": "I",
    "LYS": "K",
    "LEU": "L",
    "MET": "M",
    "ASN": "N",
    "PRO": "P",
    "GLN": "Q",
    "ARG": "R",
    "SER": "S",
    "THR": "T",
    "VAL": "V",
    "TRP": "W",
    "TYR": "Y",
}


In [68]:
records=[]
for pdb,heavy,light in tqdm(paraplume[['pdb','Hchain','Lchain']].values):

    hl_chains=[heavy,light]
    if not Path(f"/home/athenes/all_structures/imgt/{pdb}.pdb").exists():
        print(pdb)
        continue
    pdb_df = read_pdb_to_dataframe(f"/home/athenes/all_structures/imgt/{pdb}.pdb").query("chain_id in @hl_chains and atom_name=='CA'")
    triplets_heavy= pdb_df.query("chain_id==@heavy and residue_number<129")["residue_name"].tolist()
    sequence_heavy = "".join([amino_acid_dict[each] for each in triplets_heavy])
    triplets_light= pdb_df.query("chain_id==@light and residue_number<128")["residue_name"].tolist()
    sequence_light = "".join([amino_acid_dict[each] for each in triplets_light])
    records.append({"pdb":pdb,"sequences":sequence_heavy+sequence_light})
df = pd.DataFrame.from_records(records)
df.to_csv("/home/athenes/paratope_model/datasets/paraplume/paraplume_sequences.csv")


100%|██████████| 3157/3157 [05:31<00:00,  9.53it/s]


In [82]:
df=pd.read_csv("/home/athenes/paratope_model/datasets/paraplume/paraplume_sequences.csv")


In [83]:
print(df["sequences"].str.len().unique())


[224 223 230 231 226 220 225 229 222 219 217 237 232 228 227 234 221 235
 236 218 233 214 239 242 216 244 213 240 241 238 163 243 249 215 212 189
 245 277 247 248 198 194 209 211 191 206 207 210 188 202 192 246 276 208
 106 109]


# making train_sets

In [70]:
mipe_test = pd.read_csv("/home/athenes/paratope_model/datasets/mipe/test_set.csv")
paragraph_test = pd.read_csv("/home/athenes/paratope_model/datasets/paragraph/test_set.csv")
pecan_test = pd.read_csv("/home/athenes/paratope_model/datasets/pecan/test_set.csv")
mipe_pdbs = set(mipe_test["pdb"].unique())
paragraph_pdbs = set(paragraph_test["pdb"].unique())
pecan_pdbs = set(pecan_test["pdb"].unique())
testset_pdbs=set.union(mipe_pdbs,pecan_pdbs,paragraph_pdbs)


In [71]:
def exclude_from_trainset(pdb_set):
    exclude_train_pdbs = []
    for pdb1, seq1 in tqdm(df[["pdb","sequences"]].values):
        for pdb2, seq2 in df.query("pdb in @pdb_set")[["pdb","sequences"]].values:
            if distance(seq1, seq2) / (np.mean([len(seq1), len(seq2)]))<0.05:
                exclude_train_pdbs.append(pdb1)
    return list(set(exclude_train_pdbs))


In [72]:
exclude_mipe = exclude_from_trainset(mipe_pdbs)
exclude_paragraph = exclude_from_trainset(paragraph_pdbs)
exclude_pecan = exclude_from_trainset(pecan_pdbs)


100%|██████████| 3157/3157 [00:04<00:00, 647.14it/s]
100%|██████████| 3157/3157 [00:07<00:00, 427.80it/s]
100%|██████████| 3157/3157 [00:05<00:00, 539.53it/s]


In [73]:
exclude_everything = set.union(set(exclude_paragraph), set(exclude_pecan), set(exclude_mipe))


In [74]:
train_val_paraplume_everything = paraplume.query("pdb not in @exclude_everything")
test_paraplume_everything = paraplume.query("pdb in @exclude_everything")
train_paraplume_everything = train_val_paraplume_everything.sample(frac=0.9, random_state=42)
val_paraplume_everything = train_val_paraplume_everything.drop(train_paraplume_everything.index)


In [75]:
train_paraplume_everything.to_csv("/home/athenes/paratope_model/datasets/paraplume/train.csv")


In [76]:
val_paraplume_everything.to_csv("/home/athenes/paratope_model/datasets/paraplume/val.csv")


In [77]:
test_paraplume_everything.to_csv("/home/athenes/paratope_model/datasets/paraplume/test.csv")
