In [1]:
from pathlib import Path
from plinder.data.pipeline import utils, tasks
from tqdm import tqdm
import networkx as nx
import json
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from plinder.data.utils import cluster
from dataclasses import dataclass, field

In [None]:
from rdkit import Chem

In [2]:
@dataclass
class TestCriteria:
    max_entry_resolution: float = 3.5
    max_entry_r = 0.4
    max_entry_rfree = 0.45
    max_entry_r_minus_rfree = 0.05
    ligand_max_num_unresolved_heavy_atoms = 0
    ligand_max_alt_count = 1
    ligand_min_average_occupancy: float = 0.8
    ligand_min_average_rscc: float = 0.8
    ligand_max_average_rsr: float = 0.3
    ligand_max_percent_outliers_clashes = 0
    pocket_max_num_unresolved_heavy_atoms = 0
    pocket_max_alt_count = 1
    pocket_min_average_occupancy: float = 0.8
    pocket_min_average_rscc: float = 0.8
    pocket_max_average_rsr: float = 0.3
    pocket_max_percent_outliers_clashes = 100


quality_config = TestCriteria()

def get_high_quality_systems(
    row: pd.Series,
    criteria: TestCriteria
) -> bool:
    if row.system_type != "holo":
        return False
    if row.entry_r is not None and row.system_ligand_average_rscc is not None:
        quality = [
            # ENTRY
            row.entry_resolution <= criteria.max_entry_resolution,
            row.entry_r <= criteria.max_entry_r,
            row.entry_rfree <= criteria.max_entry_rfree,
            row.entry_r_minus_rfree <= criteria.max_entry_r_minus_rfree,
            # LIGAND
            row.system_ligand_num_unresolved_heavy_atoms <= row.system_num_covalent_ligands + criteria.ligand_max_num_unresolved_heavy_atoms,
            row.system_ligand_max_alt_count <= criteria.ligand_max_alt_count, # NOTE: max_alt_count is misnomer - this counts number of total conformers!
            row.system_ligand_average_occupancy >= criteria.ligand_min_average_occupancy,
            row.system_ligand_average_rscc >= criteria.ligand_min_average_rscc,
            row.system_ligand_average_rsr <= criteria.ligand_max_average_rsr,
            row.system_ligand_percent_outliers_clashes <= criteria.ligand_max_percent_outliers_clashes,
            # POCKET
            row.system_pocket_num_unresolved_heavy_atoms <= criteria.pocket_max_num_unresolved_heavy_atoms,
            row.system_pocket_max_alt_count <= criteria.pocket_max_alt_count,
            row.system_pocket_average_occupancy >= criteria.pocket_min_average_occupancy,
            row.system_pocket_average_rscc >= criteria.pocket_min_average_rscc,
            row.system_pocket_average_rsr <= criteria.pocket_max_average_rsr,
            row.system_pocket_percent_outliers_clashes <= criteria.pocket_max_percent_outliers_clashes,
        ]
        if np.logical_and.reduce(quality):
            return True
    return False

In [3]:
ecod_df = pd.read_csv("v0/dbs/ecod/ecod_raw.tsv", sep="\t", skiprows=4)
ecod_mapping = {}
for name in ["t_id", "t_name"]:
    ecod_mapping[name] = dict(zip(ecod_df["ecod_domain_id"], ecod_df[name]))

In [4]:
df = pd.read_parquet("v1/index/annotation_table.parquet")
df["system_num_covalent_ligands"] = df.groupby("system_id")["ligand_is_covalent"].transform("sum")
df["passes_quality"] = df.apply(lambda row: get_high_quality_systems(row, criteria=quality_config), axis=1)
df["system_pocket_ECOD_t_id"] = df["system_pocket_ECOD"].map(ecod_mapping["t_id"])

In [6]:
mms_df = pd.read_parquet("v1/mmp/plinder_mmp_series.parquet")

In [7]:
clusters = pd.read_parquet("v1/clusters/metric=protein_fident_qcov_weighted_sum/directed=False/threshold=100.parquet")
system_to_cluster = dict(zip(clusters["system_id"], clusters["component"]))

In [8]:
df.loc[:, "receptor_cluster"] = df["system_id"].map(system_to_cluster)
df_holo = df[(df["system_type"] == "holo") & (df["system_num_interacting_protein_chains"] <= 5) & (df["system_num_ligand_chains"] <= 5)]

In [9]:
threshold = 95
apo = pd.read_parquet(
            "v1/scores/search_db=apo/",
            columns=["query_system", "target_system", "similarity"],
            filters=[
                ("similarity", ">=", threshold),
                ("metric", "==", "pocket_fident"),
            ],
        )

pred = pd.read_parquet(
            "v1/scores/search_db=pred/",
            columns=["query_system", "target_system", "similarity"],
            filters=[
                ("similarity", ">=", threshold),
                ("metric", "==", "pocket_fident"),
            ],
        )

In [57]:
text = f"""At the time of writing of this article, \plinder contains {df["system_id"].nunique()} PLI systems extracted 
from {df['entry_pdb_id'].nunique()} PDB entries of which 
{df_holo['system_id'].nunique()} are holo systems (with the remaining consisting of {df[df['system_type'] == 'artifact']['system_id'].nunique()} artifact systems, 
{df[df['system_type'] == 'ion']['system_id'].nunique()} ion systems, and 
{df[(df['system_type'] == 'holo') & ((df['system_num_interacting_protein_chains'] > 5) | (df['system_num_ligand_chains']>5))]['system_id'].nunique()} systems containing more than five protein and or ligand chains). 
Within the holo systems, {100 * df_holo[df_holo["system_num_ligand_chains"] > 1]["system_id"].nunique() / df_holo['system_id'].nunique():.1f}% have more than one ligand, 
{100 * df_holo[df_holo["system_num_interacting_protein_chains"] > 1]["system_id"].nunique() / df_holo['system_id'].nunique():.1f}% have more than one interacting protein chain, 
and {df_holo[df_holo["passes_quality"]]["system_id"].nunique()} ({100 * df_holo[df_holo["passes_quality"]]["system_id"].nunique() / df_holo[df_holo["entry_determination_method"] == "X-RAY DIFFRACTION"]["system_id"].nunique():.1f}% of systems determined by X-ray diffraction) pass the X-ray quality criteria listed in Tablerefappendix:quality.  

These systems contain {df_holo["receptor_cluster"].nunique()} protein sequence combinations, as measured by the strongly connected components at 100% protein\_fident\_qcov
As the curation workflow operates on the entire PDB, the collection and labelling of holo systems allowed us to simultaneously identify X chains 
from X PDB entries as being apo. Thus, plinder also provides an automatically curated dataset of apo chains with no detectable ligand 
interactions (except with artifacts or ions). Using this, {apo["query_system"].nunique()} single protein chain holo systems could be linked to at least one 
corresponding apo structure of the same protein, and {pred["query_system"].nunique()} are linked to corresponding AFDB models.

Of the {df_holo.shape[0]} ligands in holo systems covering {df_holo["ligand_unique_ccd_code"].nunique()} unique CCD codes, 
{df_holo["ligand_is_lipinski"].sum()} ({100 * df_holo["ligand_is_lipinski"].sum() / df_holo.shape[0]:.1f}%) pass the Lipinski Ro5 criteria, 
{df_holo["ligand_is_covalent"].sum()} ({100 * df_holo["ligand_is_covalent"].sum() / df_holo.shape[0]:.1f}%) have a covalent linkage, 
{df_holo["ligand_is_cofactor"].sum()} ({100 * df_holo["ligand_is_cofactor"].sum() / df_holo.shape[0]:.1f}%) are cofactors, 
{df_holo["ligand_is_oligo"].sum()} ({100 * df_holo["ligand_is_oligo"].sum() / df_holo.shape[0]:.1f}%) are oligo-saccharides, -nucleotides or -peptides, 
and {df_holo["ligand_is_fragment"].sum()} ({100 * df_holo["ligand_is_fragment"].sum() / df_holo.shape[0]:.1f}%) are fragments. 

{mms_df[mms_df["mms_unique_count"] > 2]["system_id"].nunique()} systems are part of {mms_df[mms_df["mms_unique_count"] > 2]["congeneric_id"].nunique()} congeneric MMS, 
each with at least 3 ligands containing a common core. 

Thanks to extensive molecule processing and cleaning efforts to correct bond, valence and chirality issues, only {df_holo[df_holo["ligand_is_invalid"]].shape[0]} ligands in \plinder 
could not be processed by RDKit - thus, unlike many other datasets, the vast majority of \plinder can readily be processed and used by the 
typical pre-processing and feature extraction routines employed by deep learning methods, and those which cannot are clearly annotated.
"""

print(text)

At the time of writing of this article, \plinder contains 1344214 PLI systems extracted 
from 162978 PDB entries of which 
449383 are holo systems (with the remaining consisting of 573169 artifact systems, 
318060 ion systems, and 
3602 systems containing more than five protein and or ligand chains). 
Within the holo systems, 26.7% have more than one ligand, 
24.8% have more than one interacting protein chain, 
and 113498 (34.1% of systems determined by X-ray diffraction) pass the X-ray quality criteria listed in Tablerefappendix:quality.  

These systems contain 74256 protein sequence combinations, as measured by the strongly connected components at 100% protein\_fident\_qcov
As the curation workflow operates on the entire PDB, the collection and labelling of holo systems allowed us to simultaneously identify X chains 
from X PDB entries as being apo. Thus, plinder also provides an automatically curated dataset of apo chains with no detectable ligand 
interactions (except with artifac

In [39]:
pdb_id_ccds = set()
with open("v1/other_splits/pdbbind/INDEX_general_PL.2020") as f:
    for i, line in enumerate(f):
        if i < 6:
            continue
        ccd = [x for x in line.split() if x.startswith("(")]
        pdb_id_ccds.add((line.split()[0], ccd[0].replace('(', '').replace(')', '').strip().split('.')[0].upper()))
df_smaller = df[df["entry_pdb_id"].isin(set(x[0] for x in pdb_id_ccds))][["system_id", "entry_pdb_id", "ligand_ccd_code", "ligand_unique_ccd_code"]].reset_index(drop=True)

def map_peptide_id_pdbbind(peptidecode):
    # "HSR-HSR-BMA-MAN-MAN" -> def map_peptide_id_pdbbind(peptidecode):
    return str(peptidecode.count('-')+1) + '-MER'

df_smaller['new_ligand_code'] = df_smaller['ligand_unique_ccd_code'].map(str.upper)
df_smaller.loc[df_smaller['ligand_unique_ccd_code'].str.contains('-'),'new_ligand_code'] = df_smaller.loc[df_smaller['ligand_unique_ccd_code'].str.contains('-'),'new_ligand_code'].map(map_peptide_id_pdbbind)
df_smaller['new_ligand_code2'] = df_smaller['ligand_ccd_code'].map(str.upper)
df_smaller.loc[df_smaller['ligand_ccd_code'].str.contains('-'),'new_ligand_code2'] = df_smaller.loc[df_smaller['ligand_ccd_code'].str.contains('-'),'new_ligand_code2'].map(map_peptide_id_pdbbind)
pdbbind_systems = set()
missing = []
for pdb_id, ccd_code in tqdm(pdb_id_ccds):
    df_p = df_smaller[(df_smaller["entry_pdb_id"] == pdb_id) & ((df_smaller["new_ligand_code"] == ccd_code) | (df_smaller["new_ligand_code2"] == ccd_code))]
    if df_p.shape[0] == 0:
        missing.append((pdb_id, ccd_code))
    else:
        pdbbind_systems |= set(df_p["system_id"])

100%|██████████| 19443/19443 [06:08<00:00, 52.82it/s]


In [69]:
inconsistent = set()
for pdb_id, ccd_code in tqdm(missing):
    df_p = df_smaller[(df_smaller["entry_pdb_id"] == pdb_id)]
    if df_p.shape[0] > 0:
        for _, system in df_p.iterrows():
            pdbbind_systems.add(system["system_id"])
            inconsistent.add((pdb_id, ccd_code))

100%|██████████| 1811/1811 [00:11<00:00, 153.15it/s]


In [70]:
len(inconsistent), len(set(missing).difference(inconsistent))

(1375, 436)

In [71]:
df_pdbbind = df[df["system_id"].isin(pdbbind_systems)].reset_index(drop=True)
a = df_pdbbind["system_biounit_id"].astype(int).sort_values()
df_pdbbind = df_pdbbind.reindex(index=a.index).drop_duplicates(["entry_pdb_id", "system_biounit_id", "ligand_ccd_code"])

In [85]:
with open("v1/other_splits/pdbbind/pdbbind_systems", "w") as f:
    f.write("\n".join(df_pdbbind["system_id"].unique()))

In [86]:
pdbbind_lp = pd.read_csv("v1/other_splits/pdbbind_lp/LP_PDBBind.csv")
pdbbind_lp.rename(columns={"Unnamed: 0": "entry_pdb_id"}, inplace=True)

In [89]:
pdbbind_lp.head()

Unnamed: 0,entry_pdb_id,header,smiles,category,seq,resolution,date,type,new_split,CL1,CL2,CL3,remove_for_balancing_val,kd/ki,value,covalent
0,6r8o,isomerase,CSc1ccccc1[C@H]1CCCN1C(=O)CNC(=O)NCc1ccc2c(c1)...,refined,GNPLVYLDVDANGKPLGRVVLELKADVVPKTAENFRALCTGEKGFG...,1.36,2019-11-27,isomerase,test,True,True,True,False,Kd=0.006uM,8.22,False
1,3fh7,hydrolase/hydrolase inhibitor,O=C([O-])CCC[N@H+]1CCC[C@H]1COc1ccc(Oc2ccc(Cl)...,refined,VDTCSLASPASVCRTKHLHLRCSVDFTRRTLTGTAALTVQSQEDNL...,2.05,2010-01-05,hydrolase,test,True,True,True,False,Kd=25nM,7.6,False
2,4b7r,hydrolase,CCC(CC)O[C@@H]1C[C@H](C(=O)[O-])C[C@H]([NH3+])...,refined,VKLAGNSSLCPVSGWAIYSKDNSVRIGSKGDVFVIREPFISCSPLE...,1.9,2012-10-03,hydrolase,,True,True,True,False,Ki=0.23nM,9.64,False
3,3qfd,immune system,CC[C@H](C)[C@H](NC(=O)CNC(=O)[C@H](C)NC(=O)[C@...,refined,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,1.68,2011-09-28,other,train,False,False,False,False,Kd=68uM,4.17,False
4,3fvn,membrane protein,[NH3+][C@@H](C[C@]1(C(=O)[O-])C[C@H]2OCC[C@@H]...,refined,ANRTLIVTTILEEPYVMYRKSDKPLYGNDRFEGYCLDLLKELSNIL...,1.5,2010-01-19,membrane,val,True,True,True,False,Ki=169nM,6.77,False


In [93]:
df_pdbbind.loc[df_pdbbind["entry_pdb_id"].isin(set(pdbbind_lp[pdbbind_lp["category"] == "general"]["entry_pdb_id"])), 
                                                     "original_split"] = "train"
df_pdbbind.loc[df_pdbbind["entry_pdb_id"].isin(set(pdbbind_lp[pdbbind_lp["category"] == "refined"]["entry_pdb_id"])), 
                                                     "original_split"] = "val"
df_pdbbind.loc[df_pdbbind["entry_pdb_id"].isin(set(pdbbind_lp[pdbbind_lp["category"] == "core"]["entry_pdb_id"])), 
                                                     "original_split"] = "test"
df_pdbbind["original_split"].value_counts()

original_split
train    28059
val       9730
test       517
Name: count, dtype: int64

In [109]:
df_pdbbind.loc[df_pdbbind["entry_pdb_id"].isin(set(pdbbind_lp[pdbbind_lp["new_split"] == "train"]["entry_pdb_id"])), 
                                                     "lp_split"] = "train"
df_pdbbind.loc[df_pdbbind["entry_pdb_id"].isin(set(pdbbind_lp[pdbbind_lp["new_split"] == "val"]["entry_pdb_id"])), 
                                                     "lp_split"] = "val"
df_pdbbind.loc[df_pdbbind["entry_pdb_id"].isin(set(pdbbind_lp[pdbbind_lp["new_split"] == "test"]["entry_pdb_id"])), 
                                                     "lp_split"] = "test"
df_pdbbind["lp_split"].value_counts()

lp_split
train    22898
test      9493
val       4643
Name: count, dtype: int64

In [100]:
for filename in Path("v1/other_splits/equibind/").glob("timesplit_*"):
    with open(filename) as f:
        pdb_ids = set(l.strip() for l in f)
    split = filename.stem.replace("timesplit_", "").replace("no_lig_overlap", "").replace("no_rec_overlap", "").replace("_", "")
    df_pdbbind.loc[df_pdbbind["entry_pdb_id"].isin(pdb_ids), "equibind_split"] = split

In [101]:
df_pdbbind["equibind_split"].value_counts()

equibind_split
train    32018
val       1955
test       277
Name: count, dtype: int64

In [110]:
df_pdbbind[["system_id", "original_split"]].rename(columns={"original_split": "split"}).to_csv("v1/other_splits/pdbbind/pdbbind.csv", index=False)
df_pdbbind[["system_id", "lp_split"]].rename(columns={"lp_split": "split"}).to_csv("v1/other_splits/pdbbind_lp/pdbbind_lp.csv", index=False)
df_pdbbind[["system_id", "equibind_split"]].rename(columns={"equibind_split": "split"}).to_csv("v1/other_splits/equibind/equibind.csv", index=False)

In [118]:
df_pdbbind["entry_pdb_id"].nunique()

19007

In [72]:
df_pdbbind["system_id"].nunique()

30337

In [117]:
df_pdbbind.drop_duplicates("system_id")["system_type"].value_counts()

system_type
holo        28184
artifact     1442
ion           711
Name: count, dtype: int64

In [119]:
df_dockgen["entry_pdb_id"].nunique()

16881

In [77]:
dockgen_ids = defaultdict(set)
for name in ["train_ccd.txt", "val_ccd.txt", "test_ccd.txt"]:
    filename = Path("v1/other_splits/dockgen/") / name
    with open(filename) as f:
        for line in f:
            parts = line.split("_")
            dockgen_ids[Path(filename).stem].add((parts[0], parts[1], parts[2]))
df_smaller = df[df["entry_pdb_id"].isin(set(x[0] for y in dockgen_ids for x in dockgen_ids[y]))][["system_id", "system_biounit_id", "entry_pdb_id", "ligand_ccd_code"]].reset_index(drop=True)
dockgen_systems = defaultdict(set)
for n in dockgen_ids:
    for pdb_id, biounit, ccd_code in tqdm(dockgen_ids[n]):
        df_p = df_smaller[(df_smaller["entry_pdb_id"] == pdb_id) & (df_smaller["system_biounit_id"] == biounit) & (df_smaller["ligand_ccd_code"] == ccd_code)]
        if df_p.shape[0] > 0:
            dockgen_systems[n] |= set(df_p["system_id"])

100%|██████████| 25698/25698 [08:10<00:00, 52.44it/s]
100%|██████████| 141/141 [00:02<00:00, 52.49it/s]
100%|██████████| 189/189 [00:03<00:00, 52.48it/s]


In [82]:
df_dockgen = df[df["system_id"].isin(set.union(*dockgen_systems.values()))].reset_index(drop=True)
df_dockgen.loc[df_dockgen["system_id"].isin(dockgen_systems["train_ccd"]), "split"] = "train"
df_dockgen.loc[df_dockgen["system_id"].isin(dockgen_systems["val_ccd"]), "split"] = "val"
df_dockgen.loc[df_dockgen["system_id"].isin(dockgen_systems["test_ccd"]), "split"] = "test"
df_dockgen["system_id"].nunique()

41791

In [104]:
df_dockgen[["system_id", "split"]].to_csv("v1/other_splits/dockgen/dockgen.csv", index=False)

In [121]:
df_dockgen.drop_duplicates("system_id")["system_type"].value_counts()

system_type
holo        37656
ion          3212
artifact      923
Name: count, dtype: int64

In [122]:
all_dockgen_pdb_ids = set()
for n in dockgen_ids:
    all_dockgen_pdb_ids |= set(x[0] for x in dockgen_ids[n])

In [129]:
len(all_dockgen_pdb_ids)

18504

In [126]:
len(all_dockgen_pdb_ids.difference(df["entry_pdb_id"]))

113

In [125]:
len(all_dockgen_pdb_ids.difference(df_dockgen["entry_pdb_id"]))

1623

In [74]:
df_pdbbind["entry_pdb_id"].nunique(), df_pdbbind["ligand_ccd_code"].nunique()

(19007, 15070)

In [75]:
len(set(x[0] for x in pdb_id_ccds).difference(df_pdbbind["entry_pdb_id"]))

436

In [128]:
df_dockgen["system_id"].nunique()

41791

In [105]:
with open("v1/other_splits/posebusters/posebuster_systems") as f:
    posebuster_systems = set(l.strip() for l in f)
df_posebusters = df[df["system_id"].isin(posebuster_systems)].reset_index(drop=True)

In [108]:
stats_dataset = []
for subdf, name in zip([df_holo, df_pdbbind, df_dockgen], ["plinder", "pdbbind", 'dockgen', 'posebusters']):
    stats_dataset.append([name, subdf["system_id"].nunique(), 
    subdf['entry_pdb_id'].nunique(),
    subdf[subdf['passes_quality']]['system_id'].nunique(),
    subdf['receptor_cluster'].nunique(),
    subdf['ligand_rdkit_canonical_smiles'].nunique(),
    subdf['ligand_unique_ccd_code'].nunique(),
    subdf['system_pocket_CATH'].nunique(),
    subdf['system_pocket_SCOP2B'].nunique(),
    subdf['system_pocket_ECOD_t_name'].nunique(),
    subdf['system_pocket_ECOD_t_id'].nunique(),
    subdf['system_pocket_Kinase name'].nunique(),
    subdf[subdf['system_has_kinase_inhibitor']]['system_id'].nunique(),
    ])
stats_dataset = pd.DataFrame(stats_dataset, columns=['dataset', 'systems', 'pdb ids', 'pass quality',
 'receptors', 'SMILES', 'ccd codes', 'CATH', 'SCOP2B', 'ECOD t name','ECOD t id', 'protein kinase', 'kinase inhibitors'])
print(stats_dataset[stats_dataset.dataset!='posebuster'].T.to_latex().lower())


\begin{tabular}{llll}
\toprule
 & 0 & 1 & 2 \\
\midrule
dataset & plinder & pdbbind & dockgen \\
systems & 449383 & 30337 & 41791 \\
pdb ids & 110791 & 19007 & 16881 \\
pass quality & 113498 & 10818 & 19355 \\
receptors & 74256 & 5425 & 7961 \\
smiles & 51573 & 15279 & 9174 \\
ccd codes & 46988 & 15064 & 9164 \\
cath & 1641 & 649 & 603 \\
scop2b & 11154 & 2423 & 2817 \\
ecod t name & 1332 & 528 & 478 \\
ecod t id & 4458 & 1444 & 1513 \\
protein kinase & 297 & 184 & 174 \\
kinase inhibitors & 48064 & 4682 & 5605 \\
\bottomrule
\end{tabular}

