In [13]:
from glob import glob
import pandas as pd
import numpy as np
from Bio import PDB
import os
os.environ["PATH"] += os.pathsep + "/opt/conda/envs/team05/bin"
from rdkit import Chem
from rdkit.Chem import MolToSmiles, MolFromMol2File
from rdkit.Chem.rdmolfiles import MolFromSmiles, MolToSmiles
from Bio.PDB import Select, PDBIO
from Bio.PDB.PDBParser import PDBParser

In [33]:
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS
0,5svl,"A,C",KSVVVKSWTIGIINRVVQLLIISYFVGWVFLHEKAYQVRDTAIESS...,666,333333,"81,83,97,100,113,114,115,142,144,221,255,258,2..."
1,5ali,A,TLRAAVFDLDGVLALPAVFGVLGRTEEALALPRGLLNDAFQKGGPE...,546,546,"264,265,266,267,308,332,333,334,335,336,337,33..."
2,5a5d,A,AMLPISMSDEGDSFLVKDSLGENKIPKNPSKVVILDLGILDTFDAL...,289,289,"57,58,74,75,76,77,94,95,96,97,98,99,100,117,16..."
3,6bh5,A,EFVPPPECPVFEPSWEEFTDPLSFIGRIRPLAEKTGICKIRPPKDW...,292,292,"61,63,123,124,125,126,127,173,174,175,176,177,..."
4,1hqh,"A,B,C",KPIEIIGAPFSKGQPRGGVEKGPAALRKAGLVEKLKETEYNVRDHG...,942,314314314,"12,13,14,15,18,62,94,95,96,116,118,119,120,121..."
...,...,...,...,...,...,...
14117,6g6y,A,KNIQVVVRCRPFNLAERKASAHSIVECDPVRKEVSVRTGSSRKTYT...,319,319,"56,90,91,92,93,94,95,96,97,98,103,104,105,106,..."
14118,3wc7,A,GHSYEKYNNWETIEAWTKQVTSENPDLISRTAIGTTFLGNNIYLLK...,304,304,"62,63,65,66,69,119,121,136,137,138,139,140,143..."
14119,5zhm,"A,B",SMLWVGVVSIFPEMFRAISDYGITSRAVKQGLLTLTCWNPRVYTED...,476,239237,"59,60,61,62,63,88,89,90,91,92,93,95,96,113,114..."
14120,3fts,A,VDTCSLASPASVCRTKHLHLRCSVDFTRRTLTGTAALTVQSQEDNL...,607,607,"3,130,131,132,133,134,135,146,196,262,263,266,..."


In [35]:
import pandas as pd
import re
from math import log10

core_path = "/workspace/datasets/PDBbind/general-set/index/INDEX_general_PL_data.2020"
out_csv   = "/workspace/binding_affinity/PDBbind_affinity.csv"

def to_nM(value: float, unit: str) -> float:
    u = unit.lower()
    if u == "pm":  return value * 1e-3
    if u == "nm":  return value
    if u == "um":  return value * 1e3
    if u == "mm":  return value * 1e6
    if u == "m":   return value * 1e9
    return None

pat_meas = re.compile(r"(Ki|Kd|IC50)=([0-9]*\.?[0-9]+)(pM|nM|uM|mM|M)", re.IGNORECASE)

rows = []
with open(core_path) as f:
    for line in f:
        if line.startswith("#") or not line.strip():
            continue
        parts = line.split()
        pdb_id = parts[0]
        logKa  = float(parts[3]) 
        Ka_str = parts[4]

        affinity_nM, pAff = None, None
        m = pat_meas.search(Ka_str)
        if m:
            val, unit = float(m.group(2)), m.group(3)
            affinity_nM = to_nM(val, unit)
            if affinity_nM:
                pAff = 9 - log10(affinity_nM)

        rows.append((pdb_id, affinity_nM, pAff, logKa))

df_aff = pd.DataFrame(rows, columns=["PDB", "Affinity_nM", "pAff", "logKa"])

df_aff = df_aff.astype(object).where(pd.notnull(df_aff), None)

df_aff.to_csv(out_csv, index=False)
print(f"✅ Saved parsed affinities to {out_csv}")

print(df_aff.head())


✅ Saved parsed affinities to /workspace/binding_affinity/PDBbind_affinity.csv
    PDB  Affinity_nM      pAff logKa
0  3zzf  400000000.0   0.39794   0.4
1  3gww  355000000.0  0.449772  0.45
2  1w8l  320000000.0   0.49485  0.49
3  3fqa  320000000.0   0.49485  0.49
4  1zsb  250000000.0   0.60206   0.6


In [46]:
import pandas as pd

sequence_df = pd.read_csv(
    "/workspace/binding_site/datasets/PDBbind_data.tsv", sep="\t"
)
affinity_df = pd.read_csv(
    "/workspace/binding_affinity/PDBbind_affinity.csv"
)

merged = pd.merge(sequence_df, affinity_df, on="PDB", how="inner")

rows = []
for _, row in merged.iterrows():
    pdb = row["PDB"]
    sequences = str(row["Sequence"]).split(",") 
    bs = str(row["BS"]) if pd.notnull(row["BS"]) else ""

    total_len = sum(len(seq) for seq in sequences)
    chain_lens = ",".join(str(len(seq)) for seq in sequences)
    seq_concat = ",".join(sequences)

    rows.append({
        "PDB": pdb,
        "Chain": "All", 
        "Sequence": seq_concat,
        "Total_seq_lengths": total_len,
        "Chain_seq_lengths": chain_lens,
        "BS": bs,
        "Affinity_nM": row["Affinity_nM"],
        "pAff": row["pAff"],
        "logKa": row["logKa"]
    })

final_df = pd.DataFrame(rows, columns=[
    "PDB", "Chain", "Sequence", "Total_seq_lengths",
    "Chain_seq_lengths", "BS", "Affinity_nM", "pAff", "logKa"
])

out_tsv = "/workspace/binding_affinity/datasets/train2020/merged_protein_ligand.tsv"
final_df.to_csv(out_tsv, sep="\t", index=False)

print(f"✅ Saved merged table to {out_tsv}")
print(final_df.head())


✅ Saved merged table to /workspace/binding_affinity/datasets/train/merged_protein_ligand.tsv
    PDB Chain                                           Sequence  \
0  5svl   All  KSVVVKSWTIGIINRVVQLLIISYFVGWVFLHEKAYQVRDTAIESS...   
1  5ali   All  TLRAAVFDLDGVLALPAVFGVLGRTEEALALPRGLLNDAFQKGGPE...   
2  5a5d   All  AMLPISMSDEGDSFLVKDSLGENKIPKNPSKVVILDLGILDTFDAL...   
3  6bh5   All  EFVPPPECPVFEPSWEEFTDPLSFIGRIRPLAEKTGICKIRPPKDW...   
4  1hqh   All  KPIEIIGAPFSKGQPRGGVEKGPAALRKAGLVEKLKETEYNVRDHG...   

   Total_seq_lengths Chain_seq_lengths  \
0                666           333,333   
1                546               546   
2                289               289   
3                292               292   
4                942       314,314,314   

                                                  BS  Affinity_nM      pAff  \
0  81,83,97,100,113,114,115,142,144,221,255,258,2...          2.8  8.552842   
1  264,265,266,267,308,332,333,334,335,336,337,33...          NaN       NaN   
2  57,58

In [None]:
import os
import pandas as pd
from rdkit.Chem import MolFromSmiles, MolToSmiles

def read_file(file):
    return [i.strip() for i in file.readlines()]

def convert_smiles(row):
    pdb = row.PDB
    pdb_path = "/workspace/datasets/PDBbind/general-set"
    mol = f"{pdb_path}/{pdb}/{pdb}_ligand.mol2"
    command = f'obabel -imol2 "{mol}" -osmi -xC | obabel -ismi -osmi -xk -O tmp.smi'
    os.system(command)
    
    try:
        with open("tmp.smi") as f:
            smiles = f.readlines()[0].split('\t')[0].strip()
        smiles = MolToSmiles(MolFromSmiles(smiles), isomericSmiles=False, kekuleSmiles=True)
        return smiles
    except Exception as e:
        print(f"{pdb}: {e}")
        return None

in_tsv = "/workspace/binding_affinity/datasets/train2020/merged_protein_ligand.tsv"
df = pd.read_csv(in_tsv, sep="\t")

df["SMILES"] = df.apply(convert_smiles, axis=1)

out_tsv = "/workspace/binding_affinity/datasets/train2020/merged_protein_ligand_withSMILES.tsv"
df.to_csv(out_tsv, sep="\t", index=False)

print(f"✅ Saved TSV with SMILES to {out_tsv}")
print(df.head())


  Failed to kekulize aromatic bonds in MOL2 file (title is 5svl_ligand)

1 molecule converted
  Failed to kekulize aromatic SMILES (title is 5svl_ligand)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in MOL2 file (title is 1hqh_ligand)

1 molecule converted
  Failed to kekulize aromatic SMILES (title is 1hqh_ligand)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in MOL2 file (title is 5w8v_ligand)

1 molecule converted
  Failed to kekulize aromatic SMILES (title is 5w8v_ligand)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed t

In [49]:
import pandas as pd

in_tsv = "/workspace/binding_affinity/datasets/train2020/merged_protein_ligand_withSMILES.tsv"
df = pd.read_csv(in_tsv, sep="\t")

mask_len = df["SMILES"].apply(lambda smi: isinstance(smi, str) and len(smi) <= 150)

mask_paff = df["pAff"].notnull()

df_filtered = df[mask_len & mask_paff].reset_index(drop=True)

final_df = df_filtered[[
    "PDB", "Chain", "Sequence", "BS", "Affinity_nM", "pAff", "SMILES"
]]

out_tsv = "/workspace/binding_affinity/datasets/train2020/training.tsv"
final_df.to_csv(out_tsv, sep="\t", index=False)

print(f"✅ Saved final filtered TSV to {out_tsv}")
print(final_df.head())


✅ Saved final filtered TSV to /workspace/binding_affinity/datasets/train/merged_final.tsv
    PDB Chain                                           Sequence  \
0  5svl   All  KSVVVKSWTIGIINRVVQLLIISYFVGWVFLHEKAYQVRDTAIESS...   
1  6bh5   All  EFVPPPECPVFEPSWEEFTDPLSFIGRIRPLAEKTGICKIRPPKDW...   
2  1hqh   All  KPIEIIGAPFSKGQPRGGVEKGPAALRKAGLVEKLKETEYNVRDHG...   
3  3pax   All  KSKLAKPIQDLIKMIFDVESMKKAMVEFEIDLQKMPLGKLSKRQIQ...   
4  4li8   All  GTILLDLAPEDKEYQSVEEEMQSTIREHRDGGNAGGIFNRYNVIRI...   

                                                  BS  Affinity_nM      pAff  \
0  81,83,97,100,113,114,115,142,144,221,255,258,2...          2.8  8.552842   
1  61,63,123,124,125,126,127,173,174,175,176,177,...        250.0  6.602060   
2  12,13,14,15,18,62,94,95,96,116,118,119,120,121...        500.0  6.301030   
3  101,198,199,200,201,202,203,206,207,210,215,22...      10000.0  5.000000   
4  25,27,77,78,79,80,81,82,83,84,85,86,87,88,89,9...          7.5  8.124939   

                          