In [26]:
from glob import glob
import pandas as pd
import numpy as np
from Bio import PDB
import os
os.environ["PATH"] += os.pathsep + "/opt/conda/envs/team05/bin"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from rdkit import Chem
from rdkit.Chem import MolToSmiles, MolFromMol2File
from rdkit.Chem.rdmolfiles import MolFromSmiles, MolToSmiles
from Bio.PDB import Select, PDBIO
from Bio.PDB.PDBParser import PDBParser

In [27]:
pdb_path = os.path.abspath("/workspace/datasets/PDBbind2016/core")
info_path = os.path.abspath("/workspace/datasets/PDBbind/")

In [28]:
complex_list = [
    f for f in os.listdir(pdb_path)
    if os.path.isdir(os.path.join(pdb_path, f)) and len(f) == 4
]

In [30]:
from Bio.PDB import PDBParser, PDBIO, Select
import os

def remove_HETATM_PDBbind(input_list, pdb_path):

    class NonHetSelect(Select):
        def accept_residue(self, residue):
            return 1 if residue.id[0] == " " else 0
    
    for pdb in input_list:
        src_file = f"{pdb_path}/{pdb}/{pdb}_protein.pdb"
        des_file = f"{pdb_path}/{pdb}/{pdb}_remove_HEATM_protein.pdb"

        if os.path.exists(des_file):
            print(f"[SKIP] {des_file} already exists")
            continue

        if not os.path.exists(src_file):
            print(f"[WARN] source file not found: {src_file}")
            continue

        structure = PDBParser().get_structure(pdb, src_file)
        io = PDBIO()
        io.set_structure(structure)
        io.save(des_file, NonHetSelect())
        print(f"[OK] saved: {des_file}")


In [31]:
# import os
# import glob

# def delete_remove_HETATM_files(pdb_path):
#     pattern = os.path.join(pdb_path, "*", "*_remove_HEATM_protein.pdb")
#     targets = glob.glob(pattern)
#     for file in targets:
#         try:
#             os.remove(file)
#             print(f"삭제됨: {file}")
#         except Exception as e:
#             print(f"삭제 실패: {file} → {e}")

# delete_remove_HETATM_files(pdb_path)


In [32]:
remove_HETATM_PDBbind(complex_list, pdb_path)

[SKIP] /workspace/datasets/PDBbind2016/core/3f3c/3f3c_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/PDBbind2016/core/3ge7/3ge7_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/PDBbind2016/core/3mss/3mss_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/PDBbind2016/core/3gc5/3gc5_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/PDBbind2016/core/4cig/4cig_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/PDBbind2016/core/3r88/3r88_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/PDBbind2016/core/4mrw/4mrw_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/PDBbind2016/core/4e5w/4e5w_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/PDBbind2016/core/1o5b/1o5b_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/PDBbind2016/core/2j78/2j78_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/PDBbind2016/core/5c1w/5c1w_remove

In [33]:
pdb_parser = PDB.PDBParser(QUIET = True)

In [34]:
amino_acids_short = {
    "ALA":"A", "ARG":"R", "ASN":"N", "ASP":"D", "CYS":"C",
    "GLU":"E", "GLN":"Q", "GLY":"G", "HIS":"H", "ILE":"I",
    "LEU":"L", "LYS":"K", "MET":"M", "PHE":"F", "PRO":"P",
    "SER":"S", "THR":"T", "TRP":"W", "TYR":"Y", "VAL":"V",
    "SEC":"U", "PYL":"O"
}

In [35]:
data_df = pd.DataFrame({"PDB":complex_list})

In [36]:
def get_info(pdb):
    try:
        """ Load protein info """
        structure = pdb_parser.get_structure(pdb, f"{pdb_path}/{pdb}/{pdb}_remove_HEATM_protein.pdb")
        chain_name_list, pdb_sequence_list, seq_lengths_list, protein_atom_coords, protein_atom_residue_list, reindex = list(), list(), list(), list(), list(), 0
        
        """ Extract protein info """
        for chain_name in list(structure[0].child_dict.keys()):
            chain = structure[0][chain_name]

            pdb_sequence = ""
            for residue in chain.get_residues():
                if residue.resname in amino_acids_short.keys():
                    pdb_sequence += amino_acids_short[residue.resname]

                    for atom in residue:
                        protein_atom_coords.append(atom.get_coord())
                        protein_atom_residue_list.append(reindex)
                    reindex += 1     

            if len(pdb_sequence) != 0:
                chain_name_list.append(chain_name)
                pdb_sequence_list.append(pdb_sequence)
                seq_lengths_list.append(len(pdb_sequence))

        """ Load pocket info """      
        protein_atom_coords, binding_index = np.array(protein_atom_coords), list()
        pocket_structure = pdb_parser.get_structure(pdb, f"{pdb_path}/{pdb}/{pdb}_pocket.pdb")

        pocket_coordi = list()

        for chain_name in list(pocket_structure[0].child_dict.keys()):
            chain = pocket_structure[0][chain_name]
            for residue in chain.get_residues():
                if residue.resname in amino_acids_short.keys():
                    for atom in residue:
                        pocket_coordi.append(atom.get_coord())
        
        """ Matching pocket info """
        pocket_coordi = np.array(pocket_coordi)
        bi_x, bi_y, bi_z = pocket_coordi[:, 0], pocket_coordi[:, 1], pocket_coordi[:, 2]

        for i, j, k in zip(bi_x, bi_y, bi_z):
            tmp_coordi = np.array([i, j, k], dtype = np.float32)
            ind = np.where((protein_atom_coords == tmp_coordi).all(axis = 1))[0][0]
            binding_index.append(protein_atom_residue_list[ind])

        binding_index = sorted(list(set(binding_index)))
        binding_index = list(map(str, binding_index))

        total_seq_lengths = np.sum(np.array(seq_lengths_list))
        seq_lengths_list = list(map(str, seq_lengths_list))

        return ",".join(chain_name_list), ",".join(pdb_sequence_list), total_seq_lengths, ",".join(seq_lengths_list), ",".join(binding_index)

    except Exception as e:
        print(pdb, e)
        return None

In [37]:
from multiprocessing import Process, Queue, Pool

def parallelize_dataframe(df, func, num_partitions=5):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_partitions)
    results = pool.map(func, df_split)
    pool.close()
    pool.join()
    return results

In [38]:
def get_pdb_info_bulk(df):
    return df.PDB.map(get_info)

In [39]:
info_results = parallelize_dataframe(data_df, get_pdb_info_bulk, 5)

In [40]:
info_results = pd.concat(info_results)

In [41]:
data_df["Chain"] = info_results.map(lambda a: a[0] if a is not None else None)

In [42]:
data_df["Sequence"] = info_results.map(lambda a: a[1] if a is not None else None)

In [43]:
data_df["Total_seq_lengths"] = info_results.map(lambda a: a[2] if a is not None else None)

In [44]:
data_df["Chain_seq_lengths"] = info_results.map(lambda a: a[3] if a is not None else None)

In [45]:
data_df["BS"] = info_results.map(lambda a: a[4] if a is not None else None)

In [46]:
data_df = data_df.loc[data_df.Sequence.isna()==False].reset_index(drop=True)
data_df = data_df.loc[data_df.Chain != " "].reset_index(drop=True)

In [47]:
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,1018,509509,"13,14,15,16,17,18,19,20,21,22,23,24,25,26,54,9..."
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,718,359359,"32,33,34,35,36,37,41,57,58,59,60,61,62,63,89,9..."
2,3mss,A,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,264,264,"93,102,103,104,105,106,107,108,109,110,111,113..."
3,3gc5,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,736,368368,"32,34,36,57,58,59,60,89,90,91,92,93,94,95,96,9..."
4,4cig,"A,B",SPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFLL...,294,147147,"108,109,110,111,112,113,114,115,116,117,118,11..."
...,...,...,...,...,...,...
285,3fv1,A,ANRTLIVTTILEEPYVMYRKSDKPLYGNDRFEGYCLDLLKELSNIL...,256,256,"9,10,11,12,13,14,34,58,59,60,61,68,70,71,72,85..."
286,2zy1,A,MTMMDMNFKYCHKIMKKHSKSFSYAFDLLPEDQRKAVWAIYAVCRK...,279,279,"14,17,18,20,21,24,25,36,39,40,43,44,46,47,48,1..."
287,4mme,A,REHWATRLGLILAMAGYAVDLGNFLRFPVQAAENGGGAFMIPYIIA...,504,504,"13,14,15,16,17,18,19,20,21,22,25,95,96,97,98,9..."
288,3zdg,"K,L,M,N,O",LDRADILYNIRQTSRPDVIPTQRDRPVAVSVSLKFINILEVNEITN...,994,199199199197200,"84,85,86,88,138,140,141,142,143,144,145,177,17..."


In [48]:
import pandas as pd
import re
from math import log10

core_path = "/workspace/datasets/PDBbind2016/index/INDEX_core_data.2016"
out_csv   = "/workspace/binding_affinity/datasets/Core2016/core2016_Afinity.csv"

def to_nM(value: float, unit: str) -> float:
    u = unit.lower()
    if u == "pm":  return value * 1e-3
    if u == "nm":  return value
    if u == "um":  return value * 1e3
    if u == "mm":  return value * 1e6
    if u == "m":   return value * 1e9
    return None

pat_meas = re.compile(r"(Ki|Kd|IC50)=([0-9]*\.?[0-9]+)(pM|nM|uM|mM|M)", re.IGNORECASE)

rows = []
with open(core_path) as f:
    for line in f:
        if line.startswith("#") or not line.strip():
            continue
        parts = line.split()
        pdb_id = parts[0]
        logKa  = float(parts[3])  
        Ka_str = parts[4]

        affinity_nM, pAff = None, None
        m = pat_meas.search(Ka_str)
        if m:
            val, unit = float(m.group(2)), m.group(3)
            affinity_nM = to_nM(val, unit)
            if affinity_nM:
                pAff = 9 - log10(affinity_nM)

        rows.append((pdb_id, affinity_nM, pAff, logKa))

df_aff = pd.DataFrame(rows, columns=["PDB", "Affinity_nM", "pAff", "loaKa"])

df_aff.to_csv(out_csv, index=False)
print(f"✅ Saved parsed affinities to {out_csv}")
df_aff.head()

df_aff = df_aff.where(pd.notnull(df_aff), None)


✅ Saved parsed affinities to /workspace/binding_affinity/datasets/core2016/core2016_Afinity.csv


In [49]:
data_df = pd.merge(data_df, df_aff, on="PDB", how="inner")

In [50]:
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Affinity_nM,pAff,loaKa
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,1018,509509,"13,14,15,16,17,18,19,20,21,22,23,24,25,26,54,9...",950.0,6.022276,6.02
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,718,359359,"32,33,34,35,36,37,41,57,58,59,60,61,62,63,89,9...",2.0,8.698970,8.70
2,3mss,A,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,264,264,"93,102,103,104,105,106,107,108,109,110,111,113...",22000.0,4.657577,4.66
3,3gc5,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,736,368368,"32,34,36,57,58,59,60,89,90,91,92,93,94,95,96,9...",55.0,7.259637,7.26
4,4cig,"A,B",SPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFLL...,294,147147,"108,109,110,111,112,113,114,115,116,117,118,11...",214000.0,3.669586,3.67
...,...,...,...,...,...,...,...,...,...
285,3fv1,A,ANRTLIVTTILEEPYVMYRKSDKPLYGNDRFEGYCLDLLKELSNIL...,256,256,"9,10,11,12,13,14,34,58,59,60,61,68,70,71,72,85...",0.5,9.301030,9.30
286,2zy1,A,MTMMDMNFKYCHKIMKKHSKSFSYAFDLLPEDQRKAVWAIYAVCRK...,279,279,"14,17,18,20,21,24,25,36,39,40,43,44,46,47,48,1...",40.0,7.397940,7.40
287,4mme,A,REHWATRLGLILAMAGYAVDLGNFLRFPVQAAENGGGAFMIPYIIA...,504,504,"13,14,15,16,17,18,19,20,21,22,25,95,96,97,98,9...",318.0,6.497573,6.50
288,3zdg,"K,L,M,N,O",LDRADILYNIRQTSRPDVIPTQRDRPVAVSVSLKFINILEVNEITN...,994,199199199197200,"84,85,86,88,138,140,141,142,143,144,145,177,17...",79.0,7.102373,7.10


In [51]:
lengths = data_df.Total_seq_lengths.values

In [52]:
data_df = data_df[lengths <= 1500].reset_index(drop=True)
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Affinity_nM,pAff,loaKa
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,1018,509509,"13,14,15,16,17,18,19,20,21,22,23,24,25,26,54,9...",950.0,6.022276,6.02
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,718,359359,"32,33,34,35,36,37,41,57,58,59,60,61,62,63,89,9...",2.0,8.698970,8.70
2,3mss,A,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,264,264,"93,102,103,104,105,106,107,108,109,110,111,113...",22000.0,4.657577,4.66
3,3gc5,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,736,368368,"32,34,36,57,58,59,60,89,90,91,92,93,94,95,96,9...",55.0,7.259637,7.26
4,4cig,"A,B",SPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFLL...,294,147147,"108,109,110,111,112,113,114,115,116,117,118,11...",214000.0,3.669586,3.67
...,...,...,...,...,...,...,...,...,...
275,3fv1,A,ANRTLIVTTILEEPYVMYRKSDKPLYGNDRFEGYCLDLLKELSNIL...,256,256,"9,10,11,12,13,14,34,58,59,60,61,68,70,71,72,85...",0.5,9.301030,9.30
276,2zy1,A,MTMMDMNFKYCHKIMKKHSKSFSYAFDLLPEDQRKAVWAIYAVCRK...,279,279,"14,17,18,20,21,24,25,36,39,40,43,44,46,47,48,1...",40.0,7.397940,7.40
277,4mme,A,REHWATRLGLILAMAGYAVDLGNFLRFPVQAAENGGGAFMIPYIIA...,504,504,"13,14,15,16,17,18,19,20,21,22,25,95,96,97,98,9...",318.0,6.497573,6.50
278,3zdg,"K,L,M,N,O",LDRADILYNIRQTSRPDVIPTQRDRPVAVSVSLKFINILEVNEITN...,994,199199199197200,"84,85,86,88,138,140,141,142,143,144,145,177,17...",79.0,7.102373,7.10


In [53]:
data_df = data_df[data_df["BS"] != ""].reset_index(drop=True)
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Affinity_nM,pAff,loaKa
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,1018,509509,"13,14,15,16,17,18,19,20,21,22,23,24,25,26,54,9...",950.0,6.022276,6.02
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,718,359359,"32,33,34,35,36,37,41,57,58,59,60,61,62,63,89,9...",2.0,8.698970,8.70
2,3mss,A,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,264,264,"93,102,103,104,105,106,107,108,109,110,111,113...",22000.0,4.657577,4.66
3,3gc5,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,736,368368,"32,34,36,57,58,59,60,89,90,91,92,93,94,95,96,9...",55.0,7.259637,7.26
4,4cig,"A,B",SPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFLL...,294,147147,"108,109,110,111,112,113,114,115,116,117,118,11...",214000.0,3.669586,3.67
...,...,...,...,...,...,...,...,...,...
275,3fv1,A,ANRTLIVTTILEEPYVMYRKSDKPLYGNDRFEGYCLDLLKELSNIL...,256,256,"9,10,11,12,13,14,34,58,59,60,61,68,70,71,72,85...",0.5,9.301030,9.30
276,2zy1,A,MTMMDMNFKYCHKIMKKHSKSFSYAFDLLPEDQRKAVWAIYAVCRK...,279,279,"14,17,18,20,21,24,25,36,39,40,43,44,46,47,48,1...",40.0,7.397940,7.40
277,4mme,A,REHWATRLGLILAMAGYAVDLGNFLRFPVQAAENGGGAFMIPYIIA...,504,504,"13,14,15,16,17,18,19,20,21,22,25,95,96,97,98,9...",318.0,6.497573,6.50
278,3zdg,"K,L,M,N,O",LDRADILYNIRQTSRPDVIPTQRDRPVAVSVSLKFINILEVNEITN...,994,199199199197200,"84,85,86,88,138,140,141,142,143,144,145,177,17...",79.0,7.102373,7.10


In [54]:
def convert_smiles(row):
    pdb = row.PDB
    
    mol = f"{pdb_path}/{pdb}/{pdb}_ligand.mol2"
    command = f'obabel -imol2 "{mol}" -osmi -xC | obabel -ismi -osmi -xk -O tmp.smi'
    os.system(command)
    
    smiles = read_file(open("tmp.smi"))[0].split('\t')[0].strip()
    
    try:
        smiles = MolToSmiles(MolFromSmiles(smiles),isomericSmiles = False, kekuleSmiles = True)
        return smiles
    
    except Exception as e:
        print(pdb, e)
        return None

In [55]:
def read_file(file):
    return [i.strip() for i in file.readlines()]

In [56]:
SMILES = data_df.apply(convert_smiles, axis = 1)

1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in MOL2 file (title is 3ge7_ligand)

1 molecule converted
  Failed to kekulize aromatic SMILES (title is 3ge7_ligand)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in MOL2 file (title is 1o5b_ligand)

1 molecule converted
  Failed to kekulize aromatic SMILES (title is 1o5b_ligand)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule convert

4mme Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)


In [57]:
data_df["SMILES"] = SMILES

In [58]:
data_df = data_df.loc[data_df.SMILES.isna()==False].reset_index(drop=True)
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Affinity_nM,pAff,loaKa,SMILES
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,1018,509509,"13,14,15,16,17,18,19,20,21,22,23,24,25,26,54,9...",950.0,6.022276,6.02,[NH3+]C(CC1=CC=C(F)C=C1)C(=O)[O-]
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,718,359359,"32,33,34,35,36,37,41,57,58,59,60,61,62,63,89,9...",2.0,8.698970,8.70,CNC1=NC2=C(CC[NH2+]CC3CCCC3)C3=C(C=C2N1)C(=O)N...
2,3mss,A,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,264,264,"93,102,103,104,105,106,107,108,109,110,111,113...",22000.0,4.657577,4.66,CNC(=O)C([NH3+])CC1=CC=C(OCC2=CC=CC=C2)C=C1
3,3gc5,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,736,368368,"32,34,36,57,58,59,60,89,90,91,92,93,94,95,96,9...",55.0,7.259637,7.26,CNC1=NC2=C(C=C3C(=O)NC(N)=NC3=C2CC[NH3+])N1
4,4cig,"A,B",SPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFLL...,294,147147,"108,109,110,111,112,113,114,115,116,117,118,11...",214000.0,3.669586,3.67,COC1=CC=C(CNC(=O)C2=CC3=C(C=CC=C3)C=C2C[NH+](C...
...,...,...,...,...,...,...,...,...,...,...
274,3fcq,A,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,316,316,"109,110,111,112,113,114,119,120,121,122,128,12...",1700000.0,2.769551,2.77,CC(=O)OC1=C(C(=O)[O-])C=CC=C1C
275,3fv1,A,ANRTLIVTTILEEPYVMYRKSDKPLYGNDRFEGYCLDLLKELSNIL...,256,256,"9,10,11,12,13,14,34,58,59,60,61,68,70,71,72,85...",0.5,9.301030,9.30,C[NH2+]C1C(O)COC2CC(CC([NH3+])C(=O)[O-])(C(=O)...
276,2zy1,A,MTMMDMNFKYCHKIMKKHSKSFSYAFDLLPEDQRKAVWAIYAVCRK...,279,279,"14,17,18,20,21,24,25,36,39,40,43,44,46,47,48,1...",40.0,7.397940,7.40,[O]P([O])(=O)CC(=O)NCCCC1=CC=CC(OC2=CC=CC=C2)=C1
277,3zdg,"K,L,M,N,O",LDRADILYNIRQTSRPDVIPTQRDRPVAVSVSLKFINILEVNEITN...,994,199199199197200,"84,85,86,88,138,140,141,142,143,144,145,177,17...",79.0,7.102373,7.10,CC(CCOC(=O)N(C)C)[NH+](C)C


In [59]:
def get_SMILES_length(df):
    index = [True if len(smi) <= 150 else False for smi in df.SMILES.values]
    return index

In [60]:
smiles_index = get_SMILES_length(data_df)

In [61]:
data_df = data_df.loc[smiles_index].reset_index(drop=True)

In [62]:
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Affinity_nM,pAff,loaKa,SMILES
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,1018,509509,"13,14,15,16,17,18,19,20,21,22,23,24,25,26,54,9...",950.0,6.022276,6.02,[NH3+]C(CC1=CC=C(F)C=C1)C(=O)[O-]
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,718,359359,"32,33,34,35,36,37,41,57,58,59,60,61,62,63,89,9...",2.0,8.698970,8.70,CNC1=NC2=C(CC[NH2+]CC3CCCC3)C3=C(C=C2N1)C(=O)N...
2,3mss,A,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,264,264,"93,102,103,104,105,106,107,108,109,110,111,113...",22000.0,4.657577,4.66,CNC(=O)C([NH3+])CC1=CC=C(OCC2=CC=CC=C2)C=C1
3,3gc5,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,736,368368,"32,34,36,57,58,59,60,89,90,91,92,93,94,95,96,9...",55.0,7.259637,7.26,CNC1=NC2=C(C=C3C(=O)NC(N)=NC3=C2CC[NH3+])N1
4,4cig,"A,B",SPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFLL...,294,147147,"108,109,110,111,112,113,114,115,116,117,118,11...",214000.0,3.669586,3.67,COC1=CC=C(CNC(=O)C2=CC3=C(C=CC=C3)C=C2C[NH+](C...
...,...,...,...,...,...,...,...,...,...,...
274,3fcq,A,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,316,316,"109,110,111,112,113,114,119,120,121,122,128,12...",1700000.0,2.769551,2.77,CC(=O)OC1=C(C(=O)[O-])C=CC=C1C
275,3fv1,A,ANRTLIVTTILEEPYVMYRKSDKPLYGNDRFEGYCLDLLKELSNIL...,256,256,"9,10,11,12,13,14,34,58,59,60,61,68,70,71,72,85...",0.5,9.301030,9.30,C[NH2+]C1C(O)COC2CC(CC([NH3+])C(=O)[O-])(C(=O)...
276,2zy1,A,MTMMDMNFKYCHKIMKKHSKSFSYAFDLLPEDQRKAVWAIYAVCRK...,279,279,"14,17,18,20,21,24,25,36,39,40,43,44,46,47,48,1...",40.0,7.397940,7.40,[O]P([O])(=O)CC(=O)NCCCC1=CC=CC(OC2=CC=CC=C2)=C1
277,3zdg,"K,L,M,N,O",LDRADILYNIRQTSRPDVIPTQRDRPVAVSVSLKFINILEVNEITN...,994,199199199197200,"84,85,86,88,138,140,141,142,143,144,145,177,17...",79.0,7.102373,7.10,CC(CCOC(=O)N(C)C)[NH+](C)C


In [63]:
data_dff = data_df.iloc[:, [0, 1, 2, 5, 6, 7, 9]]
data_dff

Unnamed: 0,PDB,Chain,Sequence,BS,Affinity_nM,pAff,SMILES
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,"13,14,15,16,17,18,19,20,21,22,23,24,25,26,54,9...",950.0,6.022276,[NH3+]C(CC1=CC=C(F)C=C1)C(=O)[O-]
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,"32,33,34,35,36,37,41,57,58,59,60,61,62,63,89,9...",2.0,8.698970,CNC1=NC2=C(CC[NH2+]CC3CCCC3)C3=C(C=C2N1)C(=O)N...
2,3mss,A,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,"93,102,103,104,105,106,107,108,109,110,111,113...",22000.0,4.657577,CNC(=O)C([NH3+])CC1=CC=C(OCC2=CC=CC=C2)C=C1
3,3gc5,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,"32,34,36,57,58,59,60,89,90,91,92,93,94,95,96,9...",55.0,7.259637,CNC1=NC2=C(C=C3C(=O)NC(N)=NC3=C2CC[NH3+])N1
4,4cig,"A,B",SPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFLL...,"108,109,110,111,112,113,114,115,116,117,118,11...",214000.0,3.669586,COC1=CC=C(CNC(=O)C2=CC3=C(C=CC=C3)C=C2C[NH+](C...
...,...,...,...,...,...,...,...
274,3fcq,A,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,"109,110,111,112,113,114,119,120,121,122,128,12...",1700000.0,2.769551,CC(=O)OC1=C(C(=O)[O-])C=CC=C1C
275,3fv1,A,ANRTLIVTTILEEPYVMYRKSDKPLYGNDRFEGYCLDLLKELSNIL...,"9,10,11,12,13,14,34,58,59,60,61,68,70,71,72,85...",0.5,9.301030,C[NH2+]C1C(O)COC2CC(CC([NH3+])C(=O)[O-])(C(=O)...
276,2zy1,A,MTMMDMNFKYCHKIMKKHSKSFSYAFDLLPEDQRKAVWAIYAVCRK...,"14,17,18,20,21,24,25,36,39,40,43,44,46,47,48,1...",40.0,7.397940,[O]P([O])(=O)CC(=O)NCCCC1=CC=CC(OC2=CC=CC=C2)=C1
277,3zdg,"K,L,M,N,O",LDRADILYNIRQTSRPDVIPTQRDRPVAVSVSLKFINILEVNEITN...,"84,85,86,88,138,140,141,142,143,144,145,177,17...",79.0,7.102373,CC(CCOC(=O)N(C)C)[NH+](C)C


In [64]:
df = data_dff[data_df["pAff"].notna()].reset_index(drop=True)
df

Unnamed: 0,PDB,Chain,Sequence,BS,Affinity_nM,pAff,SMILES
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,"13,14,15,16,17,18,19,20,21,22,23,24,25,26,54,9...",950.0,6.022276,[NH3+]C(CC1=CC=C(F)C=C1)C(=O)[O-]
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,"32,33,34,35,36,37,41,57,58,59,60,61,62,63,89,9...",2.0,8.698970,CNC1=NC2=C(CC[NH2+]CC3CCCC3)C3=C(C=C2N1)C(=O)N...
2,3mss,A,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,"93,102,103,104,105,106,107,108,109,110,111,113...",22000.0,4.657577,CNC(=O)C([NH3+])CC1=CC=C(OCC2=CC=CC=C2)C=C1
3,3gc5,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,"32,34,36,57,58,59,60,89,90,91,92,93,94,95,96,9...",55.0,7.259637,CNC1=NC2=C(C=C3C(=O)NC(N)=NC3=C2CC[NH3+])N1
4,4cig,"A,B",SPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFLL...,"108,109,110,111,112,113,114,115,116,117,118,11...",214000.0,3.669586,COC1=CC=C(CNC(=O)C2=CC3=C(C=CC=C3)C=C2C[NH+](C...
...,...,...,...,...,...,...,...
274,3fcq,A,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,"109,110,111,112,113,114,119,120,121,122,128,12...",1700000.0,2.769551,CC(=O)OC1=C(C(=O)[O-])C=CC=C1C
275,3fv1,A,ANRTLIVTTILEEPYVMYRKSDKPLYGNDRFEGYCLDLLKELSNIL...,"9,10,11,12,13,14,34,58,59,60,61,68,70,71,72,85...",0.5,9.301030,C[NH2+]C1C(O)COC2CC(CC([NH3+])C(=O)[O-])(C(=O)...
276,2zy1,A,MTMMDMNFKYCHKIMKKHSKSFSYAFDLLPEDQRKAVWAIYAVCRK...,"14,17,18,20,21,24,25,36,39,40,43,44,46,47,48,1...",40.0,7.397940,[O]P([O])(=O)CC(=O)NCCCC1=CC=CC(OC2=CC=CC=C2)=C1
277,3zdg,"K,L,M,N,O",LDRADILYNIRQTSRPDVIPTQRDRPVAVSVSLKFINILEVNEITN...,"84,85,86,88,138,140,141,142,143,144,145,177,17...",79.0,7.102373,CC(CCOC(=O)N(C)C)[NH+](C)C


In [65]:
df.to_csv("/workspace/binding_affinity/datasets/Core2016/Core2016.tsv", sep="\t", index=False)