In [1]:
from glob import glob
import pandas as pd
import numpy as np
from Bio import PDB
import subprocess
import os
os.environ["PATH"] += os.pathsep + "/opt/conda/envs/team05/bin"
os.environ["PATH"] += os.pathsep + "/workspace/chimera/bin"
from rdkit.Chem.rdmolfiles import MolFromSmiles, MolToSmiles

In [2]:
pdb_path = os.path.abspath("datasets/COACH420/")
info_path = os.path.abspath("datasets/COACH420/")
pdb_paths = os.path.abspath("datasets/COACH420/pdbs/")

In [3]:
complex_list = [
    f.replace("_chimera.pdb", "")
    for f in os.listdir(pdb_paths)
    if f.endswith("_chimera.pdb")
]

In [4]:
# %%bash -s $pdb_path
# export PATH="/opt/conda/envs/team05/bin:/workspace/chimera/bin:$PATH"

# path=$1

# for file in $path/*/*'.pdb';do
#     output_path=${file%.pdb}"_chimera.pdb"
#     echo -e "open $file \n write format pdb 0 $output_path \n stop" | chimera --nogui
# done > chimer.log

In [5]:
def read_file(file):
    return file.readlines()

def preprocessing_lig_code(lines):
    lig_info = dict()
    for i, line in enumerate(lines):
        line = line.strip()
        if not line or line.startswith("HEADER") or line.startswith("#"):
            continue  # 빈 줄, 주석, 헤더 건너뜀
        line_list = line.split()
        if len(line_list) >= 2 and "/" in line_list[0]:
            pdb = line_list[0].split("/")[1].split(".")[0]
            ligand = line_list[1].split(",")
            lig_info[pdb] = ligand
        else:
            print(f"❌ 건너뜀 (index={i}): {line}")
    return lig_info



In [6]:
code_path = os.path.abspath("datasets/COACH420/")

In [7]:
lig_info = preprocessing_lig_code(read_file(open(f"{code_path}/coach420(mlig).ds", "r")))

In [8]:
from scipy.spatial import distance_matrix
from multiprocessing import Process, Queue, Pool

In [9]:
pdb_parser = PDB.PDBParser(QUIET=True)

In [10]:
amino_acids_short = {"ALA":"A", "ARG":"R", "ASN":"N", "ASP":"D", "CYS":"C", "GLU":"E", "GLN":"Q", "GLY":"G", "HIS":"H", "ILE":"I", "LEU":"L", "LYS":"K", "MET":"M", "PHE":"F", "PRO":"P", "SER":"S", "THR":"T", "TRP":"W", "TYR":"Y", "VAL":"V", "SEC":"U", "PYL":"O"}

In [11]:
def parallelize_dataframe(df, func, num_partitions=5):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_partitions)
    results = pool.map(func, df_split)
    pool.close()
    pool.join()
    return results

In [12]:
def get_binding_sites(protein_coords, ligand_coords, protein_atom_residues):
    P_L_distance_matrix = distance_matrix(protein_coords, ligand_coords)
    return sorted(list(set(protein_atom_residues[np.where(P_L_distance_matrix<=4.)[0]])))

In [13]:
def get_info(pdb):
    """ Load protein info """

    if pdb not in lig_info:
        print(f"[SKIP] {pdb} not in lig_info")
        return None

    structure = pdb_parser.get_structure(pdb, f"{pdb_paths}/{pdb}_chimera.pdb")
    
    chain_name_list, pdb_sequence_list, seq_lengths_list, protein_atom_coords, protein_atom_residue_list, reindex = list(), list(), list(), list(), list(), 0
    ligand_total_coords, ligand_chain_list, ligand_chain_number, ligand_chain_code = list(), list(), list(), list()
    
    lig_code = lig_info[pdb]

    """ Exclude complex with 20 amino acids ligand """
    for i in lig_code:
        if i in amino_acids_short.keys():
            return None
        
    """ Extract protein info """
    for chain_name in list(structure[0].child_dict.keys()):
        chain = structure[0][chain_name]

        pdb_sequence, binding_index_list = "", list()
        
        for residue in chain.get_residues():
            if residue.resname in amino_acids_short.keys():
                pdb_sequence += amino_acids_short[residue.resname]
                
                # protein info
                for atom in residue:
                    protein_atom_coords.append(atom.get_coord())
                    protein_atom_residue_list.append(reindex)
                reindex += 1 
                
            elif residue.resname in lig_code:
                ligand_coords = list()
                
                ligand_chain_list.append(chain_name)
                ligand_chain_number.append(str(residue.get_id()[1]))
                ligand_chain_code.append(residue.resname)
                
                # ligand info
                for atom in residue:
                    ligand_coords.append(atom.get_coord())  
                ligand_total_coords.append(ligand_coords)
                
        if len(pdb_sequence) != 0:
            chain_name_list.append(chain_name)
            pdb_sequence_list.append(pdb_sequence)
            seq_lengths_list.append(len(pdb_sequence))                


    """ Get binding sites info """
    for lig_coordi in ligand_total_coords:
        if len(protein_atom_coords) != 0 and len(lig_coordi) !=0:
            binding_index = get_binding_sites(protein_atom_coords, lig_coordi, np.array(protein_atom_residue_list)) 

            binding_index = list(map(str, binding_index))
            binding_index_list.append(",".join(binding_index))

        else:
            print(pdb)
            return None
            
    total_seq_lengths = np.sum(np.array(seq_lengths_list))
    seq_lengths_list = list(map(str, seq_lengths_list))
    
    return ",".join(chain_name_list), ",".join(pdb_sequence_list), total_seq_lengths, ",".join(seq_lengths_list), "|".join(binding_index_list), ",".join(ligand_chain_list), ",".join(ligand_chain_number), ",".join(ligand_chain_code)

In [14]:
def get_raw_protein_info_bulk(df):
    return df.coach_PDB.map(get_info)              

In [15]:
pdb = [coach_PDB[:-1] for coach_PDB in complex_list]

In [16]:
data_df = pd.DataFrame({"coach_PDB":complex_list, "PDB":pdb})
data_df

Unnamed: 0,coach_PDB,PDB
0,1h1sA,1h1s
1,1jcgA,1jcg
2,3hvlA,3hvl
3,1kaqA,1kaq
4,1ltzA,1ltz
...,...,...
384,2qwiA,2qwi
385,1jclB,1jcl
386,3b3fA,3b3f
387,2vflA,2vfl


In [17]:
info_results = parallelize_dataframe(data_df, get_raw_protein_info_bulk, num_partitions = 5)
info_results = pd.concat(info_results)

  return bound(*args, **kwds)


[SKIP] 1fcvA not in lig_info[SKIP] 3e3sA not in lig_info

[SKIP] 3ek5A not in lig_info
[SKIP] 1xz8A not in lig_info
[SKIP] 1jcgA not in lig_info
[SKIP] 1kaqA not in lig_info
[SKIP] 3gh6A not in lig_info
[SKIP] 1nhvA not in lig_info
[SKIP] 3fwrA not in lig_info
[SKIP] 3idoA not in lig_info
[SKIP] 2x60A not in lig_info
[SKIP] 3dwrA not in lig_info
[SKIP] 148lE not in lig_info
[SKIP] 1einA not in lig_info
[SKIP] 1lqdB not in lig_info
[SKIP] 1bq3A not in lig_info
[SKIP] 1sgcA not in lig_info
[SKIP] 3b6aA not in lig_info
[SKIP] 2yw9D not in lig_info
[SKIP] 2dptA not in lig_info
[SKIP] 1a8tA not in lig_info
[SKIP] 1zu0A not in lig_info
[SKIP] 2za1A not in lig_info
[SKIP] 3dsrA not in lig_info
[SKIP] 3gidA not in lig_info
[SKIP] 2zdqA not in lig_info
[SKIP] 2vfcA not in lig_info
[SKIP] 1theA not in lig_info
[SKIP] 2j4eB not in lig_info
[SKIP] 1yfrA not in lig_info
[SKIP] 2bbwA not in lig_info
[SKIP] 2e5mA not in lig_info
[SKIP] 2cx8A not in lig_info
[SKIP] 2i4nB not in lig_info
[SKIP] 1i1hA n

In [18]:
data_df["Chain"] = info_results.map(lambda a: a[0] if a is not None else None)
data_df["Sequence"] = info_results.map(lambda a: a[1] if a is not None else None)
data_df["Total_seq_lengths"] = info_results.map(lambda a: a[2] if a is not None else None)
data_df["Chain_seq_lengths"] = info_results.map(lambda a: a[3] if a is not None else None)
data_df["BS"] = info_results.map(lambda a: a[4] if a is not None else None)
data_df["Ligand_chain"] = info_results.map(lambda a: a[5] if a is not None else None)
data_df["Ligand_chain_number"] = info_results.map(lambda a: a[6] if a is not None else None)
data_df["Ligand_chain_code"] = info_results.map(lambda a: a[7] if a is not None else None)
data_df = data_df.loc[data_df.Sequence.isna()==False].reset_index(drop=True)
data_df = data_df.loc[data_df.Chain != " "].reset_index(drop=True)

In [19]:
data_df

Unnamed: 0,coach_PDB,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code
0,1h1sA,1h1s,A,SMENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVP...,297.0,297,"10,11,12,13,18,31,64,80,81,82,83,84,85,86,89,1...",A,1298,4SP
1,3hvlA,3hvl,A,GLTEEQRMMIRELMDAQMKTFDTTFSHFKNFRLPGVLSREEAAKWS...,287.0,287,"54,88,89,91,92,96,126,130,133,144,151,162,163,...",A,1,SRL
2,1ltzA,1ltz,A,FVVPDITTRKNVGLSHDANDFTLPQPLDRYSAEDHATWATLYQRQC...,274.0,274,9193949597100152172177,A,500,HBI
3,1k7eA,1k7e,A,MERYENLFAQLNDRREGAFVPFVTLGDPGIEQSLKIIDTLIDAGAD...,261.0,261,2148596399152174182204205224226227,A,401,IAG
4,2q6vA,2q6v,A,RPCYLVLSSHDFRTPRRANIHFITDQLALRGTTRFFSLRYSRLSRM...,370.0,370,"214,215,216,217,239,240,258,259,261,264,278,29...",A,1081,UDP
...,...,...,...,...,...,...,...,...,...,...
266,3nk7A,3nk7,A,MTEPAIITNASDPAVQRIIDVTKHSIKTTLIEDTEPLMECIRAGVQ...,265.0,265,"119,185,186,187,207,208,209,210,213,226,227,22...","A,B",770770,"SAM,SAM"
267,1efyA,1efy,A,KSKLAKPIQDLIKMIFDVESMKKAMVEFEIDLQKMPLGKLSKRQIQ...,350.0,350,101200201227234235236242245326,A,201,BZC
268,2qwiA,2qwi,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388.0,388,"36,37,69,70,74,97,141,143,146,165,195,196,211,...",A,800,G20
269,3b3fA,3b3f,A,TEESSAVQYFQFYGYLSQQQNMMQDYVRTGTYQRAILQNHTDFKDK...,337.0,337,"8,9,12,18,21,27,51,52,53,56,57,73,74,75,99,100...",A,481,SAH


In [20]:
lengths = data_df.Total_seq_lengths.values

In [21]:
data_df = data_df[lengths <= 1500].reset_index(drop=True)
data_df

Unnamed: 0,coach_PDB,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code
0,1h1sA,1h1s,A,SMENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVP...,297.0,297,"10,11,12,13,18,31,64,80,81,82,83,84,85,86,89,1...",A,1298,4SP
1,3hvlA,3hvl,A,GLTEEQRMMIRELMDAQMKTFDTTFSHFKNFRLPGVLSREEAAKWS...,287.0,287,"54,88,89,91,92,96,126,130,133,144,151,162,163,...",A,1,SRL
2,1ltzA,1ltz,A,FVVPDITTRKNVGLSHDANDFTLPQPLDRYSAEDHATWATLYQRQC...,274.0,274,9193949597100152172177,A,500,HBI
3,1k7eA,1k7e,A,MERYENLFAQLNDRREGAFVPFVTLGDPGIEQSLKIIDTLIDAGAD...,261.0,261,2148596399152174182204205224226227,A,401,IAG
4,2q6vA,2q6v,A,RPCYLVLSSHDFRTPRRANIHFITDQLALRGTTRFFSLRYSRLSRM...,370.0,370,"214,215,216,217,239,240,258,259,261,264,278,29...",A,1081,UDP
...,...,...,...,...,...,...,...,...,...,...
266,3nk7A,3nk7,A,MTEPAIITNASDPAVQRIIDVTKHSIKTTLIEDTEPLMECIRAGVQ...,265.0,265,"119,185,186,187,207,208,209,210,213,226,227,22...","A,B",770770,"SAM,SAM"
267,1efyA,1efy,A,KSKLAKPIQDLIKMIFDVESMKKAMVEFEIDLQKMPLGKLSKRQIQ...,350.0,350,101200201227234235236242245326,A,201,BZC
268,2qwiA,2qwi,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388.0,388,"36,37,69,70,74,97,141,143,146,165,195,196,211,...",A,800,G20
269,3b3fA,3b3f,A,TEESSAVQYFQFYGYLSQQQNMMQDYVRTGTYQRAILQNHTDFKDK...,337.0,337,"8,9,12,18,21,27,51,52,53,56,57,73,74,75,99,100...",A,481,SAH


In [22]:
def fwrite(fw, lines):
    
    for i in lines:
        fw.write("%s\n"%i)
    fw.close()

def extract_ligand_info(lines, chain, number, code):
    return [line.strip() for line in lines if (line[17:20].strip() == code) and (line[21:22].strip() == chain) and (line[22:26].strip() == number)]
    
def add_ligand(row):
    SMILES = list()
    
    coach_PDB, ligand_chain, ligand_chain_number, ligand_chain_code = row.values[0], row.values[7].split(","), row.values[8].split(","), row.values[9].split(",")
    lines = read_file(open(f"{pdb_paths}/{coach_PDB}.pdb", "r"))
    
    for chain, number, code in zip(ligand_chain, ligand_chain_number, ligand_chain_code):
        ligand_files = extract_ligand_info(lines, chain, number, code)

        fwrite(open(f"{pdb_paths}/{coach_PDB}_{chain}_{number}_{code}_ligand.pdb", "w"), ligand_files)
        
        try:
            command = f"obabel -ipdb {pdb_paths}/{coach_PDB}_{chain}_{number}_{code}_ligand.pdb -osmi -xC | obabel -ismi -osmi -xk -O tmp.smi"

            os.system(command)

            smiles = read_file(open("tmp.smi"))[0].split('\t')[0].strip()            

            smiles = MolToSmiles(MolFromSmiles(smiles),isomericSmiles = False, kekuleSmiles = True)
            SMILES.append(smiles)

        except Exception as e:
            print(coach_PDB, e)
            return None  
    
    return ",".join(SMILES) 

In [23]:
SMILES = data_df.apply(add_ligand, axis = 1)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/1k7eA_A_401_IAG_ligand.pdb)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/2g25A_A_887_TDK_ligand.pdb)

1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/2g25A_B_887_TDK_ligand.pdb)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
0 molecules converted
0 molecules converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted


1goqA list index out of range


1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
[01:40:19] Explicit valence for atom # 13 N, 4, is greater than permitted
0 molecules converted
0 molecules converted


2fhjA Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)
2qehA list index out of range


1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/1e5qA_A_500_NDP_ligand.pdb)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
[01:40:20] Explicit valence for atom # 26 N, 4, is greater than 

1wopA Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)


1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/2cwhB_B_1510_NDP_ligand.pdb)

1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/2cwhB_B_1520_PYC_ligand.pdb)

1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/2cwhB_A_2510_NDP_ligand.pdb)

1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/3efvA_A_463_NAD_ligand.pdb)

1 molecule converted
1 mo

1qhiA Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)


1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/2ed4A_A_400_FAD_ligand.pdb)

1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/2ed4A_A_401_NAD_ligand.pdb)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule

1q8jA Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)


1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/1if7A_A_555_SBR_ligand.pdb)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/1n07A_A_165_FMN_ligand.pdb)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/2b3dA_A_1205_FAD_ligand.pdb)

1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /works

1ettH Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)


1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
[01:40:32] Explicit valence for atom # 16 N, 4, is greater than permitted
1 

1qjiA Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)


  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/1b8uA_A_334_NAD_ligand.pdb)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/2gz3A_A_367_NAP_ligand.pdb)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule

1br6A Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)


1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/1uvtH_H_1_I48_ligand.pdb)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 m

1ex8A Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)
1dcpA Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)


1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
[01:40:44] Explicit valence for atom # 17 N, 4, is greater than permitted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted


1jylA Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)


1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
[01:40:44] Explicit valence for atom # 17 N, 4, is greater than permitted
1 molecule converted
1 molecule converted
[01:40:45] Explicit valence for atom # 19 N, 4, is greater than permitted


1etrH Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)
7dfrA Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)


1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /workspace/datasets/COACH420/pdbs/1u72A_A_187_NDP_ligand.pdb)

1 molecule converted
1

In [24]:
data_df["SMILES"] = SMILES

In [25]:
data_df = data_df.loc[data_df.SMILES.isna()==False].reset_index(drop=True)
data_df

Unnamed: 0,coach_PDB,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code,SMILES
0,1h1sA,1h1s,A,SMENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVP...,297.0,297,"10,11,12,13,18,31,64,80,81,82,83,84,85,86,89,1...",A,1298,4SP,NS(=O)(=O)C1=CC=C(NC2=NC3=NC=NC3C(OCC3CCCCC3)=...
1,3hvlA,3hvl,A,GLTEEQRMMIRELMDAQMKTFDTTFSHFKNFRLPGVLSREEAAKWS...,287.0,287,"54,88,89,91,92,96,126,130,133,144,151,162,163,...",A,1,SRL,CCOP(=O)(OCC)C(=CC1=CC(C(C)(C)C)=C(O)C(C(C)(C)...
2,1ltzA,1ltz,A,FVVPDITTRKNVGLSHDANDFTLPQPLDRYSAEDHATWATLYQRQC...,274.0,274,9193949597100152172177,A,500,HBI,CC(O)C(O)C1=CN=CC(C(=O)N=CN)=N1
3,1k7eA,1k7e,A,MERYENLFAQLNDRREGAFVPFVTLGDPGIEQSLKIIDTLIDAGAD...,261.0,261,2148596399152174182204205224226227,A,401,IAG,O=C(O)CNC(=O)CC1=C2C=CC=CC2=NC1
4,2q6vA,2q6v,A,RPCYLVLSSHDFRTPRRANIHFITDQLALRGTTRFFSLRYSRLSRM...,370.0,370,"214,215,216,217,239,240,258,259,261,264,278,29...",A,1081,UDP,O=C1CCN(C2OC(COP(=O)(O)OP(=O)(O)O)C(O)C2O)C(=O)N1
...,...,...,...,...,...,...,...,...,...,...,...
252,3nk7A,3nk7,A,MTEPAIITNASDPAVQRIIDVTKHSIKTTLIEDTEPLMECIRAGVQ...,265.0,265,"119,185,186,187,207,208,209,210,213,226,227,22...","A,B",770770,"SAM,SAM",C[SH](CCC(N)C(=O)O)CC1OC(N2C=NC3=C(N)N=CN=C32)...
253,1efyA,1efy,A,KSKLAKPIQDLIKMIFDVESMKKAMVEFEIDLQKMPLGKLSKRQIQ...,350.0,350,101200201227234235236242245326,A,201,BZC,COC1=CC=CC(C2=NC3=C(C(N)=O)C=CC=C3N2)=C1
254,2qwiA,2qwi,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388.0,388,"36,37,69,70,74,97,141,143,146,165,195,196,211,...",A,800,G20,CCCN(C)C(=O)C1OCCC(N=C(N)N)C1NC(C)=O.O.O
255,3b3fA,3b3f,A,TEESSAVQYFQFYGYLSQQQNMMQDYVRTGTYQRAILQNHTDFKDK...,337.0,337,"8,9,12,18,21,27,51,52,53,56,57,73,74,75,99,100...",A,481,SAH,NC1=C2N=CN(C3OC(CSCCC(N)C(=O)O)C(O)C3O)C2=NC=N1


In [26]:
def get_SMILES_length(df):
    SMILES = df.SMILES.values
    index = []

    for smiles in SMILES:
        too_long = any(len(smi) > 160 for smi in smiles.split(","))
        index.append(False if too_long else True)
    
    return index


In [27]:
smiles_index = get_SMILES_length(data_df)

In [28]:
data_df = data_df.loc[smiles_index].reset_index(drop=True)

In [29]:
data_df

Unnamed: 0,coach_PDB,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code,SMILES
0,1h1sA,1h1s,A,SMENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVP...,297.0,297,"10,11,12,13,18,31,64,80,81,82,83,84,85,86,89,1...",A,1298,4SP,NS(=O)(=O)C1=CC=C(NC2=NC3=NC=NC3C(OCC3CCCCC3)=...
1,3hvlA,3hvl,A,GLTEEQRMMIRELMDAQMKTFDTTFSHFKNFRLPGVLSREEAAKWS...,287.0,287,"54,88,89,91,92,96,126,130,133,144,151,162,163,...",A,1,SRL,CCOP(=O)(OCC)C(=CC1=CC(C(C)(C)C)=C(O)C(C(C)(C)...
2,1ltzA,1ltz,A,FVVPDITTRKNVGLSHDANDFTLPQPLDRYSAEDHATWATLYQRQC...,274.0,274,9193949597100152172177,A,500,HBI,CC(O)C(O)C1=CN=CC(C(=O)N=CN)=N1
3,1k7eA,1k7e,A,MERYENLFAQLNDRREGAFVPFVTLGDPGIEQSLKIIDTLIDAGAD...,261.0,261,2148596399152174182204205224226227,A,401,IAG,O=C(O)CNC(=O)CC1=C2C=CC=CC2=NC1
4,2q6vA,2q6v,A,RPCYLVLSSHDFRTPRRANIHFITDQLALRGTTRFFSLRYSRLSRM...,370.0,370,"214,215,216,217,239,240,258,259,261,264,278,29...",A,1081,UDP,O=C1CCN(C2OC(COP(=O)(O)OP(=O)(O)O)C(O)C2O)C(=O)N1
...,...,...,...,...,...,...,...,...,...,...,...
251,3nk7A,3nk7,A,MTEPAIITNASDPAVQRIIDVTKHSIKTTLIEDTEPLMECIRAGVQ...,265.0,265,"119,185,186,187,207,208,209,210,213,226,227,22...","A,B",770770,"SAM,SAM",C[SH](CCC(N)C(=O)O)CC1OC(N2C=NC3=C(N)N=CN=C32)...
252,1efyA,1efy,A,KSKLAKPIQDLIKMIFDVESMKKAMVEFEIDLQKMPLGKLSKRQIQ...,350.0,350,101200201227234235236242245326,A,201,BZC,COC1=CC=CC(C2=NC3=C(C(N)=O)C=CC=C3N2)=C1
253,2qwiA,2qwi,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388.0,388,"36,37,69,70,74,97,141,143,146,165,195,196,211,...",A,800,G20,CCCN(C)C(=O)C1OCCC(N=C(N)N)C1NC(C)=O.O.O
254,3b3fA,3b3f,A,TEESSAVQYFQFYGYLSQQQNMMQDYVRTGTYQRAILQNHTDFKDK...,337.0,337,"8,9,12,18,21,27,51,52,53,56,57,73,74,75,99,100...",A,481,SAH,NC1=C2N=CN(C3OC(CSCCC(N)C(=O)O)C(O)C3O)C2=NC=N1


In [30]:
def read_file(file):
    return file.readlines()

def preprocessing_PDBSWS(lines):
    data = dict()
    
    for line in lines:
        line_list = line.strip().split(" ")
        if len(line_list) == 3:
            data[f"{line_list[0]}_{line_list[1]}"] = line_list[-1]
            
    return data

def preprocessing_SIFTS(lines):
    data = dict()
    
    for line in lines[2:]:
        line_list = line.strip().split("\t")
        data[f"{line_list[0]}_{line_list[1]}"] = line_list[2]
    return data

In [31]:
SIFTS_mapping_table = preprocessing_SIFTS(read_file(open(f"{info_path}/SIFTS_chain_mapping_table.txt", "r")))
PDBSWS_mapping_table = preprocessing_PDBSWS(read_file(open(f"{info_path}/PDBSWS_chain_mapping_table.txt", "r")))

In [32]:
def get_uniprot(row):
    try:
        pdb = row.PDB
        chains = row.Chain.split(",")

        uniprot_ids = list()
        for chain in chains:
            name = f"{pdb}_{chain}"
            if name in SIFTS_mapping_table:
                uniprot_ids.append(SIFTS_mapping_table[name])
            else:
                uniprot_ids.append(PDBSWS_mapping_table[name])
        return ",".join(uniprot_ids)    
    
    except Exception as e:
        print(row.PDB, e)
        return None

In [33]:
def get_uniprot_id_bulk(df):
    return df.apply(get_uniprot, axis=1)

In [34]:
uniprot_ids = parallelize_dataframe(data_df, get_uniprot_id_bulk, num_partitions = 5)

  return bound(*args, **kwds)


In [35]:
data_df["Uniprot_ID"] = pd.concat(uniprot_ids)

In [36]:
data_df = data_df.loc[data_df.Uniprot_ID.isna()==False].reset_index(drop=True)
data_df

Unnamed: 0,coach_PDB,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code,SMILES,Uniprot_ID
0,1h1sA,1h1s,A,SMENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVP...,297.0,297,"10,11,12,13,18,31,64,80,81,82,83,84,85,86,89,1...",A,1298,4SP,NS(=O)(=O)C1=CC=C(NC2=NC3=NC=NC3C(OCC3CCCCC3)=...,P24941
1,3hvlA,3hvl,A,GLTEEQRMMIRELMDAQMKTFDTTFSHFKNFRLPGVLSREEAAKWS...,287.0,287,"54,88,89,91,92,96,126,130,133,144,151,162,163,...",A,1,SRL,CCOP(=O)(OCC)C(=CC1=CC(C(C)(C)C)=C(O)C(C(C)(C)...,Q15788
2,1ltzA,1ltz,A,FVVPDITTRKNVGLSHDANDFTLPQPLDRYSAEDHATWATLYQRQC...,274.0,274,9193949597100152172177,A,500,HBI,CC(O)C(O)C1=CN=CC(C(=O)N=CN)=N1,P30967
3,1k7eA,1k7e,A,MERYENLFAQLNDRREGAFVPFVTLGDPGIEQSLKIIDTLIDAGAD...,261.0,261,2148596399152174182204205224226227,A,401,IAG,O=C(O)CNC(=O)CC1=C2C=CC=CC2=NC1,P00929
4,2q6vA,2q6v,A,RPCYLVLSSHDFRTPRRANIHFITDQLALRGTTRFFSLRYSRLSRM...,370.0,370,"214,215,216,217,239,240,258,259,261,264,278,29...",A,1081,UDP,O=C1CCN(C2OC(COP(=O)(O)OP(=O)(O)O)C(O)C2O)C(=O)N1,Q8GCH2
...,...,...,...,...,...,...,...,...,...,...,...,...
251,3nk7A,3nk7,A,MTEPAIITNASDPAVQRIIDVTKHSIKTTLIEDTEPLMECIRAGVQ...,265.0,265,"119,185,186,187,207,208,209,210,213,226,227,22...","A,B",770770,"SAM,SAM",C[SH](CCC(N)C(=O)O)CC1OC(N2C=NC3=C(N)N=CN=C32)...,P52391
252,1efyA,1efy,A,KSKLAKPIQDLIKMIFDVESMKKAMVEFEIDLQKMPLGKLSKRQIQ...,350.0,350,101200201227234235236242245326,A,201,BZC,COC1=CC=CC(C2=NC3=C(C(N)=O)C=CC=C3N2)=C1,P26446
253,2qwiA,2qwi,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388.0,388,"36,37,69,70,74,97,141,143,146,165,195,196,211,...",A,800,G20,CCCN(C)C(=O)C1OCCC(N=C(N)N)C1NC(C)=O.O.O,P03472
254,3b3fA,3b3f,A,TEESSAVQYFQFYGYLSQQQNMMQDYVRTGTYQRAILQNHTDFKDK...,337.0,337,"8,9,12,18,21,27,51,52,53,56,57,73,74,75,99,100...",A,481,SAH,NC1=C2N=CN(C3OC(CSCCC(N)C(=O)O)C(O)C3O)C2=NC=N1,Q4AE70


In [37]:
data_df = data_df.iloc[:, [1, 3, 6]]

In [None]:
import os
import subprocess
import pandas as pd
from Bio import PDB

# ==========================
# 1. 체인 분리 (복잡한 폴더 구조도 지원)
# ==========================
parser = PDB.PDBParser(QUIET=True)
io = PDB.PDBIO()

def split_chains_flexible(src_dir, out_dir, list_file):
    """
    src_dir 내부를 재귀적으로 뒤져서 모든 .pdb 파일을 찾아
    체인 단위로 분리하여 out_dir에 저장하고 list_file 생성
    """
    os.makedirs(out_dir, exist_ok=True)
    all_files = []

    # 재귀적으로 모든 .pdb 파일 찾기
    pdb_files = []
    for root, dirs, files in os.walk(src_dir):
        for fname in files:
            if fname.endswith(".pdb"):
                pdb_files.append(os.path.join(root, fname))

    for pdb_file in pdb_files:
        pdb_id = os.path.basename(pdb_file).replace(".pdb", "")
        try:
            structure = parser.get_structure(pdb_id, pdb_file)
            for model in structure:
                for chain in model:
                    io.set_structure(chain)
                    out_path = os.path.join(out_dir, f"{pdb_id}_{chain.id}.pdb")
                    io.save(out_path)
                    all_files.append(out_path)
        except Exception as e:
            print(f"[경고] {pdb_file} 처리 실패: {e}")

    # list.txt 생성
    with open(list_file, "w") as f:
        for path in sorted(all_files):
            f.write(os.path.basename(path) + "\n")

    print(f"[완료] {len(all_files)} chains saved in {out_dir}")
    return all_files


# ==========================
# 2. TM-align 실행
# ==========================
def run_tmalign(dir1, list1, dir2, list2, out_file, tmalign_bin="./TMalign"):
    cmd = [tmalign_bin,
           "-dir1", dir1, list1,
           "-dir2", dir2, list2,
           "-outfmt", "2"]
    with open(out_file, "w") as f:
        subprocess.run(cmd, stdout=f)
    print(f"[완료] TM-align 결과 저장: {out_file}")


# ==========================
# 3. Unseen protein filtering
# ==========================
def filter_unseen(tsv_file, cutoff=0.4, out_file="unseen.tsv"):
    df = pd.read_csv(tsv_file, sep="\t")
    # test 단백질(chain)별 최대 TM-score 계산
    max_scores = df.groupby("query")["TMscore"].max().reset_index()
    unseen = max_scores[max_scores["TMscore"] <= cutoff]
    unseen.to_csv(out_file, sep="\t", index=False)
    print(f"[완료] unseen proteins 저장: {out_file} ({len(unseen)} entries)")
    return unseen


# ==========================
# 4. 실행 예시
# ==========================

# (A) Train 세트 (PDBbind + scPDB)
split_chains_flexible("datasets/PDBbind/general-set",
                      "datasets/train_chains",
                      "datasets/train_list.txt")
split_chains_flexible("datasets/scPDB",
                      "datasets/train_chains",
                      "datasets/train_list.txt")

# (B) Test 세트 (COACH420 + HOLO4K)
split_chains_flexible("datasets/COACH420/pdbs",
                      "datasets/test_chains",
                      "datasets/test_list.txt")
split_chains_flexible("datasets/HOLO4K/pdbs",
                      "datasets/test_chains",
                      "datasets/test_list.txt")

# (C) TM-align 실행 (Test vs Train)
run_tmalign("datasets/test_chains", "datasets/test_list.txt",
            "datasets/train_chains", "datasets/train_list.txt",
            out_file="all_pairs.tsv",
            tmalign_bin="./TMalign")  # TM-align 바이너리 경로 맞게 수정

# (D) unseen protein 필터링
unseen = filter_unseen("all_pairs.tsv", cutoff=0.4,
                       out_file="unseen_proteins.tsv")
