In [30]:
from glob import glob
import pandas as pd
import numpy as np
from Bio import PDB
import os
os.environ["PATH"] += os.pathsep + "/opt/conda/envs/team05/bin"
from rdkit import Chem
from rdkit.Chem import MolToSmiles, MolFromMol2File
from rdkit.Chem.rdmolfiles import MolFromSmiles, MolToSmiles
from Bio.PDB import Select, PDBIO
from Bio.PDB.PDBParser import PDBParser

In [31]:
pdb_path = os.path.abspath("/workspace/datasets/CSAR-HIQ_36")
info_path = os.path.abspath("/workspace/datasets/CSAR-HIQ_36")

In [32]:
complex_list = [
    f for f in os.listdir(pdb_path)
    if os.path.isdir(os.path.join(pdb_path, f)) and len(f) == 4
]

In [33]:
from Bio.PDB import PDBParser, PDBIO, Select
import os

def remove_HETATM_PDBbind(input_list, pdb_path):

    class NonHetSelect(Select):
        def accept_residue(self, residue):
            return 1 if residue.id[0] == " " else 0
    
    for pdb in input_list:
        src_file = f"{pdb_path}/{pdb}/{pdb}_protein.pdb"
        des_file = f"{pdb_path}/{pdb}/{pdb}_remove_HEATM_protein.pdb"

        if os.path.exists(des_file):
            print(f"[SKIP] {des_file} already exists")
            continue

        if not os.path.exists(src_file):
            print(f"[WARN] source file not found: {src_file}")
            continue

        structure = PDBParser().get_structure(pdb, src_file)
        io = PDBIO()
        io.set_structure(structure)
        io.save(des_file, NonHetSelect())
        print(f"[OK] saved: {des_file}")

In [34]:
remove_HETATM_PDBbind(complex_list, pdb_path)

[SKIP] /workspace/datasets/CSAR-HIQ_36/2j78/2j78_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CSAR-HIQ_36/1uzv/1uzv_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CSAR-HIQ_36/1p1n/1p1n_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CSAR-HIQ_36/1ps3/1ps3_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CSAR-HIQ_36/1syi/1syi_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CSAR-HIQ_36/1r5y/1r5y_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CSAR-HIQ_36/1qkt/1qkt_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CSAR-HIQ_36/2hb1/2hb1_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CSAR-HIQ_36/1z95/1z95_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CSAR-HIQ_36/1zhx/1zhx_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CSAR-HIQ_36/1nc1/1nc1_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/dat

In [35]:
pdb_parser = PDB.PDBParser(QUIET = True)

In [36]:
amino_acids_short = {
    "ALA":"A", "ARG":"R", "ASN":"N", "ASP":"D", "CYS":"C",
    "GLU":"E", "GLN":"Q", "GLY":"G", "HIS":"H", "ILE":"I",
    "LEU":"L", "LYS":"K", "MET":"M", "PHE":"F", "PRO":"P",
    "SER":"S", "THR":"T", "TRP":"W", "TYR":"Y", "VAL":"V",
    "SEC":"U", "PYL":"O"
}

In [37]:
data_df = pd.DataFrame({"PDB":complex_list})

In [38]:
def get_info(pdb):
    try:
        """ Load protein info """
        structure = pdb_parser.get_structure(pdb, f"{pdb_path}/{pdb}/{pdb}_remove_HEATM_protein.pdb")
        chain_name_list, pdb_sequence_list, seq_lengths_list, protein_atom_coords, protein_atom_residue_list, reindex = list(), list(), list(), list(), list(), 0
        
        """ Extract protein info """
        for chain_name in list(structure[0].child_dict.keys()):
            chain = structure[0][chain_name]

            pdb_sequence = ""
            for residue in chain.get_residues():
                if residue.resname in amino_acids_short.keys():
                    pdb_sequence += amino_acids_short[residue.resname]

                    for atom in residue:
                        protein_atom_coords.append(atom.get_coord())
                        protein_atom_residue_list.append(reindex)
                    reindex += 1     

            if len(pdb_sequence) != 0:
                chain_name_list.append(chain_name)
                pdb_sequence_list.append(pdb_sequence)
                seq_lengths_list.append(len(pdb_sequence))

        """ Load pocket info """      
        protein_atom_coords, binding_index = np.array(protein_atom_coords), list()
        pocket_structure = pdb_parser.get_structure(pdb, f"{pdb_path}/{pdb}/{pdb}_pocket.pdb")

        pocket_coordi = list()

        for chain_name in list(pocket_structure[0].child_dict.keys()):
            chain = pocket_structure[0][chain_name]
            for residue in chain.get_residues():
                if residue.resname in amino_acids_short.keys():
                    for atom in residue:
                        pocket_coordi.append(atom.get_coord())
        
        """ Matching pocket info """
        pocket_coordi = np.array(pocket_coordi)
        bi_x, bi_y, bi_z = pocket_coordi[:, 0], pocket_coordi[:, 1], pocket_coordi[:, 2]

        for i, j, k in zip(bi_x, bi_y, bi_z):
            tmp_coordi = np.array([i, j, k], dtype = np.float32)
            ind = np.where((protein_atom_coords == tmp_coordi).all(axis = 1))[0][0]
            binding_index.append(protein_atom_residue_list[ind])

        binding_index = sorted(list(set(binding_index)))
        binding_index = list(map(str, binding_index))

        total_seq_lengths = np.sum(np.array(seq_lengths_list))
        seq_lengths_list = list(map(str, seq_lengths_list))

        return ",".join(chain_name_list), ",".join(pdb_sequence_list), total_seq_lengths, ",".join(seq_lengths_list), ",".join(binding_index)

    except Exception as e:
        print(pdb, e)
        return None

In [39]:
from multiprocessing import Process, Queue, Pool

def parallelize_dataframe(df, func, num_partitions=5):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_partitions)
    results = pool.map(func, df_split)
    pool.close()
    pool.join()
    return results

In [40]:
def get_pdb_info_bulk(df):
    return df.PDB.map(get_info)

In [41]:
info_results = parallelize_dataframe(data_df, get_pdb_info_bulk, 5)

In [42]:
info_results = pd.concat(info_results)

In [43]:
data_df["Chain"] = info_results.map(lambda a: a[0] if a is not None else None)

In [44]:
data_df["Sequence"] = info_results.map(lambda a: a[1] if a is not None else None)

In [45]:
data_df["Total_seq_lengths"] = info_results.map(lambda a: a[2] if a is not None else None)

In [46]:
data_df["Chain_seq_lengths"] = info_results.map(lambda a: a[3] if a is not None else None)

In [47]:
data_df["BS"] = info_results.map(lambda a: a[4] if a is not None else None)

In [48]:
data_df = data_df.loc[data_df.Sequence.isna()==False].reset_index(drop=True)
data_df = data_df.loc[data_df.Chain != " "].reset_index(drop=True)

In [49]:
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS
0,2j78,A,VKKFPEGFLWGVATASYQIEGSPLADGAGMSIWHTFSHTPGNVKNG...,443,443,"11,12,13,14,15,16,17,18,19,32,42,43,48,51,52,7..."
1,1uzv,A,ATQGVFTLPANTRFGVTAFANSSGTQTVNVLVNNETAATFSGQSTN...,456,456,"12,83,85,86,109,110,111,112,113,132,133,134,13..."
2,1p1n,A,KTVVVTTILESPYVMMKKNHEMLEGNERYEGYCVDLAAEIAKHCGF...,512,512,"5,6,7,8,9,10,11,12,13,14,31,32,52,54,55,56,57,..."
3,1ps3,A,QDVVQDVPNVDVQMLELYDRMSFKDIDGGVWKQGWNIKYDPLKYNA...,1004,1004,"56,57,58,59,60,61,62,63,64,65,66,67,70,74,75,7..."
4,1syi,A,NKTVVVTTILESPYVMMKKNHEMLEGNERYEGYCVDLAAEIAKHCG...,514,514,"6,7,8,9,10,11,12,13,14,15,32,33,53,55,56,57,58..."
5,1r5y,A,RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,357,357,"31,32,34,35,36,57,58,59,60,61,62,64,89,90,91,9..."
6,1qkt,A,NSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNL...,495,495,"19,20,21,22,23,24,25,26,27,37,38,39,40,41,42,4..."
7,2hb1,A,EMEKEFEQIDKSGSWAAIYQDIRHEASDFPCRVAKLPKNKNRNRYR...,297,297,"18,19,22,34,42,43,44,45,46,47,48,49,66,81,82,8..."
8,1z95,A,IFLNVLEAIEPGVVCAGHDNNQPDSFAALLSSLNELGERQLVHVVK...,238,238,"9,10,11,12,13,14,15,16,17,24,25,26,27,28,29,30..."
9,1zhx,A,MDPSQYASSSSWTSFLKSIASFNGDLSSLSAPPFILSPISLTEFSQ...,436,436,"5,6,8,10,11,12,13,14,15,16,17,18,19,20,21,22,2..."


In [50]:
data_df.to_csv("CSAR36_nonaffinity.csv", index=False)

print("✅ 저장 완료: CSAR36_nonaffinity.csv")


✅ 저장 완료: CSAR36_nonaffinity.csv


In [51]:
import os
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MolFromSmiles, MolToSmiles

pdb_path = "/workspace/datasets/CSAR-HIQ_36" 

def convert_smiles(pdb):
    """PDB ID에 해당하는 mol2 파일을 SMILES로 변환"""
    mol_file = f"{pdb_path}/{pdb}/{pdb}_ligand.mol2"
    tmp_file = "tmp.smi"

    command = f'obabel -imol2 "{mol_file}" -osmi -xC | obabel -ismi -osmi -xk -O {tmp_file}'
    os.system(command)

    try:
        with open(tmp_file) as f:
            line = f.readline().strip()
            smiles = line.split("\t")[0]

        mol = MolFromSmiles(smiles)
        if mol is None:
            return None
        smiles = MolToSmiles(mol, isomericSmiles=False, kekuleSmiles=True)
        return smiles
    except Exception as e:
        print(f"[Error] {pdb}: {e}")
        return None

df_bs = pd.read_csv("CSAR36_nonaffinity.csv")  
df_smi = pd.read_csv("affinity_data.csv")     

df_smi = df_smi.rename(columns={"pdbid": "PDB", "-logKd/Ki": "pAff"})

df_smi["Affinity_nM"] = (10 ** (-df_smi["pAff"])) * 1e9

df_bs["SMILES"] = df_bs["PDB"].apply(convert_smiles)

merged_df = pd.merge(df_bs, df_smi, on="PDB", how="inner")

merged_df = merged_df.drop_duplicates(subset=["PDB"], keep="first")

merged_df = merged_df[
    (merged_df["Sequence"].str.len() <= 1500) &
    (merged_df["SMILES"].str.len() <= 150)
]

merged_df = merged_df[["PDB", "Chain", "Sequence", "Affinity_nM", "pAff", "BS", "SMILES"]]

merged_df.to_csv("CSAR36.tsv", sep="\t", index=False)
print("✅ 저장 완료: CSAR36.tsv (중복 제거 + 필터링 적용)")


1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule co

✅ 저장 완료: CSAR36.tsv (중복 제거 + 필터링 적용)


1 molecule converted
1 molecule converted
