In [122]:
from glob import glob
import pandas as pd
import numpy as np
from Bio import PDB
import os
os.environ["PATH"] += os.pathsep + "/opt/conda/envs/team05/bin"
from rdkit import Chem
from rdkit.Chem import MolToSmiles, MolFromMol2File
from rdkit.Chem.rdmolfiles import MolFromSmiles, MolToSmiles
from Bio.PDB import Select, PDBIO
from Bio.PDB.PDBParser import PDBParser

In [123]:
pdb_path = os.path.abspath("/workspace/datasets/CASF-2013")
info_path = os.path.abspath("/workspace/datasets/")

In [124]:
complex_list = [
    f for f in os.listdir(pdb_path)
    if os.path.isdir(os.path.join(pdb_path, f)) and len(f) == 4
]

In [125]:
from Bio.PDB import PDBParser, PDBIO, Select
import os

def remove_HETATM_PDBbind(input_list, pdb_path):

    class NonHetSelect(Select):
        def accept_residue(self, residue):
            return 1 if residue.id[0] == " " else 0
    
    for pdb in input_list:
        src_file = f"{pdb_path}/{pdb}/{pdb}_protein.pdb"
        des_file = f"{pdb_path}/{pdb}/{pdb}_remove_HEATM_protein.pdb"

        if os.path.exists(des_file):
            print(f"[SKIP] {des_file} already exists")
            continue

        if not os.path.exists(src_file):
            print(f"[WARN] source file not found: {src_file}")
            continue

        structure = PDBParser().get_structure(pdb, src_file)
        io = PDBIO()
        io.set_structure(structure)
        io.save(des_file, NonHetSelect())
        print(f"[OK] saved: {des_file}")


In [126]:
# import os
# import glob

# def delete_remove_HETATM_files(pdb_path):
#     pattern = os.path.join(pdb_path, "*", "*_remove_HEATM_protein.pdb")
#     targets = glob.glob(pattern)
#     for file in targets:
#         try:
#             os.remove(file)
#             print(f"삭제됨: {file}")
#         except Exception as e:
#             print(f"삭제 실패: {file} → {e}")

# delete_remove_HETATM_files(pdb_path)


In [127]:
remove_HETATM_PDBbind(complex_list, pdb_path)

[SKIP] /workspace/datasets/CASF-2013/3f3c/3f3c_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CASF-2013/3ge7/3ge7_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CASF-2013/3mss/3mss_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CASF-2013/1xd0/1xd0_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CASF-2013/4g8m/4g8m_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CASF-2013/2x8z/2x8z_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CASF-2013/1o5b/1o5b_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CASF-2013/2ole/2ole_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CASF-2013/2j78/2j78_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CASF-2013/2v7a/2v7a_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CASF-2013/1igj/1igj_remove_HEATM_protein.pdb already exists
[SKIP] /workspace/datasets/CASF-2013/1f8b/1

In [128]:
pdb_parser = PDB.PDBParser(QUIET = True)

In [129]:
amino_acids_short = {
    "ALA":"A", "ARG":"R", "ASN":"N", "ASP":"D", "CYS":"C",
    "GLU":"E", "GLN":"Q", "GLY":"G", "HIS":"H", "ILE":"I",
    "LEU":"L", "LYS":"K", "MET":"M", "PHE":"F", "PRO":"P",
    "SER":"S", "THR":"T", "TRP":"W", "TYR":"Y", "VAL":"V",
    "SEC":"U", "PYL":"O"
}

In [130]:
data_df = pd.DataFrame({"PDB":complex_list})
data_df

Unnamed: 0,PDB
0,3f3c
1,3ge7
2,3mss
3,1xd0
4,4g8m
...,...
190,1hnn
191,2xnb
192,3huc
193,3fcq


In [131]:
import os
import numpy as np
import pandas as pd
from multiprocessing import Pool
from Bio.PDB import PDBParser

pdb_path = "/workspace/datasets/CASF-2013"   
pdb_parser = PDBParser(QUIET=True)

amino_acids_short = {
    "ALA":"A","CYS":"C","ASP":"D","GLU":"E","PHE":"F",
    "GLY":"G","HIS":"H","ILE":"I","LYS":"K","LEU":"L",
    "MET":"M","ASN":"N","PRO":"P","GLN":"Q","ARG":"R",
    "SER":"S","THR":"T","VAL":"V","TRP":"W","TYR":"Y"
}

def get_info(pdb):
    try:
        """Load protein info from one PDB ID"""
        structure = pdb_parser.get_structure(
            pdb, f"{pdb_path}/{pdb}/{pdb}_protein.pdb" 
        )
        chain_name_list, pdb_sequence_list, seq_lengths_list = [], [], []
        reindex = 0

        for chain_name in list(structure[0].child_dict.keys()):
            chain = structure[0][chain_name]
            pdb_sequence = ""
            for residue in chain.get_residues():
                if residue.resname in amino_acids_short.keys():
                    pdb_sequence += amino_acids_short[residue.resname]
                    reindex += 1
            if len(pdb_sequence) != 0:
                chain_name_list.append(chain_name)
                pdb_sequence_list.append(pdb_sequence)
                seq_lengths_list.append(len(pdb_sequence))

        total_seq_lengths = np.sum(np.array(seq_lengths_list))
        seq_lengths_list = list(map(str, seq_lengths_list))

        binding_index = "" #미사용

        return (
            ",".join(chain_name_list),     
            ",".join(pdb_sequence_list),   
            total_seq_lengths,             
            ",".join(seq_lengths_list),    
            binding_index                  
        )

    except Exception as e:
        print(f"Error in {pdb}: {e}")
        return None

def process_chunk(df_chunk):
    results = []
    for pdb in df_chunk["PDB"]:
        info = get_info(pdb)
        if info is not None:
            results.append([pdb, *info])

    if len(results) == 0:
        return pd.DataFrame(
            columns=["PDB", "Chain", "Sequence", "Total_seq_lengths", "Chain_seq_lengths", "BS"]
        )

    return pd.DataFrame(
        results,
        columns=["PDB", "Chain", "Sequence", "Total_seq_lengths", "Chain_seq_lengths", "BS"]
    )

def parallelize_dataframe(df, func, num_partitions=5):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_partitions)
    results = pool.map(func, df_split)
    pool.close()
    pool.join()

    return pd.concat(results).reset_index(drop=True)

if __name__ == "__main__":
    pdb_ids = [
        d for d in os.listdir(pdb_path)
        if os.path.isdir(os.path.join(pdb_path, d)) and len(d) == 4
    ]
    df = pd.DataFrame({"PDB": pdb_ids})

    print("총 PDB 개수:", len(df))  

    data_df = parallelize_dataframe(df, process_chunk, num_partitions=8)

    print(data_df.head())
    print("최종 행 수:", len(data_df))

data_df


총 PDB 개수: 195
    PDB Chain                                           Sequence  \
0  3f3c   A,B  REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...   
1  3ge7   A,B  RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...   
2  3mss        MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...   
3  1xd0        YSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNE...   
4  4g8m        ANKTVVVTTILESPYVMMKKNHEMLEGNERYEGYCVDLAAEIAKHC...   

   Total_seq_lengths Chain_seq_lengths BS  
0               1018           509,509     
1                718           359,359     
2                264               264     
3                495               495     
4                260               260     
최종 행 수: 195


Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,1018,509509,
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,718,359359,
2,3mss,,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,264,264,
3,1xd0,,YSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNE...,495,495,
4,4g8m,,ANKTVVVTTILESPYVMMKKNHEMLEGNERYEGYCVDLAAEIAKHC...,260,260,
...,...,...,...,...,...,...
190,1hnn,,AVASAYQRFEPRAYLRNNYAPPRGDLCNPNGVGPWKLRCLAQTFAT...,261,261,
191,2xnb,,MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTEGVPSTA...,296,296,
192,3huc,,ERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLR...,337,337,
193,3fcq,,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,316,316,


In [132]:
def get_pdb_info_bulk(df):
    return df.PDB.map(get_info)

In [135]:
data_df["Chain"] = info_results.map(lambda a: a[0] if a is not None else None)

In [136]:
data_df["Sequence"] = info_results.map(lambda a: a[1] if a is not None else None)

In [137]:
data_df["Total_seq_lengths"] = info_results.map(lambda a: a[2] if a is not None else None)

In [138]:
data_df["Chain_seq_lengths"] = info_results.map(lambda a: a[3] if a is not None else None)

In [139]:
data_df["BS"] = info_results.map(lambda a: a[4] if a is not None else None)

In [141]:
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,1018,509509,
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,718,359359,
2,3mss,,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,264,264,
3,1xd0,,YSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNE...,495,495,
4,4g8m,,ANKTVVVTTILESPYVMMKKNHEMLEGNERYEGYCVDLAAEIAKHC...,260,260,
...,...,...,...,...,...,...
190,1hnn,,AVASAYQRFEPRAYLRNNYAPPRGDLCNPNGVGPWKLRCLAQTFAT...,261,261,
191,2xnb,,MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTEGVPSTA...,296,296,
192,3huc,,ERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLR...,337,337,
193,3fcq,,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,316,316,


In [142]:
import pandas as pd
import numpy as np
import re
from math import log10

def to_nM(value: float, unit: str) -> float:
    u = unit.lower()
    if u == "pm":  return value * 1e-3
    if u == "nm":  return value
    if u == "um":  return value * 1e3
    if u == "mm":  return value * 1e6
    if u == "m":   return value * 1e9
    return np.nan

pat_meas = re.compile(
    r"\b(Ki|Kd|IC50)\s*=\s*([<>~]?\s*[0-9]*\.?[0-9]+(?:e[+-]?\d+)?)\s*(pM|nM|uM|mM|M)\b",
    re.IGNORECASE
)
pat_p = re.compile(r"\b(pKi|pKd)\s*=\s*([0-9]*\.?[0-9]+)\b", re.IGNORECASE)
pat_mlog = re.compile(r"-\s*log\s*(Ki|Kd)\s*=\s*([0-9]*\.?[0-9]+)\b", re.IGNORECASE)

def parse_affinity_field(field: str):
    cand = []
    for m in pat_meas.finditer(str(field)):
        kind, num_str, unit = m.group(1), m.group(2), m.group(3)
        num_str = re.sub(r"[<>~\s]", "", num_str)
        try:
            val = float(num_str)
        except ValueError:
            continue
        nM = to_nM(val, unit)
        if not np.isnan(nM):
            cand.append((kind.upper(), nM))

    selected_nM = None
    if cand:
        kdki = [n for k, n in cand if k in ("KI", "KD")]
        if kdki:
            selected_nM = min(kdki)
        else:
            ic50s = [n for k, n in cand if k == "IC50"]
            if ic50s:
                selected_nM = min(ic50s)

    if selected_nM is None:
        best_p = None
        for m in pat_p.finditer(str(field)):
            try:
                p = float(m.group(2))
            except ValueError:
                continue
            best_p = p if (best_p is None or p > best_p) else best_p
        for m in pat_mlog.finditer(str(field)):
            try:
                p = float(m.group(2))
            except ValueError:
                continue
            best_p = p if (best_p is None or p > best_p) else best_p
        if best_p is not None:
            selected_nM = 10 ** (9 - best_p)

    if selected_nM is None:
        return None

    pAff = 9.0 - log10(selected_nM)
    return float(selected_nM), float(pAff)


colnames = ["PDB", "Resolution", "Year", "logAff", "Original", "ClusterID"]
df = pd.read_csv("/workspace/datasets/CASF-2013.dat", 
                 sep=r"\s+", comment="#", names=colnames)

affinity_vals = df["Original"].apply(parse_affinity_field)
df["Affinity_nM"] = [x[0] if x else None for x in affinity_vals]
df["pAff"]        = [x[1] if x else None for x in affinity_vals]

df.to_csv("CASF-2013_affinity.tsv", sep="\t", index=False)


In [143]:
import pandas as pd

aff_df = pd.read_csv("CASF-2013_affinity.tsv", sep="\t")

data_df = data_df.merge(
    aff_df[["PDB", "Affinity_nM", "pAff"]],
    on="PDB",
    how="left"   
)

data_df.to_csv("CASF-2013_affinity.tsv", sep="\t", index=False)


In [144]:
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Affinity_nM,pAff
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,1018,509509,,950.0,6.022276
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,718,359359,,2.0,8.698970
2,3mss,,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,264,264,,22000.0,4.657577
3,1xd0,,YSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNE...,495,495,,75.0,7.124939
4,4g8m,,ANKTVVVTTILESPYVMMKKNHEMLEGNERYEGYCVDLAAEIAKHC...,260,260,,12.8,7.892790
...,...,...,...,...,...,...,...,...
190,1hnn,,AVASAYQRFEPRAYLRNNYAPPRGDLCNPNGVGPWKLRCLAQTFAT...,261,261,,580.0,6.236572
191,2xnb,,MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTEGVPSTA...,296,296,,149.0,6.826814
192,3huc,,ERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLR...,337,337,,1030.0,5.987163
193,3fcq,,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,316,316,,1700000.0,2.769551


In [145]:
lengths = data_df.Total_seq_lengths.values

In [146]:
data_df = data_df[lengths <= 1500].reset_index(drop=True)
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Affinity_nM,pAff
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,1018,509509,,950.0,6.022276
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,718,359359,,2.0,8.698970
2,3mss,,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,264,264,,22000.0,4.657577
3,1xd0,,YSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNE...,495,495,,75.0,7.124939
4,4g8m,,ANKTVVVTTILESPYVMMKKNHEMLEGNERYEGYCVDLAAEIAKHC...,260,260,,12.8,7.892790
...,...,...,...,...,...,...,...,...
175,1hnn,,AVASAYQRFEPRAYLRNNYAPPRGDLCNPNGVGPWKLRCLAQTFAT...,261,261,,580.0,6.236572
176,2xnb,,MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTEGVPSTA...,296,296,,149.0,6.826814
177,3huc,,ERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLR...,337,337,,1030.0,5.987163
178,3fcq,,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,316,316,,1700000.0,2.769551


In [147]:
def convert_smiles(row):
    pdb = row.PDB
    
    mol = f"{pdb_path}/{pdb}/{pdb}_ligand.mol2"
    command = f'obabel -imol2 "{mol}" -osmi -xC | obabel -ismi -osmi -xk -O tmp.smi'
    os.system(command)
    
    smiles = read_file(open("tmp.smi"))[0].split('\t')[0].strip()
    
    try:
        smiles = MolToSmiles(MolFromSmiles(smiles),isomericSmiles = False, kekuleSmiles = True)
        return smiles
    
    except Exception as e:
        print(pdb, e)
        return None

In [148]:
def read_file(file):
    return [i.strip() for i in file.readlines()]

In [149]:
SMILES = data_df.apply(convert_smiles, axis = 1)

1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in MOL2 file (title is 3ge7_ligand)

1 molecule converted
  Failed to kekulize aromatic SMILES (title is 3ge7_ligand)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in MOL2 file (title is 1o5b_ligand)

1 molecule converted
  Failed to kekulize aromatic SMILES (title is 1o5b_ligand)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in MOL2 file (title is 3gy4_ligand)

1 molecule converted
  Failed

In [150]:
data_df["SMILES"] = SMILES

In [151]:
data_df = data_df.loc[data_df.SMILES.isna()==False].reset_index(drop=True)
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Affinity_nM,pAff,SMILES
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,1018,509509,,950.0,6.022276,[NH3+]C(CC1=CC=C(F)C=C1)C(=O)[O-]
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,718,359359,,2.0,8.698970,CNC1=NC2=C(CC[NH2+]CC3CCCC3)C3=C(C=C2N1)C(=O)N...
2,3mss,,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,264,264,,22000.0,4.657577,CNC(=O)C([NH3+])CC1=CC=C(OCC2=CC=CC=C2)C=C1
3,1xd0,,YSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNE...,495,495,,75.0,7.124939,CC1OC(OC2C(CO)OC(O)C(O)C2O)C(O)C(O)C1[NH2+]C1C...
4,4g8m,,ANKTVVVTTILESPYVMMKKNHEMLEGNERYEGYCVDLAAEIAKHC...,260,260,,12.8,7.892790,[NH3+]C(C(=O)[O-])C1CCC1C(=O)[O-]
...,...,...,...,...,...,...,...,...,...
175,1hnn,,AVASAYQRFEPRAYLRNNYAPPRGDLCNPNGVGPWKLRCLAQTFAT...,261,261,,580.0,6.236572,NS(=O)(=O)C1=CC=C2CC[NH2+]CC2=C1
176,2xnb,,MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTEGVPSTA...,296,296,,149.0,6.826814,CC1=C(C2=CC=NC(NC3=CC=C(N4CC[NH2+]CC4)C=C3)=N2...
177,3huc,,ERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLR...,337,337,,1030.0,5.987163,C=CC(=O)NC1=CC2=C(C=C1)C(NC1=CC=NN1)=NC(C1=CC=...
178,3fcq,,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,316,316,,1700000.0,2.769551,CC(=O)OC1=C(C(=O)[O-])C=CC=C1C


In [152]:
def get_SMILES_length(df):
    index = [True if len(smi) <= 150 else False for smi in df.SMILES.values]
    return index

In [153]:
smiles_index = get_SMILES_length(data_df)

In [154]:
data_df = data_df.loc[smiles_index].reset_index(drop=True)

In [155]:
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Affinity_nM,pAff,SMILES
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,1018,509509,,950.0,6.022276,[NH3+]C(CC1=CC=C(F)C=C1)C(=O)[O-]
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,718,359359,,2.0,8.698970,CNC1=NC2=C(CC[NH2+]CC3CCCC3)C3=C(C=C2N1)C(=O)N...
2,3mss,,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,264,264,,22000.0,4.657577,CNC(=O)C([NH3+])CC1=CC=C(OCC2=CC=CC=C2)C=C1
3,1xd0,,YSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNE...,495,495,,75.0,7.124939,CC1OC(OC2C(CO)OC(O)C(O)C2O)C(O)C(O)C1[NH2+]C1C...
4,4g8m,,ANKTVVVTTILESPYVMMKKNHEMLEGNERYEGYCVDLAAEIAKHC...,260,260,,12.8,7.892790,[NH3+]C(C(=O)[O-])C1CCC1C(=O)[O-]
...,...,...,...,...,...,...,...,...,...
175,1hnn,,AVASAYQRFEPRAYLRNNYAPPRGDLCNPNGVGPWKLRCLAQTFAT...,261,261,,580.0,6.236572,NS(=O)(=O)C1=CC=C2CC[NH2+]CC2=C1
176,2xnb,,MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTEGVPSTA...,296,296,,149.0,6.826814,CC1=C(C2=CC=NC(NC3=CC=C(N4CC[NH2+]CC4)C=C3)=N2...
177,3huc,,ERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLR...,337,337,,1030.0,5.987163,C=CC(=O)NC1=CC2=C(C=C1)C(NC1=CC=NN1)=NC(C1=CC=...
178,3fcq,,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,316,316,,1700000.0,2.769551,CC(=O)OC1=C(C(=O)[O-])C=CC=C1C


In [156]:
data_dff = data_df.iloc[:, [0, 1, 2, 5, 6, 7, 8]]
data_dff

Unnamed: 0,PDB,Chain,Sequence,BS,Affinity_nM,pAff,SMILES
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,,950.0,6.022276,[NH3+]C(CC1=CC=C(F)C=C1)C(=O)[O-]
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,,2.0,8.698970,CNC1=NC2=C(CC[NH2+]CC3CCCC3)C3=C(C=C2N1)C(=O)N...
2,3mss,,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,,22000.0,4.657577,CNC(=O)C([NH3+])CC1=CC=C(OCC2=CC=CC=C2)C=C1
3,1xd0,,YSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNE...,,75.0,7.124939,CC1OC(OC2C(CO)OC(O)C(O)C2O)C(O)C(O)C1[NH2+]C1C...
4,4g8m,,ANKTVVVTTILESPYVMMKKNHEMLEGNERYEGYCVDLAAEIAKHC...,,12.8,7.892790,[NH3+]C(C(=O)[O-])C1CCC1C(=O)[O-]
...,...,...,...,...,...,...,...
175,1hnn,,AVASAYQRFEPRAYLRNNYAPPRGDLCNPNGVGPWKLRCLAQTFAT...,,580.0,6.236572,NS(=O)(=O)C1=CC=C2CC[NH2+]CC2=C1
176,2xnb,,MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTEGVPSTA...,,149.0,6.826814,CC1=C(C2=CC=NC(NC3=CC=C(N4CC[NH2+]CC4)C=C3)=N2...
177,3huc,,ERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLR...,,1030.0,5.987163,C=CC(=O)NC1=CC2=C(C=C1)C(NC1=CC=NN1)=NC(C1=CC=...
178,3fcq,,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,,1700000.0,2.769551,CC(=O)OC1=C(C(=O)[O-])C=CC=C1C


In [157]:
df = data_dff[data_df["pAff"].notna()].reset_index(drop=True)
df

Unnamed: 0,PDB,Chain,Sequence,BS,Affinity_nM,pAff,SMILES
0,3f3c,"A,B",REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,,950.0,6.022276,[NH3+]C(CC1=CC=C(F)C=C1)C(=O)[O-]
1,3ge7,"A,B",RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKP...,,2.0,8.698970,CNC1=NC2=C(CC[NH2+]CC3CCCC3)C3=C(C=C2N1)C(=O)N...
2,3mss,,MDPSYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL...,,22000.0,4.657577,CNC(=O)C([NH3+])CC1=CC=C(OCC2=CC=CC=C2)C=C1
3,1xd0,,YSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNE...,,75.0,7.124939,CC1OC(OC2C(CO)OC(O)C(O)C2O)C(O)C(O)C1[NH2+]C1C...
4,4g8m,,ANKTVVVTTILESPYVMMKKNHEMLEGNERYEGYCVDLAAEIAKHC...,,12.8,7.892790,[NH3+]C(C(=O)[O-])C1CCC1C(=O)[O-]
...,...,...,...,...,...,...,...
175,1hnn,,AVASAYQRFEPRAYLRNNYAPPRGDLCNPNGVGPWKLRCLAQTFAT...,,580.0,6.236572,NS(=O)(=O)C1=CC=C2CC[NH2+]CC2=C1
176,2xnb,,MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTEGVPSTA...,,149.0,6.826814,CC1=C(C2=CC=NC(NC3=CC=C(N4CC[NH2+]CC4)C=C3)=N2...
177,3huc,,ERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLR...,,1030.0,5.987163,C=CC(=O)NC1=CC2=C(C=C1)C(NC1=CC=NN1)=NC(C1=CC=...
178,3fcq,,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,,1700000.0,2.769551,CC(=O)OC1=C(C(=O)[O-])C=CC=C1C


In [158]:
df.to_csv("/workspace/binding_affinity/datasets/CASF-2013/CASF-2013.tsv", sep="\t", index=False)

In [191]:
import pandas as pd

test_df = pd.read_csv("/workspace/binding_affinity/datasets/CASF-2016/CASF-2016.tsv", sep="\t")
validation_df = pd.read_csv("/workspace/binding_affinity/datasets/validation2020/validation2020.tsv", sep="\t")

# PDB ID 집합
test_ids = set(test_df["PDB"].astype(str))
validation_ids = set(validation_df["PDB"].astype(str))

# 교집합 (겹치는 PDB ID)
overlap = test_ids.intersection(validation_ids)
print("겹치는 개수:", len(overlap))
filtered_validation_df = validation_df[~validation_df["PDB"].astype(str).isin(overlap)]

print("제거 후 validation 개수:", len(filtered_validation_df))

# 저장
filtered_validation_df.to_csv("/workspace/binding_affinity/datasets/validation2020/validation_filtered1.tsv", sep="\t", index=False)
print("저장 완료: validation_filtered1.tsv")


겹치는 개수: 258
제거 후 validation 개수: 4862
저장 완료: validation_filtered1.tsv


In [192]:
import pandas as pd

# 파일 불러오기
test_df = pd.read_csv("/workspace/binding_affinity/datasets/CASF-2013/CASF-2013.tsv", sep="\t")
validation_df = pd.read_csv("/workspace/binding_affinity/datasets/validation2020/validation_filtered1.tsv", sep="\t")

# PDB ID 집합
test_ids = set(test_df["PDB"].astype(str))
validation_ids = set(validation_df["PDB"].astype(str))

# 교집합 (겹치는 PDB ID)
overlap = test_ids.intersection(validation_ids)
print("겹치는 개수:", len(overlap))
filtered_validation_df = validation_df[~validation_df["PDB"].astype(str).isin(overlap)]

print("제거 후 validation 개수:", len(filtered_validation_df))

# 저장
filtered_validation_df.to_csv("/workspace/binding_affinity/datasets/validation2020/validation_filtered2.tsv", sep="\t", index=False)
print("저장 완료: validation_filtered2.tsv")


겹치는 개수: 67
제거 후 validation 개수: 4795
저장 완료: validation_filtered2.tsv


In [193]:
import pandas as pd

# 파일 불러오기
val_df = pd.read_csv("/workspace/binding_affinity/datasets/validation2020/validation_filtered2.tsv", sep="\t")

print("원본 개수:", len(val_df))

# 2000개 샘플링 (랜덤 시드 고정 → 재현성 보장)
val_sampled = val_df.sample(n=2000, random_state=42)

print("샘플링 개수:", len(val_sampled))

# 저장
val_sampled.to_csv("/workspace/binding_affinity/datasets/validation2020/validation_2000.tsv", sep="\t", index=False)

print("저장 완료: validation_2000.tsv")


원본 개수: 4795
샘플링 개수: 2000
저장 완료: validation_2000.tsv


In [None]:
# train 중복 제거

In [196]:
import pandas as pd

# 파일 불러오기
test_df = pd.read_csv("/workspace/binding_affinity/datasets/CASF-2013/CASF-2013.tsv", sep="\t")
training_df = pd.read_csv("/workspace/binding_affinity/datasets/train2020/training.tsv", sep="\t")

# PDB ID 집합
test_ids = set(test_df["PDB"].astype(str))
training_ids = set(training_df["PDB"].astype(str))

# 교집합 (겹치는 PDB ID)
overlap = test_ids.intersection(training_ids)
print("겹치는 개수:", len(overlap))
filtered_training_df = training_df[~training_df["PDB"].astype(str).isin(overlap)]

print("제거 후 training 개수:", len(filtered_training_df))

# 저장
filtered_training_df.to_csv("/workspace/binding_affinity/datasets/train/training_filtered1.tsv", sep="\t", index=False)
print("저장 완료: training_filtered1.tsv")


겹치는 개수: 18
제거 후 training 개수: 11841
저장 완료: training_filtered1.tsv


In [197]:
import pandas as pd

# 파일 불러오기
test_df = pd.read_csv("/workspace/binding_affinity/datasets/CASF-2016/CASF-2016.tsv", sep="\t")
training_df = pd.read_csv("/workspace/binding_affinity/datasets/train2020/training.tsv", sep="\t")

# PDB ID 집합
test_ids = set(test_df["PDB"].astype(str))
training_ids = set(training_df["PDB"].astype(str))

# 교집합 (겹치는 PDB ID)
overlap = test_ids.intersection(training_ids)
print("겹치는 개수:", len(overlap))
filtered_training_df = training_df[~training_df["PDB"].astype(str).isin(overlap)]

print("제거 후 training 개수:", len(filtered_training_df))

# 저장
filtered_training_df.to_csv("/workspace/binding_affinity/datasets/train2020/training_filtered2.tsv", sep="\t", index=False)
print("저장 완료: training_filtered2.tsv")


겹치는 개수: 16
제거 후 training 개수: 11843
저장 완료: training_filtered2.tsv


In [198]:
import pandas as pd

# 파일 불러오기
test_df = pd.read_csv("/workspace/binding_affinity/datasets/CASF-2013/CASF-2013.tsv", sep="\t")
training_df = pd.read_csv("/workspace/binding_affinity/datasets/train2020/training2.tsv", sep="\t")

# PDB ID 집합
test_ids = set(test_df["PDB"].astype(str))
training_ids = set(training_df["PDB"].astype(str))

# 교집합 (겹치는 PDB ID)
overlap = test_ids.intersection(training_ids)
print("겹치는 개수:", len(overlap))
filtered_training_df = training_df[~training_df["PDB"].astype(str).isin(overlap)]

print("제거 후 training 개수:", len(filtered_training_df))

# 저장
filtered_training_df.to_csv("/workspace/binding_affinity/datasets/train2020/training2_filtered1.tsv", sep="\t", index=False)
print("저장 완료: training2_filtered1.tsv")


겹치는 개수: 0
제거 후 training 개수: 2795
저장 완료: training2_filtered1.tsv


In [199]:
import pandas as pd

# 파일 불러오기
test_df = pd.read_csv("/workspace/binding_affinity/datasets/CASF-2016/CASF-2016.tsv", sep="\t")
training_df = pd.read_csv("/workspace/binding_affinity/datasets/train/training2.tsv", sep="\t")

# PDB ID 집합
test_ids = set(test_df["PDB"].astype(str))
training_ids = set(training_df["PDB"].astype(str))

# 교집합 (겹치는 PDB ID)
overlap = test_ids.intersection(training_ids)
print("겹치는 개수:", len(overlap))
filtered_training_df = training_df[~training_df["PDB"].astype(str).isin(overlap)]

print("제거 후 training 개수:", len(filtered_training_df))

# 저장
filtered_training_df.to_csv("/workspace/binding_affinity/datasets/train/training2_filtered2.tsv", sep="\t", index=False)
print("저장 완료: training2_filtered2.tsv")


겹치는 개수: 0
제거 후 training 개수: 2795
저장 완료: training2_filtered2.tsv


In [None]:
# 중복 확인

In [201]:
import pandas as pd

# 파일 불러오기
validation_df = pd.read_csv("/workspace/binding_affinity/datasets/validation2020/validation_2000.tsv", sep="\t")
test2_df = pd.read_csv("/workspace/binding_affinity/datasets/train2020/training_filtered2.tsv", sep="\t")

# PDB ID 집합
test_ids = set(test_df["PDB"].astype(str))
test2_ids = set(test2_df["PDB"].astype(str))

# 교집합 (겹치는 PDB ID)
overlap = test_ids.intersection(test2_ids)
print("겹치는 개수:", len(overlap))


겹치는 개수: 0


In [202]:
import pandas as pd

# 파일 불러오기
validation_df = pd.read_csv("/workspace/binding_affinity/datasets/validation2020/validation_2000.tsv", sep="\t")
test2_df = pd.read_csv("/workspace/binding_affinity/datasets/train2020/training2_filtered2.tsv", sep="\t")

# PDB ID 집합
test_ids = set(test_df["PDB"].astype(str))
test2_ids = set(test2_df["PDB"].astype(str))

# 교집합 (겹치는 PDB ID)
overlap = test_ids.intersection(test2_ids)
print("겹치는 개수:", len(overlap))


겹치는 개수: 0


In [203]:
import pandas as pd

# 파일 불러오기
test_df = pd.read_csv("/workspace/binding_affinity/datasets/CASF-2016/CASF-2016.tsv", sep="\t")
test2_df = pd.read_csv("/workspace/binding_affinity/datasets/train2020/training2_filtered2.tsv", sep="\t")

# PDB ID 집합
test_ids = set(test_df["PDB"].astype(str))
test2_ids = set(test2_df["PDB"].astype(str))

# 교집합 (겹치는 PDB ID)
overlap = test_ids.intersection(test2_ids)
print("겹치는 개수:", len(overlap))


겹치는 개수: 0


In [204]:
import pandas as pd

# 파일 불러오기
test_df = pd.read_csv("/workspace/binding_affinity/datasets/CASF-2013/CASF-2013.tsv", sep="\t")
test2_df = pd.read_csv("/workspace/binding_affinity/datasets/train2020/training2_filtered2.tsv", sep="\t")

# PDB ID 집합
test_ids = set(test_df["PDB"].astype(str))
test2_ids = set(test2_df["PDB"].astype(str))

# 교집합 (겹치는 PDB ID)
overlap = test_ids.intersection(test2_ids)
print("겹치는 개수:", len(overlap))


겹치는 개수: 0


In [205]:
import pandas as pd

# 파일 불러오기
test_df = pd.read_csv("/workspace/binding_affinity/datasets/CASF-2013/CASF-2013.tsv", sep="\t")
validation_df = pd.read_csv("/workspace/binding_affinity/datasets/validation/validation_2000.tsv", sep="\t")

# PDB ID 집합
test_ids = set(test_df["PDB"].astype(str))
test2_ids = set(test2_df["PDB"].astype(str))

# 교집합 (겹치는 PDB ID)
overlap = test_ids.intersection(test2_ids)
print("겹치는 개수:", len(overlap))


겹치는 개수: 0


In [206]:
import pandas as pd

# 파일 불러오기
test_df = pd.read_csv("/workspace/binding_affinity/datasets/CASF-2016/CASF-2016.tsv", sep="\t")
validation_df = pd.read_csv("/workspace/binding_affinity/datasets/validation2020/validation_2000.tsv", sep="\t")

# PDB ID 집합
test_ids = set(test_df["PDB"].astype(str))
test2_ids = set(test2_df["PDB"].astype(str))

# 교집합 (겹치는 PDB ID)
overlap = test_ids.intersection(test2_ids)
print("겹치는 개수:", len(overlap))


겹치는 개수: 0
