In [1]:
import pandas as pd
import numpy as np

In [2]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [3]:
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split

In [9]:
train = pd.read_csv('/content/drive/MyDrive/DACON/제2회 신약개발 AI 경진대회/open/train.csv')
test = pd.read_csv('/content/drive/MyDrive/DACON/제2회 신약개발 AI 경진대회/open/test.csv')

In [10]:
num_chembl = len(train['Assay ChEMBL ID'].unique())
num_smiles = len(train['Smiles'].unique())

print('Number of Assay ChEMBL IDs', num_chembl)
print('Number of Smiles', num_smiles)

Number of Assay ChEMBL IDs 72
Number of Smiles 1952


In [11]:
# Function to generate ECFP fingerprint from SMILES
def generate_ecfp(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError("Invalid SMILES string")
    ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    return ecfp

In [None]:
# smiles to ECFP fingerprint
train_data = train[['Assay ChEMBL ID', 'IC50_nM', 'Smiles']].copy()
train_data.loc[:, 'ECFP'] = train_data['Smiles'].apply(generate_ecfp)
ecfp_array = np.array(train_data['ECFP'].tolist())

# assay chembl id 정수 변환
train_data['Assay ChEMBL ID'] = train_data['Assay ChEMBL ID'].str.extract('(\d+)', expand=False).astype(int)

In [None]:
# smiles to ECFP fingerprint
test_data = test[['Smiles']].copy()
test_data.loc[:, 'ECFP'] = test['Smiles'].apply(generate_ecfp)

In [14]:
train_ecfp_array = np.array(train_data['ECFP'].tolist())
test_ecfp_array = np.array(test_data['ECFP'].tolist())

# K-Means

In [16]:
from sklearn.cluster import KMeans

In [17]:
test_centers = test_ecfp_array

kmeans = KMeans(n_clusters=len(test_centers), init=test_centers, n_init=1, random_state=42)
kmeans.fit(train_ecfp_array)
labels = kmeans.labels_
cluster_centers = kmeans.cluster_centers_

In [22]:
def find_top_n_closest(data, centers, labels, n=5):
    top_n_indices = []
    for i, center in enumerate(centers):
        cluster_data = data[labels == i]
        cluster_indices = np.where(labels == i)[0]
        distances = np.linalg.norm(cluster_data - center, axis=1)
        top_n_idx = distances.argsort()[:n]
        top_n_indices.extend(cluster_indices[top_n_idx])
        top_n_indices = list(set(top_n_indices))
    return top_n_indices

top_n = 1
top_n_indices = find_top_n_closest(train_ecfp_array, cluster_centers, labels, n=top_n)

selected_test_km = train_data.iloc[top_n_indices]
print(f"Original training data size: {train_data.shape}")
print(f"Selected training data size: {selected_test_km.shape}")

Original training data size: (1952, 4)
Selected training data size: (113, 4)


In [23]:
selected_train_km = train_data[~train_data['Smiles'].isin(selected_test_km['Smiles'])]

In [25]:
selected_test_km.to_csv('/content/drive/MyDrive/DACON/제2회 신약개발 AI 경진대회/open/selected_test_km.csv', index=False)

In [26]:
selected_train_km.to_csv('/content/drive/MyDrive/DACON/제2회 신약개발 AI 경진대회/open/selected_train_km.csv', index=False)