In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.cm as cm
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
import rdkit, rdkit.Chem.rdDepictor, rdkit.Chem.Draw
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import mean_squared_error
from math import log10, sqrt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
CFG = {
    'NBITS':2048,
    'SEED':42,
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

In [None]:
# Function to generate ECFP fingerprint from SMILES
def generate_ecfp(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError("Invalid SMILES string")
    ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    return ecfp

In [None]:
chembl_data = pd.read_csv('/content/drive/MyDrive/DACON/제2회 신약개발 AI 경진대회/open/preprocessed_train.csv')
chembl_data.head()

Unnamed: 0,Molecule ChEMBL ID,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Assay ChEMBL ID,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,IC50_nM,pIC50,Smiles
0,CHEMBL3968119,IC50,'=',675.9,nM,6.17,CHEMBL3887888,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL3886488,675.9,6.17,CC(C)Nc1cc(Nc2ccc3ncsc3c2)ncc1-c1nnc(-c2ccc(F)...
1,CHEMBL3966110,IC50,'=',5.1,nM,8.29,CHEMBL3887118,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL3886172,5.1,8.29,CC(C)Nc1cc(-n2ncc3cc(C#N)cnc32)ncc1C(=O)NC1CCN...
2,CHEMBL4109180,IC50,'=',25.0,nM,7.6,CHEMBL3887893,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL3886492,25.0,7.6,O=c1[nH]c(N2CCN(c3cccc(Cl)n3)CC2)nc(N[C@@H]2CC...
3,CHEMBL4107475,IC50,'=',14.0,nM,7.85,CHEMBL3887118,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL3886172,14.0,7.85,CC(C)Nc1cc(-n2ccc3cc(CNC(=O)CC4CCOCC4)cnc32)nc...
4,CHEMBL4114259,IC50,'=',145.0,nM,6.84,CHEMBL3887893,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL3886492,145.0,6.84,CC1(C)CN(c2nc(N[C@@H]3CCCNC3)c(-c3nc4ccccc4s3)...


In [None]:
num_chembl = len(chembl_data['Assay ChEMBL ID'].unique())
num_smiles = len(chembl_data['Smiles'].unique())

print('Number of Assay ChEMBL IDs', num_chembl)
print('Number of Smiles', num_smiles)

Number of Assay ChEMBL IDs 69
Number of Smiles 1839


In [None]:
chembl_data.loc[:, 'ECFP'] = chembl_data['Smiles'].apply(generate_ecfp)

In [None]:
val_data = chembl_data.iloc[:390]  # 상위 390개 행을 선택
train_data = chembl_data.iloc[390:]  # 나머지 행을 선택

In [None]:
val_x = val_data['ECFP']
val_y = val_data['pIC50']

train_x = train_data['ECFP']
train_y = train_data['pIC50']

In [None]:
val_x = np.stack(val_x.values)
train_x = np.stack(train_x.values)

In [None]:
print("Validation X Shape:", val_x.shape)
print("Training X Shape:", train_x.shape)

Validation X Shape: (390, 2048)
Training X Shape: (1449, 2048)


In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [None]:
# 하이퍼파라미터 범위 설정
param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'gamma': uniform(0, 0.5)
}

# XGBoost 모델 설정
xgb_model = XGBRegressor(random_state=CFG['SEED'], n_jobs=-1)

# RandomizedSearchCV를 사용한 파라미터 최적화
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=5,
    cv=5,
    verbose=2,
    n_jobs=1,
    random_state=CFG['SEED']
)

# 훈련 데이터
train_x = np.stack(chembl_data['ECFP'].values)
train_y = chembl_data['pIC50'].values

# 최적의 파라미터로 학습
random_search.fit(train_x, train_y)
best_xgb_model = random_search.best_estimator_

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END colsample_bytree=0.8123620356542087, gamma=0.4753571532049581, learning_rate=0.22959818254342154, max_depth=7, n_estimators=120, subsample=0.7468055921327309; total time=   0.9s
[CV] END colsample_bytree=0.8123620356542087, gamma=0.4753571532049581, learning_rate=0.22959818254342154, max_depth=7, n_estimators=120, subsample=0.7468055921327309; total time=   3.9s
[CV] END colsample_bytree=0.8123620356542087, gamma=0.4753571532049581, learning_rate=0.22959818254342154, max_depth=7, n_estimators=120, subsample=0.7468055921327309; total time=   3.0s
[CV] END colsample_bytree=0.8123620356542087, gamma=0.4753571532049581, learning_rate=0.22959818254342154, max_depth=7, n_estimators=120, subsample=0.7468055921327309; total time=   0.9s
[CV] END colsample_bytree=0.8123620356542087, gamma=0.4753571532049581, learning_rate=0.22959818254342154, max_depth=7, n_estimators=120, subsample=0.7468055921327309; total time=   1.0s
[CV] 

In [None]:
# 예측값 계산
train_y_pred = best_xgb_model.predict(train_x)
val_y_pred = best_xgb_model.predict(val_x)

# RMSE 계산
train_rmse = np.sqrt(mean_squared_error(train_y, train_y_pred))
val_rmse = np.sqrt(mean_squared_error(val_y, val_y_pred))

# 결과 출력
print(f"Training RMSE: {train_rmse:.4f}")
print(f"Validation RMSE: {val_rmse:.4f}")

Training RMSE: 0.1960
Validation RMSE: 0.1935


In [None]:
def pIC50_to_IC50(pic50_values):
    return 10 ** (9 - pic50_values)

In [None]:
test = pd.read_csv('/content/drive/MyDrive/DACON/제2회 신약개발 AI 경진대회/open/preprocessed_test.csv')

In [None]:
test.loc[:, 'ECFP'] = test['Smiles'].apply(generate_ecfp)

In [None]:
test_x = test['ECFP']
test_x = np.stack(test_x.values)
test_y = test['pIC50']

In [None]:
test_y_pred = best_xgb_model.predict(test_x)

In [None]:
test_y_pred, test_y = pIC50_to_IC50(test_y_pred), pIC50_to_IC50(test_y)

In [None]:
# RMSE 계산
rmse = sqrt(mean_squared_error(test_y, test_y_pred))

# Normalized RMSE 계산
normalized_rmse = rmse / (np.max(test_y) - np.min(test_y_pred))

# Absolute Error for IC50_nm
absolute_error = np.abs(test_y - test_y_pred)

# Correct Ratio 계산 (절대 오차가 0.5 이하인 비율)
correct_ratio = np.mean(absolute_error <= 0.5)

# 종합 점수 계산
score = 0.5 * (1 - min(normalized_rmse, 1)) + 0.5 * correct_ratio

print(f"Normalized RMSE (A): {normalized_rmse}")
print(f"Correct Ratio (B): {correct_ratio}")
print(f"Score: {score}")

Normalized RMSE (A): 0.09926023167810934
Correct Ratio (B): 0.061946902654867256
Score: 0.48134333548837893


In [None]:
submit_test = pd.read_csv('/content/drive/MyDrive/DACON/제2회 신약개발 AI 경진대회/open/test.csv')


In [None]:
submit_test.loc[:, 'ECFP'] = submit_test['Smiles'].apply(generate_ecfp)
stest_x = submit_test['ECFP']
stest_x = np.stack(stest_x.values)

In [None]:
stest_y_pred = best_xgb_model.predict(stest_x)

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/DACON/제2회 신약개발 AI 경진대회/open/sample_submission.csv')
submit['IC50_nM'] =pIC50_to_IC50(stest_y_pred)
submit.head()

Unnamed: 0,ID,IC50_nM
0,TEST_000,529.472717
1,TEST_001,63.975063
2,TEST_002,31.72826
3,TEST_003,41.16008
4,TEST_004,29.83737


In [None]:
submit.to_csv('/content/drive/MyDrive/DACON/제2회 신약개발 AI 경진대회/submit_v4.csv', index=False)