In [None]:
import pandas as pd
import numpy as np

import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem

import sklearn.metrics as metrics

from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_predict, KFold, StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

random_state = 23

In [None]:
df = pd.read_csv('cmpd.csv')
df

In [None]:
df['mol'] = df.smiles.apply(Chem.MolFromSmiles)
df['mol'] = df.mol.apply(Chem.AddHs)
df

In [None]:
for idx in range(len(df)):
    temp_list = []
    temp_list.append(df.iloc[idx, 0].split("-"))
    df['num_of_atoms'] = df['mol'][idx].GetNumAtoms()
    df['num_of_heavy_atoms'] = df['mol'][idx].GetNumHeavyAtoms()
    df['inchikey_1'] = temp_list[0][0]
    df['inchikey_2'] = temp_list[0][1]

In [None]:
df = df[['inchikey', 'inchikey_1', 'inchikey_2', 'smiles', 'group', 'activity', 'mol', 'num_of_atoms', 'num_of_heavy_atoms']]
df

In [None]:
def get_Xy(df):
    X = np.vstack(df.mol.apply(lambda m: list(AllChem.GetMorganFingerprintAsBitVect(m, 9, nBits=2048))))
    y = df.activity.eq('active').astype(float).to_numpy()
    return X, y

X_train, y_train = get_Xy(df[df.group.eq('train')])
X_test, y_test = get_Xy(df[df.group.eq('test')])

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [54]:

kf = KFold(n_splits=3, shuffle=True)

for train_idx, test_idx in kf.split(X_train, y_train):
    model = XGBClassifier(
        n_jobs=-1,
        max_depth=9
    )
    
    x_train_fold = X_train[train_idx]
    y_train_fold = y_train[train_idx]
    x_test_fold = X_train[test_idx]
    y_test_fold = y_train[test_idx]

    model.fit(x_train_fold, y_train_fold)
    y_pred = model.predict_proba(x_test_fold)[:, 1]

    log_loss = metrics.log_loss(y_test_fold, y_pred, labels=[0, 1])
    precision, recall, _ = metrics.precision_recall_curve(y_test_fold, y_pred, pos_label=1)
    fpr_roc, tpr_roc, _ = metrics.roc_curve(y_test_fold, y_pred, pos_label=1)

    auc1 = metrics.auc(recall, precision)
    auc2 = metrics.auc(fpr_roc, tpr_roc)

    print(y_pred)
    print(log_loss)
    print(auc1)
    print(auc2)

[0.4707037  0.9999472  0.5685813  ... 0.29856095 0.88801277 0.00948397]
0.3588355018121219
0.9432622927996529
0.927719284654802
[0.08804414 0.99882585 0.11920207 ... 0.10668977 0.03281366 0.00708776]
0.3891157072641492
0.9411702280551644
0.9192928938997327
[0.48621848 0.09781464 0.98054516 ... 0.3471839  0.00666342 0.00250121]
0.39458953151594545
0.9455254751720938
0.9183344883344883


In [None]:
# log_loss = metrics.log_loss(y_test, y_pred, labels=[0, 1])

# precision, recall, _ = metrics.precision_recall_curve(y_test, y_pred, pos_label=1)
# fpr_roc, tpr_roc, _ = metrics.roc_curve(y_test, y_pred, pos_label=1)

# auc1 = metrics.auc(recall, precision)
# auc2 = metrics.auc(fpr_roc, tpr_roc)

In [None]:
print(f"score : {score}")
print(f"log_loss : {log_loss}")
print(f"auc1 : {auc1}")
print(f"auc2 : {auc2}")

# score : 0.8229233741146169
# log_liss : 0.41878918961055056
# auc1 : 0.8563724080878554
# auc2 : 0.8807145044217504