In [21]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score
from imblearn.over_sampling import SMOTE

In [22]:
def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        return np.array(fp)
    else:
        return np.zeros(n_bits)
    
def evaluate_model(model, X_val, y_val, label):
    y_pred = (model.predict(X_val) > 0.5).astype(int).flatten()
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    
    print(f"{label} Model Performance:")
    print(f"Precision: {precision * 100:.4f}%, Recall: {recall * 100:.4f}")
    print(f"Accuracy: {accuracy * 100:.4f}%")
    print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
    print("-" * 30)

In [23]:
train_data = pd.read_csv('data/tested_molecules.csv')

X = np.array([smiles_to_fingerprint(smiles) for smiles in train_data['SMILES']])
y_pkm2 = train_data['PKM2_inhibition'].values
y_erk2 = train_data['ERK2_inhibition'].values

X_train, X_val, y_train_pkm2, y_val_pkm2 = train_test_split(X, y_pkm2, test_size=0.2, random_state=42)
_, _, y_train_erk2, y_val_erk2 = train_test_split(X, y_erk2, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_smote_pkm2, y_train_smote_pkm2 = smote.fit_resample(X_train, y_train_pkm2)
X_train_smote_erk2, y_train_smote_erk2 = smote.fit_resample(X_train, y_train_erk2)

In [24]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'scale_pos_weight': [1, 10, 25, 50]
}

In [25]:
xgb_pkm2 = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

grid_search_pkm2 = GridSearchCV(estimator=xgb_pkm2, param_grid=param_grid, scoring='precision', cv=5, verbose=1, n_jobs=-1)
grid_search_pkm2.fit(X_train_smote_pkm2, y_train_smote_pkm2)

best_model_pkm2 = grid_search_pkm2.best_estimator_
evaluate_model(best_model_pkm2, X_val, y_val_pkm2, "Best XGBoost for PKM2")

Fitting 5 folds for each of 108 candidates, totalling 540 fits




Best XGBoost for PKM2 Model Performance:
Precision: 100.0000%, Recall: 16.6667
Accuracy: 97.7679%
TP: 1, TN: 218, FP: 0, FN: 5
------------------------------


In [26]:
xgb_erk2 = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

grid_search_erk2 = GridSearchCV(estimator=xgb_erk2, param_grid=param_grid, scoring='precision', cv=5, verbose=1, n_jobs=-1)
grid_search_erk2.fit(X_train_smote_erk2, y_train_smote_erk2)

best_model_erk2 = grid_search_erk2.best_estimator_
evaluate_model(best_model_erk2, X_val, y_val_erk2, "Best XGBoost for ERK2")

Fitting 5 folds for each of 108 candidates, totalling 540 fits




Best XGBoost for ERK2 Model Performance:
Precision: 0.0000%, Recall: 0.0000
Accuracy: 90.6250%
TP: 0, TN: 203, FP: 5, FN: 16
------------------------------


In [27]:
test_data = pd.read_csv('data/untested_molecules-3.csv')
X_test = np.array([smiles_to_fingerprint(smiles) for smiles in test_data['SMILES']])

test_data['PKM2_inhibition'] = best_model_pkm2.predict(X_test)
test_data['ERK2_inhibition'] = best_model_erk2.predict(X_test)

In [28]:
test_data[test_data['PKM2_inhibition'] == 1]

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
111,CCCc1ccc(S(=O)(=O)NCCc2ccncc2)cc1,1,0
326,Cc1cc(-n2c(C)cc(C(=O)CSc3ncnc4ccccc34)c2C)no1,1,0
334,O=C(OCCN1C(=O)CCC1=O)c1ccc(S(=O)(=O)N2CCCCC2)cc1,1,0
413,O=C([O-])C[C@@H](NS(=O)(=O)c1ccc(Br)cc1)c1ccco1,1,0
455,C[NH+](C)CCC(=O)c1ccc(Cl)cc1,1,0
...,...,...,...
4261,Nc1cc(-c2ccc(Br)cc2)nn1S(=O)(=O)c1ccccc1,1,1
4275,O=C(Cc1cccs1)Nc1nncs1,1,0
4365,O=C(COC(=O)C1CCN(S(=O)(=O)c2cccs2)CC1)c1ccc2c(...,1,0
4423,Cc1occc1C(=O)NCc1cccs1,1,0


In [29]:
test_data[test_data['ERK2_inhibition'] == 1]

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
22,O=P([O-])(CP(=O)(c1ccccc1)c1ccccc1)c1ccccc1,0,1
32,O=c1nc[nH]c2c1ncn2[C@H]1CC[C@@H](CO)O1,0,1
56,C[NH+](C)CCC=C1c2ccccc2CCc2ccccc21,0,1
62,CC[NH+](CC)CCn1c(OC)cc(=O)[nH]c1=O,0,1
290,O=c1nc(N(c2ccccc2)c2ccccc2)[nH]c(=O)[nH]1,0,1
...,...,...,...
4287,O=C(Nc1ccc2c(c1)OCO2)c1c(F)c(F)c(F)c(F)c1F,0,1
4328,Fc1cc2nccnc2cc1N1CCSCC1,0,1
4344,COc1c(C(C)C)oc2cc3oc(=O)ccc3cc12,0,1
4362,COc1ccc(OC)c(NS(=O)(=O)c2ccc(SC)cc2)c1,0,1
