In [79]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score
from imblearn.over_sampling import SMOTE
from tqdm import tqdm

In [80]:
def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        return np.array(fp)
    else:
        return np.zeros(n_bits)
    
def evaluate_model(model, X_val, y_val, label, threshold=0.5, print_info=False):
    y_proba = model.predict_proba(X_val)[:, 1]
    y_pred = (y_proba > threshold).astype(int)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    
    if print_info:
        print(f"{label} Model Performance:")
        print(f"Precision: {precision * 100:.4f}%, Recall: {recall * 100:.4f}%")
        print(f"Accuracy: {accuracy * 100:.4f}%")
        print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
        print("-" * 30)
    return precision, recall, accuracy, tn, fp, fn, tp

In [81]:
train_data = pd.read_csv('data/tested_molecules.csv')

X = np.array([smiles_to_fingerprint(smiles) for smiles in train_data['SMILES']])
y_pkm2 = train_data['PKM2_inhibition'].values
y_erk2 = train_data['ERK2_inhibition'].values

X_train, X_val, y_train_pkm2, y_val_pkm2 = train_test_split(X, y_pkm2, test_size=0.2, random_state=42)
_, _, y_train_erk2, y_val_erk2 = train_test_split(X, y_erk2, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_smote_pkm2, y_train_smote_pkm2 = smote.fit_resample(X_train, y_train_pkm2)
X_train_smote_erk2, y_train_smote_erk2 = smote.fit_resample(X_train, y_train_erk2)

In [82]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'scale_pos_weight': [1, 10, 25, 50]
}

In [83]:
xgb_pkm2 = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

grid_search_pkm2 = GridSearchCV(estimator=xgb_pkm2, param_grid=param_grid, scoring='precision', cv=5, verbose=1, n_jobs=-1)
grid_search_pkm2.fit(X_train_smote_pkm2, y_train_smote_pkm2)

Fitting 5 folds for each of 108 candidates, totalling 540 fits




In [84]:
xgb_erk2 = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

grid_search_erk2 = GridSearchCV(estimator=xgb_erk2, param_grid=param_grid, scoring='precision', cv=5, verbose=1, n_jobs=-1)
grid_search_erk2.fit(X_train_smote_erk2, y_train_smote_erk2)

Fitting 5 folds for each of 108 candidates, totalling 540 fits




In [85]:
print(f"number of 1s in y_val_PKM2 {np.count_nonzero(y_val_pkm2 == 1)}")
print(f"number of 1s in y_val_ERK2 {np.count_nonzero(y_val_erk2 == 1)}")

number of 1s in y_val_PKM2 6
number of 1s in y_val_ERK2 16


In [94]:
best_model_pkm2 = grid_search_pkm2.best_estimator_
best_model_erk2 = grid_search_erk2.best_estimator_

thresholds = np.linspace(0.01, 1.0, 1000)
best_threshold_pkm2, best_recall_pkm2, best_precision_pkm2, best_accuracy_pkm2 = 0.5, 0, 0, 0
best_threshold_erk2, best_recall_erk2, best_precision_erk2, best_accuracy_erk2 = 0.5, 0, 0, 0
fn_pkm2, fn_erk2 = float('inf'), float('inf')

print("Finding best values")

for threshold in tqdm(thresholds, desc="Evaluating thresholds"):
    precision, recall, accuracy, _, _, fn, _ = evaluate_model(best_model_pkm2, X_val, y_val_pkm2, f"PKM2 (Threshold: {threshold:.2f})", threshold)
    if recall >= best_recall_pkm2 and precision >= best_precision_pkm2 and accuracy >= best_accuracy_pkm2 and fn <= fn_pkm2:
        best_recall_pkm2 = recall
        best_threshold_pkm2 = threshold
        best_precision_pkm2 = precision
        best_accuracy_pkm2 = accuracy
        fn_pkm2 = fn
    
    precision, recall, accuracy, _, _, fn, _ = evaluate_model(best_model_erk2, X_val, y_val_erk2, f"ERK2 (Threshold: {threshold:.2f})", threshold)
    if recall >= best_recall_erk2 and precision >= best_precision_erk2 and accuracy >= best_accuracy_erk2 and fn <= fn_erk2:
        best_recall_erk2 = recall
        best_threshold_erk2 = threshold
        best_precision_erk2 = precision
        best_accuracy_erk2 = accuracy
        fn_erk2 = fn

print(f"PKM2: {'-' * 30}\n"
      f"Best Threshold: {best_threshold_pkm2}\n"
      f"Best Recall: {best_recall_pkm2}\n"
      f"Best Accuracy: {best_accuracy_pkm2}\n"
      f"Best Precision: {best_precision_pkm2}\n"
      f"Best FN: {fn_pkm2}\n")
print(f"ERK2: {'-' * 30}\n"
      f"Best Threshold: {best_accuracy_erk2}\n"
      f"Best Recall: {best_recall_erk2}\n"
      f"Best Accuracy: {best_accuracy_erk2}\n"
      f"Best Precision: {best_precision_erk2}\n"
      f"Best FN: {fn_erk2}\n\n")

evaluate_model(best_model_pkm2, X_val, y_val_pkm2, "Final PKM2", best_threshold_pkm2, True)
evaluate_model(best_model_erk2, X_val, y_val_erk2, "Final ERK2", best_threshold_erk2, True)

Finding best values


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

PKM2: ------------------------------
Best Threshold: 0.04666666666666667
Best Recall: 0.3333333333333333
Best Accuracy: 0.9419642857142857
Best Precision: 0.18181818181818182
Best FN: 4

ERK2: ------------------------------
Best Threshold: 0.8169642857142857
Best Recall: 0.1875
Best Accuracy: 0.8169642857142857
Best Precision: 0.0967741935483871
Best FN: 13


Final PKM2 Model Performance:
Precision: 18.1818%, Recall: 33.3333%
Accuracy: 94.1964%
TP: 2, TN: 209, FP: 9, FN: 4
------------------------------
Final ERK2 Model Performance:
Precision: 9.6774%, Recall: 18.7500%
Accuracy: 81.6964%
TP: 3, TN: 180, FP: 28, FN: 13
------------------------------


(0.0967741935483871, 0.1875, 0.8169642857142857, 180, 28, 13, 3)

In [95]:
test_data = pd.read_csv('data/untested_molecules-3.csv')
X_test = np.array([smiles_to_fingerprint(smiles) for smiles in test_data['SMILES']])

test_data['PKM2_inhibition'] = (best_model_pkm2.predict_proba(X_test)[:, 1] > best_threshold_pkm2).astype(int)
test_data['ERK2_inhibition'] = (best_model_erk2.predict_proba(X_test)[:, 1] > best_threshold_erk2).astype(int)

In [96]:
test_data[test_data['PKM2_inhibition'] == 1]

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
4,C=CCNC(=O)c1cc(-c2ccccc2O)on1,1,0
10,O=C([O-])Cc1ccc2[nH]c3ccc(CC(=O)[O-])cc3c2c1,1,1
15,CCN1/C(=C/C(=O)c2ccco2)Sc2ccccc21,1,0
22,O=P([O-])(CP(=O)(c1ccccc1)c1ccccc1)c1ccccc1,1,1
23,Cc1cc(C)c2c(CC(=O)[O-])coc2c1,1,0
...,...,...,...
4423,Cc1occc1C(=O)NCc1cccs1,1,0
4435,Cc1cc(C)c(C#N)c(SC[S@@](=O)c2ccc(C(C)(C)C)cc2)n1,1,0
4440,c1csc(-c2nn[n-]n2)c1,1,1
4447,C[C@@H]1CCCN(C(=O)COC(=O)c2n[nH]c3ccccc23)C1,1,0


In [97]:
test_data[test_data['ERK2_inhibition'] == 1] 

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
6,CSc1ccc(/C=N/n2c(C)cc(C)cc2=O)cc1,0,1
9,CC[NH+]1CCN(Cc2ccc(C)cc2)CC1,0,1
10,O=C([O-])Cc1ccc2[nH]c3ccc(CC(=O)[O-])cc3c2c1,1,1
16,CN1C(=O)N2C[NH+](C3CCCCC3)CN3C(=O)N(C)C1C23,0,1
18,COc1ccccc1NC(=O)N1CCN(c2cc(-c3ccccc3)nc3ncnn23...,0,1
...,...,...,...
4403,C/C(Cl)=C/Cn1c(SC(C)C)nc2c1c(=O)[nH]c(=O)n2C,0,1
4418,Cn1cnc2nc3n(c2c1=O)CCC3,0,1
4437,C[NH+]1CCC2(CC1)NC(=O)NC2=O,0,1
4440,c1csc(-c2nn[n-]n2)c1,1,1
