In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score
from imblearn.over_sampling import SMOTE

In [2]:
def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        return np.array(fp)
    else:
        return np.zeros(n_bits)
    
def evaluate_model(model, X_val, y_val, label):
    y_pred = (model.predict(X_val) > 0.5).astype(int).flatten()
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    
    print(f"{label} Model Performance:")
    print(f"Precision: {precision * 100:.4f}%, Recall: {recall * 100:.4f}")
    print(f"Accuracy: {accuracy * 100:.4f}%")
    print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
    print("-" * 30)

In [3]:
train_data = pd.read_csv('data/tested_molecules.csv')

X = np.array([smiles_to_fingerprint(smiles) for smiles in train_data['SMILES']])
y_pkm2 = train_data['PKM2_inhibition'].values
y_erk2 = train_data['ERK2_inhibition'].values

X_train, X_val, y_train_pkm2, y_val_pkm2 = train_test_split(X, y_pkm2, test_size=0.2, random_state=42)
_, _, y_train_erk2, y_val_erk2 = train_test_split(X, y_erk2, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_smote_pkm2, y_train_smote_pkm2 = smote.fit_resample(X_train, y_train_pkm2)
X_train_smote_erk2, y_train_smote_erk2 = smote.fit_resample(X_train, y_train_erk2)

In [4]:
scale_pos_weight_pkm2 = np.sum(y_train_pkm2 == 0) / np.sum(y_train_pkm2 == 1)
scale_pos_weight_erk2 = np.sum(y_train_erk2 == 0) / np.sum(y_train_erk2 == 1)

model_xgb_pkm2 = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight_pkm2)
model_xgb_erk2 = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight_erk2)

model_xgb_pkm2.fit(X_train, y_train_pkm2)
model_xgb_erk2.fit(X_train, y_train_erk2)

evaluate_model(model_xgb_pkm2, X_val, y_val_pkm2, "XGBoost scale_pos for PKM2")
evaluate_model(model_xgb_erk2, X_val, y_val_erk2, "XGBoost scale_pos for ERK2")

XGBoost scale_pos for PKM2 Model Performance:
Precision: 33.3333%, Recall: 16.6667
Accuracy: 96.8750%
TP: 1, TN: 216, FP: 2, FN: 5
------------------------------
XGBoost scale_pos for ERK2 Model Performance:
Precision: 0.0000%, Recall: 0.0000
Accuracy: 91.9643%
TP: 0, TN: 206, FP: 2, FN: 16
------------------------------


In [5]:
model_xgb_pkm2_smote = xgb.XGBClassifier()
model_xgb_erk2_smote = xgb.XGBClassifier()

model_xgb_pkm2_smote.fit(X_train_smote_pkm2, y_train_smote_pkm2)
model_xgb_erk2_smote.fit(X_train_smote_erk2, y_train_smote_erk2)

model_xgb_pkm2_smote.fit(X_train, y_train_pkm2)
model_xgb_pkm2_smote.fit(X_train, y_train_erk2)

evaluate_model(model_xgb_pkm2_smote, X_val, y_val_pkm2, "XGBoost for PKM2")
evaluate_model(model_xgb_pkm2_smote, X_val, y_val_erk2, "XGBoost for ERK2")

XGBoost for PKM2 Model Performance:
Precision: 0.0000%, Recall: 0.0000
Accuracy: 97.3214%
TP: 0, TN: 218, FP: 0, FN: 6
------------------------------
XGBoost for ERK2 Model Performance:
Precision: 0.0000%, Recall: 0.0000
Accuracy: 92.8571%
TP: 0, TN: 208, FP: 0, FN: 16
------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
test_data = pd.read_csv('data/untested_molecules-3.csv')
X_test = np.array([smiles_to_fingerprint(smiles) for smiles in test_data['SMILES']])

In [7]:
test_data['PKM2_inhibition'] = model_xgb_pkm2.predict(X_test)
test_data['ERK2_inhibition'] = model_xgb_erk2.predict(X_test)

In [8]:
test_data[test_data['PKM2_inhibition'] == 1]

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
20,CCN1C(=O)[C@]2(C(C#N)=C([NH3+])Oc3cc(C)oc(=O)c...,1,0
36,O=C([O-])c1ccc(CN(c2ccccn2)S(=O)(=O)c2ccccc2)cc1,1,0
210,Cc1cc(NC(=O)CSc2nc3ccsc3c(=O)n2CCCCC(=O)[O-])no1,1,0
326,Cc1cc(-n2c(C)cc(C(=O)CSc3ncnc4ccccc34)c2C)no1,1,0
519,O=C(c1ccc([N-]S(=O)(=O)c2ccccc2)cc1)N1CC[NH+](...,1,0
...,...,...,...
4080,CN(C(=O)COc1ccccc1)c1nnc(-c2cccnc2)s1,1,0
4122,CCOC(=O)c1sc(NC(=O)c2ccc(C)o2)nc1C,1,0
4130,CC[C@@H](OC(=O)[C@H](NC(=O)c1ccccc1)c1ccccc1)C...,1,0
4358,CCCNC(=O)COC(=O)c1ccc(S(=O)(=O)N2CCCCCC2)cc1,1,0


In [9]:
test_data[test_data['ERK2_inhibition'] == 1]

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
240,N#CCCN1CC[NH+](Cc2cc(-c3ccccc3)ccc2O)CC1,0,1
419,COc1cc2[nH]c(=O)n(CCC(=O)NC3CCCC3)c(=O)c2cc1OC,0,1
468,Cn1c(SCC(=O)N2CCCCC2)nc2cccnc21,0,1
531,Nc1ccc(-c2n[nH]c(=O)c3ccccc23)cc1N,0,1
641,O=c1c2ccccc2nc2n1CCS2,0,1
898,N#Cc1nc(-c2ccc(F)cc2)oc1NCc1cccc2c1OCO2,0,1
1201,C(=N/c1c(-c2ccccc2)nc2n1CCS2)\c1ccccc1,0,1
1243,OCC[NH+]1CCN(Cc2nc(O)c3ccccc3n2)CC1,0,1
1291,Oc1c([C@@H](c2ccccn2)[NH+]2CCOCC2)ccc2cccnc12,0,1
1569,O=c1nc(Cc2cccs2)[nH]c2ccccc12,0,1


In [10]:
test_data = pd.read_csv('data/untested_molecules-3.csv')
X_test = np.array([smiles_to_fingerprint(smiles) for smiles in test_data['SMILES']])

In [11]:
test_data['PKM2_inhibition'] = model_xgb_pkm2_smote.predict(X_test)
test_data['ERK2_inhibition'] = model_xgb_erk2_smote.predict(X_test)

In [12]:
test_data[test_data['PKM2_inhibition'] == 1]

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
294,c1ccc(CCCc2nc(-c3ccccn3)no2)cc1,1,1
1201,C(=N/c1c(-c2ccccc2)nc2n1CCS2)\c1ccccc1,1,1
1243,OCC[NH+]1CCN(Cc2nc(O)c3ccccc3n2)CC1,1,1
1291,Oc1c([C@@H](c2ccccn2)[NH+]2CCOCC2)ccc2cccnc12,1,0
1900,Nc1nc(CSc2n[nH]c(-c3ccccc3)n2)nc(Nc2ccccc2)n1,1,1
1943,c1ccc(-c2oc3ncnc(NCCCn4ccnc4)c3c2-c2ccccc2)cc1,1,0
2211,O[C@H](c1ccc2c(c1)OCO2)c1nccc2ccccc12,1,1
2455,O[C@@H](COc1ccc(Cl)c2ccccc12)C[NH+]1CCN(c2cccc...,1,0
2503,c1ccc(OCc2nc(-c3ccccc3)no2)cc1,1,1
2773,O=c1cc(-c2ccccc2)nc2nc(CCc3ccccc3)[nH]n12,1,1


In [13]:
test_data[test_data['ERK2_inhibition'] == 1]

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
22,O=P([O-])(CP(=O)(c1ccccc1)c1ccccc1)c1ccccc1,0,1
32,O=c1nc[nH]c2c1ncn2[C@H]1CC[C@@H](CO)O1,0,1
56,C[NH+](C)CCC=C1c2ccccc2CCc2ccccc21,0,1
62,CC[NH+](CC)CCn1c(OC)cc(=O)[nH]c1=O,0,1
176,Fc1ccc(Nc2nc(Oc3ccc4c(c3)OCO4)nc(N3CCOCC3)n2)c...,0,1
...,...,...,...
4270,CN(C)S(=O)(=O)N1CCN(S(=O)(=O)N(C)C)CC1,0,1
4287,O=C(Nc1ccc2c(c1)OCO2)c1c(F)c(F)c(F)c(F)c1F,0,1
4328,Fc1cc2nccnc2cc1N1CCSCC1,0,1
4344,COc1c(C(C)C)oc2cc3oc(=O)ccc3cc12,0,1


In [14]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'scale_pos_weight': [1, 10, 25, 50]
}

In [15]:
xgb_pkm2 = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

grid_search_pkm2 = GridSearchCV(estimator=xgb_pkm2, param_grid=param_grid, scoring='precision', cv=5, verbose=1, n_jobs=-1)
grid_search_pkm2.fit(X_train_smote_pkm2, y_train_smote_pkm2)

best_model_pkm2 = grid_search_pkm2.best_estimator_
evaluate_model(best_model_pkm2, X_val, y_val_pkm2, "Best XGBoost for PKM2")

Fitting 5 folds for each of 108 candidates, totalling 540 fits




Best XGBoost for PKM2 Model Performance:
Precision: 100.0000%, Recall: 16.6667
Accuracy: 97.7679%
TP: 1, TN: 218, FP: 0, FN: 5
------------------------------


In [16]:
xgb_erk2 = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

grid_search_erk2 = GridSearchCV(estimator=xgb_erk2, param_grid=param_grid, scoring='precision', cv=5, verbose=1, n_jobs=-1)
grid_search_erk2.fit(X_train_smote_erk2, y_train_smote_erk2)

best_model_erk2 = grid_search_erk2.best_estimator_
evaluate_model(best_model_erk2, X_val, y_val_erk2, "Best XGBoost for ERK2")

Fitting 5 folds for each of 108 candidates, totalling 540 fits




Best XGBoost for ERK2 Model Performance:
Precision: 0.0000%, Recall: 0.0000
Accuracy: 90.6250%
TP: 0, TN: 203, FP: 5, FN: 16
------------------------------


In [18]:
test_data = pd.read_csv('data/untested_molecules-3.csv')
X_test = np.array([smiles_to_fingerprint(smiles) for smiles in test_data['SMILES']])

test_data['PKM2_inhibition'] = best_model_pkm2.predict(X_test)
test_data['ERK2_inhibition'] = best_model_erk2.predict(X_test)

In [19]:
test_data[test_data['PKM2_inhibition'] == 1]

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
111,CCCc1ccc(S(=O)(=O)NCCc2ccncc2)cc1,1,0
326,Cc1cc(-n2c(C)cc(C(=O)CSc3ncnc4ccccc34)c2C)no1,1,0
334,O=C(OCCN1C(=O)CCC1=O)c1ccc(S(=O)(=O)N2CCCCC2)cc1,1,0
413,O=C([O-])C[C@@H](NS(=O)(=O)c1ccc(Br)cc1)c1ccco1,1,0
455,C[NH+](C)CCC(=O)c1ccc(Cl)cc1,1,0
...,...,...,...
4261,Nc1cc(-c2ccc(Br)cc2)nn1S(=O)(=O)c1ccccc1,1,1
4275,O=C(Cc1cccs1)Nc1nncs1,1,0
4365,O=C(COC(=O)C1CCN(S(=O)(=O)c2cccs2)CC1)c1ccc2c(...,1,0
4423,Cc1occc1C(=O)NCc1cccs1,1,0


In [20]:
test_data[test_data['ERK2_inhibition'] == 1]

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
22,O=P([O-])(CP(=O)(c1ccccc1)c1ccccc1)c1ccccc1,0,1
32,O=c1nc[nH]c2c1ncn2[C@H]1CC[C@@H](CO)O1,0,1
56,C[NH+](C)CCC=C1c2ccccc2CCc2ccccc21,0,1
62,CC[NH+](CC)CCn1c(OC)cc(=O)[nH]c1=O,0,1
290,O=c1nc(N(c2ccccc2)c2ccccc2)[nH]c(=O)[nH]1,0,1
...,...,...,...
4287,O=C(Nc1ccc2c(c1)OCO2)c1c(F)c(F)c(F)c(F)c1F,0,1
4328,Fc1cc2nccnc2cc1N1CCSCC1,0,1
4344,COc1c(C(C)C)oc2cc3oc(=O)ccc3cc12,0,1
4362,COc1ccc(OC)c(NS(=O)(=O)c2ccc(SC)cc2)c1,0,1
