In [13]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb

In [14]:
def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        return np.array(fp)
    else:
        return np.zeros(n_bits)

train_data = pd.read_csv('data/tested_molecules.csv')

X = np.array([smiles_to_fingerprint(smiles) for smiles in train_data['SMILES']])
y_pkm2 = train_data['PKM2_inhibition'].values
y_erk2 = train_data['ERK2_inhibition'].values

X_train, X_val, y_train_pkm2, y_val_pkm2 = train_test_split(X, y_pkm2, test_size=0.2, random_state=42)
_, _, y_train_erk2, y_val_erk2 = train_test_split(X, y_erk2, test_size=0.2, random_state=42)

In [15]:
model_xgb_pkm2 = xgb.XGBClassifier()
model_xgb_erk2 = xgb.XGBClassifier()

model_xgb_pkm2.fit(X_train, y_train_pkm2)
model_xgb_erk2.fit(X_train, y_train_erk2)

def evaluate_model(model, X_val, y_val, label):
    y_pred = (model.predict(X_val) > 0.5).astype(int).flatten()
    accuracy = accuracy_score(y_val, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    
    print(f"{label} Model Performance:")
    print(f"Accuracy: {accuracy * 100:.4f}%")
    print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
    print("-" * 30)

evaluate_model(model_xgb_pkm2, X_val, y_val_pkm2, "XGBoost for PKM2")
evaluate_model(model_xgb_erk2, X_val, y_val_erk2, "XGBoost for ERK2")

XGBoost for PKM2 Model Performance:
Accuracy: 97.3214%
TP: 1, TN: 217, FP: 1, FN: 5
------------------------------
XGBoost for ERK2 Model Performance:
Accuracy: 92.8571%
TP: 0, TN: 208, FP: 0, FN: 16
------------------------------


In [16]:
test_data = pd.read_csv('data/untested_molecules-3.csv')

test_data.head()

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
0,C[C@@H](Sc1nc(=O)cc(N)[nH]1)C(=O)NC1CCCCC1,,
1,O=C(CCN1C(=O)COc2ccccc21)NCc1cccs1,,
2,Cn1nnnc1SCC(=O)N1CC[NH+](Cc2ccccc2)CC1,,
3,CCOC(=O)CCP(=O)([O-])[C@@H](O)c1ccc(OC)cc1,,
4,C=CCNC(=O)c1cc(-c2ccccc2O)on1,,


In [17]:
X_test = np.array([smiles_to_fingerprint(smiles) for smiles in test_data['SMILES']])

In [18]:
test_data['PKM2_inhibition'] = model_xgb_pkm2.predict(X_test)
test_data['ERK2_inhibition'] = model_xgb_erk2.predict(X_test)

In [19]:
test_data[test_data['PKM2_inhibition'] == 1]

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
279,Cc1ccnc2nc(C(=O)N3CCN(S(=O)(=O)c4ccccc4)CC3)nn12,1,0
519,O=C(c1ccc([N-]S(=O)(=O)c2ccccc2)cc1)N1CC[NH+](...,1,0
524,Cc1cc(-n2c(C)cc(C(=O)CSc3nc4ccccc4c(=O)n3Cc3cc...,1,0
1178,COc1ccc(OCc2nn3c(Cn4cnc5ccccc54)nnc3s2)cc1,1,0
1310,O=C(c1ccc(S(=O)(=O)N2CCCC2)cc1)N1CC[NH+](Cc2cc...,1,0
1362,C=CCn1c(=O)/c(=C/c2ccc(C)o2)s/c1=C(/C#N)C(=O)c...,1,0
1427,Cc1cc(NC(=O)CSc2nnc3c(C)cc4c(C)ccc(C)c4n23)no1,1,0
1475,O=c1c2ccccc2ncn1Cc1nc2ccccc2s1,1,0
1514,c1ccc(OCc2nn3c(Cn4cnc5ccccc54)nnc3s2)cc1,1,0
1630,CCOC(=O)c1c(-c2ccc(C)o2)csc1NC(=O)COC(=O)c1cccs1,1,0


In [20]:
test_data[test_data['ERK2_inhibition'] == 1]

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
294,c1ccc(CCCc2nc(-c3ccccn3)no2)cc1,0,1
1201,C(=N/c1c(-c2ccccc2)nc2n1CCS2)\c1ccccc1,0,1
1243,OCC[NH+]1CCN(Cc2nc(O)c3ccccc3n2)CC1,0,1
1291,Oc1c([C@@H](c2ccccn2)[NH+]2CCOCC2)ccc2cccnc12,0,1
1900,Nc1nc(CSc2n[nH]c(-c3ccccc3)n2)nc(Nc2ccccc2)n1,0,1
1943,c1ccc(-c2oc3ncnc(NCCCn4ccnc4)c3c2-c2ccccc2)cc1,0,1
2211,O[C@H](c1ccc2c(c1)OCO2)c1nccc2ccccc12,0,1
2455,O[C@@H](COc1ccc(Cl)c2ccccc12)C[NH+]1CCN(c2cccc...,0,1
2503,c1ccc(OCc2nc(-c3ccccc3)no2)cc1,0,1
2773,O=c1cc(-c2ccccc2)nc2nc(CCc3ccccc3)[nH]n12,0,1
