In [35]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from tensorflow.keras.layers import Conv1D, Dense, Flatten, MaxPooling1D
from tensorflow.keras.optimizers import Adam

In [36]:
train_data = pd.read_csv('data/tested_molecules.csv')

train_data.head()

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
0,C=C(C)c1nc(N)nc(N)n1,0,0
1,C=C(Cl)COc1ccc2c(C)cc(=O)oc2c1,0,0
2,C=CCNC(=O)CCCC(=O)NCC=C,0,0
3,C=CCOn1c(=O)c(C)[n+]([O-])c2ccccc21,0,0
4,C=CCn1cc(Cl)c(=O)n(CC=C)c1=O,0,0


In [37]:
def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        return np.array(fp)
    else:
        return np.zeros(n_bits)

In [38]:
X = np.array([smiles_to_fingerprint(smiles) for smiles in train_data['SMILES']])
y_pkm2 = train_data['PKM2_inhibition'].values
y_erk2 = train_data['ERK2_inhibition'].values

In [39]:
X_train, X_val, y_pkm2_train, y_pkm2_val, y_erk2_train, y_erk2_val = train_test_split(
    X, y_pkm2, y_erk2, test_size=0.2, random_state=42
)

In [40]:
def build_cnn_model(input_shape):
    model = Sequential([
        Conv1D(32, 3, activation='relu', input_shape=input_shape),
        MaxPooling1D(2),
        Conv1D(64, 3, activation='relu'),
        MaxPooling1D(2),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

input_shape = (X_train.shape[1], 1)

In [41]:
def print_model_performance(model, X_val, y_val, label):
    y_pred = (model.predict(X_val) > 0.5).astype(int).flatten()
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    print(f"{label} Model Performance:")
    print(f"Accuracy: {accuracy * 100:.4f}%")
    print(f"Precision: {precision * 100:.4f}%, Recall: {recall * 100:.4f}")
    print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
    print("-" * 30)

In [42]:
X_train_cnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_val_cnn = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))

In [43]:
model_pkm2 = build_cnn_model(input_shape)
model_pkm2.fit(X_train_cnn, y_pkm2_train, epochs=10, batch_size=32, validation_data=(X_val_cnn, y_pkm2_val))

print_model_performance(model_pkm2, X_val_cnn, y_pkm2_val, "PKM2")

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9054 - loss: 0.2470 - val_accuracy: 0.9732 - val_loss: 0.1103
Epoch 2/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9780 - loss: 0.0714 - val_accuracy: 0.9732 - val_loss: 0.0944
Epoch 3/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9751 - loss: 0.0475 - val_accuracy: 0.9732 - val_loss: 0.1127
Epoch 4/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9674 - loss: 0.0347 - val_accuracy: 0.9732 - val_loss: 0.1329
Epoch 5/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9754 - loss: 0.0253 - val_accuracy: 0.9732 - val_loss: 0.1225
Epoch 6/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9859 - loss: 0.0249 - val_accuracy: 0.9732 - val_loss: 0.1533
Epoch 7/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [44]:
model_erk2 = build_cnn_model(input_shape)
model_erk2.fit(X_train_cnn, y_erk2_train, epochs=10, batch_size=32, validation_data=(X_val_cnn, y_erk2_val))

print_model_performance(model_erk2, X_val_cnn, y_erk2_val, "ERK2")

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8555 - loss: 0.2902 - val_accuracy: 0.9286 - val_loss: 0.3028
Epoch 2/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9562 - loss: 0.1351 - val_accuracy: 0.9286 - val_loss: 0.3683
Epoch 3/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9636 - loss: 0.0676 - val_accuracy: 0.9286 - val_loss: 0.4223
Epoch 4/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9654 - loss: 0.0395 - val_accuracy: 0.9286 - val_loss: 0.4617
Epoch 5/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9777 - loss: 0.0343 - val_accuracy: 0.9286 - val_loss: 0.5662
Epoch 6/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9741 - loss: 0.0363 - val_accuracy: 0.9286 - val_loss: 0.6508
Epoch 7/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [45]:
test_data = pd.read_csv('data/untested_molecules-3.csv')

test_data.head()

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
0,C[C@@H](Sc1nc(=O)cc(N)[nH]1)C(=O)NC1CCCCC1,,
1,O=C(CCN1C(=O)COc2ccccc21)NCc1cccs1,,
2,Cn1nnnc1SCC(=O)N1CC[NH+](Cc2ccccc2)CC1,,
3,CCOC(=O)CCP(=O)([O-])[C@@H](O)c1ccc(OC)cc1,,
4,C=CCNC(=O)c1cc(-c2ccccc2O)on1,,


In [46]:
X_test = np.array([smiles_to_fingerprint(smiles) for smiles in test_data['SMILES']])
X_test_cnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

In [47]:
threshold = 0.5
test_data['PKM2_inhibition'] = (model_pkm2.predict(X_test_cnn) > threshold).astype(int)
test_data['ERK2_inhibition'] = (model_erk2.predict(X_test_cnn) > threshold).astype(int)

[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


In [48]:
test_data[test_data['PKM2_inhibition'] == 1]

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
1939,Clc1ccccc1-c1nn2c(Cn3cnc4ccccc43)nnc2s1,1,0
2221,O=C(NCCc1ccccn1)c1ccc(S(=O)(=O)N2CCCCCC2)cc1,1,0


In [49]:
test_data[test_data['ERK2_inhibition'] == 1]

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
1439,Nc1ccc(/N=N/c2ccccc2)c(N)n1,0,1
2050,CC(C)CCn1c(SCCc2ccccc2)nc2c1c(=O)[nH]c(=O)n2C,0,1
2773,O=c1cc(-c2ccccc2)nc2nc(CCc3ccccc3)[nH]n12,0,1
4012,CC(C)CSc1nc2c(c(=O)[nH]c(=O)n2C)n1CCc1ccccc1,0,1
