In [16]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from sklearn.metrics import accuracy_score, confusion_matrix
from tensorflow.keras.layers import Conv1D, Dense, Flatten, MaxPooling1D
from tensorflow.keras.optimizers import Adam

In [17]:
train_data = pd.read_csv('data/tested_molecules.csv')

train_data.head()

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
0,C=C(C)c1nc(N)nc(N)n1,0,0
1,C=C(Cl)COc1ccc2c(C)cc(=O)oc2c1,0,0
2,C=CCNC(=O)CCCC(=O)NCC=C,0,0
3,C=CCOn1c(=O)c(C)[n+]([O-])c2ccccc21,0,0
4,C=CCn1cc(Cl)c(=O)n(CC=C)c1=O,0,0


In [18]:
def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        return np.array(fp)
    else:
        return np.zeros(n_bits)

In [19]:
X = np.array([smiles_to_fingerprint(smiles) for smiles in train_data['SMILES']])
y_pkm2 = train_data['PKM2_inhibition'].values
y_erk2 = train_data['ERK2_inhibition'].values

In [20]:
X_train, X_val, y_pkm2_train, y_pkm2_val, y_erk2_train, y_erk2_val = train_test_split(
    X, y_pkm2, y_erk2, test_size=0.2, random_state=42
)

In [21]:
def build_cnn_model(input_shape):
    model = Sequential([
        Conv1D(32, 3, activation='relu', input_shape=input_shape),
        MaxPooling1D(2),
        Conv1D(64, 3, activation='relu'),
        MaxPooling1D(2),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

input_shape = (X_train.shape[1], 1)

In [22]:
def print_model_performance(model, X_val, y_val, label):
    y_pred = (model.predict(X_val) > 0.5).astype(int).flatten()
    accuracy = accuracy_score(y_val, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    print(f"{label} Model Performance:")
    print(f"Accuracy: {accuracy * 100:.4f}%")
    print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
    print("-" * 30)

In [23]:
X_train_cnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_val_cnn = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))

In [24]:
model_pkm2 = build_cnn_model(input_shape)
model_pkm2.fit(X_train_cnn, y_pkm2_train, epochs=10, batch_size=32, validation_data=(X_val_cnn, y_pkm2_val))

print_model_performance(model_pkm2, X_val_cnn, y_pkm2_val, "PKM2")

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8875 - loss: 0.2418 - val_accuracy: 0.9732 - val_loss: 0.1128
Epoch 2/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.9743 - loss: 0.0952 - val_accuracy: 0.9732 - val_loss: 0.0939
Epoch 3/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9793 - loss: 0.0502 - val_accuracy: 0.9732 - val_loss: 0.0826
Epoch 4/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9780 - loss: 0.0364 - val_accuracy: 0.9732 - val_loss: 0.1192
Epoch 5/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9789 - loss: 0.0208 - val_accuracy: 0.9732 - val_loss: 0.1287
Epoch 6/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9875 - loss: 0.0178 - val_accuracy: 0.9732 - val_loss: 0.1348
Epoch 7/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━

In [25]:
model_erk2 = build_cnn_model(input_shape)
model_erk2.fit(X_train_cnn, y_erk2_train, epochs=10, batch_size=32, validation_data=(X_val_cnn, y_erk2_val))

print_model_performance(model_erk2, X_val_cnn, y_erk2_val, "ERK2")

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.8605 - loss: 0.2988 - val_accuracy: 0.9286 - val_loss: 0.3124
Epoch 2/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9544 - loss: 0.1308 - val_accuracy: 0.9286 - val_loss: 0.2907
Epoch 3/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9508 - loss: 0.0847 - val_accuracy: 0.9286 - val_loss: 0.4256
Epoch 4/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.9598 - loss: 0.0431 - val_accuracy: 0.9286 - val_loss: 0.5768
Epoch 5/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9725 - loss: 0.0339 - val_accuracy: 0.9241 - val_loss: 0.6311
Epoch 6/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9988 - loss: 0.0172 - val_accuracy: 0.9241 - val_loss: 0.7962
Epoch 7/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━

In [26]:
test_data = pd.read_csv('data/untested_molecules-3.csv')

test_data.head()

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
0,C[C@@H](Sc1nc(=O)cc(N)[nH]1)C(=O)NC1CCCCC1,,
1,O=C(CCN1C(=O)COc2ccccc21)NCc1cccs1,,
2,Cn1nnnc1SCC(=O)N1CC[NH+](Cc2ccccc2)CC1,,
3,CCOC(=O)CCP(=O)([O-])[C@@H](O)c1ccc(OC)cc1,,
4,C=CCNC(=O)c1cc(-c2ccccc2O)on1,,


In [27]:
X_test = np.array([smiles_to_fingerprint(smiles) for smiles in test_data['SMILES']])
X_test_cnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

In [28]:
threshold = 0.5
test_data['PKM2_inhibition'] = (model_pkm2.predict(X_test_cnn) > threshold).astype(int)
test_data['ERK2_inhibition'] = (model_erk2.predict(X_test_cnn) > threshold).astype(int)

[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step


In [29]:
test_data.to_csv('pred.csv', index=False)