In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        return np.array(fp)
    else:
        return np.zeros(n_bits)
    
def evaluate_model(model, X_val, y_val, label):
    y_pred = model.predict(X_val)
    accuracy = balanced_accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    cm = confusion_matrix(y_val, y_pred)
    
    print(f"{label} Model Performance:")
    print(f"Balanced Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
    print(f"Confusion Matrix:\n{cm}")
    print("-" * 50)

In [3]:
train_data = pd.read_csv('data/tested_molecules.csv')

X = np.array([smiles_to_fingerprint(smiles) for smiles in train_data['SMILES']])
y_pkm2 = train_data['PKM2_inhibition'].values
y_erk2 = train_data['ERK2_inhibition'].values

X_train, X_val, y_train_pkm2, y_val_pkm2 = train_test_split(X, y_pkm2, test_size=0.2, random_state=42)
_, _, y_train_erk2, y_val_erk2 = train_test_split(X, y_erk2, test_size=0.2, random_state=42)

pipeline = Pipeline(steps=[
    ('scaling', StandardScaler()),
    ('pca', PCA()),
    ('mlp', MLPClassifier())
])

parameters = {
    'pca__n_components': [5, 10, 20, 30, 50, 100], 
    'mlp__learning_rate_init': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    'mlp__max_iter': [100, 250, 500, 1000],
    'mlp__hidden_layer_sizes': [(100,), (1000,), (100, 100), (1000, 1000)],
    'mlp__alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    'mlp__learning_rate': ['constant', 'invscaling', 'adaptive'],
    'mlp__momentum': [0.1, 0.5, 0.9],
    'mlp__early_stopping': [True, False],
    'mlp__beta_1': [1-1e-1, 1-1e-2, 1-1e-3, 1-1e-4, 1-1e-5],
    'mlp__beta_2': [1-1e-1, 1-1e-2, 1-1e-3, 1-1e-4, 1-1e-5],
    'mlp__validation_fraction': [0.1, 0.2, 0.5],
}

In [ ]:
clf_pkm2 = GridSearchCV(pipeline, parameters, cv=5, scoring='balanced_accuracy')
clf_pkm2.fit(X_train, y_train_pkm2)

evaluate_model(clf_pkm2, X_val, y_val_pkm2, "MLP Pipeline for PKM2")

In [ ]:
clf_erk2 = GridSearchCV(pipeline, parameters, cv=5, scoring='balanced_accuracy')
clf_erk2.fit(X_train, y_train_erk2)

evaluate_model(clf_erk2, X_val, y_val_erk2, "MLP Pipeline for ERK2")

In [ ]:
test_data = pd.read_csv('data/untested_molecules.csv')
X_test = np.array([smiles_to_fingerprint(smiles) for smiles in test_data['SMILES']])

test_data['PKM2_inhibition'] = clf_pkm2.predict(X_test)
test_data['ERK2_inhibition'] = clf_erk2.predict(X_test)

In [ ]:
test_data[test_data['ERK2_inhibition'] == 1]

In [ ]:
test_data[test_data['PKM2_inhibition'] == 1]