In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Descriptors
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, precision_recall_curve, f1_score

In [25]:
# Function to calculate molecular descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {}
    descriptors = {desc_name: desc_func(mol) for desc_name, desc_func in Descriptors._descList}
    return descriptors

In [26]:
# Load data
data = pd.read_csv(r'C:\Users\20212049\Documents\Jaar 3\Kwart 4\Advanced Programming\Group Assignment\tested_molecules.csv')
data['descriptors'] = data['SMILES'].apply(calculate_descriptors)
descriptors_df = pd.DataFrame(data['descriptors'].tolist())
descriptors_df['PKM2_inhibition'] = data['PKM2_inhibition']
descriptors_df['ERK2_inhibition'] = data['ERK2_inhibition']

# Drop rows with NaN values
descriptors_df.dropna(inplace=True)

In [27]:
# Split data for PKM2 and ERK2 inhibition modeling
def prepare_data(target):
    X = descriptors_df.drop(['PKM2_inhibition', 'ERK2_inhibition'], axis=1)
    y = descriptors_df[target].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test, X.columns

In [28]:
# Define the model structure
def build_model(input_dim):
    model = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [29]:
# Function to predict classes based on the modified threshold
def predict_classes(model, X, threshold=0.5):
    proba = model.predict(X)
    return (proba > threshold).astype(int)

In [30]:
# Function to calculate optimal threshold based on F1 score
def calculate_optimal_threshold(y_true, probas_pred):
    precision, recall, thresholds = precision_recall_curve(y_true, probas_pred)
    f1_scores = 2 * (precision * recall) / (precision + recall)
    f1_scores = np.nan_to_num(f1_scores)
    optimal_idx = np.argmax(f1_scores)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold, f1_scores[optimal_idx]

In [31]:
# Train and evaluate the model
def train_and_evaluate(target):
    X_train_scaled, X_test_scaled, y_train, y_test, feature_names = prepare_data(target)
    model = build_model(X_train_scaled.shape[1])
    model.fit(X_train_scaled, y_train, epochs=5, batch_size=10, verbose=1, validation_split=0.1)
    proba = model.predict(X_test_scaled).ravel()
    optimal_threshold, max_f1_score = calculate_optimal_threshold(y_test, proba)
    y_pred = predict_classes(model, X_test_scaled, threshold=optimal_threshold)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print(f"Results for {target}:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"Maximum F1 Score: {max_f1_score:.3f}")

    # Analyze feature importances
    weights = model.layers[0].get_weights()[0]  # Get weights of the first layer
    mean_weights = np.mean(np.abs(weights), axis=1)  # Average the weights across nodes
    feature_importance = pd.Series(mean_weights, index=feature_names)
    print("Feature Importances for", target)
    print(feature_importance.sort_values(ascending=False))

    cm = confusion_matrix(y_test, y_pred)
    print(cm)

In [32]:
train_and_evaluate('PKM2_inhibition')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  f1_scores = 2 * (precision * recall) / (precision + recall)


Results for PKM2_inhibition:
Accuracy: 0.95
Precision: 0.25
Recall: 0.50
Maximum F1 Score: 0.421
Feature Importances for PKM2_inhibition
FractionCSP3    0.078709
PEOE_VSA6       0.075639
PEOE_VSA2       0.075389
MolLogP         0.075292
fr_C_O          0.075068
                  ...   
fr_thiophene    0.060977
SlogP_VSA12     0.060848
SlogP_VSA10     0.060739
fr_hdrzine      0.060166
fr_lactone      0.060045
Length: 208, dtype: float32
[[209   9]
 [  3   3]]


In [37]:
train_and_evaluate('ERK2_inhibition')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Results for ERK2_inhibition:
Accuracy: 0.86
Precision: 0.17
Recall: 0.25
Maximum F1 Score: 0.244
Feature Importances for ERK2_inhibition
fr_amidine        0.077608
BalabanJ          0.076843
BCUT2D_MWHI       0.076739
VSA_EState8       0.075907
EState_VSA10      0.075128
                    ...   
fr_ketone         0.059714
BertzCT           0.058721
EState_VSA2       0.058347
fr_aryl_methyl    0.057369
Chi2v             0.057185
Length: 208, dtype: float32
[[188  20]
 [ 12   4]]


  f1_scores = 2 * (precision * recall) / (precision + recall)
