In [1]:
import torch 

In [2]:
import torch.nn as nn
import torch.optim as optim

In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, matthews_corrcoef, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
from rdkit import Chem
from rdkit.Chem import MACCSkeys
import pandas as pd
import numpy as np
import glob

In [4]:
# Load data
receptor = 'GCR'
train_test_smiles = load_smiles(f'{receptor}_TT_SMILES.smi')
validation_smiles = load_smiles(f'Validation/{receptor}_V_SMILES.smi')
train_test_labels_df = pd.read_excel(f'{receptor}_MLinput.xlsx', header=None)
validation_labels_df = pd.read_excel(f'Validation/{receptor}_Validation.xlsx', header=None)

labels_train_test = train_test_labels_df.iloc[:, 1].values
labels_validation = validation_labels_df.iloc[:, 1].values

# Set random seeds for reproducibility
RS = 42 # StratifiedKfold and SMOTE seed
nn_seed = 20 # MLPC seed

# Load SMILES strings
def load_smiles(file_path):
    with open(file_path, 'r') as file:
        smiles_list = [line.strip() for line in file]
    return smiles_list

# Calculate MACCS Keys
def calculate_maccs_keys(smiles_list):
    maccs_keys = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            maccs = MACCSkeys.GenMACCSKeys(mol)
            maccs_keys.append(np.array(maccs))
        else:
            maccs_keys.append(np.zeros(166))
            print('SMILES error, check .smi file')
    return np.array(maccs_keys)

# Define MLPC Architecture
class MLPC(nn.Module):
    def __init__(self, input_size):
        super(MLPC, self).__init__()
        torch.manual_seed(nn_seed)
        self.model = nn.Sequential(
            nn.Linear(input_size, 300), 
            nn.ReLU(), 
            nn.Dropout(0.3),
            nn.Linear(300, 150), 
            nn.ReLU(), 
            nn.Dropout(0.3),
            nn.Linear(150, 75), 
            nn.ReLU(), 
            nn.Dropout(0.3),
            nn.Linear(75, 2)
        )
        self._initialize_weights()

    def forward(self, x):
        return self.model(x)

    def _initialize_weights(self):
        torch.manual_seed(nn_seed)
        for layer in self.model:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                if layer.bias is not None:
                    nn.init.zeros_(layer.bias)

# Calculate MACCS Keys
maccs_keys_train_test = calculate_maccs_keys(train_test_smiles)
maccs_keys_validation = calculate_maccs_keys(validation_smiles)

# Training and evaluation
def train_and_evaluate(X, y, X_val, y_val, num_folds=10):
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=RS)
    smote = SMOTE(random_state=RS)

    # Containers for metrics, losses, and AUC data
    fold_metrics = []
    epoch_losses = []
    auc_data = []

    for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Handle class imbalance
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # Convert to PyTorch tensors
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train, dtype=torch.long)
        y_test_tensor = torch.tensor(y_test, dtype=torch.long)

        # Initialize model, loss function, optimizer
        model = MLPC(input_size=X.shape[1])
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

        # Train the model
        train_losses = []
        test_losses = []
        for epoch in range(300):
            model.train()
            optimizer.zero_grad()
            train_outputs = model(X_train_tensor)
            train_loss = criterion(train_outputs, y_train_tensor)
            train_loss.backward()
            optimizer.step()
            train_losses.append(train_loss.item())

            model.eval()
            with torch.no_grad():
                test_outputs = model(X_test_tensor)
                test_loss = criterion(test_outputs, y_test_tensor)
                test_losses.append(test_loss.item())

        epoch_losses.extend(
            [{"Fold": fold_idx + 1, "Epoch": e + 1, "Dataset": "Training", "Loss": train_losses[e]} for e in range(300)] +
            [{"Fold": fold_idx + 1, "Epoch": e + 1, "Dataset": "Testing", "Loss": test_losses[e]} for e in range(300)]
        )

        # Evaluate on training and testing sets
        with torch.no_grad():
            train_probs = nn.Softmax(dim=1)(model(X_train_tensor)).numpy()[:, 1]
            test_probs = nn.Softmax(dim=1)(model(X_test_tensor)).numpy()[:, 1]
            train_preds = torch.argmax(model(X_train_tensor), dim=1).numpy()
            test_preds = torch.argmax(model(X_test_tensor), dim=1).numpy()

        fold_metrics.append({
            "Fold": fold_idx + 1,
            "Train Accuracy": accuracy_score(y_train, train_preds),
            "Train Balanced Accuracy": balanced_accuracy_score(y_train, train_preds),
            "Train MCC": matthews_corrcoef(y_train, train_preds),
            "Train AUC": roc_auc_score(y_train, train_probs),
            "Test Accuracy": accuracy_score(y_test, test_preds),
            "Test Balanced Accuracy": balanced_accuracy_score(y_test, test_preds),
            "Test MCC": matthews_corrcoef(y_test, test_preds),
            "Test AUC": roc_auc_score(y_test, test_probs),
        })

        # Save AUC data for ROC curves
        fpr_train, tpr_train, thresholds_train = roc_curve(y_train, train_probs)
        fpr_test, tpr_test, thresholds_test = roc_curve(y_test, test_probs)
        auc_data.extend([
            {"Fold": fold_idx + 1, "Dataset": "Training", "FPR": fpr, "TPR": tpr, "Threshold": th}
            for fpr, tpr, th in zip(fpr_train, tpr_train, thresholds_train)
        ] + [
            {"Fold": fold_idx + 1, "Dataset": "Testing", "FPR": fpr, "TPR": tpr, "Threshold": th}
            for fpr, tpr, th in zip(fpr_test, tpr_test, thresholds_test)
        ])

    # Validation metrics
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    with torch.no_grad():
        val_probs = nn.Softmax(dim=1)(model(X_val_tensor)).numpy()[:, 1]
        val_preds = torch.argmax(model(X_val_tensor), dim=1).numpy()

    validation_metrics = {
        "Accuracy": accuracy_score(y_val, val_preds),
        "Balanced Accuracy": balanced_accuracy_score(y_val, val_preds),
        "MCC": matthews_corrcoef(y_val, val_preds),
        "AUC": roc_auc_score(y_val, val_probs),
    }

    # Save results to CSV files
    pd.DataFrame(fold_metrics).to_csv(f"{receptor}_MACCS_best_model_fold_metrics.csv", index=False)
    pd.DataFrame(epoch_losses).to_csv(f"{receptor}_MACCS_epoch_losses.csv", index=False)
    pd.DataFrame(auc_data).to_csv(f"{receptor}_MACCS_auc_data.csv", index=False)
    pd.DataFrame([validation_metrics]).to_csv(f"{receptor}_MACCS_validation_metrics.csv", index=False)

    return None

# Run training and evaluation
train_and_evaluate(maccs_keys_train_test, labels_train_test, maccs_keys_validation, labels_validation)