In [1]:
import torch

In [2]:
import torch.nn as nn
from torch.optim import Adam
import torch.nn.init as init

In [3]:
import numpy as np
import pandas as pd
import glob
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, label_binarize
from sklearn.metrics import matthews_corrcoef, accuracy_score, balanced_accuracy_score, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
from Element_PI_JC import VariancePersist_JC

Rips(maxdim=1, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)


In [4]:
# File paths and configurations
receptor = 'GCR'

train_test_files = sorted(glob.glob(f'{receptor}_*.xyz'))
validation_files = sorted(glob.glob(f'Validation/{receptor}_*.xyz'))
train_test_labels_df = pd.read_excel(f'{receptor}_MLinput.xlsx', header=None)
validation_labels_df = pd.read_excel(f'Validation/{receptor}_Validation.xlsx', header=None)

labels_train_test = train_test_labels_df.iloc[:, 1].values
labels_validation = validation_labels_df.iloc[:, 1].values

# TPI Hyperparameter Ranges for Grid Search
pixelx_range = [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
spread_range = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10]
myspecs = {"maxBD": 2.5, "minBD": -0.1}

# MLPC hyperparameters
RS = 42 # Random seed for StratifiedKfold and SMOTE
nn_seed = 20 # MPLC weight for reprodicibility
epochs = 300 # MLPC epochs
learning = 0.001 # MLPC learning rate
decay = 1e-5 # MLPC L2 regularization

# Track the best model
best_mcc = -np.inf
best_model_info = {
    "fold_metrics": [],
    "epoch_losses": [],
    "validation_results": {},
    "auc_data": [],
    "hyperparameters": {}
}

# Define MLPC
class MLPC(nn.Module):
    def __init__(self, input_size, seed=nn_seed):
        super(MLPC, self).__init__()
        torch.manual_seed(seed)
        self.model = nn.Sequential(
            nn.Linear(input_size, 300), 
            nn.ReLU(), 
            nn.Dropout(0.3),
            nn.Linear(300, 150),
            nn.ReLU(), 
            nn.Dropout(0.3),
            nn.Linear(150, 75), 
            nn.ReLU(), 
            nn.Dropout(0.3),
            nn.Linear(75, 2),
        )
        self._initialize_weights(seed)

    def forward(self, x):
        return self.model(x)
        
    def _initialize_weights(self, seed):
        torch.manual_seed(seed)
        for layer in self.model:
            if isinstance(layer, nn.Linear):
                init.xavier_uniform_(layer.weight)
                if layer.bias is not None:
                    init.zeros_(layer.bias)

# Generate Persistence Images
def generate_reduced_persistence_images(xyz_files, pixelx, pixely, spread, myspecs):
    TT_manual = []
    for cmp in xyz_files:
        TT_image = VariancePersist_JC(cmp, pixelx=pixelx, pixely=pixely, myspread=spread, myspecs=myspecs, showplot=False)
        TT_manual.append(TT_image)
    return np.array(TT_manual)

# Main Grid Search
for pixelx in pixelx_range:
    pixely = pixelx
    for myspread in spread_range:
        train_test_features = generate_reduced_persistence_images(train_test_files, pixelx, pixely, myspread, myspecs)
        validation_features = generate_reduced_persistence_images(validation_files, pixelx, pixely, myspread, myspecs)
        smote = SMOTE(random_state=RS)
        train_test_features_resampled, labels_train_test_resampled = smote.fit_resample(train_test_features, labels_train_test)
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=RS)
        
        fold_metrics = []
        epoch_losses = []
        auc_data = []

        for fold_idx, (train_idx, test_idx) in enumerate(skf.split(train_test_features_resampled, labels_train_test_resampled)):
            X_train, X_test = train_test_features_resampled[train_idx], train_test_features_resampled[test_idx]
            y_train, y_test = labels_train_test_resampled[train_idx], labels_train_test_resampled[test_idx]
            X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
            y_train_tensor = torch.tensor(y_train, dtype=torch.long)
            X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
            y_test_tensor = torch.tensor(y_test, dtype=torch.long)

            model = MLPC(input_size=X_train_tensor.shape[1])
            criterion = nn.CrossEntropyLoss()
            optimizer = Adam(model.parameters(), lr=learning, weight_decay=decay)

            train_losses = []
            test_losses = []
            for epoch in range(epochs):
                model.train()
                optimizer.zero_grad()
                train_outputs = model(X_train_tensor)
                train_loss = criterion(train_outputs, y_train_tensor)
                train_loss.backward()
                optimizer.step()
                train_losses.append(train_loss.item())

                model.eval()
                with torch.no_grad():
                    test_outputs = model(X_test_tensor)
                    test_loss = criterion(test_outputs, y_test_tensor)
                    test_losses.append(test_loss.item())

            epoch_losses.extend(
                [{"Fold": fold_idx + 1, "Epoch": e + 1, "Dataset": "Training", "Loss": train_losses[e]} for e in range(epochs)] +
                [{"Fold": fold_idx + 1, "Epoch": e + 1, "Dataset": "Testing", "Loss": test_losses[e]} for e in range(epochs)]
            )

            with torch.no_grad():
                train_probs = nn.Softmax(dim=1)(model(X_train_tensor)).numpy()[:, 1]
                test_probs = nn.Softmax(dim=1)(model(X_test_tensor)).numpy()[:, 1]
                train_pred = torch.max(model(X_train_tensor), 1)[1].numpy()
                test_pred = torch.max(model(X_test_tensor), 1)[1].numpy()

            avg_train_metrics = {
                "Accuracy": accuracy_score(y_train, train_pred),
                "Balanced Accuracy": balanced_accuracy_score(y_train, train_pred),
                "MCC": matthews_corrcoef(y_train, train_pred),
                "AUC": roc_auc_score(label_binarize(y_train, classes=[0, 1]), train_probs),
            }

            test_metrics = {
                "Accuracy": accuracy_score(y_test, test_pred),
                "Balanced Accuracy": balanced_accuracy_score(y_test, test_pred),
                "MCC": matthews_corrcoef(y_test, test_pred),
                "AUC": roc_auc_score(label_binarize(y_test, classes=[0, 1]), test_probs),
            }

            fold_metrics.append({
                "Fold": fold_idx + 1,
                **{f"Train {k}": v for k, v in avg_train_metrics.items()},
                **{f"Test {k}": v for k, v in test_metrics.items()},
            })

            # Log AUC data for ROC curves
            fpr_train, tpr_train, thresholds_train = roc_curve(label_binarize(y_train, classes=[0, 1]), train_probs)
            fpr_test, tpr_test, thresholds_test = roc_curve(label_binarize(y_test, classes=[0, 1]), test_probs)
            auc_data.extend([
                {"Fold": fold_idx + 1, "Dataset": "Training", "FPR": fpr, "TPR": tpr, "Threshold": th}
                for fpr, tpr, th in zip(fpr_train, tpr_train, thresholds_train)
            ] + [
                {"Fold": fold_idx + 1, "Dataset": "Testing", "FPR": fpr, "TPR": tpr, "Threshold": th}
                for fpr, tpr, th in zip(fpr_test, tpr_test, thresholds_test)
            ])

        validation_tensor = torch.tensor(validation_features, dtype=torch.float32)
        with torch.no_grad():
            validation_outputs = model(validation_tensor)
            validation_probs = nn.Softmax(dim=1)(validation_outputs).numpy()[:, 1]
            validation_pred = torch.max(validation_outputs, 1)[1].numpy()

            validation_metrics = {
                "Accuracy": accuracy_score(labels_validation, validation_pred),
                "Balanced Accuracy": balanced_accuracy_score(labels_validation, validation_pred),
                "MCC": matthews_corrcoef(labels_validation, validation_pred),
                "AUC": roc_auc_score(label_binarize(labels_validation, classes=[0, 1]), validation_probs)
            }

        if validation_metrics["MCC"] > best_mcc:
            best_mcc = validation_metrics["MCC"]
            best_model_info["fold_metrics"] = fold_metrics
            best_model_info["epoch_losses"] = epoch_losses
            best_model_info["validation_results"] = validation_metrics
            best_model_info["auc_data"] = auc_data
            best_model_info["hyperparameters"] = {"pixelx": pixelx, "pixely": pixely, "spread": myspread}

# Save results for the best model
pd.DataFrame(best_model_info["fold_metrics"]).to_csv(f"{receptor}_TPI_best_model_fold_metrics.csv", index=False)
pd.DataFrame(best_model_info["epoch_losses"]).to_csv(f"{receptor}_TPI_best_model_epoch_losses.csv", index=False)
pd.DataFrame([best_model_info["validation_results"]]).to_csv(f"{receptor}_TPI_best_model_validation_metrics.csv", index=False)
pd.DataFrame(best_model_info["auc_data"]).to_csv(f"{receptor}_TPI_best_model_auc_data.csv", index=False)
pd.DataFrame([best_model_info["hyperparameters"]]).to_csv(f"{receptor}_TPI_best_model_hyperparameters.csv", index=False)

print("Best model results saved to CSV Flipped_PI files.")


Best model results saved to CSV Flipped_PI files.
