# Ensemble MLP Models

Train multiple MLP models with different initializations and ensemble their predictions.

In [None]:
import os
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    brier_score_loss,
    classification_report,
    confusion_matrix
)

# Configuration
DATA_DIR = "../data"
MODELS_DIR = "../models"
RESULTS_DIR = "../results"

TRAIN_FILE = os.path.join(DATA_DIR, "train.csv")
VAL_FILE = os.path.join(DATA_DIR, "val.csv")
TEST_FILE = os.path.join(DATA_DIR, "test.csv")

MODEL_FILE = os.path.join(MODELS_DIR, "mlp_model.pth")
PREDICTIONS_FILE = os.path.join(RESULTS_DIR, "mlp_predictions.csv")
METRICS_FILE = os.path.join(RESULTS_DIR, "mlp_metrics.json")

TARGET_COL = "status"

BATCH_SIZE = 64
MAX_EPOCHS = 200
LEARNING_RATE = 0.0005
PATIENCE = 20
WEIGHT_DECAY = 1e-5
FOCAL_ALPHA = 0.3
FOCAL_GAMMA = 2.5
NUM_MODELS = 5
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

In [None]:
# Define Focal Loss
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        
    def forward(self, inputs, targets):
        bce_loss = nn.functional.binary_cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal_loss.mean()

# Define Residual Block
class ResidualBlock(nn.Module):
    def __init__(self, dim, dropout=0.3):
        super(ResidualBlock, self).__init__()
        self.block = nn.Sequential(
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim)
        )
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        residual = x
        out = self.block(x)
        out += residual
        out = self.relu(out)
        out = self.dropout(out)
        return out

# Define Deep Residual MLP
class DeepResidualMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=512, dropout=0.5):
        super(DeepResidualMLP, self).__init__()
        
        self.input_layer = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        
        self.res_blocks = nn.ModuleList([
            ResidualBlock(hidden_dim, dropout=dropout * 0.8),
            ResidualBlock(hidden_dim, dropout=dropout * 0.8),
            ResidualBlock(hidden_dim, dropout=dropout * 0.8)
        ])
        
        self.down1 = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout * 0.8)
        )
        
        self.res_blocks2 = nn.ModuleList([
            ResidualBlock(hidden_dim // 2, dropout=dropout * 0.6)
        ])
        
        self.down2 = nn.Sequential(
            nn.Linear(hidden_dim // 2, hidden_dim // 4),
            nn.BatchNorm1d(hidden_dim // 4),
            nn.ReLU(),
            nn.Dropout(dropout * 0.6)
        )
        
        self.res_blocks3 = nn.ModuleList([
            ResidualBlock(hidden_dim // 4, dropout=dropout * 0.4)
        ])
        
        self.output = nn.Sequential(
            nn.Linear(hidden_dim // 4, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        x = self.input_layer(x)
        for block in self.res_blocks:
            x = block(x)
        x = self.down1(x)
        for block in self.res_blocks2:
            x = block(x)
        x = self.down2(x)
        for block in self.res_blocks3:
            x = block(x)
        x = self.output(x)
        return x

In [None]:
# Load data
train_df = pd.read_csv(TRAIN_FILE)
val_df = pd.read_csv(VAL_FILE)
test_df = pd.read_csv(TEST_FILE)

X_train = train_df.drop(columns=[TARGET_COL]).values
y_train = train_df[TARGET_COL].values

X_val = val_df.drop(columns=[TARGET_COL]).values
y_val = val_df[TARGET_COL].values

X_test = test_df.drop(columns=[TARGET_COL]).values
y_test = test_df[TARGET_COL].values

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Training function
def train_epoch(model, train_loader, criterion, optimizer):
    model.train()
    total_loss = 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
        
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

def validate(model, val_loader, criterion):
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

def get_predictions(model, X):
    model.eval()
    X_tensor = torch.FloatTensor(X).to(DEVICE)
    with torch.no_grad():
        predictions = model(X_tensor).cpu().numpy().flatten()
    return predictions

In [None]:
# Train ensemble of models
models = []
val_predictions = []
test_predictions = []

for model_idx in range(NUM_MODELS):
    print(f"\n{'='*60}")
    print(f"Training Model {model_idx + 1}/{NUM_MODELS}")
    print(f"{'='*60}\n")
    
    # Set different seed for each model
    seed = 42 + model_idx
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    
    # Create dataloaders with different shuffle
    X_train_tensor = torch.FloatTensor(X_train)
    y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1)
    X_val_tensor = torch.FloatTensor(X_val)
    y_val_tensor = torch.FloatTensor(y_val).unsqueeze(1)
    
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    # Initialize model
    input_dim = X_train.shape[1]
    model = DeepResidualMLP(input_dim, hidden_dim=512, dropout=0.5).to(DEVICE)
    criterion = FocalLoss(alpha=FOCAL_ALPHA, gamma=FOCAL_GAMMA)
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=15, T_mult=2, eta_min=1e-7)
    
    print(f"Model parameters: {sum(p.numel() for p in model.parameters())}")
    
    # Train
    best_val_auc = 0.0
    patience_counter = 0
    best_model_state = None
    
    for epoch in range(MAX_EPOCHS):
        train_loss = train_epoch(model, train_loader, criterion, optimizer)
        val_loss = validate(model, val_loader, criterion)
        
        # Calculate validation AUC
        val_proba = get_predictions(model, X_val)
        val_auc = roc_auc_score(y_val, val_proba)
        
        scheduler.step()
        
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{MAX_EPOCHS} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val AUC: {val_auc:.4f}")
        
        if val_auc > best_val_auc:
            best_val_auc = val_auc
            patience_counter = 0
            best_model_state = model.state_dict().copy()
        else:
            patience_counter += 1
        
        if patience_counter >= PATIENCE:
            print(f"Early stopping at epoch {epoch+1}")
            break
    
    model.load_state_dict(best_model_state)
    print(f"\nBest validation AUC for model {model_idx + 1}: {best_val_auc:.4f}")
    
    # Store model and predictions
    models.append(model)
    val_predictions.append(get_predictions(model, X_val))
    test_predictions.append(get_predictions(model, X_test))

print(f"\n{'='*60}")
print("All models trained!")
print(f"{'='*60}")

In [None]:
# Ensemble predictions (average)
val_ensemble_proba = np.mean(val_predictions, axis=0)
test_ensemble_proba = np.mean(test_predictions, axis=0)

val_ensemble_pred = (val_ensemble_proba >= 0.5).astype(int)
test_ensemble_pred = (test_ensemble_proba >= 0.5).astype(int)

# Evaluate ensemble on validation set
val_auc_roc = roc_auc_score(y_val, val_ensemble_proba)
val_auc_pr = average_precision_score(y_val, val_ensemble_proba)
val_brier = brier_score_loss(y_val, val_ensemble_proba)

print("Ensemble Validation Performance:")
print(f"AUC-ROC: {val_auc_roc:.4f}")
print(f"AUC-PR: {val_auc_pr:.4f}")
print(f"Brier Score: {val_brier:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, val_ensemble_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, val_ensemble_pred))

In [None]:
# Evaluate ensemble on test set
test_auc_roc = roc_auc_score(y_test, test_ensemble_proba)
test_auc_pr = average_precision_score(y_test, test_ensemble_proba)
test_brier = brier_score_loss(y_test, test_ensemble_proba)

print("Ensemble Test Performance:")
print(f"AUC-ROC: {test_auc_roc:.4f}")
print(f"AUC-PR: {test_auc_pr:.4f}")
print(f"Brier Score: {test_brier:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, test_ensemble_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, test_ensemble_pred))

In [None]:
# Save best model from ensemble
torch.save({
    'model_state_dict': models[0].state_dict(),
    'input_dim': input_dim
}, MODEL_FILE)
print(f"Best model saved to {MODEL_FILE}")

# Save ensemble predictions
predictions_df = pd.DataFrame({
    'true_label': y_val,
    'predicted_probability': val_ensemble_proba,
    'predicted_label': val_ensemble_pred,
    'dataset': 'validation'
})
predictions_df.to_csv(PREDICTIONS_FILE, index=False)
print(f"Ensemble predictions saved to {PREDICTIONS_FILE}")

# Save metrics
all_metrics = {
    'architecture': f'Ensemble of {NUM_MODELS} DeepResidualMLP models',
    'base_architecture': 'Input → 512 (3x ResBlocks) → 256 (ResBlock) → 128 (ResBlock) → 1',
    'features': 'Deep residual connections, BatchNorm, Focal Loss, Model averaging',
    'num_models': NUM_MODELS,
    'dropout': '0.5 → 0.4 → 0.3 → 0.2',
    'batch_size': BATCH_SIZE,
    'learning_rate': LEARNING_RATE,
    'weight_decay': WEIGHT_DECAY,
    'optimizer': 'AdamW',
    'scheduler': 'CosineAnnealingWarmRestarts',
    'loss_function': f'FocalLoss(alpha={FOCAL_ALPHA}, gamma={FOCAL_GAMMA})',
    'max_epochs': MAX_EPOCHS,
    'patience': PATIENCE,
    'gradient_clipping': 0.5,
    'validation_metrics': {
        'auc_roc': float(val_auc_roc),
        'auc_pr': float(val_auc_pr),
        'brier_score': float(val_brier),
        'dataset': 'Validation'
    },
    'test_metrics': {
        'auc_roc': float(test_auc_roc),
        'auc_pr': float(test_auc_pr),
        'brier_score': float(test_brier),
        'dataset': 'Test'
    }
}

with open(METRICS_FILE, 'w') as f:
    json.dump(all_metrics, f, indent=4)
print(f"Metrics saved to {METRICS_FILE}")