In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import time
import matplotlib.pyplot as plt


In [None]:
# --- 1. Setup ---
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [None]:
swiss_data = pd.read_csv('D1_Swiss_processed.csv')
FEATURE_COLUMNS = ['Depression', 'Anxiety', 'Burnout', 'Stress']
feature_tensor = torch.tensor(swiss_data[FEATURE_COLUMNS].values, dtype=torch.float32)
training_dataset = TensorDataset(feature_tensor)
train_dataloader = DataLoader(training_dataset, batch_size=64, shuffle=True)
print("Data loaded and preprocessed with shape:", feature_tensor.shape)

Data loaded and preprocessed with shape: torch.Size([886, 4])


In [None]:
# --- 2. Autoencoder Architecture ---
class Autoencoder(nn.Module):
    def __init__(self, hidden_layer_size, latent_dimension=2, activation_function='ReLU'):
        super().__init__()
        activation_dict = {'ReLU': nn.ReLU, 'Sigmoid': nn.Sigmoid, 'Tanh': nn.Tanh}
        activation_layer = activation_dict[activation_function]
        
        self.encoder = nn.Sequential(
            nn.Linear(4, hidden_layer_size),
            activation_layer(),
            nn.Linear(hidden_layer_size, latent_dimension),
            activation_layer()
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dimension, hidden_layer_size),
            activation_layer(),
            nn.Linear(hidden_layer_size, 4),
            nn.Identity()
        )
    
    def forward(self, input_data):
        encoded_representation = self.encoder(input_data)
        reconstructed_output = self.decoder(encoded_representation)
        return reconstructed_output, encoded_representation

def train_single_configuration(model, optimizer, loss_function, dataloader, num_epochs=20):
    model.train()
    epoch_losses = []
    
    for current_epoch in range(num_epochs):
        total_batch_loss = 0
        for batch_data in dataloader:
            optimizer.zero_grad()
            reconstruction, _ = model(batch_data[0])
            batch_loss = loss_function(reconstruction, batch_data[0])
            batch_loss.backward()
            optimizer.step()
            total_batch_loss += batch_loss.item()
        
        average_epoch_loss = total_batch_loss / len(dataloader)
        epoch_losses.append(average_epoch_loss)
    
    return epoch_losses

In [None]:
# --- 3. Hyperparameter Grid Search ---
hidden_layer_sizes = [3, 4, 5, 6, 8, 10]
latent_dimensions = [2, 3]  # test interpretability vs reconstruction trade-off
optimizer_configs = {
    'Adam': lambda model_params: optim.Adam(model_params, lr=1e-3, betas=(0.9, 0.999)),
    'SGD':  lambda model_params: optim.SGD(model_params, lr=1e-3, momentum=0.9)
}
training_epochs = [20, 50, 100]
random_seeds = [0, 42, 123]

loss_criterion = nn.MSELoss()
experiment_results = defaultdict(list)

total_experiments = len(hidden_layer_sizes) * len(latent_dimensions) * len(optimizer_configs) * len(training_epochs) * len(random_seeds)
current_experiment = 0

for hidden_size in hidden_layer_sizes:
    for latent_dim in latent_dimensions:
        for optimizer_name, optimizer_factory in optimizer_configs.items():
            for epochs in training_epochs:
                for seed in random_seeds:
                    current_experiment += 1
                    print(f"Running experiment {current_experiment}/{total_experiments}: "
                          f"hidden={hidden_size}, latent={latent_dim}, opt={optimizer_name}, "
                          f"epochs={epochs}, seed={seed}")
                    
                    torch.manual_seed(seed)
                    np.random.seed(seed)
                    
                    autoencoder_model = Autoencoder(
                        hidden_layer_size=hidden_size, 
                        latent_dimension=latent_dim, 
                        activation_function='ReLU'
                    )
                    model_optimizer = optimizer_factory(autoencoder_model.parameters())
                    
                    experiment_start_time = time.time()
                    training_losses = train_single_configuration(
                        autoencoder_model, model_optimizer, loss_criterion, 
                        train_dataloader, num_epochs=epochs
                    )
                    experiment_duration = time.time() - experiment_start_time
                    
                    config_key = (hidden_size, latent_dim, optimizer_name, epochs)
                    experiment_results[config_key].append({
                        'random_seed': seed,
                        'final_training_loss': training_losses[-1],
                        'minimum_training_loss': min(training_losses),
                        'training_time_seconds': experiment_duration
                    })

print("\nHyperparameter validation complete!")

In [None]:
# --- 4. Results Aggregation and Analysis ---
aggregated_results = []

for configuration_key, experimental_runs in experiment_results.items():
    hidden_size, latent_dim, optimizer_name, epochs = configuration_key
    
    final_losses = [run['final_training_loss'] for run in experimental_runs]
    minimum_losses = [run['minimum_training_loss'] for run in experimental_runs]
    training_times = [run['training_time_seconds'] for run in experimental_runs]
    
    aggregated_results.append({
        'hidden_layer_size': hidden_size,
        'latent_dimension': latent_dim,
        'optimizer_type': optimizer_name,
        'training_epochs': epochs,
        'mean_final_loss': np.mean(final_losses),
        'std_final_loss': np.std(final_losses),
        'mean_minimum_loss': np.mean(minimum_losses),
        'std_minimum_loss': np.std(minimum_losses),
        'mean_training_time': np.mean(training_times),
        'std_training_time': np.std(training_times)
    })

results_dataframe = pd.DataFrame(aggregated_results)
results_dataframe = results_dataframe.sort_values([
    'hidden_layer_size', 'latent_dimension', 'optimizer_type', 'training_epochs'
])

print("=== HYPERPARAMETER VALIDATION RESULTS ===")
print(results_dataframe.to_string(index=False))

# Find best performing configuration
best_config_idx = results_dataframe['mean_final_loss'].idxmin()
best_configuration = results_dataframe.loc[best_config_idx]

print(f"\n=== BEST CONFIGURATION ===")
print(f"Hidden Layer Size: {best_configuration['hidden_layer_size']}")
print(f"Latent Dimension: {best_configuration['latent_dimension']}")
print(f"Optimizer: {best_configuration['optimizer_type']}")
print(f"Training Epochs: {best_configuration['training_epochs']}")
print(f"Mean Final Loss: {best_configuration['mean_final_loss']:.6f}")
print(f"Mean Training Time: {best_configuration['mean_training_time']:.2f}s")

# --- SAVE RESULTS FOR LOCAL ANALYSIS ---
# Save all results as CSV
results_dataframe.to_csv('hyperparameter_results.csv', index=False)
print("\n Results saved to 'hyperparameter_results.csv'")

# Save best config as JSON
import json
best_config_dict = {
    'hidden_layer_size': int(best_configuration['hidden_layer_size']),
    'latent_dimension': int(best_configuration['latent_dimension']),
    'optimizer_type': best_configuration['optimizer_type'],
    'training_epochs': int(best_configuration['training_epochs']),
    'mean_final_loss': float(best_configuration['mean_final_loss']),
    'mean_training_time': float(best_configuration['mean_training_time'])
}

with open('best_hyperparameters.json', 'w') as f:
    json.dump(best_config_dict, f, indent=2)
print(" Best configuration saved to 'best_hyperparameters.json'")

# Save raw experiment data
import pickle
with open('experiment_results.pkl', 'wb') as f:
    pickle.dump(dict(experiment_results), f)
print(" Raw experiment data saved to 'experiment_results.pkl'")

# Create summary report
with open('validation_summary.txt', 'w') as f:
    f.write("=== HYPERPARAMETER VALIDATION SUMMARY ===\n")
    f.write(f"Total experiments run: {total_experiments}\n")
    f.write(f"Best configuration:\n")
    f.write(f"  Hidden Layer Size: {best_configuration['hidden_layer_size']}\n")
    f.write(f"  Latent Dimension: {best_configuration['latent_dimension']}\n")
    f.write(f"  Optimizer: {best_configuration['optimizer_type']}\n")
    f.write(f"  Training Epochs: {best_configuration['training_epochs']}\n")
    f.write(f"  Mean Final Loss: {best_configuration['mean_final_loss']:.6f}\n")
    f.write(f"  Mean Training Time: {best_configuration['mean_training_time']:.2f}s\n")

print(" Summary report saved to 'validation_summary.txt'")
print("\n All results saved! Ready to transfer back to local machine.")

In [None]:
# --- 5. Visualization of Results ---
plt.figure(figsize=(12, 8))

# Configuration for comparison plot
comparison_hidden_size = 5
comparison_epochs = 50

plt.subplot(2, 2, 1)
for latent_dim in latent_dimensions:
    for optimizer_name in optimizer_configs.keys():
        filter_mask = (results_dataframe['hidden_layer_size'] == comparison_hidden_size) & \
                     (results_dataframe['latent_dimension'] == latent_dim) & \
                     (results_dataframe['optimizer_type'] == optimizer_name) & \
                     (results_dataframe['training_epochs'] == comparison_epochs)
        
        if filter_mask.any():
            loss_value = results_dataframe[filter_mask]['mean_final_loss'].values[0]
            plt.bar(f"{optimizer_name}_{latent_dim}D", loss_value)

plt.ylabel("Mean Final Loss")
plt.title(f"Optimizer & Latent Dimension Comparison\nHidden={comparison_hidden_size}, Epochs={comparison_epochs}")
plt.grid(axis='y', alpha=0.3)

# Loss by hidden layer size
plt.subplot(2, 2, 2)
hidden_size_losses = results_dataframe.groupby('hidden_layer_size')['mean_final_loss'].mean()
plt.plot(hidden_size_losses.index, hidden_size_losses.values, marker='o')
plt.xlabel("Hidden Layer Size")
plt.ylabel("Mean Final Loss")
plt.title("Performance vs Hidden Layer Size")
plt.grid(alpha=0.3)

# Loss by training epochs
plt.subplot(2, 2, 3)
epoch_losses = results_dataframe.groupby('training_epochs')['mean_final_loss'].mean()
plt.plot(epoch_losses.index, epoch_losses.values, marker='s')
plt.xlabel("Training Epochs")
plt.ylabel("Mean Final Loss")
plt.title("Performance vs Training Epochs")
plt.grid(alpha=0.3)

# Training time vs performance
plt.subplot(2, 2, 4)
plt.scatter(results_dataframe['mean_training_time'], results_dataframe['mean_final_loss'], 
           alpha=0.6, c=results_dataframe['hidden_layer_size'], cmap='viridis')
plt.xlabel("Mean Training Time (seconds)")
plt.ylabel("Mean Final Loss")
plt.title("Training Time vs Performance")
plt.colorbar(label='Hidden Layer Size')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()