<center><h1><b>Comprehensive Regularization Analysis to Combat Overfitting</b></h1></center>

# **Introduction**
In this exercise, we focus on time-series forecasting using the Electricity Transformer Dataset (ETT)—a non-stationary, seasonally varying dataset for predicting electricity consumption 24–96 hours ahead—with a small Transformer encoder that uses a large input window and no dropout or regularization, making it highly prone to **overfitting**. To address this, we must deeply analyze why each **regularization** method works, how different regularizers interact, and how to diagnose and correct overfitting in real-world cases.

# **Plan**
1. **Diagnostic Analysis**: Perform an extensive diagnostic analysis on a severely overfitted pre-trained model using training/validation curves, weight distributions, activations, gradients, and saliency maps.  
2. **Implement & Compare Regularizers**: Implement and evaluate at least eight regularization methods (e.g., L1/L2, Dropout variants, Normalization, Early Stopping, Augmentation) using accuracy, generalization gap, cost, and visual analyses.  
3. **Deep Analytical Investigation**: Address key research questions on Dropout failure modes, ablation studies, empirical bias-variance decomposition, and generalization vs. model complexity.

# Libraries

In [None]:
import os
import time, datetime
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import urllib.request
from sklearn.manifold import TSNE
from collections import defaultdict

In [None]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}\n")

# Data Loading and Normalization Section

In [None]:
# TODO: Load the ETTh1 dataset (download if not available)
# dataset_path = './data/ETTh1.csv'
# hint: you can use urllib.request.urlretrieve for downloading if the file doesn't exist

# TODO: Apply proper normalization for time series data
# hint: use z-score normalization (mean=0, std=1) with StandardScaler from sklearn

dataset_path = './data/ETTh1.csv'
train_dataset = None
test_dataset = None
train_subset = None

# TODO: Create train_dataset and test_dataset using your custom dataset class or DataLoader logic
# hint: split the dataset into train (80%) and test (20%)


# Use a fraction of the training set to induce overfitting.
# TODO: Use only ~15% of the training data to simulate overfitting
# hint: random subset selection using torch.randperm or random.sample



# Create the DataLoader for the training subset

In [None]:
train_loader = None  #batch_size=32
test_loader = None   #batch_size=32


# TODO: Read feature names from the dataset (remove 'date' column)
feature_names = None

print(f"Dataset: ETTh1")
print(f"Training samples: {len(train_subset) if train_subset else 'N/A'} (~15% of train dataset)")
print(f"Test samples: {len(test_dataset) if test_dataset else 'N/A'}")
print(f"Features: {', '.join(feature_names) if feature_names else 'N/A'}")
print(f"Window size: 336, Forecast horizon: 24\n")


#Base Transformer Model



In [None]:
class TransformerBaseModel(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=4, num_layers=1):
        super().__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead),
            num_layers=num_layers
        )
        self.fc_out = nn.Linear(d_model, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_encoder(x.transpose(0, 1)).transpose(0, 1)
        x = self.fc_out(self.relu(x[:, -1, :]))
        return x

#Transformer With Dropout

In [None]:
class TransformerWithDropout(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=4, num_layers=1, dropout_rate=0.5):
        super().__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout_rate),
            num_layers=num_layers
        )

        # TODO: Add dropout layer for regularization

        self.fc_out = nn.Linear(d_model, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        # TODO: Apply dropout
        x = self.embedding(x)
        x = self.transformer_encoder(x.transpose(0, 1)).transpose(0, 1)
        x = self.relu(x[:, -1, :])
        x = self.fc_out(x)
        return x


#Transformer With BatchNorm

In [None]:
class TransformerWithBatchNorm(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=4, num_layers=1):
        super().__init__()
        self.embedding = nn.Linear(input_dim, d_model)

        # TODO: Add a Batch Normalization layer
        self.bn = None

        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead),
            num_layers=num_layers
        )
        self.fc_out = nn.Linear(d_model, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        # TODO: Apply batch normalization on the embedded features
        x = self.embedding(x)
        x = self.transformer_encoder(x.transpose(0, 1)).transpose(0, 1)
        x = self.fc_out(self.relu(x[:, -1, :]))
        return x


#Training and Evaluation Functions

In [None]:
def train_epoch(model, loader, criterion, optimizer, device, l1_lambda=0.0):
    model.train()
    total_loss, total_mae, total = 0, 0, 0
    start_time = time.time()
    for inputs, targets in loader:
        # TODO
        if l1_lambda > 0:
            # TODO: Compute L1 regularization term and add it to the loss
            pass
        # TODO: Backward pass and optimizer step
        # TODO: Accumulate loss and MAE for statistics
        total += inputs.size(0)
    epoch_time = time.time() - start_time
    # TODO: Return average loss, MAE, and epoch time
    pass


def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_mae, total = 0, 0, 0
    with torch.no_grad():
        for inputs, targets in loader:
            # TODO
            total += inputs.size(0)
    # TODO: Return average loss and MAE
    pass


def train_model(model, train_loader, test_loader, criterion, optimizer, epochs=10, l1_lambda=0.0):
    model = model.to(device)
    train_losses, test_losses, train_maes, test_maes = [], [], [], []
    train_times = []
    for epoch in range(epochs):
        # TODO
        train_loss, train_mae, epoch_time =
        test_loss, test_mae =
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        train_maes.append(train_mae)
        test_maes.append(test_mae)
        train_times.append(epoch_time)
        if (epoch + 1) % 4 == 0:
            print(f"Epoch {epoch+1}/{epochs}")
            print(f"  Train Loss: {train_loss:.4f}, Train MAE: {train_mae:.4f}")
            print(f"  Test Loss: {test_loss:.4f}, Test MAE: {test_mae:.4f}")
            print(f"  Generalization Gap (MAE): {test_mae - train_mae:.4f}")
    total_time = sum(train_times)
    # TODO: Compute total training time and return all metrics
    pass


#Experiments

In [None]:
# Dictionary to store all results
results = {}

# Check input dimensions and data shapes
data = pd.read_csv(dataset_path).drop(columns=['date'])
input_dim = data.shape[1]
print(f"Detected input dimension: {input_dim}")
for batch_x, batch_y in train_loader:
    print(f"Input shape: {batch_x.shape}, Target shape: {batch_y.shape}")
    break


# Experiment 1: Base Model (No Regularization)
print("=" * 70)
print("EXPERIMENT 1: Base Model (No Regularization)")
print("=" * 70)
model1 = TransformerBaseModel(input_dim=input_dim)
criterion1 = nn.MSELoss()
optimizer1 = torch.optim.Adam(model1.parameters(), lr=0.001)
results['Base Model'] = train_model(model1, train_loader, test_loader, criterion1, optimizer1)
base_time = results['Base Model'][4]


# Experiment 2: L1 Regularization
print("=" * 70)
print("EXPERIMENT 2: L1 Regularization (λ=0.0001)")
print("=" * 70)
# TODO: Initialize model, criterion, optimizer
# TODO: Train model with l1_lambda argument
# l1_time = results['L1 Regularization'][4]



# Experiment 3: L2 Regularization (Weight Decay)
print("=" * 70)
print("EXPERIMENT 3: L2 Regularization (λ=0.01)")
print("=" * 70)
# TODO: Initialize model, criterion, optimizer (with weight_decay)
# TODO: Train model using train_model()
# l2_time = results['L2 Regularization'][4]


# Experiment 4: Elastic Net (L1 + L2)
print("=" * 70)
print("EXPERIMENT 4: Elastic Net (L1=0.0001, L2=0.01)")
print("=" * 70)
# TODO: Initialize model, criterion, optimizer (with both L1 and L2 regularization)
# TODO: Train and save results in 'Elastic Net'
# elastic_time = results['Elastic Net'][4]



# Experiment 5: Dropout
print("=" * 70)
print("EXPERIMENT 5: Dropout (rate=0.5)")
print("=" * 70)
# TODO: Initialize TransformerWithDropout model
# TODO: Define loss, optimizer and train
# dropout_time = results['Dropout'][4]


# Experiment 6: Batch Normalization
print("=" * 70)
print("EXPERIMENT 6: Batch Normalization")
print("=" * 70)
# TODO: Initialize TransformerWithBatchNorm model
# TODO: Define loss, optimizer and train
# bn_time = results['BatchNorm'][4]


# Experiment 7: Early Stopping
print("=" * 70)
print("EXPERIMENT 7: Early Stopping (patience=3)")
print("=" * 70)
# TODO: Initialize model, criterion, optimizer
# TODO: Implement training loop with early stopping logic
# TODO: Save final results in results['Early Stopping']
# early_time = total_time


# Experiment 8: Gaussian Noise (Data Augmentation)
print("=" * 70)
print("EXPERIMENT 8: Gaussian Noise (σ=0.1)")
print("=" * 70)
# TODO: Initialize model, criterion, optimizer
# TODO: Implement training loop that adds Gaussian noise to inputs


## Plot Training and Validation Loss/MAE Curves



In [None]:
print("=" * 70)
print("DIAGNOSTIC: Training and Validation Curves")
print("=" * 70)

# TODO: Retrieve training results for the Base Model from the 'results' dictionary
# TODO: Extract train_losses, test_losses, train_maes, and test_maes from 'data'

# TODO: Create two subplots (1 row, 2 columns)
# TODO: Plot training and validation loss curves on ax1
# TODO: Set axis labels, title, legend, and grid for ax1

# TODO: Plot training and validation MAE curves on ax2
# TODO: Set axis labels, title, legend, and grid for ax2

# TODO: Adjust layout and save the figure as 'loss_mae_curves.png'
print("\n✓ Loss and MAE curves saved as 'loss_mae_curves.png'")

# TODO: Compute final_train_mae, final_test_mae, and overfitting_gap
# TODO: Print the overfitting analysis summary
print("  Interpretation: Large gap indicates overfitting; model memorizes training data.")


#Weight Distribution Analysis

In [None]:
def get_weights(model):
    # TODO
    pass

print("=" * 70)
print("DIAGNOSTIC: Weight Distribution")
print("=" * 70)

# TODO: Extract weights from the trained base model using get_weights()

# TODO: Plot a histogram of weight values

# TODO: Save the figure as 'weight_distribution.png'
print("\n✓ Weight distribution saved as 'weight_distribution.png'")

# TODO: Compute and print mean and standard deviation of the weights
print("  Interpretation: Large weights or high variance may indicate overfitting.")


#Activation Distribution Analysis

In [None]:
def get_activations(model, loader, device):
    model.eval()
    activations = []
    # TODO

    with torch.no_grad():
        for inputs, _ in loader:
            # TODO
            pass

    # TODO: Flatten and concatenate all captured activations into one numpy array
    pass

print("=" * 70)
print("DIAGNOSTIC: Activation Distribution")
print("=" * 70)

# TODO: Extract activations from the trained base model using get_activations()

# TODO: Plot a histogram of activations

# TODO: Save the figure as 'activation_distribution.png'
print("\n✓ Activation distribution saved as 'activation_distribution.png'")

# TODO: Compute mean and standard deviation of activations
# TODO: Print activation statistics and interpretation
print("  Interpretation: Large or highly concentrated activations may indicate overfitting.")


## Gradient Flow Analysis

In [None]:
def compute_gradient_flow(model, loader, criterion, device):
    model.eval()
    grad_norms = []

    for inputs, targets in loader:
        # TODO

        # TODO: Loop through model parameters and record gradient norms
        # Append (parameter_name, gradient_norm) to grad_norms list
        break

    # TODO: Return list of (name, norm) tuples
    pass


print("=" * 70)
print("DIAGNOSTIC: Gradient Flow")
print("=" * 70)

# TODO: Define loss function (criterion)
# TODO: Compute gradient flow using the trained model and test loader

print("\nGradient Norms:")
# TODO: Loop through grad_norms and print parameter name and gradient norm

# TODO: Extract gradient norm values into a list
# TODO: Plot histogram of gradient norms

# TODO: Save the figure as 'gradient_flow.png'
print("\n✓ Gradient flow distribution saved as 'gradient_flow.png'")

# TODO: Compute mean and standard deviation of gradient norms
# TODO: Print summary statistics and interpretation
print("  Interpretation: Very small (vanishing) or large (exploding) gradients may indicate training issues or overfitting.")


# Saliency Maps Analysis

In [None]:
# Note: For simplicity, this exercise focuses on predicting only the *next* time step. However, this model is designed to analyze feature importance across *all* time steps to enable clearer saliency visualization and full-sequence gradient flow.
print("=" * 70)
print("DIAGNOSTIC: Saliency Map Analysis (Enhanced Model for Clear Patterns)")
print("=" * 70)

def plot_saliency_map(model, loader, device, filename='saliency_map.png', title='Saliency Map'):
    model.eval()
    # TODO
    pass

    # TODO: Create a heatmap visualization for the first few samples
    print(f"\n✓ Saliency map saved as '{filename}'")

    # TODO: Compute mean and max saliency values and print interpretation
    print("  Interpretation: Higher values = more influential time steps.")


# TODO: Extract sequence length (SEQ_LEN) and input dimension (input_dim) from one batch of data

# TODO: Define Transformer model architecture for saliency analysis
# This version uses *all time steps* in the output layer.
class TransformerForSaliency(nn.Module):
    def __init__(self, input_dim, seq_len, d_model=64, nhead=4, num_layers=1, dropout_rate=0.3):
        super().__init__()
        # TODO: Define embedding layer, transformer encoder, dropout, and output layer
        pass

    def forward(self, x):
        # TODO
        pass


# TODO: Initialize device, model, loss function, and optimizer
# TODO: Train the model using the provided train_model() function

# TODO: Generate and visualize the saliency map using the trained model

# TODO: Print the analysis completion time
# Example:
# print(f"\nAnalysis completed at: {datetime.datetime.now().strftime('%I:%M %p CEST, %B %d, %Y')}")


## Plot results

In [None]:
def compute_loss_landscape(model, loader, criterion, device, steps=10, scale=0.01):
    """
    Compute a 2D loss landscape around the model's current parameters.
    """
    model.eval()
    # Get a small batch of data
    x, y =

    # Collect parameters
    #TODO
    # Flatten parameters for perturbation
    #TODO
    # Prepare loss grid
    #TODO

    for i, dx in enumerate(alphas):
        for j, dy in enumerate(betas):
            perturbed = base_params + dx * direction_x + dy * direction_y
            idx = 0
            for p in params:
                numel = p.numel()
                p.data = perturbed[idx:idx + numel].view_as(p).data.clone()
                idx += numel

            with torch.no_grad():
                loss = criterion(model(x).squeeze(), y.squeeze()).item()
                losses[i, j] = loss

    # Restore original parameters
    idx = 0
    for p in params:
        numel = p.numel()
        p.data = base_params[idx:idx + numel].view_as(p).data.clone()
        idx += numel

    return losses


def compute_tsne(model, loader, device, n_samples=100):
    """
    Compute t-SNE projection of model embeddings.
    """
    model.eval()
    x, _ = next(iter(loader))
    x = x[:n_samples].to(device)

    with torch.no_grad():
        # TODO: Extract embeddings or features from the model

    # TODO: Initialize a t-SNE instance and fit_transform the features


# Plot results
fig = plt.figure(figsize=(20, 15))
gs = fig.add_gridspec(3, 3)


# Plot 1: Training MAE
ax1 = fig.add_subplot(gs[0, 0])
# TODO

# Plot 2: Test MAE (Generalization)
ax2 = fig.add_subplot(gs[0, 1])
# TODO

# Plot 3: Generalization Gap (Test - Train)
ax3 = fig.add_subplot(gs[0, 2])
# TODO

# Plot 4: Loss Landscape (Base Model)
ax4 = fig.add_subplot(gs[1, :2])
# TODO


# Analysis of Overfitting in the Base Model

After plotting and analyzing the above visualizations for the base model, identify where **overfitting** occurs (refer to the results of the plots to highlight signs of overfitting) and provide reasons for this. Expand on this by pinpointing *where* and *why* overfitting occurs, with findings presented through **multiple visualizations** and **mathematically grounded reasoning**.

#BEST MODEL ANALYSIS: Performance vs. Generalization

In [None]:
print("\n" + "=" * 80)
print("FINAL RESULTS SUMMARY (ETTh1)")
print("=" * 80)
print(f"{'Method':<30} | {'Train MAE':<10} | {'Test MAE':<10} | {'Gap':<10} | {'Time (s)'}")
print("-" * 80)


# TODO: Loop through all models in 'results'
#       Extract training MAE, test MAE, and total training time for each model
#       Compute the final train/test MAE and the generalization gap (test - train)


print("\n" + "=" * 80)
print("BEST MODEL ANALYSIS")
print("=" * 80)

# TODO: Find the model with the lowest final Test MAE (best predictive performance)

# TODO: Define a helper function to compute the generalization gap (test - train)
# TODO: Use it to find the model with the smallest gap (best generalization)

# TODO: Extract names and metrics of both models (best_test_model, best_gap_model)

print(f"Best Test MAE:      {best_test_name:<30} → {best_test_mae:.4f}")
print(f" Smallest Gap:      {best_gap_name:<30} → {best_gap_value:.4f}")

if best_test_name == best_gap_name:
    print("\n This model achieves both the lowest prediction error and the best generalization!")
else:
    print(f"\n Trade-off detected: The best-performing model ({best_test_name}) is not the most stable.")
    print(f"   Consider your priority: raw accuracy vs. robustness to overfitting.")

print(f"\n Analysis completed at: {datetime.datetime.now().strftime('%I:%M %p CEST, %B %d, %Y')}")


#Exploring the Limits of Dropout: Does More Regularization Always Help?

In [None]:
print("=" * 70)
print("EXPERIMENT 9: Dropout with Varying Rates")
print("=" * 70)

dropout_rates = [0.1, 0.3, 0.5, 0.7]

results_dropout_vary = {}

# TODO: Loop through each dropout rate
for rate in dropout_rates:
    print(f"Dropout Rate: {rate}")

    # TODO: Initialize your Transformer model with the given dropout rate

    # TODO: Define the loss function
    # TODO: Define the optimizer

    # TODO: Train the model


**Question:**  
You have observed how the model’s performance changes as the dropout rate increases.  
Under what specific conditions might higher dropout rates lead to worse generalization,  
even though dropout is a regularization technique?  
Explain the underlying reasons in terms of **bias**, **variance**, and **model capacity**.

## Questions

1. How does MAE (or loss) change with different dropout rates?



- <span style="color:cyan;">**Answer:** Write your answer here</span>

2. At which point does dropout start to hurt rather than help?

- <span style="color:cyan;">**Answer:** Write your answer here</span>

3. What does this reveal about over-regularization in neural networks?

- <span style="color:cyan;">**Answer:** Write your answer here</span>

#Ablation Study — Which Regularizer Helps Most?

In [None]:
def compute_weight_stats(model):
    # TODO: Compute L1 and L2 norms of model parameters and total number of parameters
    # Return l1_norm, l2_norm, num_params
    pass


def train_and_evaluate(model, train_loader, test_loader, epochs=10, l1_lambda=0.0, use_noise=False, noise_std=0.1):
    # TODO: Initialize lists to store losses, MAEs, and times

    for epoch in range(epochs):
        # TODO: Training phase
        # - Set model to training mode
        # - Add Gaussian noise if use_noise=True
        # - Compute predictions, loss, and backpropagation
        # - Add L1 regularization if l1_lambda > 0

        # TODO: Evaluation phase
        # - Set model to eval mode
        # - Compute test loss and MAE without gradients
        pass

    # TODO: Compute L1/L2 norms and number of parameters using compute_weight_stats()
    # TODO: Compute generalization gap (test MAE - train MAE)
    # TODO: Return dictionary with training and evaluation statistics
    pass


# Ablation Results Container
results = {}

# Base Model
print("Running: Base Model")
# TODO: Initialize base model and train

# L1 Regularization
print("Running: L1 Regularization")
# TODO: Initialize model and apply L1 regularization by setting l1_lambda > 0

# L2 Regularization
print("Running: L2 Regularization")
# TODO: Initialize model
# TODO: Define optimizer with weight_decay (L2 regularization)
# TODO: Train model and store results

# Dropout Regularization
print("Running: Dropout")
# TODO: Initialize model with dropout layers (e.g., dropout_rate=0.5)
# TODO: Train and evaluate model

# Batch Normalization
print("Running: BatchNorm")
# TODO: Initialize model with batch normalization layers
# TODO: Train and evaluate model

# Early Stopping
print("Running: Early Stopping")
# TODO: Implement early stopping mechanism
# Hints:
# - Track validation MAE
# - Stop training when MAE doesn’t improve for several epochs (patience)
# - Return final metrics and weight norms


# Results Summary
# TODO: Create a summary DataFrame comparing all models:
#   - Train/Test MAE
#   - Gap (Test - Train)
#   - L1/L2 norms
#   - Total time


# TODO: Save results to CSV file (e.g., "ablation_results.csv")


# Visualization Section
plt.figure(figsize=(15, 5))

# TODO: Plot 1 — Train/Test MAE curves for selected models
# TODO: Plot 2 — Bar chart of generalization gaps
# TODO: Plot 3 — L2 Norms of model weights
# TODO: Save and display final plot


**Question:**  
How do different regularization techniques (L1, L2, Dropout, BatchNorm, Early Stopping) affect the Transformer model’s performance in terms of **Mean Absolute Error (MAE)**, **generalization gap**, and **weight magnitudes**?

---

#Empirical Bias–Variance Decomposition: Quantifying the Impact of Each Regularizer

In [None]:
def train_single_model(model, train_loader, test_loader, device,
                      epochs=10, l1_lambda=0.0, weight_decay=0.0):
    """
    Trains a single model instance (used for bias-variance trials).
    """
    # TODO: Move model to device
    # TODO: Define loss function (MSE) and optimizer with given weight_decay
    # TODO: Implement standard training loop for 'epochs' iterations
    #       - Forward pass, compute loss
    #       - Apply L1 regularization if l1_lambda > 0
    #       - Backward pass and optimizer step
    # Return trained model
    pass


def bias_variance_decomposition(
    model_class,
    model_kwargs,
    train_loader,
    test_loader,
    device,
    n_trials=5,
    epochs=10,
    l1_lambda=0.0,
    weight_decay=0.0,
    dropout_rate=None
):
    """
    Performs empirical bias-variance decomposition by training n_trials models.
    Returns average bias^2, variance, and total error.
    """
    # TODO: Initialize list to collect all predictions for each trial
    # TODO: For each trial:
    #   - Set manual random seeds for reproducibility
    #   - Create model instance (include dropout_rate if provided)
    #   - Train model
    #   - Evaluate model on test set and store predictions and targets
    # TODO: After all trials, compute:
    #   - mean_predictions (average across models)
    #   - bias^2 = (mean_predictions - true_targets)^2
    #   - variance = variance of predictions across models
    # TODO: Return average bias^2, variance, and total error
    pass


print("=" * 80)
print("EMPIRICAL BIAS–VARIANCE DECOMPOSITION")
print("=" * 80)

common_kwargs = {'input_dim': input_dim, 'd_model': 64, 'nhead': 4, 'num_layers': 1}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_trials = 5
epochs = 10

bv_results = {}

# 1. Base Model
print("Running: Base Model")
# TODO: Run bias_variance_decomposition() for the base model

# 2. L1 Regularization
print("Running: L1 Regularization")
# TODO: Add L1 regularization by passing l1_lambda > 0

# 3. L2 Regularization
print("Running: L2 Regularization")
# TODO: Add L2 regularization by setting weight_decay > 0

# 4. Dropout
print("Running: Dropout")
# TODO: Use a dropout-enabled Transformer model (e.g., dropout_rate=0.5)

# 5. BatchNorm
print("Running: BatchNorm")
# TODO: Use a Transformer model with Batch Normalization layers


# RESULTS SUMMARY
# TODO: Collect bias^2, variance, and total error for each model into a DataFrame

# TODO: Save summary results to CSV

# INSIGHT CALCULATION
# TODO: Compute relative variance reduction and bias change compared to Base Model



**Question:**  
How do different **regularization techniques** affect the **bias, variance, and total error** of a base model?  
Analyze the **bias-variance decomposition** for L1, L2, Dropout, and BatchNorm regularization.

#Empirical Analysis of Generalization Gap Scaling with Model Complexity

In [None]:
def train_model_for_complexity(model_kwargs, train_loader, test_loader, device, epochs=10):
    """
    Trains a base model (no regularization) with the given architecture.
    Returns final train/test MAE and full learning curves.
    """
    # TODO: Initialize TransformerBaseModel using model_kwargs and move to device
    # TODO: Define loss function  and optimizer

    train_maes, test_maes = [], []

    for epoch in range(epochs):
        # TODO
        pass

        # TODO: Evaluate model on test_loader in eval() mode
        pass

    # TODO: Return dictionary with 'train_maes', 'test_maes', and final results
    pass


#Experiment 1: Vary model depth (number of layers)
print("=" * 80)
print("GENERALIZATION GAP vs MODEL COMPLEXITY (num_layers)")
print("=" * 80)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 10

num_layers_list = [1, 2, 3, 4, 5]
complexity_results = {}

for num_layers in num_layers_list:
    print(f"Training model with num_layers = {num_layers}")
    model_kwargs = {
        'input_dim': input_dim,
        'd_model': 64,
        'nhead': 4,
        'num_layers': num_layers
    }
    # TODO: Call train_model_for_complexity() and save results
    pass


#Experiment 2: Vary model width (d_model)
print("\n" + "=" * 80)
print("GENERALIZATION GAP vs MODEL COMPLEXITY (d_model)")
print("=" * 80)

d_model_list = [16, 32, 64, 128, 256]
complexity_results_d = {}

for d_model in d_model_list:
    print(f"Training model with d_model = {d_model}")
    model_kwargs = {
        'input_dim': input_dim,
        'd_model': d_model,
        'nhead': min(4, d_model // 8),
        'num_layers': 1
    }
    # TODO: Train model with different embedding size and store results
    pass


# PLOTTING AND SAVING RESULTS
plt.figure(figsize=(14, 5))

# (a) Generalization gap vs. number of layers
plt.subplot(1, 2, 1)
layers = list(complexity_results.keys())
gaps_layers = [complexity_results[l]['gap'] for l in layers]
train_maes_layers = [complexity_results[l]['final_train_mae'] for l in layers]
test_maes_layers = [complexity_results[l]['final_test_mae'] for l in layers]

plt.plot(layers, train_maes_layers, 'o--', label='Train MAE')
plt.plot(layers, test_maes_layers, 's--', label='Test MAE')
plt.plot(layers, gaps_layers, 'd-', color='red', label='Generalization Gap')
plt.xlabel('Number of Transformer Layers')
plt.ylabel('MAE / Gap')
plt.title('Generalization Gap vs Model Depth')
plt.legend()
plt.grid(True)

# (b) Generalization gap vs. embedding size (d_model)
plt.subplot(1, 2, 2)
d_models = list(complexity_results_d.keys())
gaps_d = [complexity_results_d[d]['gap'] for d in d_models]
train_maes_d = [complexity_results_d[d]['final_train_mae'] for d in d_models]
test_maes_d = [complexity_results_d[d]['final_test_mae'] for d in d_models]

plt.plot(d_models, train_maes_d, 'o--', label='Train MAE')
plt.plot(d_models, test_maes_d, 's--', label='Test MAE')
plt.plot(d_models, gaps_d, 'd-', color='red', label='Generalization Gap')
plt.xlabel('d_model (Embedding Size)')
plt.ylabel('MAE / Gap')
plt.title('Generalization Gap vs Model Width')
plt.legend()
plt.grid(True)
plt.xscale('log')

plt.tight_layout()
plt.savefig("generalization_gap_vs_complexity.png", dpi=150)
plt.show()

# TODO: Save results as CSVs after computing them

**Question:**  
How does the **generalization gap** (*Test MAE - Train MAE*) change as the **complexity of a Transformer model** varies?  
Specifically, analyze the impact of:

- **Model Depth:** Number of Transformer layers  
- **Model Width:** Embedding size (`d_model`)  

What trends are observed in training performance, test performance, and overfitting risk across different configurations?
