In [None]:
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np

In [None]:
class FlexibleConvNet(nn.Module):
    def __init__(self, num_conv_layers=1, num_fc_layers=1, out_channels=16,
                 kernel_size=3, dilation=1, pooling_kernel_size=None, dropout_rate=0.0):
        super().__init__()

        # Build convolutional layers with variable kernel size and dilation
        self.conv_layers = nn.ModuleList()
        in_channels = 1  # MNIST has 1 channel

        # Calculate padding to maintain spatial dimensions
        # Formula: output = (input + 2*padding - dilation*(kernel-1) - 1) / stride + 1
        # To maintain size: padding = dilation * (kernel_size - 1) / 2
        padding = dilation * (kernel_size - 1) // 2

        for i in range(num_conv_layers):
            self.conv_layers.append(
                nn.Conv2d(in_channels, out_channels,
                         kernel_size=kernel_size, # Variable kernel size
                         padding=padding, # Calculated based on kernel and dilation
                         dilation=dilation) # Variable dilation
            )
            in_channels = out_channels

        # Optional pooling
        self.pool = None
        self.pooling_kernel_size = pooling_kernel_size
        if pooling_kernel_size:
            self.pool = nn.MaxPool2d(kernel_size=pooling_kernel_size)
            # Calculate dimension after pooling
            dim_after_pool = 28 // pooling_kernel_size
            flatten_dim = dim_after_pool * dim_after_pool * out_channels
        else:
            flatten_dim = 28 * 28 * out_channels

        # Build fully connected layers
        self.fc_layers = nn.ModuleList()
        fc_dims = [flatten_dim]

        # Add intermediate FC layers if needed
        for i in range(num_fc_layers - 1):
            fc_dims.append(128)  # Hidden layer size
        fc_dims.append(10)  # Output classes

        for i in range(len(fc_dims) - 1):
            self.fc_layers.append(nn.Linear(fc_dims[i], fc_dims[i + 1]))

        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        # Apply convolutional layers
        for i, conv in enumerate(self.conv_layers):
            x = F.relu(conv(x))
            # Apply pooling after first conv if specified
            if i == 0 and self.pool is not None:
                x = self.pool(x)

        # Flatten
        x = x.view(x.size(0), -1)

        # Apply FC layers
        for i, fc in enumerate(self.fc_layers):
            x = fc(x)
            # Don't apply ReLU/Dropout to final layer
            if i < len(self.fc_layers) - 1:
                x = F.relu(x)
                x = self.dropout(x)

        return x

In [None]:
def get_data_transforms(trial=None):
    """
    Define data augmentations for MNIST
    """
    # Base transforms (always applied)
    base_transforms = [
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ]

    # Training augmentations
    train_augmentations = []

    if trial:
        # Let Optuna tune augmentation parameters
        if trial.suggest_categorical('use_rotation', [True, False]):
            rotation_degrees = trial.suggest_int('rotation_degrees', 5, 15)
            train_augmentations.append(transforms.RandomRotation(rotation_degrees))

        if trial.suggest_categorical('use_translation', [True, False]):
            translate_factor = trial.suggest_float('translate_factor', 0.05, 0.15)
            train_augmentations.append(
                transforms.RandomAffine(0, translate=(translate_factor, translate_factor))
            )

        if trial.suggest_categorical('use_scale', [True, False]):
            scale_range = trial.suggest_float('scale_range', 0.05, 0.15)
            train_augmentations.append(
                transforms.RandomAffine(0, scale=(1-scale_range, 1+scale_range))
            )
    else:
        # Default augmentations if not using Optuna
        train_augmentations = [
            transforms.RandomRotation(10),
            transforms.RandomAffine(0, translate=(0.1, 0.1)),
        ]

    # Combine augmentations with base transforms
    train_transform = transforms.Compose(train_augmentations + base_transforms)
    val_transform = transforms.Compose(base_transforms)

    return train_transform, val_transform

def training_loop(model, train_loader, val_loader, loss_fn, optimizer,
                  num_epochs=10, device='cuda', trial=None):
    """
    Training loop with optional Optuna integration
    """
    train_losses = []
    val_accuracies = []

    for epoch in range(num_epochs):
        # Training
        model.train()
        total_loss = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)

            optimizer.zero_grad()
            output = model(data)
            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        train_losses.append(avg_loss)

        # Validation
        model.eval()
        correct = 0
        val_loss = 0

        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                val_loss += loss_fn(output, target).item()
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / len(val_loader.dataset)
        val_accuracies.append(accuracy)

        print(f'Trial {trial.number if trial else "N/A"} - Epoch {epoch+1}/{num_epochs}: Loss={avg_loss:.4f}, Val Acc={accuracy:.4f}')

        # Optuna integration
        if trial:
            trial.report(accuracy, epoch)
            if trial.should_prune():
                raise optuna.TrialPruned()

    return val_accuracies[-1] if val_accuracies else 0

def objective(trial):
    # Hyperparameters to tune
    lr = trial.suggest_float('learning_rate', 0.0001, 0.01, log=True)
    num_conv_layers = trial.suggest_int('num_conv_layers', 4, 6)
    num_fc_layers = trial.suggest_int('num_fc_layers', 3, 5)
    kernel_size = trial.suggest_categorical('kernel_size', [3, 5, 7, 9])  # Testing 3, 5, 7, 9
    dilation = trial.suggest_categorical('dilation', [1, 2])  # Testing dilation 1 and 2
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.1)

    # Pooling configurations
    use_pooling = trial.suggest_categorical('use_pooling', [True, False])
    if use_pooling:
        pooling_kernel_size = trial.suggest_categorical('pooling_kernel_size', [2, 3, 4])
    else:
        pooling_kernel_size = None

    # Fixed hyperparameters
    batch_size = 128
    out_channels = 16
    weight_decay = 0.0001
    num_epochs = 10

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Calculate effective receptive field
    effective_rf = kernel_size + (kernel_size - 1) * (dilation - 1)

    print(f"\n{'='*50}")
    print(f"TRIAL {trial.number + 1}/5")
    print(f"{'='*50}")
    print(f"Hyperparameters:")
    print(f"  Conv layers: {num_conv_layers}")
    print(f"  Kernel size: {kernel_size}×{kernel_size}")
    print(f"  Dilation: {dilation} (effective RF: {effective_rf}×{effective_rf})")
    print(f"  FC layers: {num_fc_layers}")
    print(f"  Pooling: {pooling_kernel_size if use_pooling else 'None'}")
    print(f"  Dropout: {dropout_rate:.3f}")
    print(f"  Learning rate: {lr:.5f}")

    # Get data transforms (with augmentation parameters from trial)
    train_transform, val_transform = get_data_transforms(trial)

    # Load datasets
    train_dataset = datasets.MNIST('./data', train=True, download=True,
                                   transform=train_transform)
    val_dataset = datasets.MNIST('./data', train=False,
                                 transform=val_transform)

    train_loader = DataLoader(train_dataset, batch_size=batch_size,
                              shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=batch_size,
                           shuffle=False, num_workers=2)

    # Create model with variable kernel size and dilation
    model = FlexibleConvNet(
        num_conv_layers=num_conv_layers,
        num_fc_layers=num_fc_layers,
        out_channels=out_channels,
        kernel_size=kernel_size, # Variable kernel size
        dilation=dilation, # Variable dilation
        pooling_kernel_size=pooling_kernel_size,
        dropout_rate=dropout_rate
    ).to(device)

    # Print parameter count and receptive field info
    total_params = sum(p.numel() for p in model.parameters())
    print(f"  Total parameters: {total_params:,}")

    # Show what each configuration covers
    coverage = (effective_rf / 28) * 100
    print(f"  Coverage: {effective_rf}×{effective_rf} pixels = {coverage:.1f}% of image width")

    # Loss and optimizer
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Train
    accuracy = training_loop(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        loss_fn=loss_fn,
        optimizer=optimizer,
        num_epochs=num_epochs,
        device=device,
        trial=trial
    )

    print(f"Trial {trial.number + 1} Final Accuracy: {accuracy:.4f}\n")

    return accuracy

In [None]:
def main():
    print("\n" + "="*60)
    print("HYPERPARAMETER TUNING WITH OPTUNA")
    print("Testing: Kernel sizes [3,5,7,9] × Dilation [1,2]")
    print("="*60 + "\n")

    # Show all possible kernel/dilation combinations
    print("Possible kernel configurations:")
    for k in [3, 5, 7, 9]:
        for d in [1, 2]:
            eff_rf = k + (k - 1) * (d - 1)
            print(f"  Kernel {k}×{k}, Dilation {d} → Effective RF: {eff_rf}×{eff_rf}")
    print()

    # Create and run study with 5 trials
    study = optuna.create_study(
        direction='maximize',
        pruner=optuna.pruners.MedianPruner(
            n_startup_trials=2,
            n_warmup_steps=2,
        )
    )

    study.optimize(objective, n_trials=5)

    # Print results
    print("\n" + "="*60)
    print("FINAL RESULTS")
    print("="*60)
    print(f"\nBest Validation Accuracy: {study.best_value:.4f}")
    print("\nBest Hyperparameters:")
    for key, value in study.best_params.items():
        if isinstance(value, float):
            print(f"  {key}: {value:.5f}")
        else:
            print(f"  {key}: {value}")

    # Calculate effective receptive field for best configuration
    best_kernel = study.best_params.get('kernel_size', 3)
    best_dilation = study.best_params.get('dilation', 1)
    effective_rf = best_kernel + (best_kernel - 1) * (best_dilation - 1)
    print(f"\nBest configuration: {best_kernel}×{best_kernel} kernel, dilation={best_dilation}")
    print(f"Effective receptive field: {effective_rf}×{effective_rf} ({(effective_rf/28)*100:.1f}% of image)")

    # Show all trial results
    print("\nAll Trial Results:")
    for trial in study.trials:
        status = "COMPLETED" if trial.state == optuna.trial.TrialState.COMPLETE else "PRUNED"
        if trial.value:
            kernel = trial.params.get('kernel_size', 'N/A')
            dilation = trial.params.get('dilation', 'N/A')
            print(f"  Trial {trial.number + 1}: {trial.value:.4f} (K={kernel}, D={dilation}) - {status}")
        else:
            print(f"  Trial {trial.number + 1}: N/A ({status})")

    # Show importance of each hyperparameter
    try:
        import optuna.visualization as vis

        if len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]) > 1:
            fig = vis.plot_param_importances(study)
            fig.show()

            fig2 = vis.plot_optimization_history(study)
            fig2.show()
        else:
            print("\nNeed at least 2 completed trials for visualization")
    except ImportError:
        print("\nInstall plotly for visualizations: pip install plotly")

    return study

if __name__ == "__main__":
    study = main()

[I 2025-10-31 21:05:14,566] A new study created in memory with name: no-name-cdcdadf0-247a-4cfe-8d79-84b89a4e3d84



HYPERPARAMETER TUNING WITH OPTUNA
Testing: Kernel sizes [3,5,7,9] × Dilation [1,2]

Possible kernel configurations:
  Kernel 3×3, Dilation 1 → Effective RF: 3×3
  Kernel 3×3, Dilation 2 → Effective RF: 5×5
  Kernel 5×5, Dilation 1 → Effective RF: 5×5
  Kernel 5×5, Dilation 2 → Effective RF: 9×9
  Kernel 7×7, Dilation 1 → Effective RF: 7×7
  Kernel 7×7, Dilation 2 → Effective RF: 13×13
  Kernel 9×9, Dilation 1 → Effective RF: 9×9
  Kernel 9×9, Dilation 2 → Effective RF: 17×17


TRIAL 1/5
Hyperparameters:
  Conv layers: 6
  Kernel size: 5×5
  Dilation: 2 (effective RF: 9×9)
  FC layers: 4
  Pooling: None
  Dropout: 0.044
  Learning rate: 0.00035
  Total parameters: 1,672,570
  Coverage: 9×9 pixels = 32.1% of image width
Trial 0 - Epoch 1/10: Loss=0.6679, Val Acc=0.9613
Trial 0 - Epoch 2/10: Loss=0.1808, Val Acc=0.9703
Trial 0 - Epoch 3/10: Loss=0.1280, Val Acc=0.9799
Trial 0 - Epoch 4/10: Loss=0.1075, Val Acc=0.9828
Trial 0 - Epoch 5/10: Loss=0.0853, Val Acc=0.9837
Trial 0 - Epoch 6/10:

[I 2025-10-31 21:09:04,881] Trial 0 finished with value: 0.9903 and parameters: {'learning_rate': 0.00034913622846060243, 'num_conv_layers': 6, 'num_fc_layers': 4, 'kernel_size': 5, 'dilation': 2, 'dropout_rate': 0.04365708427586706, 'use_pooling': False, 'use_rotation': False, 'use_translation': True, 'translate_factor': 0.06912277281915277, 'use_scale': True, 'scale_range': 0.07285182857724358}. Best is trial 0 with value: 0.9903.


Trial 0 - Epoch 10/10: Loss=0.0515, Val Acc=0.9903
Trial 1 Final Accuracy: 0.9903


TRIAL 2/5
Hyperparameters:
  Conv layers: 5
  Kernel size: 3×3
  Dilation: 1 (effective RF: 3×3)
  FC layers: 5
  Pooling: 4
  Dropout: 0.087
  Learning rate: 0.00029
  Total parameters: 160,746
  Coverage: 3×3 pixels = 10.7% of image width
Trial 1 - Epoch 1/10: Loss=1.2265, Val Acc=0.8586
Trial 1 - Epoch 2/10: Loss=0.4590, Val Acc=0.9445
Trial 1 - Epoch 3/10: Loss=0.2845, Val Acc=0.9583
Trial 1 - Epoch 4/10: Loss=0.2207, Val Acc=0.9707
Trial 1 - Epoch 5/10: Loss=0.1901, Val Acc=0.9736
Trial 1 - Epoch 6/10: Loss=0.1676, Val Acc=0.9763
Trial 1 - Epoch 7/10: Loss=0.1541, Val Acc=0.9802
Trial 1 - Epoch 8/10: Loss=0.1383, Val Acc=0.9801
Trial 1 - Epoch 9/10: Loss=0.1346, Val Acc=0.9802


[I 2025-10-31 21:13:10,630] Trial 1 finished with value: 0.9832 and parameters: {'learning_rate': 0.00028531498008373005, 'num_conv_layers': 5, 'num_fc_layers': 5, 'kernel_size': 3, 'dilation': 1, 'dropout_rate': 0.08679901120327504, 'use_pooling': True, 'pooling_kernel_size': 4, 'use_rotation': True, 'rotation_degrees': 15, 'use_translation': True, 'translate_factor': 0.07114868936708249, 'use_scale': True, 'scale_range': 0.07611444402625725}. Best is trial 0 with value: 0.9903.


Trial 1 - Epoch 10/10: Loss=0.1225, Val Acc=0.9832
Trial 2 Final Accuracy: 0.9832


TRIAL 3/5
Hyperparameters:
  Conv layers: 4
  Kernel size: 7×7
  Dilation: 1 (effective RF: 7×7)
  FC layers: 3
  Pooling: None
  Dropout: 0.084
  Learning rate: 0.00090
  Total parameters: 1,662,042
  Coverage: 7×7 pixels = 25.0% of image width
Trial 2 - Epoch 1/10: Loss=0.3403, Val Acc=0.9835
Trial 2 - Epoch 2/10: Loss=0.0970, Val Acc=0.9870
Trial 2 - Epoch 3/10: Loss=0.0727, Val Acc=0.9899
Trial 2 - Epoch 4/10: Loss=0.0623, Val Acc=0.9872
Trial 2 - Epoch 5/10: Loss=0.0583, Val Acc=0.9893
Trial 2 - Epoch 6/10: Loss=0.0519, Val Acc=0.9915
Trial 2 - Epoch 7/10: Loss=0.0492, Val Acc=0.9908
Trial 2 - Epoch 8/10: Loss=0.0494, Val Acc=0.9924
Trial 2 - Epoch 9/10: Loss=0.0439, Val Acc=0.9900


[I 2025-10-31 21:16:21,398] Trial 2 finished with value: 0.9925 and parameters: {'learning_rate': 0.0008981209753149214, 'num_conv_layers': 4, 'num_fc_layers': 3, 'kernel_size': 7, 'dilation': 1, 'dropout_rate': 0.08426265194651103, 'use_pooling': False, 'use_rotation': False, 'use_translation': True, 'translate_factor': 0.1415892468896479, 'use_scale': False}. Best is trial 2 with value: 0.9925.


Trial 2 - Epoch 10/10: Loss=0.0444, Val Acc=0.9925
Trial 3 Final Accuracy: 0.9925


TRIAL 4/5
Hyperparameters:
  Conv layers: 5
  Kernel size: 3×3
  Dilation: 2 (effective RF: 5×5)
  FC layers: 3
  Pooling: 4
  Dropout: 0.032
  Learning rate: 0.00074
  Total parameters: 127,722
  Coverage: 5×5 pixels = 17.9% of image width
Trial 3 - Epoch 1/10: Loss=0.5956, Val Acc=0.9259
Trial 3 - Epoch 2/10: Loss=0.1837, Val Acc=0.9661


[I 2025-10-31 21:17:13,338] Trial 3 pruned. 


Trial 3 - Epoch 3/10: Loss=0.1282, Val Acc=0.9751

TRIAL 5/5
Hyperparameters:
  Conv layers: 6
  Kernel size: 5×5
  Dilation: 2 (effective RF: 9×9)
  FC layers: 5
  Pooling: None
  Dropout: 0.055
  Learning rate: 0.00010
  Total parameters: 1,689,082
  Coverage: 9×9 pixels = 32.1% of image width
Trial 4 - Epoch 1/10: Loss=1.5823, Val Acc=0.8124
Trial 4 - Epoch 2/10: Loss=0.6205, Val Acc=0.9295


[I 2025-10-31 21:18:29,889] Trial 4 pruned. 


Trial 4 - Epoch 3/10: Loss=0.4138, Val Acc=0.9414

FINAL RESULTS

Best Validation Accuracy: 0.9925

Best Hyperparameters:
  learning_rate: 0.00090
  num_conv_layers: 4
  num_fc_layers: 3
  kernel_size: 7
  dilation: 1
  dropout_rate: 0.08426
  use_pooling: False
  use_rotation: False
  use_translation: True
  translate_factor: 0.14159
  use_scale: False

Best configuration: 7×7 kernel, dilation=1
Effective receptive field: 7×7 (25.0% of image)

All Trial Results:
  Trial 1: 0.9903 (K=5, D=2) - COMPLETED
  Trial 2: 0.9832 (K=3, D=1) - COMPLETED
  Trial 3: 0.9925 (K=7, D=1) - COMPLETED
  Trial 4: 0.9751 (K=3, D=2) - PRUNED
  Trial 5: 0.9414 (K=5, D=2) - PRUNED
