In [None]:
!pip install -q optuna torch torchvision

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import random_split, DataLoader, ConcatDataset

# Download MNIST
transform = transforms.Compose([transforms.ToTensor()])
dataset = datasets.MNIST('.', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST('.', train=False, download=True, transform=transform)

# Split train into train (90%) and val (10%)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [None]:
class CNN(nn.Module):
    def __init__(self, num_filters, dropout_rate):
        super(CNN, self).__init__()
        # First convolutional layer followed by batch normalization
        self.conv1 = nn.Conv2d(1, num_filters, 3)
        self.bn1 = nn.BatchNorm2d(num_filters)
        # Second convolutional layer followed by batch normalization
        self.conv2 = nn.Conv2d(num_filters, num_filters * 2, 3)
        self.bn2 = nn.BatchNorm2d(num_filters * 2)
        # Dropout layer for convolutional features
        self.dropout2d = nn.Dropout2d(dropout_rate)
        # Fully connected layers with dropout in between
        self.fc1 = nn.Linear(num_filters * 2 * 5 * 5, 128)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.max_pool2d(x, 2)
        x = self.dropout2d(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.max_pool2d(x, 2)
        x = self.dropout2d(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)

In [None]:
# Example training (outside of the hyperparameter tuning) for clarity
lr = 1e-2
num_filters = 8
batch_size = 16
epochs = 5
dropout_rate = 0.3

# Create data loaders (assumes train_dataset and val_dataset are defined)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize model, optimizer with weight decay, and loss criterion
model = CNN(num_filters, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

# Training loop example
for epoch in range(epochs):
    model.train()
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        loss = criterion(model(data), target)
        loss.backward()
        optimizer.step()

    # Evaluation on validation set
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)
            pred = model(data).argmax(dim=1)
            correct += pred.eq(target).sum().item()
            total += data.size(0)
    val_acc = correct / total
    print(f"Epoch {epoch+1}/{epochs}, Validation Accuracy: {val_acc:.4f}")

Epoch 1/5, Validation Accuracy: 0.9622
Epoch 2/5, Validation Accuracy: 0.9702
Epoch 3/5, Validation Accuracy: 0.9702
Epoch 4/5, Validation Accuracy: 0.9712
Epoch 5/5, Validation Accuracy: 0.9717


In [None]:
import optuna

# Hyperparameter tuning with Optuna
def objective(trial):
    # Suggest hyperparameters including dropout rate and weight decay
    lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
    num_filters = trial.suggest_categorical('num_filters', [16, 24, 32])
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
    epochs = 5

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = CNN(num_filters, dropout_rate).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            loss = criterion(model(data), target)
            loss.backward()
            optimizer.step()
        # Evaluate on validation set
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                pred = model(data).argmax(dim=1)
                correct += pred.eq(target).sum().item()
                total += data.size(0)
        val_acc = correct / total
        trial.report(val_acc, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    return val_acc

pruner = optuna.pruners.SuccessiveHalvingPruner()
study = optuna.create_study(direction="maximize", pruner=pruner)
study.optimize(objective, n_trials=10)
print("Best trial:", study.best_trial.params)

[I 2025-03-05 20:25:03,643] A new study created in memory with name: no-name-81663264-5e37-443a-bbfc-250ce8b10ed4
  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
  dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-03-05 20:25:38,469] Trial 0 finished with value: 0.9883333333333333 and parameters: {'lr': 0.002111108171497272, 'num_filters': 32, 'batch_size': 128, 'dropout_rate': 0.40961533341663103, 'weight_decay': 0.00010614944708240787}. Best is trial 0 with value: 0.9883333333333333.
[I 2025-03-05 20:26:19,697] Trial 1 finished with value: 0.9826666666666667 and parameters: {'lr': 0.005522835642490142, 'num_filters': 16, 'batch_size': 64, 'dropout_rate': 0.3189620781678171, 'weight_decay': 0.00023986276078317344}. Best is trial 0 with value: 0.9883333333333333.
[I 2025-03-05 20:26:33,431] Trial 2 pruned. 
[I 2025-03-05 20:26:47,235] Trial 3 pruned. 
[I 2025-03-05 20:27:33,478] Trial 4 finishe

Best trial: {'lr': 0.000636976373301173, 'num_filters': 32, 'batch_size': 64, 'dropout_rate': 0.33792793406041755, 'weight_decay': 2.855124114411267e-06}


In [None]:
# Combine train and validation sets
full_train_dataset = ConcatDataset([train_dataset, val_dataset])
batch_size = study.best_trial.params['batch_size']
train_loader = DataLoader(full_train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
best_params = study.best_trial.params

# Pass both num_filters and dropout_rate to the model
model = CNN(best_params['num_filters'], best_params['dropout_rate']).to(device)

# Use the best learning rate and weight decay from the tuning study
optimizer = optim.Adam(
    model.parameters(),
    lr=best_params['lr'],
    weight_decay=best_params['weight_decay']
)
criterion = nn.CrossEntropyLoss()

epochs = 5
for epoch in range(epochs):
    model.train()
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        loss = criterion(model(data), target)
        loss.backward()
        optimizer.step()

model.eval()
correct, total = 0, 0
with torch.no_grad():
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        pred = model(data).argmax(dim=1)
        correct += pred.eq(target).sum().item()
        total += data.size(0)
print("Test accuracy:", correct / total)


Test accuracy: 0.989
