In [1]:
%load_ext autoreload
%autoreload 2

# Exercise 7

<img src="./images/07.png" width=800>

In [4]:
import time
import torch
import numpy as np
from torch.utils.data import DataLoader, random_split, Subset
import torch.nn as nn 
from typing import DefaultDict, Any, Callable, Optional
import mlflow
import os
from utils import train_network, accuracy_score_wrapper
import torchvision
from torchvision import transforms
from torchinfo import summary
import mlflow
from  sklearn.model_selection import train_test_split
import optuna
from tqdm.autonotebook import tqdm

In [5]:
os.environ['MLFLOW_TRACKING_URI'] = './mlruns05_7'
mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI'))

In [6]:
mlflow.set_experiment('Exercise05_7')

2025/06/12 18:34:32 INFO mlflow.tracking.fluent: Experiment with name 'Exercise05_7' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/spakdel/my_projects/Books/Inside-Deep-Learning/Exercises_InsideDeepLearning/Chapter_05/mlruns05_7/500994595681244077', creation_time=1749740672239, experiment_id='500994595681244077', last_update_time=1749740672239, lifecycle_stage='active', name='Exercise05_7', tags={}>

In [7]:
torch.backends.cudnn.deterministic=True
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Dataset and DataLoader

In [8]:
train_data = torchvision.datasets.FashionMNIST("./data", train=True, transform=transforms.ToTensor(), download=True)
test_data = torchvision.datasets.FashionMNIST("./data", train=False, transform=transforms.ToTensor(), download=True)
train_sub_set, valid_sub_set = train_test_split(
    np.arange(len(train_data)),
    test_size=0.1,
    shuffle=True, 
    random_state=42,
    stratify=train_data.targets)

train_dataset = Subset(train_data, train_sub_set)
valid_dataset = Subset(train_data, valid_sub_set)
batch = 256
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch)
test_loader = DataLoader(test_data, batch_size=batch)

In [9]:
loss_func = nn.CrossEntropyLoss()
score_funcs = {"Accuracy": accuracy_score_wrapper}

In [10]:
epochs = 20
params = {
    'device': device,
    'loss_func': loss_func.__class__.__name__,
    'epochs': epochs,
    'batch_size': batch
}

## Hyperparameter Tunning with Optuna

In [13]:
W = 28
H = 28
C = 1
classes = 10
filter = 16
K = 3
def build_model(num_conv_layers,
                num_pool_layers,
                num_hidden_layer=2, 
                init_hidden_size=512, 
                decay_factor=2,
                activation=nn.ReLU()):
    layers =[]
    in_channels = C
    out_channels = 32
    if num_pool_layers:
        pool_interval = max(1, num_conv_layers // (num_pool_layers + 1))
    else:
        pool_interval = num_conv_layers + 1
    
    currnet_pool_rounds = 0
    for i in range(num_conv_layers):
        layers.append(nn.Conv2d(
            in_channels=in_channels, 
            out_channels=out_channels,
            kernel_size=3,
            padding=3//2))
        layers.append(activation)
        in_channels = out_channels
        if (i+1) % pool_interval == 0 and currnet_pool_rounds < num_pool_layers:
            layers.append(nn.MaxPool2d(kernel_size=2))
            currnet_pool_rounds += 1
            out_channels *= 2
    final_w = W // (2 ** num_pool_layers)
    final_h = H // (2 ** num_pool_layers)
    fc_layers = []
    # Compute the number of features after flattening.
    in_features = in_channels * final_w * final_h

    fc_layers.append(nn.Flatten())

    if num_hidden_layer == 0:
        # Directly classify without extra hidden layers.
        fc_layers.append(nn.Linear(in_features, classes))
    else:
        # First FC layer: from flattened output to initial hidden size.
        fc_layers.append(nn.Linear(in_features, init_hidden_size))
        fc_layers.append(activation)
        
        # Set the current hidden size that will be reduced in subsequent layers.
        current_hidden_size = init_hidden_size

        # Add additional hidden layers with decreasing size.
        for layer in range(1, num_hidden_layer):
            # Compute new hidden size with decay.
            new_hidden_size = max(10, current_hidden_size // decay_factor)
            fc_layers.append(nn.Linear(current_hidden_size, new_hidden_size))
            fc_layers.append(activation)
            current_hidden_size = new_hidden_size

        # Final classification layer from the last hidden dimension to the number of classes.
        fc_layers.append(nn.Linear(current_hidden_size, classes))

    classifier = nn.Sequential(*fc_layers)
    model = nn.Sequential(*layers, classifier)
    return model

In [None]:
activation_functions = {
'ReLU': nn.ReLU(),
'Tanh': nn.Tanh(),
'LeakyReLU': nn.LeakyReLU(),
'Sigmoid': nn.Sigmoid()
}
optuna.logging.set_verbosity(optuna.logging.ERROR)
def champion_callback(study, frozen_trial):
    winner = study.user_attrs.get('winner', None)
    if winner is None:
        print(f'Initial trial {frozen_trial.number} achived value: {frozen_trial.value}')
    elif winner != study.best_value and study.best_value:   # second condition is for preventing zero devision
        improvment_percent = (abs(winner - study.best_value) / abs(study.best_value)) * 100
        print(f'Trial {frozen_trial.number} achived value: {frozen_trial.value} with {improvment_percent:.4f}% improvment')
    study.set_user_attr('winner', study.best_value)


def objective(trial):
    params = {
    'learning_rate': trial.suggest_float('lr', 1e-4, 1, log=True),
    'num_conv': trial.suggest_int("num_conv", 2, 10),
    'num_pool': trial.suggest_int("num_pool", 0, 2),
    'activation': trial.suggest_categorical("activation", list(activation_functions.keys())),
    'fc_hidden_layer': trial.suggest_int('fc_hidden_layer', 0, 5),
    'init_hidden_size':  trial.suggest_categorical('init_hidden_size', [2**i for i in range(6,10)]),
    }
    cnn_model = build_model(
        num_conv_layers=params['num_conv'],
        num_pool_layers=params['num_pool'],
        num_hidden_layer=params['fc_hidden_layer'], 
        init_hidden_size=params['init_hidden_size'], 
        activation=activation_functions[params['activation']],
        )
    # run_name = f'trial_lr_{params["learning_rate"]:.8f}'
    run_name = f'trial: {trial.number}'
    with mlflow.start_run(nested=True, run_name=run_name) as run:
        trial.set_user_attr('mlflow_run_id', run.info.run_id)
        
        optimizer = torch.optim.SGD(cnn_model.parameters(), lr=params['learning_rate'])
        params['optimizer'] = optimizer.defaults
        mlflow.log_params(params)
        
        with open ("model_summary.txt", "w") as f:
            f.write(str(summary(cnn_model, input_size=(batch, C, W, H))))
        mlflow.log_artifact("model_summary.txt")

        # model.apply(weight_reset)
        cnn_results = train_network(
            model=cnn_model,
            loss_func=loss_func,
            train_loader=train_loader,
            valid_loader=valid_loader,
            # test_loader=test_loader
            epochs=epochs,
            optimizer=optimizer,
            score_funcs=score_funcs,
            device=device,
            disable_tqdm=True,
            # checkpont_file_save='model.pth'
        )
    return  cnn_results['valid Accuracy'].iloc[-1]

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40, callbacks=[champion_callback])
champion_trial = study.best_trial
champion_run_id = champion_trial.user_attrs('champion_run_id')
if champion_run_id:
    mlflow_client = mlflow.tracking.MlflowClient()
    mlflow_client.set_tag(champion_run_id, "is_champion", "true")
    mlflow_client.set_tag(champion_run_id, "champion_metric_value", str(champion_trial.value))
    mlflow_client.set_tag(champion_run_id, "optuna_trial_number", str(champion_trial.number))
    print(f"Champion trial: {champion_trial.number} with value {champion_trial.value}")
else:
    print("Error: Could not retrieve champion_run_id from champion_trial.user_attrs")

## Traning with Selected Hyperparameters

In [None]:
run_id = 
run = mlflow.get_run(run_id)
parameters = run.data.params
learning_rate = parameters['learning_rate']
num_conv = parameters['num_conv']
num_pool = parameters['num_pool']
activation = parameters['activation']
fc_hidden_layer = parameters['fc_hidden_layer']
init_hidden_size = parameters['init_hidden_size']

cnn_model = build_model(
        num_conv_layers=num_conv,
        num_pool_layers=params['num_pool'],
        num_hidden_layer=params['fc_hidden_layer'], 
        init_hidden_size=params['init_hidden_size'], 
        activation=activation_functions[params['activation']],
        )
params['hidden_layers'] = hidden_layers
params['hidden_neurons'] = hidden_neurons
params['learning_rate'] = learning_rate
with mlflow.start_run(nested=True, run_name='final_run'):

    optimizer = torch.optim.SGD(fc_model.parameters(), lr=learning_rate)
    params['optimizer'] = optimizer.defaults
    mlflow.log_params(params)
    
    with open ("model_summary.txt", "w") as f:
        f.write(str(summary(fc_model)))
    mlflow.log_artifact("model_summary.txt")

    # model.apply(weight_reset)
    results = train_network(
        model=fc_model,
        loss_func=loss_func,
        train_loader=train_loader,
        valid_loader=valid_loader,
        # test_loader=test_loader
        epochs=epochs,
        optimizer=optimizer,
        score_funcs=score_funcs,
        device=device,
        checkpoint_file_save='final_model.pth'
        )

## Testing

In [None]:
def load_model_from_mlflow(
    run_id, artifact_path, model, device
):
    artifact_uri = f'runs:/{run_id}/{artifact_path}'
    checkpoint_path = mlflow.artifacts.download_artifacts(artifact_uri=artifact_uri)
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    # optimizer_state_dict = checkpoint['optimizer_state_dict']
    results = checkpoint['results']
    epoch = checkpoint['epoch']
    model.eval()
    model.to(device)
    return model, results, epoch

In [None]:
run_id = 
run = mlflow.get_run(run_id)
parameters = run.data.params
hidden_layers = parameters['hidden_layers']
hidden_neurons = parameters['hidden_neurons']
activation = parameters['activation']
learning_rate = parameters['learning_rate']
layers = [nn.Flatten(),
    nn.Linear(D, hidden_neurons ),
    activation]
for _ in range(hidden_layers):
    layers.extend([nn.Linear(hidden_neurons,  hidden_neurons), activation])

layers.append(nn.Linear(hidden_neurons, classes),)
fc_model = nn.Sequential(*layers)
optimizer = torch.optim.SGD(fc_model.parameters(), lr=learning_rate)
artifact_path = 
model, results, epoch = load_model_from_mlflow(
        run_id=run_id,
        artifact_path=artifact_path,
        model=fc_model,  # Replace with your actual model class
        device=device
    )

In [None]:
score_funcs = {"Accuracy": accuracy_score_wrapper}
results = defaultdict(list)
model.to(device)
model.eval()
with torch.no_grad():
    running_loss = []
    y_true = []
    y_pred = []
    for inputs, labels in tqdm(test_loader, desc='tetsing', leave=False):
        inputs = inputs.to(device)
        labels = labels.to(device)

        y_hat = model(inputs)
        loss = loss_func(y_hat, labels)
        running_loss.append(loss.item())

        if score_funcs is not None:
            labels = labels.detach().cpu().numpy()
            y_hat = y_hat.detach().cpu().numpy()
            y_true.extend(labels)
            y_pred.extend(y_hat)

    y_pred = np.asarray(y_pred)
    y_true = np.asanyarray(y_true)

    if score_funcs is not None and len(score_funcs) > 0:
        for score_name , score_func in score_funcs.items():
            print(f'{score_name} = {score_func(y_pred, y_true)})