In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')

  set_matplotlib_formats('png', 'pdf')


# Exercise 4

<img src='./images/04.png' width=800>

To address the original problem, I modified the `train_simple_network` function to save a version of the model every x epochs. Instead of saving models with different filenames locally, which could potentially fill the hard drive, I store the models in MLflow. By leveraging MLflow, I can keep track of models efficiently without worrying about storage limitations. This approach makes it easy to revisit specific versions of the model and analyze their performance.

In [None]:
import os
import mlflow
os.environ['MLFLOW_TRACKING_URI'] = './mlruns'
mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI'))

In [None]:
mlflow.set_experiment('Exercise_4')

2025/04/07 10:31:47 INFO mlflow.tracking.fluent: Experiment with name 'Exercise_4' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/spakdel/my_projects/Books/Inside-Deep-Learning/Exercises_InsideDeepLearning/Chapter_02/mlruns/203838178005775626', creation_time=1744009307431, experiment_id='203838178005775626', last_update_time=1744009307431, lifecycle_stage='active', name='Exercise_4', tags={}>

In [None]:
from sklearn.datasets import make_moons
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import optuna
import torch.nn as nn
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from utils import (train_network, accuracy_score_wrapper, 
                f1_score_wrapper, roc_auc_score_micro_wrapper, 
                weight_reset, set_seed)
from torchinfo import summary
from mlflow import MlflowClient
from mlflow.types import Schema, TensorSpec
from mlflow.models import ModelSignature


random_state = 42
set_seed(random_state)

X_train, Y_train = make_moons(n_samples=8000, noise=0.4, random_state=random_state)
X_valid, Y_valid = make_moons(n_samples=200, noise=0.4, random_state=random_state)

train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                            torch.tensor(Y_train, dtype=torch.long))
valid_dataset = TensorDataset(torch.tensor(X_valid, dtype=torch.float32),
                            torch.tensor(Y_valid, dtype=torch.long))


def plot_results(data_df, close=True):
    sns.lineplot(data_df, x='epoch', y='valid AUC', label='valid AUC')
    plt.xlabel('epoch')
    plt.ylabel('valid AUC')
    plt.title('valid AUC')
    fig = plt.gcf()
    if close:
        plt.close()
    return fig
    
run_id = '219aed50bb7f4f8cbe70429ade80c1a5' #best valid auc from previuos exercise

client = MlflowClient()
run_data = client.get_run(run_id).data

params = run_data.params
epochs = 30
in_features = 2
out_features = 2
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
params['device'] = device
params['epochs'] = epochs
loss_func = nn.CrossEntropyLoss()
activation_functions = {
'ReLU': nn.ReLU(),
'Tanh': nn.Tanh(),
'LeakyReLU': nn.LeakyReLU(),
'Sigmoid': nn.Sigmoid()
}
sequential_layer = [
    nn.Linear(in_features, int(params['hidden_neurons'])),
    activation_functions[params['activation']]
]
for _ in range(int(params['layers'])):
    sequential_layer.append(nn.Linear(int(params['hidden_neurons']), int(params['hidden_neurons'])))
    sequential_layer.append(activation_functions[params['activation']])
sequential_layer.append(nn.Linear(int(params['hidden_neurons']), out_features))

model = nn.Sequential(*sequential_layer)
optimizer = torch.optim.SGD(model.parameters(), lr=float(params['learning_rate']))

train_dataloader = DataLoader(train_dataset, batch_size=int(params['batch_size']),shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=int(params['batch_size']))

with mlflow.start_run(nested=True, run_name='save_every_x_epochs'):
    mlflow.log_params(params)

    
    with open ("model_summary.txt", "w") as f:
        f.write(str(summary(model)))
    mlflow.log_artifact("model_summary.txt")

    fc_results = train_network(
        model=model,
        loss_func=loss_func,
        train_loader=train_dataloader,
        valid_loader=valid_dataloader,
        epochs=epochs,
        optimizer=optimizer,
        score_funcs={'Acc':accuracy_score_wrapper, 'F1':f1_score_wrapper, 'AUC':roc_auc_score_micro_wrapper },
        device=device,
        checkpoint_file_save='model.pth',
        checkpoint_every_x=10
    )
    
    input_schema = Schema([TensorSpec(np.dtype(np.float32), (-1, 2))])
    output_schema = Schema([TensorSpec(np.dtype(np.float32), (-1, 2))])
    signature = ModelSignature(inputs=input_schema, outputs=output_schema)
    mlflow.pytorch.log_model(model, "model", signature=signature)
    mlflow.log_figure(plot_results(fc_results), "valid_AUC.png")

Epoch: 100%|██████████| 30/30 [00:47<00:00,  1.57s/it]


### Without mlflow:

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import torch.nn.functional as F
import torch.optim as optim
from tqdm.autonotebook import tqdm
import time
from collections import defaultdict
import pandas as pd
from utils import run_epoch

def resum_better_network(model,
                        optimizer,
                        loss_func,
                        train_loader,
                        val_loader=None,
                        epochs=50,
                        device='cpu',
                        score_funcs=None,
                        checkpoint_file_load=None,
                        checkpoint_file_save=None,
                        lr=0.001,
                        checkpoint_every_x=None,
                        ):
    model.to(device)
    optimizer = optimizer(model.parameters(), lr=lr)

    if checkpoint_file_load:
        checkpoint = torch.load(checkpoint_file_load,  weights_only=False)
        
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        results = checkpoint['results']
        total_train_time = checkpoint['results']['total time'][-1]
    else:
        results = defaultdict(list)
        start_epoch = 0
        total_train_time = 0
    for epoch in tqdm(range(start_epoch, epochs), desc='Epoch'):
        model.train()
        
        total_train_time += run_epoch(model,
                                    optimizer,
                                    train_loader,
                                    loss_func,
                                    device,
                                    results,
                                    score_funcs,
                                    prefix='train',
                                    desc='training',
                                    )

        results['total time'].append(total_train_time)
        results['epoch'].append(epoch)
        
        if val_loader is not None:
            model.eval()
            val_loss = 0
            with torch.no_grad():
                run_epoch(model,
                        optimizer,
                        train_loader,
                        loss_func,
                        device,
                        results,
                        score_funcs,
                        prefix='valid',
                        desc='validating',
                        )
        if checkpoint_every_x and (epoch+1) % checkpoint_every_x == 0:
            torch.save(
                {
                'results': results,
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict()
                },
                f'{checkpoint_file_save.split('.')[0]}_{epoch+1}.pth')

    if checkpoint_file_save is not None:
        torch.save(
            {
            'results': results,
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
            },
            f'{checkpoint_file_save.split('.')[0]}_final_({epoch+1}).pth')
    return pd.DataFrame.from_dict(results)