<a href="https://colab.research.google.com/github/shaymaa-nabil/AML/blob/main/notebooks/02c-model-evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Evaluation



In [None]:
%%bash

pip install --upgrade torchmetrics

In [2]:
import numpy as np
from sklearn import compose, datasets, model_selection, pipeline, preprocessing

import torch
from torch import nn, optim, utils
import torchmetrics

## Verifying availability of GPU(s)

In [3]:
# check that torch version has support for cuda
print(torch.__version__)

2.8.0+cu126


In [4]:
%%bash

# check that GPUs are physically available
nvidia-smi

Wed Nov 19 13:10:04 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
# check that PyTorch can find the GPUs
print(torch.cuda.is_available())

True


In [6]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
print(DEVICE)

cuda


## Loading the data

In [None]:
covtype_dataset = datasets.fetch_covtype(
    as_frame=True
)

In [None]:
print(covtype_dataset["DESCR"])

In [None]:
covtype_features_df = covtype_dataset["data"]
covtype_target_df = (
    covtype_dataset.get("target")
                   .to_frame()
)

## Preparing the data

### Train/Val split

In [None]:
RANDOM_STATE = np.random.RandomState(42)


train_features_df, val_features_df, train_target_df, val_target_df = (
    model_selection.train_test_split(
        covtype_features_df,
        covtype_target_df,
        test_size=0.20,
        shuffle=True,
        stratify=covtype_target_df,
        random_state=RANDOM_STATE
    )
)


### Features and target preparation

In [None]:
def array_to_tensor(arr, dtype=torch.float32):
    return torch.tensor(arr, dtype=dtype)


prepare_covtype_features = pipeline.make_pipeline(
    compose.make_column_transformer(
        (
            "passthrough",
            compose.make_column_selector(
                pattern="^Wilderness_Area_|^Soil_Type_"
            )
        ),
        force_int_remainder_cols=False,
        n_jobs=-1,
        remainder=preprocessing.QuantileTransformer(
            output_distribution="normal",
            random_state=RANDOM_STATE,
        )
    ),
    preprocessing.FunctionTransformer(
        func=array_to_tensor,
    )
)

prepare_covtype_target = pipeline.make_pipeline(
    preprocessing.OrdinalEncoder(
        categories=[
            [1, 2, 3, 4, 5, 6, 7]
        ],
    ),
    preprocessing.FunctionTransformer(
        func=array_to_tensor,
        kw_args={
            "dtype": torch.int64
        }
    ),
    preprocessing.FunctionTransformer(
        func=torch.squeeze,
    )
)



In [None]:
X_train = prepare_covtype_features.fit_transform(train_features_df)
X_val = prepare_covtype_features.transform(val_features_df)


In [None]:
print(X_train.shape)
print(X_val.shape)

In [None]:
y_train = prepare_covtype_target.fit_transform(train_target_df)
y_val = prepare_covtype_target.transform(val_target_df)


In [None]:
print(y_train.shape)
print(y_val.shape)

### Datasets

In [None]:
train_dataset = utils.data.TensorDataset(X_train, y_train)
val_dataset = utils.data.TensorDataset(X_val, y_val)

### DataLoaders

In [None]:
train_data_loader = (
    utils.data
         .DataLoader(
             train_dataset,
             num_workers=2,
             batch_size=128,
             shuffle=True,
             persistent_workers=True,
             pin_memory=True,
             prefetch_factor=2,
             drop_last=True,
         )
)

val_data_loader = (
    utils.data
         .DataLoader(
             val_dataset,
             num_workers=2,
             batch_size=128,
             shuffle=True,
             persistent_workers=True,
             pin_memory=True,
             prefetch_factor=2,
             drop_last=True,
         )
)

## Defining a training loop


In [None]:
def train(
    model_fn,
    criterion,
    optimizer,
    train_data_loader,
    n_epochs,
    log_epochs=1,
    ):

    model_fn.train()
    for epoch in range(n_epochs):
        total_loss = 0.0
        for i, (X_batch, y_batch) in enumerate(train_data_loader):

            # move batches to device
            X_batch = X_batch.to(DEVICE, non_blocking=True)
            y_batch = y_batch.to(DEVICE, non_blocking=True)

            # forward pass
            y_pred = model_fn(X_batch)
            train_loss = criterion(y_pred, y_batch)
            total_loss += train_loss.item()

            # backward pass
            train_loss.backward()

            # gradient descent step
            optimizer.step()
            optimizer.zero_grad()

        average_loss = total_loss / len(train_data_loader)

        if (epoch + 1) % log_epochs == 0:
            print(f"Epoch {epoch + 1}/{n_epochs}, Training Loss: {average_loss: .4f}")


### Defining a model

In [None]:
_ = torch.manual_seed(42)

n_features = X_train.size(1)
n_classes = y_train.unique().size(0)

covtype_model = nn.Sequential(
    nn.Linear(
        in_features=n_features,
        out_features=200,
        bias=True,
    ),
    nn.ReLU(),
    nn.Linear(
        in_features=200,
        out_features=100,
        bias=True,
    ),
    nn.ReLU(),
    nn.Linear(
        in_features=100,
        out_features=50,
        bias=True,
    ),
    nn.ReLU(),
    nn.Linear(
        in_features=50,
        out_features=n_classes,
        bias=True,
    ),
)

# move model to the GPU before defining your optimizer!
covtype_model = covtype_model.to(DEVICE)

### Defining a loss function and optimizer

In [None]:
cross_entropy_loss = nn.CrossEntropyLoss()

# optimizer should be defined after moving model to GPU
sgd = optim.SGD(
    covtype_model.parameters(),
    lr=1e-3
)

### Training the model

In [None]:
train(
    covtype_model,
    cross_entropy_loss,
    sgd,
    train_data_loader,
    n_epochs=10,
    log_epochs=1,
)

## Evaluating trained model performance

In [None]:
def evaluate(model_fn, data_loader, metric_fn, aggregate_fn=torch.mean):
    model_fn.eval()
    metrics = []
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            y_pred = model_fn(X_batch)
            metric = metric_fn(y_pred, y_batch)
            metrics.append(metric)
    return aggregate_fn(torch.stack(metrics))


In [None]:
average_loss = evaluate(
    covtype_model,
    val_data_loader,
    cross_entropy_loss,
    aggregate_fn=torch.mean,
)
print(f"Validation Loss {average_loss: .4f}")

## Using Torchmetrics

[TorchMetrics](https://lightning.ai/docs/torchmetrics/stable//index.html) is an open-source library designed to provide a comprehensive and standardized collection of machine learning metrics for PyTorch. It is developed by [Lightning AI](https://lightning.ai/docs/pytorch/stable/) and offers a wide range of functional and module-based metrics for evaluating model performance.

### Key features and benefits of TorchMetrics:

* **Extensive Metric Collection:** It offers over 100 pre-built metric implementations covering various domains like classification, regression, object detection, segmentation, and NLP. Examples include Accuracy, Precision, Recall, F1-Score, AUROC, RMSE, RÂ², BLEU, and more.
* **Standardized Interface:** Provides a consistent API for metric computation, reducing boilerplate code and enhancing reproducibility across different projects and models.
* **Distributed Training Compatibility:** Metrics are designed to work seamlessly with distributed training setups, including PyTorch's DistributedDataParallel (DDP), ensuring correct and efficient metric aggregation across multiple devices.
* **Incremental Computation:** Metrics can be updated incrementally with new batches of data, which is crucial for handling large datasets that might not fit into memory and for efficient computation within training loops.
* **Custom Metric Creation:** Offers an easy-to-use API for creating custom metrics tailored to specific needs, allowing users to extend the library's functionality.
* **Integration with PyTorch Lightning:** While usable with native PyTorch, TorchMetrics has full integration with PyTorch Lightning, simplifying metric logging and management within Lightning's training and validation loops.
* **Performance Optimization:** Designed with performance in mind, minimizing synchronization points between CPU and GPU during metric collection to avoid performance bottlenecks.
* **Visualization Capabilities:** Includes features for quickly visualizing metric performance, aiding in model analysis and debugging.

In [None]:
def evaluate_tm(model_fn, data_loader, metric):
    model_fn.eval()
    metric.reset()  # reset the metric at the beginning
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            y_pred = model_fn(X_batch)
            metric.update(y_pred, y_batch)  # update it at each iteration
    return metric.compute()  # compute the final result at the end

In [None]:
torchmetrics.Accuracy?

In [None]:
accuracy = (
    torchmetrics.Accuracy(
        num_classes=n_classes,
        task="multiclass",
    ).to(DEVICE)
)

average_accuracy = evaluate_tm(
    covtype_model,
    val_data_loader,
    accuracy,
)
print(f"Validation Accuracy {average_accuracy: .4f}")

## Modifying our training loop

In [None]:
def train(
    model_fn,
    criterion,
    optimizer,
    metric,
    train_data_loader,
    val_data_loader,
    n_epochs,
    log_epochs=1,
    ):

    history = {
        "train_losses": [],
        "val_losses": [],
        "train_metrics": [],
        "val_metrics": [],
    }

    for epoch in range(n_epochs):
        total_train_loss = 0.0
        metric.reset()
        for i, (X_batch, y_batch) in enumerate(train_data_loader):
            model_fn.train()

            # move batches to device
            X_batch = X_batch.to(DEVICE, non_blocking=True)
            y_batch = y_batch.to(DEVICE, non_blocking=True)

            # forward pass
            y_pred = model_fn(X_batch)
            train_loss = criterion(y_pred, y_batch)
            total_train_loss += train_loss.item()

            # backward pass
            train_loss.backward()

            # gradient descent step
            optimizer.step()
            optimizer.zero_grad()

            # update our metric
            metric.update(y_pred, y_batch)

        # comute the average (across batches!) training loss
        average_train_loss = total_train_loss / len(train_data_loader)
        history["train_losses"].append(average_train_loss)

        # compute the average (across batched!) validation loss
        with torch.no_grad():
            model_fn.eval()
            total_val_loss = 0.0
            for X_batch, y_batch in val_data_loader:
                X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
                y_pred = model_fn(X_batch)
                val_loss = criterion(y_pred, y_batch)
                total_val_loss += val_loss.item()
            average_val_loss = total_val_loss / len(val_data_loader)
            history["val_losses"].append(average_val_loss)

        # compute the training metric after each epoch
        average_train_metric = (
            metric.compute()
                  .item()
        )
        history["train_metrics"].append(average_train_metric)

        # compute the validation metric after each epoch
        average_val_metric = evaluate_tm(
            model_fn,
            val_data_loader,
            metric,
        )
        history["val_metrics"].append(average_val_metric)

        if (epoch + 1) % log_epochs == 0:
            print(f"Epoch {epoch + 1}/{n_epochs}, "
                  f"train loss: {history['train_losses'][-1]:.4f}, "
                  f"val loss: {history['val_losses'][-1]:.4f}, "
                  f"train metric: {history['train_metrics'][-1]:.4f}, "
                  f"val metric: {history['val_metrics'][-1]:.4f}"
            )

    return history



## Combining training and evaluation

In [None]:
_ = torch.manual_seed(42)

n_features = X_train.size(1)
n_classes = y_train.unique().size(0)

covtype_model = nn.Sequential(
    nn.Linear(
        in_features=n_features,
        out_features=200,
        bias=True,
    ),
    nn.ReLU(),
    nn.Linear(
        in_features=200,
        out_features=100,
        bias=True,
    ),
    nn.ReLU(),
    nn.Linear(
        in_features=100,
        out_features=50,
        bias=True,
    ),
    nn.ReLU(),
    nn.Linear(
        in_features=50,
        out_features=n_classes,
        bias=True,
    ),
)

# move model to the GPU before defining your optimizer!
covtype_model = covtype_model.to(DEVICE)

cross_entropy_loss = nn.CrossEntropyLoss()

# optimizer should be defined after moving model to GPU
sgd = optim.SGD(
    covtype_model.parameters(),
    lr=1e-3
)

# define metric
accuracy = (
    torchmetrics.Accuracy(
        num_classes=n_classes,
        task="multiclass",
    ).to(DEVICE)
)

history = train(
    covtype_model,
    cross_entropy_loss,
    sgd,
    accuracy,
    train_data_loader,
    val_data_loader,
    n_epochs=10,
    log_epochs=1,
)