# Introduction

This notebook demonstrates how to train and tune a [convolutional neural network](https://en.wikipedia.org/wiki/Convolutional_neural_network) using [PyTorch](https://pytorch.org/) to classify images as dogs or cats.

# Imports

In [None]:
from __future__ import annotations

import csv
import functools
import glob
import itertools
import os
from collections import defaultdict
from typing import Any

import albumentations as A
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
from PIL import Image
import torch
from torch import nn
from torch import optim
from torch.optim import lr_scheduler
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim import Optimizer
from torch.utils import data
from tqdm import tqdm
from rich import print


import warnings
warnings.filterwarnings("ignore")

# Load rich extension for pretty output
%load_ext rich

# Configuration

This is some basic configuration parameters that will be used throughout the notebook.

In [None]:
DATA_DIR = "./data"
IMG_HEIGHT = IMG_WIDTH = 200
EPOCHS = 20
NUM_WORKERS = 4
PIN_MEMORY = True
LABEL_MAP = {"cat": 0, "dog": 1}
FOLDS = 5
COSINE_ANNEALING_T0 = 10
EARLY_STOPPING_ROUNDS = 5
MODEL_DIR = "./models"
INFERENCE_DIR = "./inferences"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Make model directory to save models
# And inferences
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)
    
if not os.path.exists(INFERENCE_DIR):
    os.mkdir(INFERENCE_DIR)

# Unzip Data and Resize Images

In [None]:
import os
from zipfile import ZipFile

if not os.path.exists(DATA_DIR):
    os.mkdir(DATA_DIR)
    
    with ZipFile("../input/dogs-vs-cats/test1.zip") as zipf:
        zipf.extractall(DATA_DIR)

    with ZipFile("../input/dogs-vs-cats/train.zip") as zipf:
        zipf.extractall(DATA_DIR)
    
    train_path = os.path.join(DATA_DIR, "train", "*.jpg")
    test_path = os.path.join(DATA_DIR, "test1", "*.jpg")
    for img_path in itertools.chain(glob.glob(train_path), glob.glob(test_path)):
        img = Image.open(img_path)
        if img.height != IMG_HEIGHT or img.width != IMG_WIDTH:
            img = img.resize((IMG_WIDTH, IMG_HEIGHT), resample=Image.BILINEAR)
            img.save(img_path)


# Dataset And Data Functions

This section defines the `Dataset` that will be used by the model and some additional data functions.

The `CatsDogDataset` class implements a PyTorch Dataset to handle data loading.

In [None]:
class CatsDogsDataset(data.Dataset):
    def __init__(
        self,
        csv: str,
        transform: A.Compose = None,
        labels: bool = True,
    ):
        self.df: pd.DataFrame = pd.read_csv(csv)
        self.transform = transform
        self.labels = labels

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
        # Load image path and label
        if isinstance(idx, slice):
            raise TypeError(f"{self.__class__.__name__} doesn't support slicing.")

        img = self.df.iloc[idx, 0]

        # Read image and convert to NumPy array
        img = Image.open(img)
        img = np.array(img)

        # Apply transformations
        if self.transform is not None:
            img = self.transform(image=img)["image"]

        # Move channels in front
        img = np.swapaxes(img, -1, 0)
        img = torch.tensor(img, dtype=torch.float32)

        sample = {"image": img}

        # Add label to sample if required
        if self.labels is True:
            label = self.df.iloc[idx, 1]
            sample["label"] = torch.tensor(label, dtype=torch.float32)

        return sample

The `dir_to_csv()` function takes either the train or test directories and makes a CSV file out of them.

In [None]:
def dir_to_csv(dir_name: str, dest: str, has_labels: bool = True) -> None:
    def with_labels(path):
        label_map = LABEL_MAP
        yield from (
            {
                "filename": filename,
                "label": label_map["cat" if "cat" in filename else "dog"],
            }
            for filename in glob.glob(path)
        )

    path = os.path.join(DATA_DIR, dir_name, "*.jpg")
    target = os.path.join(DATA_DIR, dest)

    with open(target, mode="w+") as f:

        fieldnames = ["filename"]

        if has_labels is True:
            fieldnames.append("label")
            rows = with_labels(path)
        else:
            rows = ({"filename": filename} for filename in glob.glob(path))

        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

The `order_test_data()` function orders the test data by the 

In [None]:
def order_test_data(csv_path: str) -> None:
    path = os.path.join(DATA_DIR, csv_path)

    df: pd.DataFrame = pd.read_csv(path)

    target_names = []

    for filename in df["filename"]:
        # Take out filename from full path
        basename = os.path.basename(filename)

        # Remove extension
        name, _ = os.path.splitext(basename)

        target_names.append(int(name))

    df["target_name"] = target_names

    # Sort by IDs
    df = df.sort_values(by=["target_name"])
    df = df.drop("target_name", axis=1)

    # Overwrite existing file
    df.to_csv(path, index=False)

The `get_transforms()` function returns an `albumentations.Compose` object with the transformations that should be applied on the images.

In [None]:
def get_transforms() -> A.Compose:
    return A.Compose(
        [
            A.HorizontalFlip(p=0.4),
            A.RandomBrightnessContrast(p=0.2),
            A.Normalize(always_apply=True)
        ]
    )

# Model Class And Functions

The `CatsDogsModel` class implements a PyTorch model.

It takes the following inputs:

* `in_channels`: Number of channels in the input images.
* `conv_out_channels`: List of output channels for the convolutional layers. Thus, the model automatically adds as many convolutional layers as the length of the list with the specified number of output channels. Each convolutional layer is also followed by batch normalization, `ReLU` activaton and maxpooling of pool size `(2, 2)`.
* `kernel_sizes`: List of kernel sizes for the convolutional layers. This should have the same length as the above list.
* `linear_out_features`: Similar to `conv_out_channels` but for the linear layers, except the output layer. The model will have a total of one more linear layer than the length of this list. Each liniear layer is followed by a `ReLU` activation. 
* `n_targets`: Number of output features for the output linear layer.
* `dropout`: Dropout probability for the optional dropout layer between the convolutional and linear parts. When `0.0`, no dropout layer is added to the model.
* `op_activation`: Activation function for output layer. Can be `sigmoid` or `softmax`.

Defining the model this way allows for convenient tuning of these values using some hyperparameter tuning library.

In [None]:
class CatsDogsModel(nn.Module):
    op_activs = {
        "sigmoid": nn.Sigmoid,
        "softmax": nn.Softmax,
    }
    def __init__(
        self,
        conv_out_channels: list[int],
        kernel_sizes: list[int],
        linear_out_features: list[int], 
        n_targets: int = 1,
        in_channels: int = 3,
        dropout: float = 0.0,
        op_activation: str = "sigmoid",
    ) -> None:
        super().__init__()

        layers = []
        in_c = in_channels

        for out_channels, kernel_size in zip(conv_out_channels, kernel_sizes):
            conv_block = self._make_conv_block(
                in_channels=in_c,
                out_channels=out_channels,
                kernel_size=(kernel_size, kernel_size),
            )
            layers.append(conv_block)
            in_c = out_channels

        layers.append(nn.Flatten())

        if dropout > 0.0:
            layers.append(nn.Dropout(p=dropout))

        in_features = self._get_linear_in_features(layers, in_channels)

        for out_features in linear_out_features:
            linear_block = self._make_linear_block(
                in_features=in_features, out_features=out_features
            )
            layers.append(linear_block)
            in_features = out_features

        layers.append(nn.Linear(in_features, n_targets))
        layers.append(self.op_activs[op_activation]())
        self.layers = nn.Sequential(*layers)

    def _make_conv_block(self, in_channels: int, out_channels: int, kernel_size: int) -> nn.Sequential:
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2)),
        )

    def _get_linear_in_features(self, layers: list[nn.Module], in_channels: int = 3) -> int:
        """
        Automatically calculate the in_features for the linear layer
        Immediately after the final convolutional layer.
        """
        x = torch.rand(1, in_channels, IMG_HEIGHT, IMG_WIDTH, dtype=torch.float32, device=DEVICE)
        m = nn.Sequential(*layers)
        m.to(DEVICE)

        return m(x).size(-1)

    def _make_linear_block(self, in_features: int, out_features: int) -> nn.Sequential:
        return nn.Sequential(nn.Linear(in_features, out_features), nn.ReLU())

    def forward(self, image: torch.Tensor) -> torch.Tensor:
        return self.layers(image).squeeze(1)

For example, to define a model with:

* RGB input.
* Four convolutional layers with output channels `16`, `32`, `32` and `64` and kernel sizes `3`, `3`, `5` and `5`.
* Three linear layers with output features `64`, `32` and `16`.
* A single Sigmoid output unit.
* Dropout of `0.2`.

You can do the following:

In [None]:
m = CatsDogsModel(
    conv_out_channels=[16, 32, 32, 64],
    kernel_sizes=[3, 3, 5, 5],
    linear_out_features=[64, 32, 16],
    n_targets=1,
    in_channels=3,
    dropout=0.2,
)
m

In [None]:
x = torch.rand(1, 3, IMG_HEIGHT, IMG_WIDTH, dtype=torch.float32, device=DEVICE)
m.to(DEVICE)
m(x)

The `get_model_params()` function takes a dictionary of parameters and extracts all parameters that are relevant to create a model, storing them in the appropriate format.

In [None]:
def get_model_params(params: dict[str, Any]) -> dict[str, Any]:
    dict_params = {}

    dict_params["conv_out_channels"] = [
        params["conv1_out"],
        params["conv2_out"],
        params["conv3_out"],
        params["conv4_out"],
    ]

    k1, k2 = params["kernel_size1"], params["kernel_size2"]

    dict_params["kernel_sizes"] = [k1, k1, k2, k2]

    dict_params["linear_out_features"] = [
        params["linear1_out"],
        params["linear2_out"],
        params["linear3_out"],
    ]

    dict_params["dropout"] = params["dropout"]

    return dict_params

# Engine Class And Training Functions

The `Engine` class handles training the model.

In [None]:
class Engine:
    def __init__(
        self, model: nn.Module, optimizer: Optimizer, scheduler: _LRScheduler = None
    ) -> None:
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.history = defaultdict(list)

    @staticmethod
    def loss_fn(y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
        fn = nn.BCELoss()
        fn.to(DEVICE)
        return fn(y_pred, y_true)

    @staticmethod
    def correct_and_accuracy(y_pred: torch.Tensor, y_true: torch.Tensor) -> tuple[int, float]:
        correct = (y_true == (y_pred > 0.5).float()).float().sum()
        return correct, correct / len(y_pred)
            
    def save_loss_curve(self, filename: str, best_iter: int = None) -> None:
        train_loss = self.history["train_loss"]
        val_loss = self.history["val_loss"]
        
        print(train_loss, val_loss)
        
        plt.figure()
        plt.plot(train_loss, label="Training")
        plt.plot(val_loss, label="Validation")
        
        if best_iter is not None:
            plt.axvline(best_iter, color="black", linestyle="--")
        
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("Loss Curve")
        plt.legend()
        
        path = os.path.join(MODEL_DIR, filename)
        plt.savefig(path)
        plt.clf()

    def save_model(self, filename: str) -> None:
        path = os.path.join(MODEL_DIR, filename)
        torch.save(self.model.state_dict(), path)

    def _train_one_step(
        self, data: dict[str, torch.Tensor]
    ) -> tuple[torch.Tensor, int, float]:
        for key, value in data.items():
            data[key] = value.to(DEVICE)

        preds = self.model(image=data["image"])

        true = data["label"]
        loss = self.loss_fn(preds, true)

        loss.backward()
        self.optimizer.step()

        correct, accuracy = self.correct_and_accuracy(preds, true)

        return loss, correct, accuracy

    def train(
        self, data_loader: data.DataLoader, epoch_num: int = 0
    ) -> tuple[float, float]:
        self.model.train()
        total_correct = 0

        optimizer = self.optimizer
        scheduler = self.scheduler

        total_loss = accuracy = 0.0
        iters = len(data_loader)
        
        with tqdm(data_loader, unit="batch", desc="Training") as p_loader:
            for batch_index, data in enumerate(p_loader):
                optimizer.zero_grad(set_to_none=True)

                loss, correct, accuracy = self._train_one_step(data=data)
                
                total_correct += correct
                loss = loss.item()
                accuracy_per = accuracy * 100

                postfix = {"loss": loss, "batch acc": f"{accuracy_per: .2f}%"}

                postfix["lr"] = (
                    scheduler.get_last_lr()[0]
                    if scheduler is not None
                    else optimizer.param_groups[0]["lr"]
                )

                p_loader.set_postfix(postfix)

                total_loss += loss

                if scheduler is not None:
                    scheduler.step(epoch_num + batch_index / iters)

            avg_loss = total_loss / (batch_index + 1)
            self.history["train_loss"].append(avg_loss)
            return avg_loss, total_correct

    def _evaluate_one_step(
        self, data: dict[str, torch.Tensor]
    ) -> tuple[torch.Tensor, int, float]:
        for key, value in data.items():
            data[key] = value.to(DEVICE)

        preds = self.model(image=data["image"])

        true = data["label"]
        loss = self.loss_fn(preds, true)
        correct, accuracy = self.correct_and_accuracy(preds, true)

        return loss, correct, accuracy

    def evaluate(self, data_loader: data.DataLoader) -> tuple[float, float]:
        self.model.eval()
        
        total_correct = 0
        total_loss = accuracy = 0.0

        with tqdm(data_loader, unit="batch", desc="Validation") as p_loader:
            for batch_index, data in enumerate(p_loader):
                with torch.no_grad():
                    loss, correct, accuracy = self._evaluate_one_step(data=data)
                    
                total_correct += correct 
                loss = loss.item()
                accuracy_per = accuracy * 100

                postfix = {"loss": loss, "batch acc": f"{accuracy_per: .2f}%"}

                p_loader.set_postfix(postfix)

                total_loss += loss

            avg_loss = total_loss / (batch_index + 1)
            self.history["val_loss"].append(avg_loss)
            return avg_loss, total_correct

The `train()` function runs the training loop. Optionally, it also saves the model and the loss curve of the model.

In [None]:
def init_weights(layer: nn.Module) -> None:
    if isinstance(layer, (nn.Conv2d, nn.Linear)):
        nn.init.xavier_normal_(layer.weight)

def train(
    dataset: data.Dataset,
    params: dict[str, Any],
    *,
    val_split: float = 0.2,
    save_model: bool = False,
) -> np.float32:
    length = len(dataset)
    val_len = int(val_split * length)
    t_len = length - val_len
    
    generator = torch.Generator().manual_seed(42)
    split = data.random_split(dataset, [t_len, val_len], generator=generator)

    batch_size = params["batch_size"]

    train_loader = data.DataLoader(
        split[0],
        shuffle=True,
        batch_size=batch_size,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
    )

    val_loader = data.DataLoader(
        split[1],
        shuffle=True,
        batch_size=batch_size,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
    )

    model = CatsDogsModel(n_targets=1, **get_model_params(params))
    model.apply(init_weights)
    model.to(DEVICE)

    optimizer = optim.Adam(model.parameters(), lr=params["lr"])
    scheduler = lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer=optimizer, T_0=COSINE_ANNEALING_T0
    )

    engine = Engine(model=model, optimizer=optimizer, scheduler=scheduler)

    esr = params.get("early_stopping_rounds", 5)
    best_loss = np.inf
    best_iter = counter = 0
    losses = []

    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")

        t_loss, t_correct = engine.train(data_loader=train_loader, epoch_num=epoch)
        val_loss, val_correct = engine.evaluate(data_loader=val_loader)
        
        losses.append(val_loss)
        
        t_acc = t_correct / t_len
        val_acc = val_correct / val_len
        
        print(f"Avg. train loss={t_loss:.4f}, Train acc={t_acc:.4f}")
        print(f"Avg. val loss={val_loss:.4f}, Val acc={val_acc:.4f}")
        
        # Simple early stopping
        if val_loss < best_loss:
            best_loss, best_iter = val_loss, epoch
            if save_model is True:
                engine.save_model(f"model-{epoch + 1}.pth")
        else:
            counter += 1

        if counter > esr:
            break
            
    if save_model is True:
        engine.save_loss_curve(f"loss_curve.png", best_iter)

    return np.mean(losses, dtype=np.float32)

# Optuna Objective

Optuna requires an objective function which returns the metric to be optimized.

In [None]:
def objective(trial: optuna.trial.Trial, dataset: data.Dataset) -> np.float32:
    params = {
        "conv1_out": trial.suggest_int("conv1_out", 16, 64, step=8),
        "conv2_out": trial.suggest_int("conv2_out", 16, 64, step=8),
        "conv3_out": trial.suggest_int("conv3_out", 16, 64, step=8),
        "conv4_out": trial.suggest_int("conv4_out", 16, 64, step=8),
        "linear1_out": trial.suggest_int("linear1_out", 16, 64, step=8),
        "linear2_out": trial.suggest_int("linear2_out", 16, 64, step=8),
        "linear3_out": trial.suggest_int("linear3_out", 16, 64, step=8),
        "kernel_size1": trial.suggest_categorical("kernel_size1", [3, 5, 7]),
        "kernel_size2": trial.suggest_categorical("kernel_size2", [3, 5, 7]),
        "lr": trial.suggest_float("lr", 1e-4, 1e-1, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [32, 64, 128]),
        "dropout": trial.suggest_uniform("dropout", 0.1, 0.7),
        "early_stopping_rounds": trial.suggest_categorical("early_stopping_rounds", [5, 10, 15, 20])
    }

    return train(dataset, params)

# Training

# Convert Directories To CSV

In [None]:
dir_to_csv("train", "train_data.csv")
dir_to_csv("test1", "test_data.csv", has_labels=False)
order_test_data("test_data.csv")

# Seed

In [None]:
torch.manual_seed(42)
torch.backends.cudnn.benchmark = True

# Read and Create Dataset

In [None]:
dataset_path = os.path.join(DATA_DIR, "train_data.csv")
transform = get_transforms()
dataset = CatsDogsDataset(csv=dataset_path, transform=transform)

# Tune Model Using Optuna

In [None]:
objective_ = functools.partial(objective, dataset=dataset)

pruner = optuna.pruners.SuccessiveHalvingPruner()
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)
study = optuna.create_study(
    direction="minimize",
    pruner=pruner,
    sampler=sampler
)
study.optimize(objective_, n_trials=20, gc_after_trial=True)

# Train Final Model Using Best Parameters

In [None]:
print(study.best_trial.params)

In [None]:
best_trial = study.best_trial
val_loss = train(dataset=dataset, params=best_trial.params, save_model=True)
print(f"Avg. validation loss:{val_loss:.4f}")

## Loss curve

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

path = os.path.join(MODEL_DIR, "loss_curve.png")
im = plt.imread(path)
ax.imshow(im)
ax.set_axis_off()
plt.show()

# Prediction

In [None]:
def make_inference(csv_file: str, params: dict[str, Any]) -> None:
    transform = A.Compose([A.Normalize(always_apply=True)])

    test_data_path = os.path.join(DATA_DIR, csv_file)

    test_data = CatsDogsDataset(
        csv=test_data_path,
        transform=transform,
        labels=False,
    )
     
    models_path = os.path.join(MODEL_DIR, "*.pth")
    
    for model_path in glob.glob(models_path):
        model = CatsDogsModel(**get_model_params(params), n_targets=1)

        state_dict = torch.load(model_path)
        model.load_state_dict(state_dict)
        model.to(DEVICE)
        model.eval()

        data_loader = data.DataLoader(
            test_data,
            batch_size=params["batch_size"],
            num_workers=NUM_WORKERS,
            pin_memory=PIN_MEMORY
        )

        df_dict = defaultdict(list)
        df_dict["id"] = range(1, len(test_data) + 1)

        for idx, batch in enumerate(data_loader, 1):
            for k, v in batch.items():
                batch[k] = v.to(DEVICE)

            with torch.no_grad():
                preds = model(image=batch["image"])
            
            preds = (preds > 0.5).int()
            df_dict["label"].extend(preds.tolist())
            
        basename = os.path.basename(model_path)
        filename, _ = os.path.splitext(basename)

        df = pd.DataFrame.from_dict(df_dict)
        path = os.path.join(INFERENCE_DIR, f"{filename}_submission.csv")
        df.to_csv(path, index=False)

In [None]:
make_inference("test_data.csv", best_trial.params)