<a href="https://colab.research.google.com/github/rahiakela/deep-learning-research-and-practice/blob/main/deep-learning-fundamentals/unit06-deep-learning-tips-and-tricks/03_debugging_deep_neural_networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Debugging Deep Neural Networks

**Reference**

[Debugging Deep Neural Networks](https://lightning.ai/pages/courses/deep-learning-fundamentals/unit-6-overview-essential-deep-learning-tips-tricks/6.8-debugging-deep-neural-networks/)

## 1) Setup

In [None]:
!pip install torch torchvision torchaudio
!pip install lightning
!pip install torchmetrics
!pip install mlxtend==0.21.0

In [2]:
!lightning --version

lightning, version 2.0.2


In [None]:
!wget https://github.com/Lightning-AI/dl-fundamentals/raw/main/unit06-dl-tips/6.8-debugging/shared_utilities.py

In [15]:
import torch
import torchvision
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
import torch.nn.functional as F

import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.utilities.model_summary import ModelSummary
import torchmetrics as tm

import os
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import StandardScaler

from shared_utilities import CustomDataModule

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [7]:
def show_metrics(trainer):
  metrics = pd.read_csv(f"{trainer.logger.log_dir}/metrics.csv")

  aggreg_metrics = []
  agg_col = "epoch"
  for i, dfg in metrics.groupby(agg_col):
      agg = dict(dfg.mean())
      agg[agg_col] = i
      aggreg_metrics.append(agg)

  df_metrics = pd.DataFrame(aggreg_metrics)
  df_metrics[["train_loss", "val_loss"]].plot(
      grid=True, legend=True, xlabel="Epoch", ylabel="Loss"
  )

  plt.savefig("suggest_loss.pdf")

  df_metrics[["train_acc", "val_acc"]].plot(
      grid=True, legend=True, xlabel="Epoch", ylabel="ACC"
  )

  plt.show()

In [8]:
!mkdir -p my_logs

##2) Fast Dev Runs

In [9]:
class PyTorchMLP(torch.nn.Module):
  def __init__(self, num_features, num_classes):
    super().__init__()

    self.all_layers = torch.nn.Sequential(
        # 1st hidden layer
        torch.nn.Linear(num_features, 100),
        torch.nn.BatchNorm1d(100),
        torch.nn.ReLU(),
        
        # 2nd hidden layer
        torch.nn.Linear(100, 50),
        torch.nn.BatchNorm1d(50),
        torch.nn.ReLU(),
        
        # output layer
        torch.nn.Linear(50, num_classes),
    )

  def forward(self, x):
    x = torch.flatten(x, start_dim=1)
    logits = self.all_layers(x)
    return logits

In [10]:
class LightningModel(L.LightningModule):
    def __init__(self, model, learning_rate):
        super().__init__()

        self.learning_rate = learning_rate
        self.model = model

        self.save_hyperparameters(ignore=["model"])

        self.train_acc = tm.Accuracy(task="multiclass", num_classes=2)
        self.val_acc = tm.Accuracy(task="multiclass", num_classes=2)
        self.test_acc = tm.Accuracy(task="multiclass", num_classes=2)

    def forward(self, x):
        return self.model(x)

    def _shared_step(self, batch):
        features, true_labels = batch
        logits = self(features)

        loss = F.cross_entropy(logits, true_labels)
        predicted_labels = torch.argmax(logits, dim=1)
        return loss, true_labels, predicted_labels

    def training_step(self, batch, batch_idx):
        loss, true_labels, predicted_labels = self._shared_step(batch)

        self.log("train_loss", loss)
        self.train_acc(predicted_labels, true_labels)
        self.log(
            "train_acc", self.train_acc, prog_bar=True, on_epoch=True, on_step=False
        )
        return loss

    def validation_step(self, batch, batch_idx):
        loss, true_labels, predicted_labels = self._shared_step(batch)

        self.log("val_loss", loss, prog_bar=True)
        self.val_acc(predicted_labels, true_labels)
        self.log("val_acc", self.val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        loss, true_labels, predicted_labels = self._shared_step(batch)
        self.test_acc(predicted_labels, true_labels)
        self.log("test_acc", self.test_acc)

    def configure_optimizers(self):
        optimizers = torch.optim.SGD(self.parameters(), lr=self.learning_rate)

        return optimizers

In [None]:
%%capture --no-display

L.seed_everything(123)

dm = CustomDataModule()

pytorch_model = PyTorchMLP(num_features=100, num_classes=2)
lightning_model = LightningModel(model=pytorch_model, learning_rate=0.15)

trainer = L.Trainer(
  fast_dev_run=5,
  max_epochs=100,
  accelerator="auto", # set to "auto" or "gpu" to use GPUs if available
  devices="auto",      # Uses all available GPUs if applicable
  deterministic=True,
  #logger=CSVLogger(save_dir="logs_v1", name="my-model")
)

trainer.fit(
  model=lightning_model,
  datamodule=dm
)

In [12]:
show_metrics(trainer)

FileNotFoundError: ignored

In [13]:
trainer.test(model=lightning_model, dataloaders=dm.val_dataloader())

Testing: 0it [00:00, ?it/s]

[{'test_acc': 0.625}]

##3) Model Summaries

In [14]:
class LightningModel(L.LightningModule):
    def __init__(self, model, learning_rate):
        super().__init__()

        self.example_input_array = torch.Tensor(5, 100) # example batch

        self.learning_rate = learning_rate
        self.model = model

        self.save_hyperparameters(ignore=["model"])

        self.train_acc = tm.Accuracy(task="multiclass", num_classes=2)
        self.val_acc = tm.Accuracy(task="multiclass", num_classes=2)
        self.test_acc = tm.Accuracy(task="multiclass", num_classes=2)

    def forward(self, x):
        return self.model(x)

    def _shared_step(self, batch):
        features, true_labels = batch
        logits = self(features)

        loss = F.cross_entropy(logits, true_labels)
        predicted_labels = torch.argmax(logits, dim=1)
        return loss, true_labels, predicted_labels

    def training_step(self, batch, batch_idx):
        loss, true_labels, predicted_labels = self._shared_step(batch)

        self.log("train_loss", loss)
        self.train_acc(predicted_labels, true_labels)
        self.log(
            "train_acc", self.train_acc, prog_bar=True, on_epoch=True, on_step=False
        )
        return loss

    def validation_step(self, batch, batch_idx):
        loss, true_labels, predicted_labels = self._shared_step(batch)

        self.log("val_loss", loss, prog_bar=True)
        self.val_acc(predicted_labels, true_labels)
        self.log("val_acc", self.val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        loss, true_labels, predicted_labels = self._shared_step(batch)
        self.test_acc(predicted_labels, true_labels)
        self.log("test_acc", self.test_acc)

    def configure_optimizers(self):
        optimizers = torch.optim.SGD(self.parameters(), lr=self.learning_rate)
        return optimizers

In [None]:
L.seed_everything(123)

dm = CustomDataModule()

pytorch_model = PyTorchMLP(num_features=100, num_classes=2)
lightning_model = LightningModel(model=pytorch_model, learning_rate=0.15)

In [17]:
summary = ModelSummary(lightning_model, max_depth=-1)
print(summary)

   | Name               | Type               | Params | In sizes | Out sizes
----------------------------------------------------------------------------------
0  | model              | PyTorchMLP         | 15.6 K | [5, 100] | [5, 2]   
1  | model.all_layers   | Sequential         | 15.6 K | [5, 100] | [5, 2]   
2  | model.all_layers.0 | Linear             | 10.1 K | [5, 100] | [5, 100] 
3  | model.all_layers.1 | BatchNorm1d        | 200    | [5, 100] | [5, 100] 
4  | model.all_layers.2 | ReLU               | 0      | [5, 100] | [5, 100] 
5  | model.all_layers.3 | Linear             | 5.0 K  | [5, 100] | [5, 50]  
6  | model.all_layers.4 | BatchNorm1d        | 100    | [5, 50]  | [5, 50]  
7  | model.all_layers.5 | ReLU               | 0      | [5, 50]  | [5, 50]  
8  | model.all_layers.6 | Linear             | 102    | [5, 50]  | [5, 2]   
9  | train_acc          | MulticlassAccuracy | 0      | ?        | ?        
10 | val_acc            | MulticlassAccuracy | 0      | ?        | ?  

In [None]:
%%capture --no-display

trainer = L.Trainer(
  max_epochs=5,
  accelerator="cpu", # set to "auto" or "gpu" to use GPUs if available
  devices="auto",      # Uses all available GPUs if applicable
  deterministic=True
)

trainer.fit(
  model=lightning_model,
  datamodule=dm
)

In [19]:
trainer.test(model=lightning_model, dataloaders=dm.val_dataloader())

Testing: 0it [00:00, ?it/s]

[{'test_acc': 0.8581249713897705}]

##4) Batch Overfitting

In [20]:
L.seed_everything(123)

dm = CustomDataModule()

pytorch_model = PyTorchMLP(num_features=100, num_classes=2)
lightning_model = LightningModel(model=pytorch_model, learning_rate=0.15)

INFO: Global seed set to 123
INFO:lightning.fabric.utilities.seed:Global seed set to 123


In [21]:
summary = ModelSummary(lightning_model, max_depth=-1)
print(summary)

   | Name               | Type               | Params | In sizes | Out sizes
----------------------------------------------------------------------------------
0  | model              | PyTorchMLP         | 15.6 K | [5, 100] | [5, 2]   
1  | model.all_layers   | Sequential         | 15.6 K | [5, 100] | [5, 2]   
2  | model.all_layers.0 | Linear             | 10.1 K | [5, 100] | [5, 100] 
3  | model.all_layers.1 | BatchNorm1d        | 200    | [5, 100] | [5, 100] 
4  | model.all_layers.2 | ReLU               | 0      | [5, 100] | [5, 100] 
5  | model.all_layers.3 | Linear             | 5.0 K  | [5, 100] | [5, 50]  
6  | model.all_layers.4 | BatchNorm1d        | 100    | [5, 50]  | [5, 50]  
7  | model.all_layers.5 | ReLU               | 0      | [5, 50]  | [5, 50]  
8  | model.all_layers.6 | Linear             | 102    | [5, 50]  | [5, 2]   
9  | train_acc          | MulticlassAccuracy | 0      | ?        | ?        
10 | val_acc            | MulticlassAccuracy | 0      | ?        | ?  

In [None]:
%%capture --no-display

trainer = L.Trainer(
  overfit_batches=1,
  max_epochs=10,
  accelerator="cpu", # set to "auto" or "gpu" to use GPUs if available
  devices="auto",      # Uses all available GPUs if applicable
  deterministic=True
)

trainer.fit(
  model=lightning_model,
  datamodule=dm
)

In [24]:
trainer.test(model=lightning_model, datamodule=dm)

Testing: 0it [00:00, ?it/s]

[{'test_acc': 0.8245000243186951}]