# Image classification on the CIFAR10 dataset 

## Importing modules and set up

In [1]:
# importing the pytorch modules
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision

# importing the pytorch lightning modules
from pl_bolts.datamodules import CIFAR10DataModule
from pl_bolts.transforms.dataset_normalizations import cifar10_normalization
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks import LearningRateMonitor, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

# importing the modules for the optimizer
from torch.optim.lr_scheduler import OneCycleLR
from torch.optim.swa_utils import AveragedModel, update_bn
from torchmetrics.functional import accuracy

# seeding everything to ensure reproducable results
seed_everything(1)

# Set up the Path
Path = os.environ.get("PATH_DATASETS", ".")

# Set up dataset and GPU, batch size and number of workers (for data loading, based on # cpu cores)
PATH_DATASETS = os.environ.get("PATH_DATASETS", ".")
AVAIL_GPUS = min(1, torch.cuda.device_count())
BATCH_SIZE = 256 if AVAIL_GPUS else 64
NUM_WORKERS = int(os.cpu_count() / 2)

print("Available GPUS:", AVAIL_GPUS)
print("Batch size:", BATCH_SIZE)
print("Num Workers:", NUM_WORKERS)

# set gpu as device, if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Global seed set to 1


Available GPUS: 1
Batch size: 256
Num Workers: 6


## Cifar10 Dataset

The CIFAR10 Dataset consists of 60000  32x32 images.

In order increase the number of possible samples in the training dataset, I set up some transformations.

transformations for the training dataset:
 - crop a random 32x32 part of the source image
 - randomly flip image horizontally
 - convert data to tensor
 - normalize data

In [2]:
train_transforms = torchvision.transforms.Compose(    [
        torchvision.transforms.RandomCrop(32, padding=4),
        torchvision.transforms.RandomHorizontalFlip(),
        torchvision.transforms.ToTensor(),
        cifar10_normalization(),
    ]
)

Transformations for the test and validation dataset:
 - convert data to tensor
 - normalize data according

In [3]:
test_transforms = torchvision.transforms.Compose(    
    [
        torchvision.transforms.ToTensor(),
        cifar10_normalization(),
    ]
)

Loading the dataset:

In [4]:
cifar10_data_module = CIFAR10DataModule(
    data_dir=PATH_DATASETS,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    train_transforms=train_transforms,
    val_transforms=test_transforms,
    test_transforms=test_transforms,
)

  rank_zero_deprecation(
  rank_zero_deprecation(
  rank_zero_deprecation(


## Network Architecture

The network constists of four composite convolutional layers followed by three linear layers.
The composite layers are constructed as:
1. Convolutional layer
1. MaxPool layer
1. ReLU layer
1. BatchNorm layer

In [5]:
network = nn.Sequential(
    # Composite layer 1
    nn.Conv2d(3, 32, kernel_size=2),
    nn.MaxPool2d(2,2),
    nn.ReLU(),
    nn.BatchNorm2d(32),
    
    # Composite layer 2
    nn.Conv2d(32, 64, kernel_size=2),
    nn.MaxPool2d(2,2),
    nn.ReLU(),
    nn.BatchNorm2d(64),
    
    # Composite layer 3
    nn.Conv2d(64, 128, kernel_size=2),
    nn.MaxPool2d(2,2),
    nn.ReLU(),
    nn.BatchNorm2d(128),
    
    # Composite layer 4
    nn.Conv2d(128, 256, kernel_size=2),
    nn.MaxPool2d(2,2),
    nn.ReLU(),
    nn.BatchNorm2d(256),
    
    # Linear layers
    nn.Flatten(),
    nn.Linear(256*1*1, 120),
    nn.ReLU(),
    nn.Linear(120, 84),
    nn.ReLU(),
    nn.Linear(84, 10),
)

# putting the network on to the GPU 
network.to(device)

Sequential(
  (0): Conv2d(3, 32, kernel_size=(2, 2), stride=(1, 1))
  (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (2): ReLU()
  (3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (4): Conv2d(32, 64, kernel_size=(2, 2), stride=(1, 1))
  (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (6): ReLU()
  (7): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (8): Conv2d(64, 128, kernel_size=(2, 2), stride=(1, 1))
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): ReLU()
  (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (12): Conv2d(128, 256, kernel_size=(2, 2), stride=(1, 1))
  (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (14): ReLU()
  (15): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (16): Flatten(st

Check the input and output dimensions of the layers.

In [6]:
from torchsummary import summary

summary(network, (3, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 31, 31]             416
         MaxPool2d-2           [-1, 32, 15, 15]               0
              ReLU-3           [-1, 32, 15, 15]               0
       BatchNorm2d-4           [-1, 32, 15, 15]              64
            Conv2d-5           [-1, 64, 14, 14]           8,256
         MaxPool2d-6             [-1, 64, 7, 7]               0
              ReLU-7             [-1, 64, 7, 7]               0
       BatchNorm2d-8             [-1, 64, 7, 7]             128
            Conv2d-9            [-1, 128, 6, 6]          32,896
        MaxPool2d-10            [-1, 128, 3, 3]               0
             ReLU-11            [-1, 128, 3, 3]               0
      BatchNorm2d-12            [-1, 128, 3, 3]             256
           Conv2d-13            [-1, 256, 2, 2]         131,328
        MaxPool2d-14            [-1, 25

## Defining the model

Next, I define the loss function, the accuracy function, and the optimizer.

As loss function I chose the negative log likelyhood loss funtion. It is useful in classification tasks.  
I use the accuracy function from torchmetrics.  
I use a stochastic gradient descent optimizer with a learning rare of 0.05.  
The learning rate is adapted with learning rate scheduler.

In [7]:
class NeuralNetwork(LightningModule):
    def __init__(self, model, lr=0.05):
        super().__init__()
    
        self.save_hyperparameters()
        self.model = model
        
    def forward(self, x):
        out = self.model(x)
        return F.log_softmax(out, dim=1)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.nll_loss(logits, y)
        self.log("train_loss", loss)
        return loss
    
    def evaluate(self, batch, stage=None):
        x, y = batch
        logits = self(x)
        
        loss = F.nll_loss(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc =  accuracy(preds, y)
        
        if stage:
            self.log(f"{stage}_loss", loss, prog_bar=True)
            self.log(f"{stage}_acc", acc, prog_bar=True)
    
    def validation_step(self, batch, batch_idx):
        self.evaluate(batch, "val")
        
    def test_step(self, batch, batch_idx):
        self.evaluate(batch, "test")
    
    def configure_optimizers(self):
        optimizer = torch.optim.SGD(
            self.parameters(),
            lr = self.hparams.lr,
        )
        
        steps_per_epoch = 45000 // BATCH_SIZE
        
        scheduler_dict = {
            "scheduler": OneCycleLR(
                optimizer,
                0.1,
                epochs = self.trainer.max_epochs,
                steps_per_epoch=steps_per_epoch,
            ),
            "interval": "step",
        }
        return {"optimizer": optimizer, "lr_scheduler": scheduler_dict}

Initializing the model and setting a data module

In [8]:
model = NeuralNetwork(network, lr=0.05)
model.datamodule = cifar10_data_module

## Define the trainer

I use the trainer class frompytorch lightning with a progressbar, a tensorboard logger and an early stopping mechanism that monitors the validation loss.

In [9]:
trainer = Trainer(
    progress_bar_refresh_rate=10,
    max_epochs=100,
    gpus=AVAIL_GPUS,
    logger=TensorBoardLogger("lightning_logs/"),
    callbacks=[
        LearningRateMonitor(logging_interval="step"),
        EarlyStopping(monitor="val_loss", patience=3),
    ],
    
)

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [10]:
os.environ['TENSORBOARD_BINARY'] = '/home/paul/anaconda3/bin/tensorboard'
# Start tensorboard.
%reload_ext tensorboard
%tensorboard --logdir lightning_logs/

## Train the network

In [11]:
trainer.fit(model, cifar10_data_module)

Files already downloaded and verified
Files already downloaded and verified


  rank_zero_deprecation(
  rank_zero_deprecation(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
2022-02-10 13:33:15.499367: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-10 13:33:15.499385: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 215 K 
-------------------------------------
215 K     Trainable params
0         Non-trainable params
215 K     Total params
0.863     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 1


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

## Test the network

In [12]:
#trainer.test(model, datamodule=cifar10_data_module)