# Pytorch Lightning Sample Code

In [1]:
# Jupyter Notebook setup:

# change to directory of this file
import os
os.chdir('/workspace')
path = os.getcwd()
print(path)

/workspace


## Hyperparameters

Use command line `ArgumentParser` with best practices to split into:
- Trainer args (accelerator, devices, num_nodes, etc…)
- Model specific arguments (layer_dim, num_layers, learning_rate, etc…)
- System arguments (data_path, cluster_email, etc…)

In [2]:
import argparse
from pathlib import Path


parser = argparse.ArgumentParser()
'''train'''
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--max_epochs', type=int, default=5)

'''model'''
parser.add_argument('--learning_rate', type=float, default='1e-3')

'''System'''
parser.add_argument('--gpus', type=list, default=[0])
parser.add_argument('--num_workers', type=int, default=32)
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--data_dir', type=Path, default='./data')

args = parser.parse_args(args=[]) # Set args=[] when running in Jupyter

## Data

Use [`LightningDataModule`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html) to collect all data related logic in one place:
- Dataset loading
- Train/val/test split
- Dataloader setup

In [3]:
from torch.utils.data import random_split, DataLoader
from torch import Generator
from torchvision import datasets
from torchvision.transforms import ToTensor
import lightning as L

class FashionMNISTDataModule(L.LightningDataModule):
    def __init__(self, data_dir: str = args.data_dir):
        super().__init__()
        self.data_dir = data_dir
        
    # called only within a single process on CPU
    def prepare_data(self):
        # download
        datasets.FashionMNIST(
            root=args.data_dir,
            train=True,
            download=True,
        )
        datasets.FashionMNIST(
            root=args.data_dir,
            train=False,
            download=True,
        )

    # run on each GPU
    def setup(self, stage: str):
        # Assign train/val datasets for use in dataloaders
        if stage == "fit":
            dataset = datasets.FashionMNIST(
                root=args.data_dir,
                train=True,
                transform=ToTensor(),
            )
            self.dataset_train, self.dataset_val = random_split(dataset, [50000, 10000])

        # Assign test dataset for use in dataloader(s)
        if stage == "test":
            self.dataset_test = datasets.FashionMNIST(
                root=args.data_dir,
                train=False,
                transform=ToTensor(),
            )



    def train_dataloader(self):
        return DataLoader(self.dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)

    def val_dataloader(self):
        return DataLoader(self.dataset_val, batch_size=args.batch_size, num_workers=args.num_workers)

    def test_dataloader(self):
        return DataLoader(self.dataset_test, batch_size=args.batch_size, num_workers=args.num_workers)

datamodule = FashionMNISTDataModule()

## Model

Use [`LightningModule`](https://lightning.ai/docs/pytorch/stable/common/lightning_module.html) to organise all model related logic:
- Model definition (`__init__()`)
- Train Loop (`training_step()`)
- Validation Loop (`validation_step()`)
- Test Loop (`test_step()`)
- Configure optimizers (`configure_optimizers()`)

In [4]:
import torch.nn as nn
import torch.optim as optim
import lightning as L
import torchmetrics

class NeuralNetwork(L.LightningModule):
    def __init__(self):
        super().__init__()
        # save hyperparameters
        self.save_hyperparameters(args)
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=10)
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = nn.functional.cross_entropy(logits, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = nn.functional.cross_entropy(logits, y)
        self.log('val_loss', loss)
        # log step metric
        self.accuracy(logits, y)
        self.log("val_acc", self.accuracy, on_epoch=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = nn.functional.cross_entropy(logits, y)
        self.log('test_loss', loss)
        self.accuracy(logits, y)
        self.log("test_acc", self.accuracy, on_epoch=True)
        return loss
    
    def configure_optimizers(self):
        return optim.SGD(self.parameters(), lr=args.learning_rate)
    
model = NeuralNetwork()


## Train Neural Network

The [`Trainer`](https://lightning.ai/docs/pytorch/stable/common/trainer.html) automates training.

In [5]:
from lightning import Trainer


trainer = L.Trainer(max_epochs=args.max_epochs, accelerator='gpu', devices=args.gpus) # set devices to a list of GPU ids to train on
# start training 
trainer.fit(model, datamodule=datamodule)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/3
Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/3
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/3
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 3 processes
----------------------------------------------------------------------------------------------------

You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEV

Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


## Test Neural Network

In [6]:
trainer = L.Trainer(max_epochs=args.max_epochs, accelerator='gpu', devices=[0]) 
trainer.test(model, datamodule=datamodule)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Testing: 0it [00:00, ?it/s]

[{'test_loss': 1.6185829639434814, 'test_acc': 0.5909000039100647}]

## Inference

In [7]:
# get sample from validation set
x, y = next(iter(datamodule.test_dataloader()))
# get prediction
pred = model(x[0])

classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]
predicted, actual = classes[pred[0].argmax(0).item()], classes[y[0]]
print(f'Predicted: "{predicted}", Actual: "{actual}"')

Predicted: "Ankle boot", Actual: "Ankle boot"
