# Pytorch Lightning Sample Code

In [1]:
# Jupyter Notebook setup:

# change to directory of this file
import os
os.chdir('/workspace')
path = os.getcwd()
print(path)

/workspace


## Hyperparameters

Use command line `ArgumentParser` with best practices to split into:
- Trainer args (accelerator, devices, num_nodes, etc…)
- Model specific arguments (layer_dim, num_layers, learning_rate, etc…)
- System arguments (data_path, cluster_email, etc…)

In [10]:
import argparse
from pathlib import Path


parser = argparse.ArgumentParser()
'''train'''
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--max_epochs', type=int, default=5)

'''model'''
parser.add_argument('--learning_rate', type=float, default='1e-3')

'''System'''
parser.add_argument('--gpus', type=list, default=[0])
parser.add_argument('--num_workers', type=int, default=32)
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--data_dir', type=Path, default='./data')

args = parser.parse_args(args=[]) # Set args=[] when running in Jupyter

In [8]:
import torch, torch.nn as nn
import lightning as L

class NeuralNetwork(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = nn.functional.cross_entropy(logits, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss = self.training_step(batch, batch_idx)
        self.log('val_loss', loss)
        return loss
    
    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(), lr=args.learning_rate)


In [4]:
from torchvision import datasets
from torchvision.transforms import ToTensor
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

In [31]:
from torch.utils.data import random_split, DataLoader
from torch import Generator

class FashionMNISTDataModule(L.LightningDataModule):
    def __init__(self, data_dir: str = args.data_dir):
        super().__init__()
        self.data_dir = data_dir
        
    # called only within a single process on CPU
    def prepare_data(self):
        # download
        datasets.FashionMNIST(
            root=args.data_dir,
            train=True,
            download=True,
        )
        datasets.FashionMNIST(
            root=args.data_dir,
            train=False,
            download=True,
        )

    # run on each GPU
    def setup(self, stage: str):
        # Assign train/val datasets for use in dataloaders
        if stage == "fit":
            dataset = datasets.FashionMNIST(
                root=args.data_dir,
                train=True,
                transform=ToTensor(),
            )
            self.dataset_train, self.dataset_val = torch.utils.data.random_split(dataset, [50000, 10000])

        # Assign test dataset for use in dataloader(s)
        if stage == "test":
            self.dataset_test = datasets.FashionMNIST(
                root=args.data_dir,
                train=False,
                transform=ToTensor(),
            )



    def train_dataloader(self):
        return DataLoader(self.dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)

    def val_dataloader(self):
        return DataLoader(self.dataset_val, batch_size=args.batch_size, num_workers=args.num_workers)

    def test_dataloader(self):
        return DataLoader(self.dataset_test, batch_size=args.batch_size, num_workers=args.num_workers)


In [29]:
dataset = datasets.FashionMNIST(
                root=args.data_dir,
                train=True,
            )
print(type(dataset))

<class 'torchvision.datasets.mnist.FashionMNIST'>


In [32]:
from lightning import Trainer
from torch.utils.data import DataLoader
batch_size = 64
model = NeuralNetwork()
data = FashionMNISTDataModule()
trainer = L.Trainer(max_epochs=args.max_epochs, accelerator='gpu', devices=args.gpus) # set devices to a list of GPU ids to train on
# start training 
trainer.fit(model, datamodule=data)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name              | Type       | Params
-------------------------------------------------
0 | flatten           | Flatten    | 0     
1 | linear_relu_stack | Sequential | 669 K 
-------------------------------------------------
669 K     Trainable params
0         Non-trainable params
669 K     Total params
2.679     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    return self.collate_fn(data)
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/_utils/collate.py", line 175, in default_collate
    return [default_collate(samples) for samples in transposed]  # Backwards compatibility.
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/_utils/collate.py", line 175, in <listcomp>
    return [default_collate(samples) for samples in transposed]  # Backwards compatibility.
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/_utils/collate.py", line 183, in default_collate
    raise TypeError(default_collate_err_msg_format.format(elem_type))
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'PIL.Image.Image'>


In [5]:
# get sample from validation set
x, y = next(iter(DataLoader(test_data, batch_size=1)))
# get prediction
pred = model(x)

classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]
predicted, actual = classes[pred[0].argmax(0)], classes[y]
print(f'Predicted: "{predicted}", Actual: "{actual}"')

Predicted: "Ankle boot", Actual: "Ankle boot"
