**Multi-GPU working**  
https://www.kaggle.com/code/rasmus01610/notebook-multi-gpu-training-with-pytorch-lightning/notebook

In [1]:
#!pip install monai
#!pip install scikit-learn

In [1]:
import os
import PIL
import torch
import numpy as np

from monai.data import DataLoader, ArrayDataset
from torch.optim.lr_scheduler import CosineAnnealingLR, OneCycleLR,  CyclicLR
from torchvision import models
from monai.transforms import (
    EnsureChannelFirst,
    AsDiscrete,
    Compose,
    LoadImage,
    ScaleIntensity,
)
import glob
import os

from torch.utils.data import random_split
import pytorch_lightning as pl
from torch import optim
from sklearn.metrics import accuracy_score

In [2]:
#!wget https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/MedNIST.tar.gz
#!tar -zxf MedNIST.tar.gz

In [3]:
path = "./PCBData"
images = glob.glob(path + '/*/*.jpg')
labels = [image.split('/')[-2] for image in images]
unique_labels = list(set(labels))
labels = [unique_labels.index(i) for i in labels]

In [4]:
num_classes = len(unique_labels)

train_transforms = Compose(
    [
        LoadImage(image_only=True),
        EnsureChannelFirst(),
        ScaleIntensity(),
    ]
)

y_trans = Compose([AsDiscrete(to_onehot=num_classes)])

In [5]:
ds = ArrayDataset(images, labels=labels, img_transform=train_transforms, label_transform=y_trans)
n_data = len(ds)
n_train = int(n_data*0.9)
train_ds, val_ds = random_split(ds, [n_train, n_data-n_train], generator=torch.Generator().manual_seed(42))

batch_size = 64
n_gpu = 4

In [6]:
len(ds)

12758

In [7]:
#train_dl = DataLoader(train_ds, shuffle=True, pin_memory=True, num_workers=2, batch_size=256)
#val_dl = DataLoader(val_ds, shuffle=False, pin_memory=True, num_workers=2, batch_size=256)

In [8]:
class PCBModel(pl.LightningModule):
    def __init__(self, net, lr, loss):
        super().__init__()
        self.net = net
        self.lr = lr
        self.loss = loss 
        
    def forward(self, x):
        return torch.nn.functional.softmax(self.net(x), dim=1)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss(y_hat, y.float())
        self.log("train_loss", loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss(y_hat, y.float())
        val_acc = (y.argmax(dim=1) == y_hat.argmax(dim=1)).float().sum() / y.shape[0]
        self.log("val_acc", val_acc, prog_bar=True, sync_dist=True)
        self.log("val_loss", loss, prog_bar=True, sync_dist=True)
        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.lr)
        #scheduler = CosineAnnealingLR(optimizer, self.trainer.max_epochs * 200, 0)
        #scheduler = CyclicLR(optimizer, base_lr=self.lr/100, max_lr=self.lr, step_size_up = 100)
        scheduler = OneCycleLR(optimizer, max_lr=self.lr, epochs=self.trainer.max_epochs,
                               steps_per_epoch=n_train//batch_size//n_gpu)
        return [optimizer], [scheduler]

    def setup(self, stage=None):
        path = "./PCBData"
        images = glob.glob(path + '/*/*.jpg')
        labels = [image.split('/')[-2] for image in images]
        unique_labels = list(set(labels))
        labels = [unique_labels.index(i) for i in labels]

        num_classes = len(unique_labels)

        train_transforms = Compose(
            [
                LoadImage(image_only=True),
                EnsureChannelFirst(),
                ScaleIntensity(),
            ]
        )
        
        y_trans = Compose([AsDiscrete(to_onehot=num_classes)])
        
        ds = ArrayDataset(images, labels=labels, img_transform=train_transforms, label_transform=y_trans)
        n_data = len(ds)
        n_train = int(n_data*0.9)
        self.train_ds, self.val_ds = random_split(ds, [n_train, n_data-n_train], generator=torch.Generator().manual_seed(42))

    def train_dataloader(self):
        return DataLoader(self.train_ds, shuffle=True, pin_memory=True, num_workers=2, batch_size=batch_size)
        
    def val_dataloader(self):
        return DataLoader(self.val_ds, shuffle=False, pin_memory=True, num_workers=2, batch_size=batch_size)
        

In [9]:
resnet_pretrained = models.resnet152(pretrained=False)

/home/kotech/venv-lightning/lib/python3.8/site-packages/torchvision/models/_utils.py:208: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.
/home/kotech/venv-lightning/lib/python3.8/site-packages/torchvision/models/_utils.py:223: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=None`.


In [10]:
resnet_pretrained

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [11]:
num_ftrs = resnet_pretrained.fc.in_features
resnet_pretrained.fc = torch.nn.Linear(num_ftrs, num_classes)

In [12]:
#net = DenseNet121(spatial_dims=2, in_channels=3, out_channels=num_classes)
net = resnet_pretrained
lr = 1e-3
loss = torch.nn.CrossEntropyLoss()
model = PCBModel(net, lr, loss)

In [13]:
trainer = pl.Trainer(accelerator="gpu",devices=n_gpu,strategy="ddp_notebook", max_epochs=20, log_every_n_steps=40)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/kotech/venv-lightning/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [14]:
#trainer.fit(model, train_dl, val_dl)
trainer.fit(model)

Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4
Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/4
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/4
Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/4
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 4 processes
----------------------------------------------------------------------------------------------------

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEV

Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

/home/kotech/venv-lightning/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
