In [3]:
import os
import torch
import torch.nn.functional as F
from torch import nn
from torchvision import transforms
import torchmetrics
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from PIL import Image

In [4]:
class Dataset():
    def __init__(self, data_list, data_dir , transform = None):
        self.data_dir = data_dir
        self.data_list = data_list
        self.transform = transform
    def __len__(self):
        return int(len(self.data_list))

    def __getitem__(self, index):
        img_path = os.path.join(self.data_dir, self.data_list[index])
        img = Image.open(img_path)

        if "dog" in img_path:
            label = 1
        else:
            label = 0
        if self.transform is not None:
            img = self.transform(img)

        return (img, label)


train_dir = 'Data/train'
test_dir = 'Data/test1'

train_files = os.listdir(train_dir)
test_files = os.listdir(test_dir)

transformations = transforms.Compose([transforms.Resize((60,60)),transforms.ToTensor()])

train = Dataset(train_files, train_dir, transformations)
val = Dataset(test_files, test_dir, transformations)


train, val = torch.utils.data.random_split(train,[20000,5000]) 


In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics
from torch.utils.data import DataLoader
import pytorch_lightning as pl

class LitModel(pl.LightningModule):
    def __init__(self, learning_rate, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.train_acc = torchmetrics.Accuracy(task="binary")
        self.valid_acc = torchmetrics.Accuracy(task="binary")

        # Define layers
        self.conv1 = nn.Sequential(nn.Conv2d(3, 16, 3), nn.ReLU(), nn.MaxPool2d(2, 2)) 
        self.conv2 = nn.Sequential(nn.Conv2d(16, 32, 3), nn.ReLU(), nn.MaxPool2d(2, 2)) 
        self.conv3 = nn.Sequential(nn.Conv2d(32, 64, 3), nn.ReLU(), nn.MaxPool2d(2, 2)) 
        self.fc1 = nn.Sequential(nn.Flatten(), nn.Linear(64 * 5 * 5, 256), nn.ReLU(), nn.Linear(256, 128), nn.ReLU())
        self.fc2 = nn.Linear(128, 1)  # Output a single logit for binary classification

    def train_dataloader(self):
        return DataLoader(dataset=train, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(dataset=val, batch_size=self.batch_size, shuffle=False)

    def training_step(self, batch, batch_idx):
        data, label = batch
        output = self.forward(data).squeeze(1)
        loss = nn.BCEWithLogitsLoss()(output, label.float())
        acc = self.train_acc(torch.sigmoid(output), label.int())

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('train_acc', acc, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def on_train_epoch_end(self):
        self.log('train_acc_epoch', self.train_acc.compute())
        self.train_acc.reset()

    def validation_step(self, batch, batch_idx):
        val_data, val_label = batch
        val_output = self.forward(val_data).squeeze(1)
        val_loss = nn.BCEWithLogitsLoss()(val_output, val_label.float())
        acc = self.valid_acc(torch.sigmoid(val_output), val_label.int())

        self.log('val_loss', val_loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('val_acc', acc, on_step=True, on_epoch=True, prog_bar=True)
        return val_loss

    def on_validation_epoch_end(self):
        self.log('val_acc_epoch', self.valid_acc.compute())
        self.valid_acc.reset()

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x  # No softmax; use raw logits for BCEWithLogitsLoss


## **Automatic Batch Selection**

In [9]:
from pytorch_lightning.tuner import Tuner
model = LitModel(batch_size=32)
trainer = pl.Trainer(max_epochs=10)
tuner = Tuner(trainer)
tuner.scale_batch_size(model, mode='binsearch')


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4070 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: x:\Programing\Training\computerVision\ComputerVisionCourses\pytorchLightening\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\saeid\anaconda3\envs\pytorch\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performanc

32768

In [27]:
# Initialize the model with the desired batch size and learning rate
model = LitModel(batch_size=32, learning_rate=0.001)

# Set up the Trainer with GPU support, progress bar, and a specific number of epochs
trainer = pl.Trainer(
    accelerator="gpu",  # Use "cpu" if you don't have a GPU available
    devices=1,          # Number of GPUs to use
    max_epochs=10,      # Adjust the number of epochs as needed
    enable_progress_bar=True
)

# Start training
trainer.fit(model)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type           | Params
---------------------------------------------
0 | train_acc | BinaryAccuracy | 0     
1 | valid_acc | BinaryAccuracy | 0     
2 | conv1     | Sequential     | 448   
3 | conv2     | Sequential     | 4.6 K 
4 | conv3     | Sequential     | 18.5 K
5 | fc1       | Sequential     | 442 K 
6 | fc2       | Linear         | 129   
---------------------------------------------
466 K     Trainable params
0         Non-trainable params
466 K     Total params
1.866     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\saeid\anaconda3\envs\pytorch\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


## **6. Using Callbacks - Early Stopping & Checkpointing**

**Early Stopping** -  early stopping is a form of regularization used to avoid overfitting when training a learner with an iterative method, such as gradient descent.

![](https://cdn-images-1.medium.com/max/920/1*iAK5uMoOlX1gZu-cSh1nZw.png)

**Model Checkpoint** - ModelCheckpoint callback is used to save a model or weights (in a checkpoint file) at some interval, so the model or weights can be loaded later to continue the training from the state saved.

In [28]:
# Setup Early Stopping
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

early_stop = EarlyStopping(
    monitor = 'val_loss',
    patience = 3,
    strict = False,
    verbose = False,
    mode = 'min'
)

In [30]:
# Setup Model Checkpoint
from pytorch_lightning.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='models/',
    filename='sample-catsvsdogs-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,# We save the top 3 models
    mode='min',
)

In [32]:
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

# Initialize EarlyStopping and ModelCheckpoint callbacks
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=3)
checkpoint_callback = ModelCheckpoint(monitor="val_loss", save_top_k=1, mode="min")

# Initialize model
model = LitModel(batch_size=32, learning_rate=0.001)

# Initialize a trainer
trainer = pl.Trainer(
    accelerator="gpu",
    devices=1,
    max_epochs=10,
    enable_progress_bar=True,
    callbacks=[early_stopping, checkpoint_callback]
)

# Start training
trainer.fit(model)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type           | Params
---------------------------------------------
0 | train_acc | BinaryAccuracy | 0     
1 | valid_acc | BinaryAccuracy | 0     
2 | conv1     | Sequential     | 448   
3 | conv2     | Sequential     | 4.6 K 
4 | conv3     | Sequential     | 18.5 K
5 | fc1       | Sequential     | 442 K 
6 | fc2       | Linear         | 129   
---------------------------------------------
466 K     Trainable params
0         Non-trainable params
466 K     Total params
1.866     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\saeid\anaconda3\envs\pytorch\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
c:\Users\saeid\anaconda3\envs\pytorch\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [33]:
# Start tensorboard.
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

## **Restore from Checkpoints**

In [34]:
# Get path of best model
checkpoint_callback.best_model_path

'x:\\Programing\\Training\\computerVision\\ComputerVisionCourses\\pytorchLightening\\lightning_logs\\version_4\\checkpoints\\epoch=5-step=3750.ckpt'

### **Load and run inference using the best checkpoint model**

In [35]:
#loading the best checkpoints to model
pretrained_model = LitModel.load_from_checkpoint(batch_size = 32, learning_rate=0.001, checkpoint_path = checkpoint_callback.best_model_path)
pretrained_model = pretrained_model.to("cuda")
pretrained_model.eval()
pretrained_model.freeze()

## **Save our Model for Production Deployments**

**Exporting to TorchScript**

TorchScript allows you to serialize your models in a way that it can be loaded in non-Python environments. The LightningModule has a handy method to_torchscript() that returns a scripted module which you can save or directly use.

In [36]:
model = LitModel.load_from_checkpoint(batch_size = 32, learning_rate=0.001, checkpoint_path = checkpoint_callback.best_model_path)

script = model.to_torchscript()

# save for use in production environment
torch.jit.save(script, "model.pt")