In [2]:
import os
import sys
from google.colab import files

repo = 'https://github.com/pol-ucd/SegmenterProject.git'
project = os.path.basename(repo).split('.')[0]

# Check CUDA GPUs are enabled
!nvidia-smi

if not os.path.exists(project):
    !git clone https://github.com/pol-ucd/SegmenterProject.git
else:
    os.chdir(project)
    !git pull https://github.com/pol-ucd/SegmenterProject.git


ROOT_DIR = os.path.abspath("")

os.chdir(project)

# To find local version of the library
sys.path.append(os.path.join(ROOT_DIR, 'SegmenterProject'))


Thu Aug  7 11:53:58 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   67C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
try:
  import torchinfo
  import torchmetrics
  import segmentation_models_pytorch
except:
  !pip install torchinfo
  !pip install torchmetrics
  !pip install segmentation-models-pytorch

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0
Collecting torchmetrics
  Downloading torchmetrics-1.8.0-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-c

In [4]:
import torch
from torch import GradScaler

from nn.data import data_load
from nn.models import SegformerBinarySegmentation4
from nn.modules import CombinedLoss
from utils.torch_utils import TrainingManager, get_default_device




In [7]:
train_path = 'data/Polyp Segmentation/train'
val_path = 'data/Polyp Segmentation/valid'
n_epochs = 100
n_batch = 16
test_split = 0.3


# Set the default device to the best available GPU ... or CPU if no GPU available
device = get_default_device()
print(f"Using {device} device for model training.")

"""
I've implemented a data_load function that
can generate a train/test split if needed - but for now I'm just taking 100%
of the training and 100% validation data and using them to train and then to
validate respectively.
"""
(train_loader,
  _) = data_load(train_path,
                # test_split=args.test_split,
                test_split=0.0,  # Use 100% for training
                batch_size=n_batch,
                verbose=True)

(_,
  val_loader) = data_load(val_path,
                          # test_split=args.test_split,
                          test_split=1.0,  # Use 100% for testing/validation
                          batch_size=n_batch,
                          verbose=True)

n_val = len(val_loader) * n_batch
n_train = len(train_loader) * n_batch

print(f"Training batches: {len(train_loader)}")
print(f"Test batches: {len(val_loader)}")

pretained_model = 'nvidia/segformer-b4-finetuned-ade-512-512'
# model = SegformerBinarySegmentation().to(device)  #Old Word doc model
model = SegformerBinarySegmentation4(pretrained_model=pretained_model,
                                     num_classes=1).to(device)
loss_fn = CombinedLoss()


optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
                                                                 T_0=10,
                                                                 T_mult=2)

"""
Only use GradScaler if we have CUDA
"""
scaler = None
if torch.cuda.is_available():
    scaler = GradScaler()

trainer = TrainingManager(model,
                          optimizer,
                          criterion=loss_fn,
                          scaler=scaler,
                          train_loader=train_loader,
                          eval_loader=val_loader,
                          save_preds=False,
                          save_preds_path=""
                          )
train_params = {}
eval_params = {}
best_dice_score = 0.0
for epoch in range(n_epochs):
    print(f"Epoch {epoch + 1}/{n_epochs}")
    train_loss, train_dice = trainer.train(**train_params)
    print(f"Train Loss: {train_loss / n_train:.4f}, Train Dice: {train_dice / n_train:.4f}")

    val_loss, val_metrics = trainer.evaluate(**eval_params)
    print(
        f"Total evaluation Loss: {val_loss / n_val:.4f} | Dice: {val_metrics['dice'] / n_val:.4f} | IOU: {val_metrics['iou'] / n_val:.4f}")
    if val_metrics['dice'] > best_dice_score:
        best_dice_score = val_metrics['dice']
        torch.save(model.state_dict(), "best_segformer.pth")
        # _, _ = trainer.evaluate(save_preds=True)
        print(f"Model saved for dice score: {val_metrics['dice'] / n_val:.4f}")


Using cuda device for model training.
Found 630 training samples and 0 test samples
Found 0 training samples and 157 test samples
Training batches: 158
Test batches: 40


  original_init(self, **validated_kwargs)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/257M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/257M [00:00<?, ?B/s]

Epoch 1/100



  0%|          | 0/158 [00:00<?, ?it/s][A
  1%|          | 1/158 [00:02<06:06,  2.33s/it][A
  1%|▏         | 2/158 [00:02<03:08,  1.21s/it][A
  2%|▏         | 3/158 [00:03<02:11,  1.18it/s][A
  3%|▎         | 4/158 [00:03<01:44,  1.47it/s][A
  3%|▎         | 5/158 [00:04<01:29,  1.71it/s][A
  4%|▍         | 6/158 [00:04<01:20,  1.88it/s][A
  4%|▍         | 7/158 [00:04<01:14,  2.03it/s][A
  5%|▌         | 8/158 [00:05<01:10,  2.12it/s][A
  6%|▌         | 9/158 [00:05<01:07,  2.20it/s][A
  6%|▋         | 10/158 [00:06<01:05,  2.26it/s][A
  7%|▋         | 11/158 [00:06<01:04,  2.27it/s][A
  8%|▊         | 12/158 [00:06<01:03,  2.31it/s][A
  8%|▊         | 13/158 [00:07<01:02,  2.33it/s][A
  9%|▉         | 14/158 [00:07<01:01,  2.35it/s][A
  9%|▉         | 15/158 [00:08<01:00,  2.36it/s][A
 10%|█         | 16/158 [00:08<01:00,  2.33it/s][A
 11%|█         | 17/158 [00:09<01:00,  2.34it/s][A
 11%|█▏        | 18/158 [00:09<00:59,  2.35it/s][A
 12%|█▏        | 19/158 [00:0

Train Loss: 0.0655, Train Dice: 0.1302





Total evaluation Loss: 0.0643 | Dice: 0.1144 | IOU: 0.0801
Model saved for dice score: 0.1144
Epoch 2/100


100%|██████████| 158/158 [01:13<00:00,  2.16it/s]

Train Loss: 0.0434, Train Dice: 0.1360





Total evaluation Loss: 0.0382 | Dice: 0.1196 | IOU: 0.0856
Model saved for dice score: 0.1196
Epoch 3/100


100%|██████████| 158/158 [01:12<00:00,  2.17it/s]

Train Loss: 0.0328, Train Dice: 0.1354





Total evaluation Loss: 0.0445 | Dice: 0.1144 | IOU: 0.0799
Epoch 4/100


 56%|█████▋    | 89/158 [00:41<00:32,  2.14it/s]


KeyboardInterrupt: 