In [1]:
# clone the ADIS repository
!git clone https://github.com/sathishkumar67/SSD_MobileNetV3_ADIS.git
# move the files to the current directory
!mv /kaggle/working/SSD_MobileNetV3_ADIS/* /kaggle/working/
# upgrade pip
!pip install --upgrade pip
# install the required packages
!pip install  -r requirements.txt --upgrade --upgrade-strategy eager

Cloning into 'SSD_MobileNetV3_ADIS'...
remote: Enumerating objects: 294, done.[K
remote: Counting objects: 100% (62/62), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 294 (delta 32), reused 40 (delta 13), pack-reused 232 (from 2)[K
Receiving objects: 100% (294/294), 99.67 MiB | 41.17 MiB/s, done.
Resolving deltas: 100% (166/166), done.
Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting optuna==4.2.1 (from -r requirements.txt (line 2))
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting huggingface-hub==0.30.

In [2]:
# necessary imports
import os
import optuna
import joblib
from typing import Tuple
from tqdm import tqdm
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from huggingface_hub import hf_hub_download
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, RandomSampler
from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, SequentialLR
from torchvision.ops import box_iou
from collections import defaultdict
from torchmetrics.detection import MeanAveragePrecision
from ssdlite_mobnetv3_adis.utils import unzip_file, replace_activation_function
from ssdlite_mobnetv3_adis.dataset import collate_fn, SSDLITEOBJDET_DATASET, CachedSSDLITEOBJDET_DATASET
from ssdlite_mobnetv3_adis.model import SSDLITE_MOBILENET_V3_Large
from ssdlite_mobnetv3_adis.epu import EPU


# set random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
# set constants
REPO_ID = "pt-sk/ADIS" 
DATASET_NAME = "balanced_dataset"
REPO_TYPE = "dataset"
FILENAME_IN_REPO = f"{DATASET_NAME}.zip"
LOCAL_DIR = os.getcwd()
DATASET_PATH = f"{LOCAL_DIR}/{FILENAME_IN_REPO}"
DATASET_FOLDER_PATH = f"{LOCAL_DIR}/{DATASET_NAME}"                       
CLASSES = ['Cat', 'Cattle', 'Chicken', 'Deer', 'Dog', 'Squirrel', 'Eagle', 'Goat', 'Rodents', 'Snake'] 
NUM_CLASSES = len(CLASSES)
NUM_CLASSES_WITH_BG = NUM_CLASSES + 1    # 1 for background class

# download the dataset and unzip it
hf_hub_download(repo_id=REPO_ID, filename=FILENAME_IN_REPO, repo_type=REPO_TYPE, local_dir=LOCAL_DIR)
unzip_file(DATASET_PATH, LOCAL_DIR)

balanced_dataset.zip:   0%|          | 0.00/7.04G [00:00<?, ?B/s]

Unzipping: 100%|██████████| 7.07G/7.07G [00:43<00:00, 161MB/s]


Unzipped /kaggle/working/balanced_dataset.zip to /kaggle/working
Removed zip file: /kaggle/working/balanced_dataset.zip


In [4]:
# set pin memory device
PIN_MEMORY_DEVICE = "cuda:0"
NUM_CORES = os.cpu_count()
BATCH_SIZE = 128

# prepare the dataset
train_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="train",
    num_classes=NUM_CLASSES_WITH_BG)

val_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="val",
    num_classes=NUM_CLASSES_WITH_BG)

test_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="test",
    num_classes=NUM_CLASSES_WITH_BG)


# samplers for reproducibility
train_sampler = RandomSampler(train_dataset, generator=torch.Generator().manual_seed(RANDOM_SEED))
val_sampler = RandomSampler(val_dataset, generator=torch.Generator().manual_seed(RANDOM_SEED))
test_sampler = RandomSampler(test_dataset, generator=torch.Generator().manual_seed(RANDOM_SEED))


# prepare the dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    sampler=train_sampler,
    num_workers=NUM_CORES,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2,
    pin_memory_device=PIN_MEMORY_DEVICE)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    sampler=val_sampler,
    num_workers=NUM_CORES,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2,
    pin_memory_device=PIN_MEMORY_DEVICE)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    sampler=test_sampler,
    num_workers=NUM_CORES,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2,
    pin_memory_device=PIN_MEMORY_DEVICE)

Preprocessing dataset and caching to /kaggle/working/balanced_dataset/train_cache...


 69%|██████▊   | 12445/18139 [02:52<01:18, 72.35it/s] 


KeyboardInterrupt: 

In [None]:
def bohb_tunner(
    args: dict,
    model: nn.Module,
    optimizer: optim.Optimizer,
    dataloaders: dict[str, torch.utils.data.DataLoader],
    callback
) -> None:
    """
    Train an object detection model with linear warmup, cosine decay, EMA, and early stopping on val loss.

    Args:
        args (dict): Dictionary containing training parameters:
            - device (torch.device): Device to train on (e.g., 'cuda' or 'cpu').
            - warmup_epochs (int): Number of epochs for linear warmup.
            - num_epochs (int): Total number of epochs for training.
            - patience (int): Early stopping patience in epochs (val loss based).
            - initial_lr (float): Initial learning rate.
            - lr_factor (float): Factor to reduce learning rate.
            - start_factor (float): Start factor for linear warmup.
            - end_factor (float): End factor for linear warmup.
        model (nn.Module): The detection model.
        optimizer (optim.Optimizer): Optimizer instance.
        dataloaders (dict): Dict with 'train' and 'val' DataLoader.
        callback (Callable): Callback function for BOHB.
    """
    # Unpack dataloaders
    train_loader, val_loader = dataloaders['train'], dataloaders['val']

    # Set up LR schedulers: linear warmup then cosine annealing
    scheduler_warmup = LinearLR(optimizer, start_factor=args["start_factor"], end_factor=args["end_factor"], total_iters=args["warmup_epochs"])
    scheduler_cosine = CosineAnnealingLR(optimizer, T_max=(args["num_epochs"] - args["warmup_epochs"]), eta_min=args["initial_lr"] * args["lr_factor"])
    # SequentialLR to combine warmup and cosine annealing
    scheduler = SequentialLR(optimizer, schedulers=[scheduler_warmup, scheduler_cosine], milestones=[args["warmup_epochs"]])

    # Initialize best validation loss and patience counter
    best_val_loss = float('inf')
    patience_counter = 0

    # Training loop
    for epoch in range(1, args["num_epochs"] + 1):
        # Training
        model.train()
        # Initialize total loss for the epoch
        total_loss = 0.0
        # Create a tqdm progress bar for training
        train_bar = tqdm(train_loader, desc=f"Epoch {epoch}/{args['num_epochs']}", unit="batch")
        for images, targets in train_bar:
            # Move images to device
            images = images.to(args["device"])
            # Move targets to device
            targets = [{k: v.to(args["device"]) for k, v in t.items()} for t in targets]

            # Forward pass and compute loss
            loss_dict = model(images, targets)
            loss = sum(loss for loss in loss_dict.values())

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Accumulate loss
            total_loss += loss.item()
            train_bar.set_postfix(loss=loss.item(), lr=optimizer.param_groups[0]['lr'])

        # Calculate average loss for the epoch
        avg_train_loss = total_loss / len(train_loader)

        # Step scheduler
        scheduler.step()

        # Validation
        total_val_loss = 0.0
        # no gradient calculation for validation
        with torch.no_grad():
            # Create a tqdm progress bar for validation
            for images, targets in tqdm(val_loader, desc="Validating", unit="batch"):
                # Move images to device
                images = images.to(args["device"])
                # Move targets to device
                targets = [{k: v.to(args["device"]) for k, v in t.items()} for t in targets]
                # Forward pass and compute loss
                loss_dict = model(images, targets)
                # Accumulate validation loss
                total_val_loss += sum(loss for loss in loss_dict.values()).item()
                
        # Calculate average validation loss
        avg_val_loss = total_val_loss / len(val_loader)
        
        # report the average validation loss to the BOHB callback
        callback(avg_val_loss, epoch)

        # Print training and validation loss
        print(f"Epoch {epoch}: Train Loss={avg_train_loss:.4f}, Val Loss={avg_val_loss:.4f}")

        # Early stopping on val loss
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= args["patience"]:
                print(f"Early stopping at epoch {epoch} (no improvement for {args['patience']} epochs)")
                break
    
    # return the best validation loss
    return best_val_loss

In [None]:
# constants
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
WARMUP_EPOCHS = 5
NUM_EPOCHS = 50
PATIENCE = 5
END_FACTOR = 1.0

# define the objective function
def objective(trial):
    # define callback to report intermidiate results
    def on_train_epoch_end(score, epoch):
        trial.report(score, step=epoch)  
        if trial.should_prune():
            raise optuna.TrialPruned()
        
    # suggest hyperparameters for the model
    INITIAL_LR = trial.suggest_float("INITIAL_LR", 1e-5, 1e-2, log=True)
    LR_FACTOR = trial.suggest_float("LR_FACTOR", 1e-3, 1.0, log=True)
    START_FACTOR = trial.suggest_float("START_FACTOR", 1e-5, 1e-1, log=True)
    WEIGHT_DECAY = trial.suggest_float("WEIGHT_DECAY", 1e-5, 1e-2, log=True)
    MOMENTUM = trial.suggest_float("MOMENTUM", 0.7, 0.99)
    
    # create the model
    model = SSDLITE_MOBILENET_V3_Large(num_classes_with_bg=NUM_CLASSES_WITH_BG)
    # replace the activation function with epu
    epu_activation_fn = EPU()
    replace_activation_function(model, epu_activation_fn)
    # move the model to device
    model.to(DEVICE)
    
    # create the optimizer
    optimizer = optim.AdamW(
        model.parameters(),
        lr=INITIAL_LR,
        betas=(MOMENTUM, 0.999),
        weight_decay=WEIGHT_DECAY,
        eps=1e-8,
        fused=True
    )
    
    # tune the model
    best_val_loss = bohb_tunner(
        args={
            "device": DEVICE,
            "warmup_epochs": WARMUP_EPOCHS,
            "num_epochs": NUM_EPOCHS,
            "patience": PATIENCE,
            "initial_lr": INITIAL_LR,
            "lr_factor": LR_FACTOR,
            "start_factor": START_FACTOR,
            "end_factor": END_FACTOR
        },
        model=model,
        optimizer=optimizer,
        dataloaders={"train":train_loader, "val":val_loader},
        callback=on_train_epoch_end
    )
    # return the best validation loss
    return best_val_loss

In [None]:
# define the number of trials
NUM_TRIALS = 1

# load the study
study = optuna.create_study(direction='minimize', 
                            sampler=optuna.samplers.TPESampler(), 
                            pruner=optuna.pruners.HyperbandPruner(),
                            study_name="ssd_mobnetv3_adis_epu_bohbtune",
                            load_if_exists=True)

# Optimize with a callback to stop after NUM_TRIALS complete trials
study.optimize(objective, n_trials=NUM_TRIALS)

# save the study
joblib.dump(study, f"{LOCAL_DIR}/ssd_mobnetv3_adis_epu_bohbtune_study1.pkl")

[I 2025-05-14 16:24:58,117] A new study created in memory with name: ssd_mobnetv3_adis_epu_bohbtune
Epoch 1/50:   0%|          | 0/142 [00:05<?, ?batch/s]
[W 2025-05-14 16:25:04,060] Trial 0 failed with parameters: {'INITIAL_LR': 1.3537834182970224e-05, 'LR_FACTOR': 0.007588706700051255, 'START_FACTOR': 0.02697134300947967, 'WEIGHT_DECAY': 0.00013763924621956587, 'MOMENTUM': 0.8023358486147819} because of the following error: OutOfMemoryError('CUDA out of memory. Tried to allocate 132.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 141.12 MiB is free. Process 15281 has 15.74 GiB memory in use. Of the allocated memory 15.44 GiB is allocated by PyTorch, and 23.97 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)').
Traceback (most recent call las

OutOfMemoryError: CUDA out of memory. Tried to allocate 132.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 141.12 MiB is free. Process 15281 has 15.74 GiB memory in use. Of the allocated memory 15.44 GiB is allocated by PyTorch, and 23.97 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)