In [None]:
# This cell downloads repository utilities and installs dependencies.
# It's intended for ephemeral notebook environments (Colab / Kaggle).
# If running locally you can skip the git clone/move steps and use the local files directly.
# Clone the ADIS helper repository (contains model & training code).
!git clone https://github.com/sathishkumar67/SSD_MobileNetV3_ADIS.git
# On some hosted runtimes the repo contents may need moving to the working directory.
# The following mv command was used for Kaggle examples; remove or adapt it on other platforms.
!mv /kaggle/working/SSD_MobileNetV3_ADIS/* /kaggle/working/  # adapt to your environment if needed
# Ensure pip is up-to-date then install required python packages from requirements.txt.
!pip install --upgrade pip
!pip install -r requirements.txt

In [None]:
# Core imports and reproducibility setup
import os
import optuna
import joblib
from tqdm import tqdm
import random
import numpy as np
from huggingface_hub import hf_hub_download
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, RandomSampler
from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, SequentialLR
# Utilities and model components from the ADIS helper package
from ssdlite_mobnetv3_adis.utils import unzip_file, replace_activation_function
from ssdlite_mobnetv3_adis.dataset import collate_fn, SSDLITEOBJDET_DATASET, CachedSSDLITEOBJDET_DATASET
from ssdlite_mobnetv3_adis.model import SSDLITE_MOBILENET_V3_Large
from ssdlite_mobnetv3_adis.epu import EPU

# Set random seed for reproducibility across runs and devices.
# NOTE: Complete determinism in CUDA may not be achievable across different drivers/hardware.
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
# Dataset constants and download
# REPO_ID points to a Hugging Face dataset repo containing a zipped dataset named balanced_dataset.zip
REPO_ID = "pt-sk/ADIS"
DATASET_NAME = "balanced_dataset"
REPO_TYPE = "dataset"
FILENAME_IN_REPO = f"{DATASET_NAME}.zip"
LOCAL_DIR = os.getcwd()
DATASET_PATH = f"{LOCAL_DIR}/{FILENAME_IN_REPO}"
DATASET_FOLDER_PATH = f"{LOCAL_DIR}/{DATASET_NAME}"
# Class names for the dataset (no background class here)
CLASSES = ['Cat', 'Cattle', 'Chicken', 'Deer', 'Dog', 'Squirrel', 'Eagle', 'Goat', 'Rodents', 'Snake']
NUM_CLASSES = len(CLASSES)
NUM_CLASSES_WITH_BG = NUM_CLASSES + 1  # add 1 for the background class used by SSD

# Download the dataset archive and extract it locally. If already present, hf_hub_download will reuse the file.
hf_hub_download(repo_id=REPO_ID, filename=FILENAME_IN_REPO, repo_type=REPO_TYPE, local_dir=LOCAL_DIR)
unzip_file(DATASET_PATH, LOCAL_DIR)

In [None]:
# DataLoader setup: configure device pinning, parallel workers and construct datasets/dataloaders
# pin_memory_device is used by DataLoader to place tensors on the correct cuda device when pinning is enabled.
PIN_MEMORY_DEVICE = "cuda:0"
NUM_CORES = os.cpu_count()
BATCH_SIZE = 64

# Build cached dataset wrappers for faster access during tuning/training
train_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="train",
    num_classes=NUM_CLASSES_WITH_BG)

val_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="val",
    num_classes=NUM_CLASSES_WITH_BG)

test_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="test",
    num_classes=NUM_CLASSES_WITH_BG)

# Use RandomSampler with a fixed generator seed for deterministic sampling order across runs
train_sampler = RandomSampler(train_dataset, generator=torch.Generator().manual_seed(RANDOM_SEED))
val_sampler = RandomSampler(val_dataset, generator=torch.Generator().manual_seed(RANDOM_SEED))
test_sampler = RandomSampler(test_dataset, generator=torch.Generator().manual_seed(RANDOM_SEED))

# Create dataloaders. persistent_workers=True can improve throughput but requires careful worker resource management.
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    sampler=train_sampler,
    num_workers=NUM_CORES,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2,
    pin_memory_device=PIN_MEMORY_DEVICE)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    sampler=val_sampler,
    num_workers=NUM_CORES,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2,
    pin_memory_device=PIN_MEMORY_DEVICE)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    sampler=test_sampler,
    num_workers=NUM_CORES,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2,
    pin_memory_device=PIN_MEMORY_DEVICE)

In [None]:
def bohb_tunner(
    args: dict,
    model: nn.Module,
    optimizer: optim.Optimizer,
    dataloaders: dict[str, torch.utils.data.DataLoader],
    callback,
) -> float:
    """Train helper used by the Optuna/BOHB objective.

    This function runs a full training loop for a single trial using a two-stage scheduler:
    1) Linear warmup for `warmup_epochs` iterations (LinearLR)
    2) Cosine annealing for the remaining epochs (CosineAnnealingLR)

    The function reports the epoch-level validation loss via the provided `callback` and returns
    the best observed validation loss for the trial (lower is better).

    Args:
        args: configuration dict with keys `device`, `warmup_epochs`, `num_epochs`, `patience`,
              `initial_lr`, `lr_factor`, `start_factor`, and `end_factor`.
        model: a PyTorch nn.Module implementing the SSD-style forward(images, targets) -> loss_dict API.
        optimizer: optimizer instance (e.g., AdamW) already constructed for the model.
        dataloaders: dict containing the 'train' and 'val' DataLoader objects.
        callback: callable(callback_score, epoch) used to report intermediate results (Optuna).

    Returns:
        float: best validation loss observed during training (lower is better).
    """
    # Unpack dataloaders
    train_loader, val_loader = dataloaders['train'], dataloaders['val']

    # Build the two-phase scheduler: linear warmup -> cosine annealing
    scheduler_warmup = LinearLR(optimizer, start_factor=args["start_factor"], end_factor=args["end_factor"], total_iters=args["warmup_epochs"])
    scheduler_cosine = CosineAnnealingLR(optimizer, T_max=(args["num_epochs"] - args["warmup_epochs"]), eta_min=args["initial_lr"] * args["lr_factor"])
    scheduler = SequentialLR(optimizer, schedulers=[scheduler_warmup, scheduler_cosine], milestones=[args["warmup_epochs"]])

    # Early stopping bookkeeping
    best_val_loss = float('inf')
    patience_counter = 0

    # Training loop over epochs
    for epoch in range(1, args["num_epochs"] + 1):
        model.train()
        total_loss = 0.0
        train_bar = tqdm(train_loader, desc=f"Epoch {epoch}/{args['num_epochs']}", unit="batch")
        for images, targets in train_bar:
            # Move tensors to the configured device
            images = images.to(args["device"])
            targets = [{k: v.to(args["device"]) for k, v in t.items()} for t in targets]

            # Forward: model returns a dict of losses (SSD-style). Sum to scalar loss for backward.
            loss_dict = model(images, targets)
            loss = sum(loss for loss in loss_dict.values())

            # Backpropagation step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            train_bar.set_postfix(loss=loss.item(), lr=optimizer.param_groups[0]['lr'])

        avg_train_loss = total_loss / max(1, len(train_loader))

        # Advance the LR scheduler by one epoch
        scheduler.step()

        # Validation: evaluate without gradient calculations
        total_val_loss = 0.0
        with torch.no_grad():
            for images, targets in tqdm(val_loader, desc="Validating", unit="batch"):
                images = images.to(args["device"])
                targets = [{k: v.to(args["device"]) for k, v in t.items()} for t in targets]
                loss_dict = model(images, targets)
                total_val_loss += sum(loss for loss in loss_dict.values()).item()

        avg_val_loss = total_val_loss / max(1, len(val_loader))

        # Report the validation loss back to the hyperparameter tuner
        callback(avg_val_loss, epoch)

        print(f"Epoch {epoch}: Train Loss={avg_train_loss:.4f}, Val Loss={avg_val_loss:.4f}")

        # Early stopping on validation loss
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= args["patience"]:
                print(f"Early stopping at epoch {epoch} (no improvement for {args['patience']} epochs)")
                break

    return best_val_loss

In [None]:
# Optuna/BOHB tuning constants and objective function
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
WARMUP_EPOCHS = 10
NUM_EPOCHS = 100
PATIENCE = 10
END_FACTOR = 1.0

def objective(trial):
    """Objective function for Optuna tuning. This function is executed for each trial and should
    return a scalar value to minimize (here: validation loss).
"""
    def on_train_epoch_end(score, epoch):
        # report intermediate score to the trial (used by pruning handlers)
        trial.report(score, step=epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()

    # Suggest hyperparameters to search over. Use log sampling for scale-sensitive params.
    INITIAL_LR = trial.suggest_float("INITIAL_LR", 1e-4, 1e-1, log=True)
    LR_FACTOR = trial.suggest_float("LR_FACTOR", 1e-4, 1e-1, log=True)
    START_FACTOR = trial.suggest_float("START_FACTOR", 1e-4, 1e-1, log=True)
    WEIGHT_DECAY = trial.suggest_float("WEIGHT_DECAY", 1e-6, 1e-1, log=True)
    MOMENTUM = trial.suggest_float("MOMENTUM", 0.7, 0.99)

    # Build model, replace activation with EPU (efficient parametric unit) and move to device
    model = SSDLITE_MOBILENET_V3_Large(num_classes_with_bg=NUM_CLASSES_WITH_BG)
    epu_activation_fn = EPU()
    replace_activation_function(model, epu_activation_fn)
    model.to(DEVICE)

    # Create optimizer for this trial's hyperparameters
    optimizer = optim.AdamW(
        model.parameters(),
        lr=INITIAL_LR,
        betas=(MOMENTUM, 0.999),
        weight_decay=WEIGHT_DECAY,
        eps=1e-8,
        fused=True
    )

    # Run the tuning loop and return the best validation loss observed
    best_val_loss = bohb_tunner(
        args={
            "device": DEVICE,
            "warmup_epochs": WARMUP_EPOCHS,
            "num_epochs": NUM_EPOCHS,
            "patience": PATIENCE,
            "initial_lr": INITIAL_LR,
            "lr_factor": LR_FACTOR,
            "start_factor": START_FACTOR,
            "end_factor": END_FACTOR
        },
        model=model,
        optimizer=optimizer,
        dataloaders={"train": train_loader, "val": val_loader},
        callback=on_train_epoch_end
    )
    return best_val_loss

In [None]:
# Run hyperparameter optimization
# Configure number of trials and create or load an existing Optuna study.
NUM_TRIALS = 30  # adjust to your compute/time budget

# If you want to resume a previous study, provide a storage URL or load the joblib dump instead.
# study = joblib.load("/path/to/study.pkl")
# Direction should be 'minimize' for validation loss or 'maximize' for metrics like mAP
study = optuna.create_study(direction="minimize", study_name="ssd_mobnetv3_adis_epu_bohbtune_study")

# Execute the optimization loop. This will run `objective` NUM_TRIALS times (may be parallelized with RDB storage).
study.optimize(objective, n_trials=NUM_TRIALS)

# Persist the study to disk so results can be inspected or resumed later
joblib.dump(study, f"{os.path.join(LOCAL_DIR, 'optuna_study.pkl')}")

In [None]:
# Load the study from disk and inspect the best trial's parameters
study = joblib.load(os.path.join(LOCAL_DIR, 'optuna_study.pkl'))
# Display the best trial summary (loss, params, etc.)
study.best_trial

In [None]:
# Train final model using the best hyperparameters found by the study
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
WARMUP_EPOCHS = 10
NUM_EPOCHS = 100
PATIENCE = 10
END_FACTOR = 1.0
# Load best hyperparameters from the Optuna study. Ensure `study` is defined or loaded above.
INITIAL_LR = study.best_params["INITIAL_LR"]
LR_FACTOR = study.best_params["LR_FACTOR"]
START_FACTOR = study.best_params["START_FACTOR"]
WEIGHT_DECAY = study.best_params["WEIGHT_DECAY"]
MOMENTUM = study.best_params["MOMENTUM"]

# Instantiate and prepare model for final training
model = SSDLITE_MOBILENET_V3_Large(num_classes_with_bg=NUM_CLASSES_WITH_BG)
epu_activation_fn = EPU()
replace_activation_function(model, epu_activation_fn)
model.to(DEVICE)

# Final optimizer using the selected hyperparameters
optimizer = optim.AdamW(
    model.parameters(),
    lr=INITIAL_LR,
    betas=(MOMENTUM, 0.999),
    weight_decay=WEIGHT_DECAY,
    eps=1e-8,
    fused=True
)

# Use the project's train function to run a full training session. The train function should accept
# the optimizer, model, dataloaders and other configs – consult ssdlite_mobnetv3_adis.trainer for details.
from ssdlite_mobnetv3_adis.trainer import train
# pass the necessary arguments to train() as per its signature
train()