In [None]:
# clone the ADIS repository
!git clone https://github.com/sathishkumar67/SSD_MobileNetV3_ADIS.git
!mv /kaggle/working/SSD_MobileNetV3_ADIS/* /kaggle/working/
!pip install --upgrade pip
# install the required packages
!pip install  -r requirements.txt --upgrade --upgrade-strategy eager
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

Cloning into 'SSD_MobileNetV3_ADIS'...
remote: Enumerating objects: 173, done.[K
remote: Counting objects: 100% (173/173), done.[K
remote: Compressing objects: 100% (127/127), done.[K
remote: Total 173 (delta 104), reused 107 (delta 46), pack-reused 0 (from 0)[K
Receiving objects: 100% (173/173), 74.96 KiB | 5.35 MiB/s, done.
Resolving deltas: 100% (104/104), done.
Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1
Collecting ultralytics (from -r requirements.txt (line 1))
  Downloading ultralytics-8.3.109-py3-none-any.whl.metadata (37 kB)
Collecting albumentation

In [None]:
# necessary imports
import os
import torch
import optuna
import joblib
from typing import Tuple
import torch.optim as optim
from tqdm import tqdm
from ssd_mobnetv3_adis import unzip_file
from huggingface_hub import hf_hub_download
from torch.utils.data import DataLoader
from ssd_mobnetv3_adis import collate_fn, SSDLITEOBJDET_DATASET, CachedSSDLITEOBJDET_DATASET, SSD_MOBILENET_V3_Large

In [3]:
# set constants
REPO_ID = "pt-sk/ADIS" 
DATASET_NAME = "balanced_dataset"
REPO_TYPE = "dataset"
FILENAME_IN_REPO = f"{DATASET_NAME}.zip"
LOCAL_DIR = os.getcwd()
DATASET_PATH = f"{LOCAL_DIR}/{FILENAME_IN_REPO}"
DATASET_FOLDER_PATH = f"{LOCAL_DIR}/{DATASET_NAME}"                       
CLASSES = ['Cat', 'Cattle', 'Chicken', 'Deer', 'Dog', 'Squirrel', 'Eagle', 'Goat', 'Rodents', 'Snake'] 
NUM_CLASSES = len(CLASSES)                        
BACKGROUND_CLASS_ID = 0
MODEL_NUM_CLASSES = NUM_CLASSES + 1    # 1 for background class

# download the dataset and unzip it
hf_hub_download(repo_id=REPO_ID, filename=FILENAME_IN_REPO, repo_type=REPO_TYPE, local_dir=LOCAL_DIR)
unzip_file(DATASET_PATH, LOCAL_DIR)

# number of cores
num_cores = os.cpu_count()
print(f"Number of CPU cores: {num_cores}")

balanced_dataset.zip:   0%|          | 0.00/7.04G [00:00<?, ?B/s]

Unzipping: 100%|██████████| 7.07G/7.07G [00:42<00:00, 167MB/s]


Unzipped /kaggle/working/balanced_dataset.zip to /kaggle/working
Removed zip file: /kaggle/working/balanced_dataset.zip
Number of CPU cores: 4


In [4]:
# prepare the dataset
train_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="train",
    num_classes=MODEL_NUM_CLASSES)

val_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="val",
    num_classes=MODEL_NUM_CLASSES)

test_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="test",
    num_classes=MODEL_NUM_CLASSES)

Preprocessing dataset and caching to /kaggle/working/balanced_dataset/train_cache...


100%|██████████| 18139/18139 [03:39<00:00, 82.50it/s] 


Preprocessing dataset and caching to /kaggle/working/balanced_dataset/val_cache...


100%|██████████| 2390/2390 [00:26<00:00, 88.65it/s] 


Preprocessing dataset and caching to /kaggle/working/balanced_dataset/test_cache...


100%|██████████| 2390/2390 [00:28<00:00, 84.03it/s] 


In [5]:
# prepare the dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=num_cores,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2,
    pin_memory_device="cuda:0")

val_loader = DataLoader(
    val_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=num_cores,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2,
    pin_memory_device="cuda:0")

test_loader = DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=num_cores,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2,
    pin_memory_device="cuda:0")

In [None]:
def train(warmup_epochs: int, num_epochs: int, patience: int, initial_lr: float, betas: Tuple[float, float], weight_decay: float, dataloaders: dict[str, torch.utils.data.DataLoader], callback) -> None:
    # early stopping parameters
    best_map = float('-inf')
    patience_counter = 0
    
    # get the dataloaders
    train_loader, val_loader = dataloaders['train'], dataloaders['val']
    
    # Set device
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load the model
    model = SSD_MOBILENET_V3_Large(num_classes_with_bg=MODEL_NUM_CLASSES)
    model.to(device)
    
    # Optimizer and scheduler
    optimizer = model.configure_optimizers(lr=initial_lr, betas=betas, weight_decay=weight_decay, eps=1e-08, fused=True)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
    
    for epoch in range(num_epochs):
        # Warmup phase: linearly increase learning rate for the first 4 epochs
        if epoch < warmup_epochs:
            lr = initial_lr * (epoch + 1) / warmup_epochs
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
        # Training phase
        model.train()
        total_loss = 0.0
        num_batches = len(train_loader)
        
        # Progress bar
        train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
        
        for _, (images, targets) in enumerate(train_bar):
            # Move data to device
            images = images.to(device)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            # Forward pass
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            
            # Backward pass and optimization
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            
            batch_loss = losses.detach().item()
            total_loss += batch_loss
            
            # Update progress bar
            train_bar.set_postfix(loss=batch_loss)
        
        avg_loss = total_loss / num_batches
        print(f"Epoch {epoch+1}/{num_epochs} | Learning Rate: {lr:.6f} | Avg Train Loss: {avg_loss:.4f}")
        
        # Validation phase
        model.eval()
        metric = MeanAveragePrecision()
        eval_bar = tqdm(val_loader, desc=f"Validating...", unit="batch")
        with torch.no_grad():
            for images, targets in eval_bar:
                # Move data to device
                images = images.to(device)
                targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
                # Forward pass    
                predictions = model(images)
                metric.update(predictions, targets)
        
        map_result = metric.compute()
        print(f"Epoch {epoch+1} | Val mAP: {map_result['map']:.4f}")
        
        # Report the validation mAP
        callback(map_result['map'], epoch+1)
        
        # Early stopping logic
        if map_result['map'] > best_map:
            best_map = map_result['map']
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered at epoch", epoch + 1)
                break
        
        # Step the learning rate scheduler after warmup
        if epoch >= warmup_epochs:
            lr_scheduler.step()
    
    print("Training complete.")
    print(f"Best mAP: {best_map:.4f}")
    return best_map

In [None]:
dataloaders = {"train":train_loader, "val":val_loader}

def objective(trial):
    # define callback to report intermidiate results
    def on_train_epoch_end(score, epoch):
        trial.report(score, step=epoch)  
        if trial.should_prune():
            raise optuna.TrialPruned()
        
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-2, log=True)
    momentum = trial.suggest_float("momentum", 0.7, 0.99)
    
    best_map = train(warmup_epochs=4, num_epochs=10, patience=4, initial_lr=lr, betas=(momentum, 0.999), weight_decay=weight_decay,
        dataloaders=dataloaders, callback=on_train_epoch_end)
    return best_map

In [None]:
NUM_TRIALS = 5

# load the study
study = optuna.create_study(direction='maximize', 
                            sampler=optuna.samplers.TPESampler(), 
                            pruner=optuna.pruners.HyperbandPruner(),
                            study_name="ssd_mobnetv3_adis_tuning",
                            load_if_exists=True)

# Optimize with a callback to stop after NUM_TRIALS complete trials
study.optimize(
    objective,
    n_trials=NUM_TRIALS)

[I 2025-04-17 04:26:54,295] A new study created in memory with name: ssd_mobnetv3_adis_tuning


In [None]:
joblib.dump(study, f"{LOCAL_DIR}/optuna_study.pkl")

Using device: cuda:0


Epoch 1/10:  87%|████████▋ | 124/142 [01:51<00:16,  1.11batch/s, loss=5.95]
[W 2025-04-17 04:29:02,629] Trial 1 failed with parameters: {'lr': 0.0003258226447882577, 'weight_decay': 7.351008945883096e-05, 'momentum': 0.8069944975912936} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_71/1982897661.py", line 14, in objective
    best_map = train(warmup_epochs=4, num_epochs=10, patience=4, initial_lr=lr, betas=(momentum, 0.999), weight_decay=weight_decay,
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_71/27229307.py", line 40, in train
    loss_dict = model(images, targets)
                ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-pack

KeyboardInterrupt: 

In [None]:
# print("Saving the model...")

# # Save the model
# torch.save({
#     'model_state_dict': model.state_dict(),
#     'optimizer_state_dict': optimizer.state_dict()
# }, 'ssd_mobilenet_v3_finetuned.pth')