In [1]:
# clone the ADIS repository
!git clone https://github.com/sathishkumar67/SSD_MobileNetV3_ADIS.git
# move the files to the current directory
!mv /kaggle/working/SSD_MobileNetV3_ADIS/* /kaggle/working/
# upgrade pip
!pip install --upgrade pip
# install the required packages
!pip install  -r requirements.txt --upgrade --upgrade-strategy eager

Cloning into 'SSD_MobileNetV3_ADIS'...
remote: Enumerating objects: 306, done.[K
remote: Counting objects: 100% (74/74), done.[K
remote: Compressing objects: 100% (56/56), done.[K
remote: Total 306 (delta 40), reused 49 (delta 18), pack-reused 232 (from 2)[K
Receiving objects: 100% (306/306), 99.69 MiB | 20.69 MiB/s, done.
Resolving deltas: 100% (174/174), done.
Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting optuna==4.2.1 (from -r requirements.txt (line 2))
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting huggingface-hub==0.30.

In [None]:
# necessary imports
import os
import optuna
import joblib
from tqdm import tqdm
import random
import numpy as np
from tqdm import tqdm
from huggingface_hub import hf_hub_download
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, RandomSampler
from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, SequentialLR
from ssdlite_mobnetv3_adis.utils import unzip_file, replace_activation_function
from ssdlite_mobnetv3_adis.dataset import collate_fn, SSDLITEOBJDET_DATASET, CachedSSDLITEOBJDET_DATASET
from ssdlite_mobnetv3_adis.model import SSDLITE_MOBILENET_V3_Large
from ssdlite_mobnetv3_adis.epu import EPU
from ssdlite_mobnetv3_adis.trainer import bohb_tunner, train


# set random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
# set constants
REPO_ID = "pt-sk/ADIS" 
DATASET_NAME = "balanced_dataset"
REPO_TYPE = "dataset"
FILENAME_IN_REPO = f"{DATASET_NAME}.zip"
LOCAL_DIR = os.getcwd()
DATASET_PATH = f"{LOCAL_DIR}/{FILENAME_IN_REPO}"
DATASET_FOLDER_PATH = f"{LOCAL_DIR}/{DATASET_NAME}"                       
CLASSES = ['Cat', 'Cattle', 'Chicken', 'Deer', 'Dog', 'Squirrel', 'Eagle', 'Goat', 'Rodents', 'Snake'] 
NUM_CLASSES = len(CLASSES)
NUM_CLASSES_WITH_BG = NUM_CLASSES + 1    # 1 for background class

# download the dataset and unzip it
hf_hub_download(repo_id=REPO_ID, filename=FILENAME_IN_REPO, repo_type=REPO_TYPE, local_dir=LOCAL_DIR)
unzip_file(DATASET_PATH, LOCAL_DIR)

balanced_dataset.zip:   0%|          | 0.00/7.04G [00:00<?, ?B/s]

Unzipping:  46%|████▋     | 3.27G/7.07G [00:20<00:23, 160MB/s]


KeyboardInterrupt: 

In [None]:
# set pin memory device
PIN_MEMORY_DEVICE = "cuda:0"
NUM_CORES = os.cpu_count()
BATCH_SIZE = 64

# prepare the dataset
train_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="train",
    num_classes=NUM_CLASSES_WITH_BG)

val_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="val",
    num_classes=NUM_CLASSES_WITH_BG)

test_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="test",
    num_classes=NUM_CLASSES_WITH_BG)


# samplers for reproducibility
train_sampler = RandomSampler(train_dataset, generator=torch.Generator().manual_seed(RANDOM_SEED))
val_sampler = RandomSampler(val_dataset, generator=torch.Generator().manual_seed(RANDOM_SEED))
test_sampler = RandomSampler(test_dataset, generator=torch.Generator().manual_seed(RANDOM_SEED))


# prepare the dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    sampler=train_sampler,
    num_workers=NUM_CORES,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2,
    pin_memory_device=PIN_MEMORY_DEVICE)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    sampler=val_sampler,
    num_workers=NUM_CORES,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2,
    pin_memory_device=PIN_MEMORY_DEVICE)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    sampler=test_sampler,
    num_workers=NUM_CORES,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2,
    pin_memory_device=PIN_MEMORY_DEVICE)

Preprocessing dataset and caching to /kaggle/working/balanced_dataset/train_cache...


100%|██████████| 18139/18139 [03:36<00:00, 83.78it/s] 


Preprocessing dataset and caching to /kaggle/working/balanced_dataset/val_cache...


100%|██████████| 2390/2390 [00:27<00:00, 86.82it/s] 


Preprocessing dataset and caching to /kaggle/working/balanced_dataset/test_cache...


100%|██████████| 2390/2390 [00:30<00:00, 78.10it/s] 


In [None]:
# constants
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
WARMUP_EPOCHS = 10
NUM_EPOCHS = 100
PATIENCE = 10
END_FACTOR = 1.0

# define the objective function
def objective(trial):
    # define callback to report intermidiate results
    def on_train_epoch_end(score, epoch):
        trial.report(score, step=epoch)  
        if trial.should_prune():
            raise optuna.TrialPruned()
        
    # suggest hyperparameters for the model
    INITIAL_LR = trial.suggest_float("INITIAL_LR", 1e-4, 1e-1, log=True)
    LR_FACTOR = trial.suggest_float("LR_FACTOR", 1e-4, 1e-1, log=True)
    START_FACTOR = trial.suggest_float("START_FACTOR", 1e-4, 1e-1, log=True)
    WEIGHT_DECAY = trial.suggest_float("WEIGHT_DECAY", 1e-4, 1e-1, log=True)
    MOMENTUM = trial.suggest_float("MOMENTUM", 0.7, 0.99)
    
    # create the model
    model = SSDLITE_MOBILENET_V3_Large(num_classes_with_bg=NUM_CLASSES_WITH_BG)
    # replace the activation function with epu
    epu_activation_fn = EPU()
    replace_activation_function(model, epu_activation_fn)
    # move the model to device
    model.to(DEVICE)
    
    # create the optimizer
    optimizer = optim.AdamW(
        model.parameters(),
        lr=INITIAL_LR,
        betas=(MOMENTUM, 0.999),
        weight_decay=WEIGHT_DECAY,
        eps=1e-8,
        fused=True
    )
    
    # tune the model
    best_val_loss = bohb_tunner(
        args={
            "device": DEVICE,
            "warmup_epochs": WARMUP_EPOCHS,
            "num_epochs": NUM_EPOCHS,
            "patience": PATIENCE,
            "initial_lr": INITIAL_LR,
            "lr_factor": LR_FACTOR,
            "start_factor": START_FACTOR,
            "end_factor": END_FACTOR
        },
        model=model,
        optimizer=optimizer,
        dataloaders={"train":train_loader, "val":val_loader},
        callback=on_train_epoch_end
    )
    # return the best validation loss
    return best_val_loss

In [None]:
# define the number of trials
NUM_TRIALS = 2

# load the study
study = optuna.create_study(direction='minimize', 
                            sampler=optuna.samplers.TPESampler(), 
                            pruner=optuna.pruners.HyperbandPruner(),
                            study_name="ssd_mobnetv3_adis_epu_bohbtune",
                            load_if_exists=True)

# Optimize with a callback to stop after NUM_TRIALS complete trials
study.optimize(objective, n_trials=NUM_TRIALS)

# save the study
joblib.dump(study, f"{LOCAL_DIR}/ssd_mobnetv3_adis_epu_bohbtune_study1.pkl")

[I 2025-05-15 08:01:37,501] A new study created in memory with name: ssd_mobnetv3_adis_epu_bohbtune
Downloading: "https://download.pytorch.org/models/ssdlite320_mobilenet_v3_large_coco-a79551df.pth" to /root/.cache/torch/hub/checkpoints/ssdlite320_mobilenet_v3_large_coco-a79551df.pth
100%|██████████| 13.4M/13.4M [00:00<00:00, 201MB/s]
Epoch 1/50: 100%|██████████| 284/284 [02:23<00:00,  1.98batch/s, loss=18.8, lr=9.48e-8]
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.52batch/s]


Epoch 1: Train Loss=18.8128, Val Loss=18.7341


Epoch 2/50: 100%|██████████| 284/284 [02:20<00:00,  2.03batch/s, loss=5.02, lr=0.000117]
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.57batch/s]


Epoch 2: Train Loss=6.5411, Val Loss=5.3463


Epoch 3/50: 100%|██████████| 284/284 [02:19<00:00,  2.04batch/s, loss=5.04, lr=0.000233]
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.49batch/s]


Epoch 3: Train Loss=4.9229, Val Loss=4.6310


Epoch 4/50: 100%|██████████| 284/284 [02:19<00:00,  2.03batch/s, loss=3.64, lr=0.000349]
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.60batch/s]


Epoch 4: Train Loss=4.1921, Val Loss=4.0116


Epoch 5/50: 100%|██████████| 284/284 [02:20<00:00,  2.02batch/s, loss=3.43, lr=0.000466]
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.64batch/s]


Epoch 5: Train Loss=3.6240, Val Loss=3.6525


Epoch 6/50: 100%|██████████| 284/284 [02:19<00:00,  2.03batch/s, loss=2.68, lr=0.000582]
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.59batch/s]


Epoch 6: Train Loss=3.2531, Val Loss=3.4761


Epoch 7/50: 100%|██████████| 284/284 [02:19<00:00,  2.04batch/s, loss=3.27, lr=0.000582]
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.60batch/s]


Epoch 7: Train Loss=2.9106, Val Loss=3.3584


Epoch 8/50: 100%|██████████| 284/284 [02:18<00:00,  2.05batch/s, loss=2.67, lr=0.00058]
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.63batch/s]


Epoch 8: Train Loss=2.6422, Val Loss=3.3122


Epoch 9/50: 100%|██████████| 284/284 [02:19<00:00,  2.04batch/s, loss=1.88, lr=0.000576]
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.60batch/s]


Epoch 9: Train Loss=2.4640, Val Loss=3.2487


Epoch 10/50: 100%|██████████| 284/284 [02:19<00:00,  2.04batch/s, loss=2.53, lr=0.000571]
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.61batch/s]


Epoch 10: Train Loss=2.2659, Val Loss=3.2604


Epoch 11/50: 100%|██████████| 284/284 [02:19<00:00,  2.04batch/s, loss=1.81, lr=0.000565]
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.69batch/s]


Epoch 11: Train Loss=2.0962, Val Loss=3.2481


Epoch 12/50: 100%|██████████| 284/284 [02:19<00:00,  2.04batch/s, loss=1.87, lr=0.000557]
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.57batch/s]


Epoch 12: Train Loss=1.9652, Val Loss=3.3271


Epoch 13/50: 100%|██████████| 284/284 [02:19<00:00,  2.04batch/s, loss=2.02, lr=0.000549]
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.58batch/s]


Epoch 13: Train Loss=1.8452, Val Loss=3.3426


Epoch 14/50: 100%|██████████| 284/284 [02:19<00:00,  2.03batch/s, loss=1.7, lr=0.000539] 
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.67batch/s]


Epoch 14: Train Loss=1.7274, Val Loss=3.4523


Epoch 15/50: 100%|██████████| 284/284 [02:18<00:00,  2.05batch/s, loss=1.98, lr=0.000527]
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.46batch/s]


Epoch 15: Train Loss=1.5984, Val Loss=3.5069


Epoch 16/50: 100%|██████████| 284/284 [02:19<00:00,  2.04batch/s, loss=1.8, lr=0.000515] 
Validating: 100%|██████████| 38/38 [00:10<00:00,  3.65batch/s]
[I 2025-05-15 08:41:43,746] Trial 0 finished with value: 3.2481229869942916 and parameters: {'INITIAL_LR': 0.0005824283135055773, 'LR_FACTOR': 0.00940428043364665, 'START_FACTOR': 0.0001627105524931517, 'WEIGHT_DECAY': 0.001021068178450286, 'MOMENTUM': 0.78397802624776}. Best is trial 0 with value: 3.2481229869942916.


Epoch 16: Train Loss=1.5051, Val Loss=3.6535
Early stopping at epoch 16 (no improvement for 5 epochs)


Epoch 1/50:  38%|███▊      | 107/284 [00:54<01:29,  1.98batch/s, loss=6.39, lr=0.000823]
[W 2025-05-15 08:42:38,025] Trial 1 failed with parameters: {'INITIAL_LR': 0.08286585374783254, 'LR_FACTOR': 0.07949185602592693, 'START_FACTOR': 0.00993766831573112, 'WEIGHT_DECAY': 0.049146756073796136, 'MOMENTUM': 0.976817165212368} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_75/1394800244.py", line 42, in objective
    best_val_loss = bohb_tunner(
                    ^^^^^^^^^^^^
  File "/tmp/ipykernel_75/2640974943.py", line 63, in bohb_tunner
    total_loss += loss.item()
                  ^^^^^^^^^^^
KeyboardInterrupt
[W 2025-05-15 08:42:38,027] Trial 1 failed with value None.


KeyboardInterrupt: 