In [1]:
# clone the ADIS repository
!git clone https://github.com/sathishkumar67/SSD_MobileNetV3_ADIS.git
!mv /kaggle/working/SSD_MobileNetV3_ADIS/* /kaggle/working/
!pip install --upgrade pip
# install the required packages
!pip install  -r requirements.txt --upgrade --upgrade-strategy eager
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

Cloning into 'SSD_MobileNetV3_ADIS'...
remote: Enumerating objects: 146, done.[K
remote: Counting objects: 100% (146/146), done.[K
remote: Compressing objects: 100% (109/109), done.[K
remote: Total 146 (delta 88), reused 87 (delta 37), pack-reused 0 (from 0)[K
Receiving objects: 100% (146/146), 62.62 KiB | 6.26 MiB/s, done.
Resolving deltas: 100% (88/88), done.
Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1
Collecting ultralytics (from -r requirements.txt (line 1))
  Downloading ultralytics-8.3.109-py3-none-any.whl.metadata (37 kB)
Collecting albumentations==2

In [2]:
# necessary imports
import os
from tqdm import tqdm
import torch
import torch.optim as optim
from ssd_mobnetv3_adis import unzip_file
from huggingface_hub import hf_hub_download
from torch.utils.data import DataLoader
from torchmetrics.detection.mean_ap import MeanAveragePrecision
from ssd_mobnetv3_adis import collate_fn, SSDLITEOBJDET_DATASET, CachedSSDLITEOBJDET_DATASET, SSD_MOBILENET_V3_Large_Config, SSD_MOBILENET_V3_Large

In [3]:
# set constants
REPO_ID = "pt-sk/ADIS" 
DATASET_NAME = "balanced_dataset"
REPO_TYPE = "dataset"
FILENAME_IN_REPO = f"{DATASET_NAME}.zip"
LOCAL_DIR = os.getcwd()
DATASET_PATH = f"{LOCAL_DIR}/{FILENAME_IN_REPO}"
DATASET_FOLDER_PATH = f"{LOCAL_DIR}/{DATASET_NAME}"                       
CLASSES = ['Cat', 'Cattle', 'Chicken', 'Deer', 'Dog', 'Squirrel', 'Eagle', 'Goat', 'Rodents', 'Snake'] 
NUM_CLASSES = len(CLASSES)                        
BACKGROUND_CLASS_ID = 0
MODEL_NUM_CLASSES = NUM_CLASSES + 1     # 1 for background class

# download the dataset and unzip it
hf_hub_download(repo_id=REPO_ID, filename=FILENAME_IN_REPO, repo_type=REPO_TYPE, local_dir=LOCAL_DIR)
unzip_file(DATASET_PATH, LOCAL_DIR)

# number of cores
num_cores = os.cpu_count()
print(f"Number of CPU cores: {num_cores}")

balanced_dataset.zip:   0%|          | 0.00/7.04G [00:00<?, ?B/s]

Unzipping: 100%|██████████| 7.07G/7.07G [00:45<00:00, 155MB/s]


Unzipped /kaggle/working/balanced_dataset.zip to /kaggle/working
Removed zip file: /kaggle/working/balanced_dataset.zip
Number of CPU cores: 4


In [4]:
# prepare the dataset
train_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="train",
    num_classes=MODEL_NUM_CLASSES)

val_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="val",
    num_classes=MODEL_NUM_CLASSES)

test_dataset = CachedSSDLITEOBJDET_DATASET(
    dataset_class=SSDLITEOBJDET_DATASET,
    root_dir=DATASET_FOLDER_PATH,
    split="test",
    num_classes=MODEL_NUM_CLASSES)

Preprocessing dataset and caching to /kaggle/working/balanced_dataset/train_cache...


100%|██████████| 18139/18139 [04:23<00:00, 68.84it/s] 


Preprocessing dataset and caching to /kaggle/working/balanced_dataset/val_cache...


100%|██████████| 2390/2390 [00:30<00:00, 77.69it/s] 


Preprocessing dataset and caching to /kaggle/working/balanced_dataset/test_cache...


100%|██████████| 2390/2390 [00:30<00:00, 77.16it/s] 


In [7]:
# prepare the dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=num_cores,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2)

val_loader = DataLoader(
    val_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=num_cores,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2)

test_loader = DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=num_cores,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2)

In [17]:
def train():
    # Set device
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load the model
    model = SSD_MOBILENET_V3_Large(num_classes_with_bg=MODEL_NUM_CLASSES)
    model.to(device)
    
    # Optimizer and scheduler
    optimizer = model.configure_optimizers(lr=0.0001, betas=(0.9, 0.999), weight_decay=0.0001, eps=1e-08, fused=True)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
    
    # Training loop parameters
    num_epochs = 50
    warmup_epochs = 4
    patience = 5
    best_map = float('-inf')
    patience_counter = 0
    initial_lr = 0.0001  # Matches the optimizer's initial learning rate
    
    for epoch in range(num_epochs):
        # Warmup phase: linearly increase learning rate for the first 4 epochs
        if epoch < warmup_epochs:
            lr = initial_lr * (epoch + 1) / warmup_epochs
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
        # Training phase
        model.train()
        total_loss = 0.0
        num_batches = len(train_loader)
        
        # Progress bar
        train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
        
        for _, (images, targets) in enumerate(train_bar):
            # Move data to device
            images = images.to(device)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            # Forward pass
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            
            # Backward pass and optimization
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            
            batch_loss = losses.detach().item()
            total_loss += batch_loss
            
            # Update progress bar
            train_bar.set_postfix(loss=batch_loss)
        
        avg_loss = total_loss / num_batches
        print(f"Epoch {epoch+1}/{num_epochs} | Learning Rate: {lr:.6f} | Avg Train Loss: {avg_loss:.4f}")
        
        # Validation phase
        model.eval()
        metric = MeanAveragePrecision()
        eval_bar = tqdm(val_loader, desc=f"Validating...", unit="batch")
        with torch.no_grad():
            for images, targets in eval_bar:
                # Move data to device
                images = images.to(device)
                targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
                # Forward pass    
                predictions = model(images)
                metric.update(predictions, targets)
        
        map_result = metric.compute()
        print(f"Epoch {epoch+1} | Val mAP: {map_result['map']:.4f}")
        
        # Early stopping logic
        if map_result['map'] > best_map:
            best_map = map_result['map']
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered at epoch", epoch + 1)
                break
        
        # Step the learning rate scheduler after warmup
        if epoch >= warmup_epochs:
            lr_scheduler.step()
    
    print("Training complete.")
    print(f"Best mAP: {best_map:.4f}")
    print("Saving the model...")
    
    # Save the model
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, 'ssd_mobilenet_v3_finetuned.pth')

In [18]:
train()

Using device: cuda:0


Downloading: "https://download.pytorch.org/models/ssdlite320_mobilenet_v3_large_coco-a79551df.pth" to /root/.cache/torch/hub/checkpoints/ssdlite320_mobilenet_v3_large_coco-a79551df.pth
100%|██████████| 13.4M/13.4M [00:00<00:00, 163MB/s]
Epoch 1/50: 100%|██████████| 142/142 [02:07<00:00,  1.12batch/s, loss=9.9] 


Epoch 1/50 | Learning Rate: 0.000025 | Avg Train Loss: 12.1161


Validating...: 100%|██████████| 19/19 [00:19<00:00,  1.01s/batch]


Epoch 1 | Val mAP: 0.0021


Epoch 2/50: 100%|██████████| 142/142 [02:02<00:00,  1.16batch/s, loss=5.88]


Epoch 2/50 | Learning Rate: 0.000050 | Avg Train Loss: 7.5462


Validating...: 100%|██████████| 19/19 [00:16<00:00,  1.16batch/s]


Epoch 2 | Val mAP: 0.0508


Epoch 3/50: 100%|██████████| 142/142 [02:03<00:00,  1.15batch/s, loss=4.41]


Epoch 3/50 | Learning Rate: 0.000075 | Avg Train Loss: 4.9300


Validating...: 100%|██████████| 19/19 [00:16<00:00,  1.17batch/s]


Epoch 3 | Val mAP: 0.2642


Epoch 4/50: 100%|██████████| 142/142 [02:02<00:00,  1.16batch/s, loss=3.83]


Epoch 4/50 | Learning Rate: 0.000100 | Avg Train Loss: 3.9407


Validating...: 100%|██████████| 19/19 [00:16<00:00,  1.15batch/s]


Epoch 4 | Val mAP: 0.3452


Epoch 5/50: 100%|██████████| 142/142 [02:01<00:00,  1.17batch/s, loss=3.19]


Epoch 5/50 | Learning Rate: 0.000100 | Avg Train Loss: 3.3614


Validating...: 100%|██████████| 19/19 [00:16<00:00,  1.17batch/s]


Epoch 5 | Val mAP: 0.3855


Epoch 6/50: 100%|██████████| 142/142 [02:00<00:00,  1.17batch/s, loss=2.82]


Epoch 6/50 | Learning Rate: 0.000100 | Avg Train Loss: 2.9761


Validating...: 100%|██████████| 19/19 [00:16<00:00,  1.12batch/s]


Epoch 6 | Val mAP: 0.4215


Epoch 7/50: 100%|██████████| 142/142 [02:01<00:00,  1.17batch/s, loss=2.71]


Epoch 7/50 | Learning Rate: 0.000100 | Avg Train Loss: 2.6730


Validating...: 100%|██████████| 19/19 [00:16<00:00,  1.17batch/s]


Epoch 7 | Val mAP: 0.4477


Epoch 8/50: 100%|██████████| 142/142 [02:00<00:00,  1.18batch/s, loss=2.26]


Epoch 8/50 | Learning Rate: 0.000100 | Avg Train Loss: 2.4396


Validating...: 100%|██████████| 19/19 [00:15<00:00,  1.23batch/s]


Epoch 8 | Val mAP: 0.4529


Epoch 9/50: 100%|██████████| 142/142 [02:01<00:00,  1.16batch/s, loss=2.48]


Epoch 9/50 | Learning Rate: 0.000100 | Avg Train Loss: 2.4047


Validating...: 100%|██████████| 19/19 [00:17<00:00,  1.11batch/s]


Epoch 9 | Val mAP: 0.4543


Epoch 10/50: 100%|██████████| 142/142 [02:01<00:00,  1.16batch/s, loss=2.29]


Epoch 10/50 | Learning Rate: 0.000100 | Avg Train Loss: 2.3741


Validating...: 100%|██████████| 19/19 [00:16<00:00,  1.16batch/s]


Epoch 10 | Val mAP: 0.4557


Epoch 11/50: 100%|██████████| 142/142 [02:02<00:00,  1.16batch/s, loss=2.1] 


Epoch 11/50 | Learning Rate: 0.000100 | Avg Train Loss: 2.3465


Validating...: 100%|██████████| 19/19 [00:16<00:00,  1.16batch/s]


Epoch 11 | Val mAP: 0.4564


Epoch 12/50: 100%|██████████| 142/142 [01:59<00:00,  1.19batch/s, loss=2.46]


Epoch 12/50 | Learning Rate: 0.000100 | Avg Train Loss: 2.3468


Validating...: 100%|██████████| 19/19 [00:16<00:00,  1.13batch/s]


Epoch 12 | Val mAP: 0.4565


Epoch 13/50: 100%|██████████| 142/142 [02:00<00:00,  1.18batch/s, loss=2.39]


Epoch 13/50 | Learning Rate: 0.000100 | Avg Train Loss: 2.3378


Validating...: 100%|██████████| 19/19 [00:16<00:00,  1.15batch/s]


Epoch 13 | Val mAP: 0.4567


Epoch 14/50: 100%|██████████| 142/142 [02:01<00:00,  1.17batch/s, loss=2.05]


Epoch 14/50 | Learning Rate: 0.000100 | Avg Train Loss: 2.3357


Validating...: 100%|██████████| 19/19 [00:16<00:00,  1.14batch/s]


Epoch 14 | Val mAP: 0.4564


Epoch 15/50: 100%|██████████| 142/142 [02:01<00:00,  1.17batch/s, loss=2.55]


Epoch 15/50 | Learning Rate: 0.000100 | Avg Train Loss: 2.3428


Validating...: 100%|██████████| 19/19 [00:15<00:00,  1.22batch/s]


Epoch 15 | Val mAP: 0.4561


Epoch 16/50: 100%|██████████| 142/142 [02:01<00:00,  1.17batch/s, loss=2.6] 


Epoch 16/50 | Learning Rate: 0.000100 | Avg Train Loss: 2.3358


Validating...: 100%|██████████| 19/19 [00:16<00:00,  1.14batch/s]


Epoch 16 | Val mAP: 0.4563


Epoch 17/50: 100%|██████████| 142/142 [02:00<00:00,  1.18batch/s, loss=2.33]


Epoch 17/50 | Learning Rate: 0.000100 | Avg Train Loss: 2.3336


Validating...: 100%|██████████| 19/19 [00:15<00:00,  1.24batch/s]


Epoch 17 | Val mAP: 0.4562


Epoch 18/50: 100%|██████████| 142/142 [02:02<00:00,  1.16batch/s, loss=2.54]


Epoch 18/50 | Learning Rate: 0.000100 | Avg Train Loss: 2.3385


Validating...: 100%|██████████| 19/19 [00:17<00:00,  1.11batch/s]


Epoch 18 | Val mAP: 0.4561
Early stopping triggered at epoch 18
Training complete.
Best mAP: 0.4567
Saving the model...


In [None]:
# # Define the objective function
# def objective(trial):
    
#     # Define callback to report intermediate results
#     def on_train_epoch_end(score, epoch):
#         trial.report(score, step=epoch)  
#         if trial.should_prune():
#             raise optuna.TrialPruned()

#     callbacks = {
#         "on_train_epoch_end" : on_train_epoch_end
#     }
    
#     # Define hyperparameters using Optuna suggestions
#     lr0 = trial.suggest_float("lr0", 1e-5, 1e-3, log=True)
#     lrf = trial.suggest_float("lrf", 0.1, 1, log=True)
#     weight_decay = trial.suggest_float("weight_decay", 0.0001, 0.01, log=True)
#     warmup_momentum = trial.suggest_float("warmup_momentum", 0.5, 0.9)
#     momentum = trial.suggest_float("momentum", 0.8, 0.99)

In [None]:
# import joblib
# NUM_TRIALS = 3

# study = optuna.create_study(direction='maximize', 
#                             sampler=optuna.samplers.TPESampler(), 
#                             pruner=optuna.pruners.HyperbandPruner(),
#                             study_name="yolo11_tuning",
#                             load_if_exists=True)

# # Optimize with a callback to stop after NUM_TRIALS complete trials
# study.optimize(
#     objective,
#     n_trials=NUM_TRIALS)

# joblib.dump(study, "/kaggle/working/optuna_study.pkl")