In [1]:
!git clone https://github.com/sathishkumar67/ADIS.git
!mv /kaggle/working/ADIS/* /kaggle/working/
!pip install --upgrade pip
!pip install  -r requirements.txt --upgrade --upgrade-strategy eager
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

Cloning into 'ADIS'...


remote: Enumerating objects: 1083, done.[K
remote: Counting objects: 100% (105/105), done.[K
remote: Compressing objects: 100% (77/77), done.[K
remote: Total 1083 (delta 57), reused 73 (delta 27), pack-reused 978 (from 1)[K
Receiving objects: 100% (1083/1083), 39.97 MiB | 33.57 MiB/s, done.
Resolving deltas: 100% (553/553), done.
Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1
Collecting ultralytics (from -r requirements.txt (line 1))
  Downloading ultralytics-8.3.107-py3-none-any.whl.metadata (37 kB)
Collecting albumentations==2.0.5 (from -r requirements.txt (

In [2]:
# necessary imports
import os
from utils import unzip_file
from huggingface_hub import hf_hub_download

REPO_ID = "pt-sk/ADIS" 
DATASET_NAME = "balanced_dataset"
REPO_TYPE = "dataset"
FILENAME_IN_REPO = f"{DATASET_NAME}.zip"
LOCAL_DIR = os.getcwd()
DATASET_PATH = f"{LOCAL_DIR}/{FILENAME_IN_REPO}"
DATASET_FOLDER_PATH = f"{LOCAL_DIR}/{DATASET_NAME}"
NUM_CLASSES = 10                                               
CLASSES = ['Cat', 'Cattle', 'Chicken', 'Deer', 'Dog', 'Squirrel', 'Eagle', 'Goat', 'Rodents', 'Snake'] 
BACKGROUND_CLASS_ID = 0
MODEL_NUM_CLASSES = NUM_CLASSES + 1     # 1 for background class

# download the dataset and unzip it
hf_hub_download(repo_id=REPO_ID, filename=FILENAME_IN_REPO, repo_type=REPO_TYPE, local_dir=LOCAL_DIR)
unzip_file(DATASET_PATH, LOCAL_DIR)

# remove dataset.zip
os.remove(DATASET_PATH)

# number of cores
num_cores = os.cpu_count()
print(f"Number of CPU cores: {num_cores}")

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


balanced_dataset.zip:   0%|          | 0.00/7.04G [00:00<?, ?B/s]

Unzipping: 100%|██████████| 7.07G/7.07G [00:43<00:00, 162MB/s]


Number of CPU cores: 4


In [None]:
from typing import List, Tuple, Dict, Any
from tqdm import tqdm
import time
import os
import cv2
import lmdb
import numpy as np
import shutil
from functools import partial
import torch
import pickle
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision.models.detection import ssdlite320_mobilenet_v3_large
from torchvision.models.detection.ssdlite import SSDLiteClassificationHead
from torchvision.models.detection import _utils as det_utils
from torchmetrics.detection import MeanAveragePrecision 

In [4]:
class SSDLITEOBJDET_DATASET(Dataset):
    def __init__(self, root_dir: str, split: str, num_classes: int, img_size: int=320, mode: str="train", dtype=np.float32) -> None:
        super().__init__()
        self.root_dir, self.split, self.img_size, self.num_classes = root_dir, split.lower(), img_size, num_classes
        self.current_dir = os.path.join(self.root_dir, self.split)
        self.mode =  mode
        self.dtype = dtype
        
        # check if model is train or eval
        if self.mode not in ["train", "eval"]:
            raise ValueError(f"Invalid mode: {self.mode}. Expected 'train' or 'eval'.")
        
        # set interpolation method for resizing
        self.interpolation = cv2.INTER_LANCZOS4 if self.mode == "train" else cv2.INTER_LINEAR

        # Validate current directory
        if not os.path.exists(self.current_dir):
            raise FileNotFoundError(f"{self.current_dir} does not exist.")
        elif not os.path.isdir(self.current_dir):
            raise NotADirectoryError(f"{self.current_dir} is not a directory.")
        
        # check if the split directory is empty
        if len(os.listdir(self.current_dir)) == 0:
            raise ValueError(f"The directory {self.current_dir} is empty.")
        
        # get image and label files
        self.image_files = sorted(
            [os.path.join(self.current_dir, f) for f in os.listdir(self.current_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))],
            key=lambda x: os.path.splitext(x)[0]
        )
        self.label_files = [os.path.join(self.current_dir, os.path.splitext(f)[0] + '.txt') for f in self.image_files]

        # Validate existence for ALL label files
        for img_file, lbl_file in zip(self.image_files, self.label_files):
            if not os.path.exists(lbl_file):
                raise FileNotFoundError(f"Label file missing for {img_file}")

    def __len__(self) -> int:
        return len(self.image_files)

    def __getitem__(self, idx) -> Tuple[torch.Tensor, dict]:
        img_path, label_path = self.image_files[idx], self.label_files[idx]

        # Read image and convert to RGB format
        image = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
        orig_height, orig_width, _ = image.shape
        # tensor with uint8 datatype 
        image = cv2.resize(image, (self.img_size, self.img_size), interpolation=self.interpolation)
        
        # Read label file and parse the bounding boxes and labels
        data = np.loadtxt(label_path, dtype=self.dtype, delimiter=' ', ndmin=2)
        
        if data.size == 0:
            return image, {
                'boxes': np.array([[0.0, 0.0, 1.0, 1.0]], dtype=self.dtype),
                'labels': np.array(0, dtype=np.uint8)
            }
        else:
            # Convert normalized box coordinates into absolute coordinates, where orig_width and orig_height are your original dimensions.
            cx, cy, w, h = data[:, 1], data[:, 2], data[:, 3], data[:, 4]
            xmin = np.maximum(0, (cx - w/2) * orig_width)
            ymin = np.maximum(0, (cy - h/2) * orig_height)
            xmax = np.minimum(orig_width, (cx + w/2) * orig_width)
            ymax = np.minimum(orig_height, (cy + h/2) * orig_height)
            
            # Filter degenerate boxes (width or height less than 1)
            valid_mask = ((xmax - xmin) >= 1) & ((ymax - ymin) >= 1)
            valid_boxes = np.stack([xmin[valid_mask], ymin[valid_mask],
                                    xmax[valid_mask], ymax[valid_mask]], axis=1)

            # Adjust class IDs (cid from first column)
            valid_labels = data[valid_mask, 0].astype(np.uint8) 
            np.add(valid_labels, 1, out=valid_labels)  # Increment class IDs by 1 for background class

            # scale boxes to new image size
            scale_factors = np.array([self.img_size / orig_width, self.img_size / orig_height,
                                    self.img_size / orig_width, self.img_size / orig_height], dtype=valid_boxes.dtype)
            np.multiply(valid_boxes, scale_factors, out=valid_boxes)
            
            # Validate class IDs
            if np.all((valid_labels < 0) & (valid_labels >= self.num_classes)):
                raise ValueError(f"Invalid class ID in {label_path}")

            if self.mode == "train":
                np.divide(valid_boxes, self.img_size, out=valid_boxes) # Normalize boxes to [0, 1]

            return image, {
                'boxes': valid_boxes,
                'labels': valid_labels}
        
    def denormalize_bbox(self, boxes: torch.Tensor|np.ndarray) -> torch.Tensor|np.ndarray:
        # Denormalize boxes to original size
        return boxes * self.img_size
    
    def normalize_bbox(self, boxes: torch.Tensor|np.ndarray) -> torch.Tensor|np.ndarray:
        # Normalize boxes to [0, 1]
        return boxes / self.img_size
    
    def denormalize_image(self, image: torch.Tensor|np.ndarray) -> torch.Tensor|np.ndarray:
        # Denormalize image to [0, 255]
        return image * 255.0
    
    def normalize_image(self, image: torch.Tensor|np.ndarray) -> torch.Tensor|np.ndarray:
        # Normalize image to [0, 1]
        return image / 255.0
    

def collate_fn(batch):
    """
    Collate function to process a batch of samples from SSDLITEOBJDET_DATASET.
    
    Args:
        batch: List of tuples containing (image, target_dict)
    
    Returns:
        Tuple of (images, targets) where:
        - images: Tensor of shape (B, C, H, W) with normalized images
        - targets: List of dicts with 'boxes' and 'labels' tensors for each image
    """
    images = []
    targets = []

    # Process each sample in the batch
    for img, tgt in batch:
        # Convert HWC numpy array to CHW tensor and normalize to [0, 1]
        img_tensor = torch.from_numpy(img).permute(2, 0, 1).float()
        img_tensor /= 255.0
        images.append(img_tensor)
        
        # Convert annotations to tensors
        boxes = torch.as_tensor(tgt['boxes'], dtype=torch.float32)
        labels = torch.as_tensor(tgt['labels'], dtype=torch.int64)
        
        targets.append({
            'boxes': boxes,
            'labels': labels
        })
    
    return torch.stack(images, dim=0), targets

In [18]:
class CachedSSDLITEOBJDET_DATASET(Dataset):
    def __init__(self, dataset_class :SSDLITEOBJDET_DATASET, 
                root_dir: str, 
                split: str, 
                num_classes: int, 
                img_size: int=320, 
                dtype: np.dtype=np.float32, 
                mode: str="train",
                lmdb_path: str = None,
                map_size: int=1099511627776) -> None:
        super().__init__()
        
        self.root_dir, self.split, self.img_size, self.num_classes = root_dir, split.lower(), img_size, num_classes
        self.dtype = dtype
        self.mode = mode.lower()
        self.dataset_class = dataset_class
        self.map_size = map_size
        self.lmdb_path = lmdb_path if lmdb_path else os.path.join(self.root_dir, f"{self.split}_cache")
        
        # preprocess the dataset and cache it in lmdb
        self.preprocess_dataset()
        
        self.env = lmdb.open(self.lmdb_path, readonly=True, lock=False)
        with self.env.begin() as txn:
            self.length = txn.stat()['entries']

    
    def __len__(self):
        return self.length

    
    def __getitem__(self, idx):
        with self.env.begin() as txn:
            data = txn.get(str(idx).encode())
        return pickle.loads(data)
    
    
    def preprocess_dataset(self) -> None:
        dataset = self.dataset_class(root_dir=self.root_dir,
                                    split=self.split, 
                                    num_classes=self.num_classes, 
                                    img_size=self.img_size, 
                                    dtype=self.dtype, 
                                    mode=self.mode)
        # Create LMDB environment
        env = lmdb.open(self.lmdb_path, map_size=self.map_size)  # 1TB
        
        with env.begin(write=True) as txn:
            for idx in tqdm(range(len(dataset))):
                image, target = dataset[idx]
                
                # Serialize and store
                txn.put(
                    str(idx).encode(),
                    pickle.dumps((image, target), protocol=pickle.HIGHEST_PROTOCOL)
                )

        shutil.rmtree(os.path.join(self.root_dir, self.split))
        del dataset
    
    
    def denormalize_bbox(self, boxes: torch.Tensor|np.ndarray) -> torch.Tensor|np.ndarray:
        # Denormalize boxes to original size
        return boxes * self.img_size
    
    
    def normalize_bbox(self, boxes: torch.Tensor|np.ndarray) -> torch.Tensor|np.ndarray:
        # Normalize boxes to [0, 1]
        return boxes / self.img_size
    
    
    def denormalize_image(self, image: torch.Tensor|np.ndarray) -> torch.Tensor|np.ndarray:
        # Denormalize image to [0, 255]
        return image * 255.0
    
    
    def normalize_image(self, image: torch.Tensor|np.ndarray) -> torch.Tensor|np.ndarray:
        # Normalize image to [0, 1]
        return image / 255.0
    
def collate_fn(batch):
    """
    Collate function to process a batch of samples from SSDLITEOBJDET_DATASET.
    
    Args:
        batch: List of tuples containing (image, target_dict)
    
    Returns:
        Tuple of (images, targets) where:
        - images: Tensor of shape (B, C, H, W) with normalized images
        - targets: List of dicts with 'boxes' and 'labels' tensors for each image
    """
    images = []
    targets = []
    
    # Process each sample in the batch
    for img, tgt in batch:
        images.append(img)
        targets.append(tgt)
    
    return np.stack(images, axis=0), targets

In [6]:
# train_dataset = CachedSSDLITEOBJDET_DATASET(SSDLITEOBJDET_DATASET,
#                                         root_dir=DATASET_FOLDER_PATH, 
#                                         split='train', 
#                                         num_classes=MODEL_NUM_CLASSES, 
#                                         img_size=320, 
#                                         dtype=np.float32, 
#                                         mode='train')

val_dataset = CachedSSDLITEOBJDET_DATASET(SSDLITEOBJDET_DATASET,
                                        root_dir=DATASET_FOLDER_PATH, 
                                        split='val', 
                                        num_classes=MODEL_NUM_CLASSES, 
                                        img_size=320, 
                                        dtype=np.float32, 
                                        mode='train')

# test_dataset = CachedSSDLITEOBJDET_DATASET(SSDLITEOBJDET_DATASET,
#                                         root_dir=DATASET_FOLDER_PATH, 
#                                         split='test', 
#                                         num_classes=MODEL_NUM_CLASSES, 
#                                         img_size=320, 
#                                         dtype=np.float32, 
#                                         mode='train')

100%|██████████| 2390/2390 [00:32<00:00, 72.77it/s] 


In [19]:
val_loader = DataLoader(val_dataset, 
                        batch_size=64, 
                        shuffle=True, 
                        collate_fn=collate_fn, 
                        num_workers=num_cores,
                        pin_memory=True,
                        persistent_workers=True,
                        prefetch_factor=2)

In [None]:
# prepare the data for training
imgs, targets = next(iter(val_loader))

image = torch.as_tensor(imgs, dtype=torch.float32, device="cuda:0").permute(0, 3, 1, 2)
image.div_(255.0)

for target in targets:
    target["boxes"] = torch.as_tensor(target["boxes"], dtype=torch.float32, device="cuda:0")
    target["labels"] = torch.as_tensor(target["labels"], dtype=torch.int64, device="cuda:0")

In [None]:
class SSD_MOBILENET_V3_Large(nn.Module):
    def __init__(self, num_classes_with_bg:int, img_size: int=320) -> None:
        super(SSD_MOBILENET_V3_Large, self).__init__()
        self.num_classes_with_bg = num_classes_with_bg
        self.img_size = img_size
        self.model = ssdlite320_mobilenet_v3_large(weights='COCO_V1', weights_backbone="IMAGENET1K_V2") 
        self.model.head.classification_head = SSDLiteClassificationHead(
            in_channels=det_utils.retrieve_out_channels(self.model.backbone, (self.img_size, self.img_size)),
            num_anchors=self.model.anchor_generator.num_anchors_per_location(),
            num_classes=self.num_classes_with_bg,
            norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.03)
        )
        self.model.detections_per_img = 100
    
    def configure_optimizers(self, lr: float = 0.0001, betas: Tuple[float, float] = (0.9, 0.999), weight_decay: float = 0.0001, eps: float = 1e-08, fused: bool = True) -> torch.optim.Optimizer:        
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for _, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for _, p in param_dict.items() if p.dim() < 2]

        # Create AdamW optimizer and use the fused version if available 
        return torch.optim.AdamW([{'params': decay_params, 'weight_decay': weight_decay},
                                    {'params': nodecay_params, 'weight_decay': 0.0}], 
                                    lr=lr, 
                                    betas=betas, 
                                    eps=eps, 
                                    fused=fused)
    
    def forward(self, images: torch.Tensor, targets: dict=None) :
        return self.model(images, targets)
    
    def load(self, checkpoint_path: dict, key_name: str = "model_state_dict", map_location: str = "cpu") -> None:
        """
        Load the model state dict from a checkpoint file.

        Args:
            checkpoint_path (str): Path to the checkpoint file.
            key_name (str): Key name in the checkpoint file to load the model state dict.
            map_location (str): Map location for loading the checkpoint.
        """
        start_time = time.time()
        print(f"Loading checkpoint from {checkpoint_path}...")
        self.load_state_dict(torch.load(checkpoint_path, map_location=map_location)[key_name])
        print(f"Checkpoint loaded in {time.time() - start_time:.2f} seconds.")

In [57]:
# model = SSD_MOBILENET_V3_Large(num_classes_with_bg=MODEL_NUM_CLASSES)
# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# model.load("ssd_checkpoint/checkpoint_1.pth")
# model.to(device)

In [58]:
# results = model.evaluate(DATASET_FOLDER_PATH, device)

In [59]:
def train():
    # Set device
    # device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # print(f"Using device: {device}")
    # # Load the model
    # model = SSD_MOBILENET_V3_Large(num_classes_with_bg=MODEL_NUM_CLASSES)
    # model.to(device)
    
    # train_dataset = SSDLITEOBJDET_DATASET(DATASET_FOLDER_PATH, 'train')
    # val_dataset = SSDLITEOBJDET_DATASET(DATASET_FOLDER_PATH, 'val')

    # train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn, num_workers=num_cores, pin_memory=True, pin_memory_device="cuda:0")
    # val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn, num_workers=num_cores, pin_memory=True, pin_memory_device="cuda:0")

    # Optimizer and scheduler
    optimizer = model.configure_optimizers(lr=0.0001, betas=(0.9, 0.999), weight_decay=0.001, eps=1e-08, fused=True)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

    # Training loop
    num_epochs = 50
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        num_batches = len(val_loader)
        
        # Import tqdm for progress bar
        train_bar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
        
        for _, (images, targets) in enumerate(train_bar):
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            batch_loss = losses.detach().item()
            total_loss += batch_loss
            
            # Update progress bar with current batch loss
            train_bar.set_postfix(loss=batch_loss)

        avg_loss = total_loss / num_batches
        print(f"Epoch {epoch+1}/{num_epochs} | Avg Train Loss: {avg_loss:.4f}")

        lr_scheduler.step()

        # Validation
        model.eval()
        metric = MeanAveragePrecision()
        with torch.no_grad():
            for images, targets in val_loader:
                images = list(img.to(device) for img in images)
                targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
                
                predictions = model(images)
                metric.update(predictions, targets)
        
        map_result = metric.compute()
        print(f"Epoch {epoch+1} | Val mAP: {map_result['map']:.4f}")

    # Save model
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()}, 'ssd_mobilenet_v3_finetuned.pth')

In [None]:
# Run the training function
# train()

Epoch 1/50:   0%|          | 0/38 [00:00<?, ?batch/s]


AttributeError: 'numpy.ndarray' object has no attribute 'to'

In [None]:

        elapsed = time.time() - start_time
        
        results.append({
            'split': split,
            **split_metrics,
            'time': f"{elapsed:.1f}s"
        })
        
        print(f"\nCompleted {split} split in {elapsed:.1f} seconds")
        print(f"Split Metrics - mAP: {split_metrics['mAP']:.4f}, Precision: {split_metrics['Precision']:.4f}")

    # Create DataFrame
    df = pd.DataFrame(results).set_index('split')
    numeric_cols = ['mAP', 'mAP_50', 'mAP_75', 'mAP_small', 'mAP_medium', 
                   'mAP_large', 'Recall', 'Precision', 'F1']
    df[numeric_cols] = df[numeric_cols].applymap(lambda x: f"{float(x):.4f}")
    
    return df

def evaluate_split(model, dataloader, device, metric):
    """Evaluate with batch-level progress"""
    model.eval()
    metric.reset()
    
    # Batch progress bar
    batch_progress = tqdm(dataloader, 
                        desc="Processing batches",
                        leave=False,
                        position=1)
    
    with torch.no_grad():
        for images, targets in batch_progress:
            # Move data to device
            images = list(img.to(device) for img in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
            # Inference
            predictions = model(images)
            metric.update(predictions, targets)
            
            # Update progress description
            batch_progress.set_postfix({
                'current_mAP': f"{metric.compute()['map'].item():.3f}",
                'batch_size': len(images)
            })

    # Final metrics
    metrics = metric.compute()
    
    return {
        'mAP': metrics['map'].item(),
        'mAP_50': metrics['map_50'].item(),
        'mAP_75': metrics['map_75'].item(),
        'mAP_small': metrics['map_small'].item(),
        'mAP_medium': metrics['map_medium'].item(),
        'mAP_large': metrics['map_large'].item(),
        'Recall': metrics['mar_100'].item(),
        'Class_APs': metrics['classes'].cpu().numpy().round(4),
        'Precision': metrics['precision'].cpu().numpy().mean().round(4),
        'Recall': metrics['recall'].cpu().numpy().mean().round(4),
        'F1': (2 * (metrics['precision'] * metrics['recall']) / 
              (metrics['precision'] + metrics['recall'] + 1e-16)).cpu().numpy().mean().round(4)
    }

def evaluate():
    print("\n🚀 Starting Comprehensive Evaluation")
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f"🔧 Using device: {device}")
    
    # Model loading
    print("\n🔄 Loading model weights...")
    start_load = time.time()
    model = SSD_MOBILENET_V3_Large(num_classes_with_bg=MODEL_NUM_CLASSES)

    print(f"✅ Model loaded in {time.time()-start_load:.1f}s")
    
    # Evaluation
    print("\n📊 Starting evaluation on all splits...")
    metrics_df = evaluate_model(model, DATASET_FOLDER_PATH, device)
    
    # Results display
    print("\n🎯 Final Metrics Summary:")
    print(metrics_df[['mAP', 'mAP_50', 'mAP_75', 'Recall', 'Precision', 'F1', 'time']])
    
    print("\n📈 Class-wise Performance:")
    class_df = pd.DataFrame(metrics_df['Class_APs'].tolist(), 
                          index=metrics_df.index).T
    class_df.columns = metrics_df.index
    print(class_df.round(4))
    
    print("\n🏁 Evaluation complete!")

if __name__ == '__main__':
    evaluate()