<a href="https://colab.research.google.com/github/shubhamt2897/DL_BB_YOLO/blob/main/DL_BB_YOLO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch torchvision torchaudio --upgrade
!pip install matplotlib opencv-python albumentations onnx onnxruntime


Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cubla

In [2]:
!gdown "https://drive.google.com/uc?id=11s-GLb6LZ0SCAVW6aikqImuuQEEbT_Fb" -O dl_challenge.tar.xz
!tar -xvf dl_challenge.tar.xz

Downloading...
From (original): https://drive.google.com/uc?id=11s-GLb6LZ0SCAVW6aikqImuuQEEbT_Fb
From (redirected): https://drive.google.com/uc?id=11s-GLb6LZ0SCAVW6aikqImuuQEEbT_Fb&confirm=t&uuid=64f80f10-c100-42a3-b5f5-9168cb092763
To: /content/dl_challenge.tar.xz
100% 2.44G/2.44G [00:24<00:00, 101MB/s]
dl_challenge/
dl_challenge/889a9fb3-9915-11ee-9103-bbb8eae05561/
dl_challenge/889a9fb3-9915-11ee-9103-bbb8eae05561/rgb.jpg
dl_challenge/889a9fb3-9915-11ee-9103-bbb8eae05561/pc.npy
dl_challenge/889a9fb3-9915-11ee-9103-bbb8eae05561/bbox3d.npy
dl_challenge/889a9fb3-9915-11ee-9103-bbb8eae05561/mask.npy
dl_challenge/911224fa-9915-11ee-9103-bbb8eae05561/
dl_challenge/911224fa-9915-11ee-9103-bbb8eae05561/rgb.jpg
dl_challenge/911224fa-9915-11ee-9103-bbb8eae05561/pc.npy
dl_challenge/911224fa-9915-11ee-9103-bbb8eae05561/bbox3d.npy
dl_challenge/911224fa-9915-11ee-9103-bbb8eae05561/mask.npy
dl_challenge/9f50f3c2-9915-11ee-9103-bbb8eae05561/
dl_challenge/9f50f3c2-9915-11ee-9103-bbb8eae05561/rgb.jpg

In [7]:
import torch
from torch.utils.data import Dataset
import cv2
import numpy as np
import albumentations as A
import os

def corners_to_7params(corners):
    """
    Converts 8 corners (shape (8,3)) into a 7-parameter representation:
      [center_x, center_y, center_z, dx, dy, dz, heading].
    For simplicity, we assume the box is axis-aligned so heading is set to 0.
    """
    center = np.mean(corners, axis=0)
    min_vals = np.min(corners, axis=0)
    max_vals = np.max(corners, axis=0)
    dims = max_vals - min_vals
    heading = 0.0
    return np.concatenate([center, dims, [heading]])

class Sereact3DDataset(Dataset):
    def __init__(self, folder_list, transform=None):
        self.folder_list = folder_list
        self.transform = transform
        self.folder_list = [
            f for f in self.folder_list
            if all(os.path.isfile(os.path.join(f, fname))
                   for fname in ['rgb.jpg', 'bbox3d.npy', 'mask.npy', 'pc.npy'])
        ]

    def __len__(self):
        return len(self.folder_list)

    def __getitem__(self, idx):
        folder_path = self.folder_list[idx]
        rgb_path   = os.path.join(folder_path, 'rgb.jpg')
        bbox_path  = os.path.join(folder_path, 'bbox3d.npy')
        mask_path  = os.path.join(folder_path, 'mask.npy')
        pc_path    = os.path.join(folder_path, 'pc.npy')

        rgb = cv2.imread(rgb_path)[:, :, ::-1]  # Convert BGR to RGB
        bbox3d_data = np.load(bbox_path)        # Could be of different shapes
        mask_data = np.load(mask_path)
        pc_data = np.load(pc_path)

        # Process bounding box data:
        # If it is 1D with 7 elements, use it directly.
        # Otherwise, if it is not of size 7 but is a multiple of 24,
        # we assume it stores boxes as 8 corners (24 numbers) and we select the first box.
        bbox3d_data = np.squeeze(bbox3d_data)
        if bbox3d_data.size == 7:
            bbox_final = bbox3d_data
        elif bbox3d_data.size % 24 == 0:
            # Assume shape is (num_boxes, 24) when reshaped
            num_boxes = bbox3d_data.size // 24
            bbox3d_data = bbox3d_data.reshape(num_boxes, 24)
            # Select the first bounding box (or choose another strategy, e.g. largest area)
            first_box = bbox3d_data[0]
            # Convert the 24-element vector (8 corners) to 7 parameters.
            corners = first_box.reshape(8, 3)
            bbox_final = corners_to_7params(corners)
        else:
            raise ValueError(f"Unexpected bounding box size: {bbox3d_data.size}")

        # Process point cloud: if shape is [3, H, W], reshape it to [H*W, 3]
        if pc_data.ndim == 3 and pc_data.shape[0] == 3:
            pc_data = np.transpose(pc_data, (1,2,0))
            pc_data = pc_data.reshape(-1, 3)

        if self.transform:
            augmented = self.transform(image=rgb)
            rgb = augmented['image']

        rgb_t = torch.from_numpy(rgb).permute(2, 0, 1).float()
        bbox3d_t = torch.from_numpy(bbox_final).float()  # Now shape (7,)
        mask_t = torch.from_numpy(mask_data).float()
        pc_t = torch.from_numpy(pc_data).float()

        return {'rgb': rgb_t, 'bbox3d': bbox3d_t, 'mask': mask_t, 'point_cloud': pc_t}


In [8]:
import glob
from torch.utils.data import DataLoader
import albumentations as A

def custom_collate(batch):
    collated = {}
    for key in batch[0]:
        try:
            collated[key] = torch.stack([item[key] for item in batch], dim=0)
        except RuntimeError:
            collated[key] = [item[key] for item in batch]
    return collated

# Define augmentations.
train_transform = A.Compose([
    A.Resize(224, 224),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.2),
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15, p=0.5),
])
val_transform = A.Compose([A.Resize(224, 224)])

data_dir = 'dl_challenge'
folders = sorted(glob.glob(os.path.join(data_dir, '*')))

train_folders = folders[:150]
val_folders   = folders[150:180]
test_folders  = folders[180:]

train_dataset = Sereact3DDataset(train_folders, transform=train_transform)
val_dataset = Sereact3DDataset(val_folders, transform=val_transform)
test_dataset = Sereact3DDataset(test_folders, transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=custom_collate)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=custom_collate)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=custom_collate)


  original_init(self, **validated_kwargs)


In [9]:
import torch.nn as nn
import torch.nn.functional as F

class YOLO3DModel(nn.Module):
    def __init__(self, grid_size=7, num_anchors=1):
        super(YOLO3DModel, self).__init__()
        self.grid_size = grid_size
        self.num_anchors = num_anchors

        # Backbone: a small CNN similar to YOLO's initial layers.
        self.features = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),  # [B,16,224,224]
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),  # [B,16,112,112]

            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),  # [B,32,112,112]
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),  # [B,32,56,56]

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),   # [B,64,56,56]
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),  # [B,64,28,28]

            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),  # [B,128,28,28]
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),  # [B,128,14,14]

            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),  # [B,256,14,14]
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2)   # [B,256,7,7] --> matches our grid size 7x7
        )

        # Detection head: predict 7 parameters per grid cell, per anchor.
        # Here we assume only one class (or no class info needed) and no objectness score,
        # but in practice you might add these.
        self.detector = nn.Sequential(
            nn.Conv2d(256, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1),
            nn.Conv2d(128, num_anchors * 7, kernel_size=1)  # Each grid cell predicts 7 values.
        )

    def forward(self, x):
        # x: [B, 3, 224, 224]
        features = self.features(x)  # [B,256,7,7]
        detections = self.detector(features)  # [B, num_anchors*7, 7, 7]
        B, C, H, W = detections.shape
        # Reshape to [B, H, W, num_anchors, 7]
        detections = detections.view(B, self.num_anchors, 7, H, W)
        detections = detections.permute(0, 3, 4, 1, 2).contiguous()  # [B,7,7,num_anchors,7]
        # Flatten grid cells and anchors: [B, 7*7*num_anchors, 7]
        detections = detections.view(B, -1, 7)
        return detections


In [10]:
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import logging
import csv

def bbox3d_loss(pred, target):
    return F.smooth_l1_loss(pred, target)

def train_model_yolo(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, device='cuda', patience=3):
    model.to(device)
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    epochs_no_improve = 0

    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1, verbose=True)

    logging.basicConfig(filename='training.log', level=logging.INFO, format='%(asctime)s %(message)s')
    csv_filename = 'training_metrics.csv'
    with open(csv_filename, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['epoch', 'train_loss', 'val_loss'])

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0.0
        for batch in train_loader:
            rgb = batch['rgb'].to(device)
            # For YOLO model, we assume ground truth is provided per sample as a 7-element vector.
            # In practice, you might have multiple detections per image.
            gt_bbox = batch['bbox3d'].to(device)  # shape: [B, 7] if one bbox per image.

            optimizer.zero_grad()
            pred_bbox = model(rgb)  # output shape: [B, 7*7*num_anchors, 7]
            # For simplicity, assume we only care about one predicted bbox per image
            # (e.g., use the prediction from the center cell). In practice, you'd match anchors.
            # Here we just take the first prediction for each sample.
            pred_bbox = pred_bbox[:, 0, :]  # shape: [B, 7]

            loss = criterion(pred_bbox, gt_bbox)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        model.eval()
        total_val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                rgb = batch['rgb'].to(device)
                gt_bbox = batch['bbox3d'].to(device)
                pred_bbox = model(rgb)
                pred_bbox = pred_bbox[:, 0, :]
                val_loss = criterion(pred_bbox, gt_bbox)
                total_val_loss += val_loss.item()
        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)

        scheduler.step(avg_val_loss)
        log_str = f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}"
        print(log_str)
        logging.info(log_str)

        with open(csv_filename, 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([epoch+1, avg_train_loss, avg_val_loss])

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            print(f"No improvement for {epochs_no_improve} epochs.")
            logging.info(f"No improvement for {epochs_no_improve} epochs.")

        if epochs_no_improve >= patience:
            print("Early stopping triggered.")
            logging.info("Early stopping triggered.")
            break

    return train_losses, val_losses

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

model = YOLO3DModel(grid_size=7, num_anchors=1)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
criterion = bbox3d_loss

train_losses, val_losses = train_model_yolo(
    model,
    train_loader,
    val_loader,
    criterion,
    optimizer,
    num_epochs=10,
    device=device,
    patience=3
)


Using device: cuda
Epoch 1/10, Train Loss: 0.0178, Val Loss: 0.0056
Epoch 2/10, Train Loss: 0.0068, Val Loss: 0.0056
No improvement for 1 epochs.
Epoch 3/10, Train Loss: 0.0053, Val Loss: 0.0049
Epoch 4/10, Train Loss: 0.0043, Val Loss: 0.0037
Epoch 5/10, Train Loss: 0.0033, Val Loss: 0.0034
Epoch 6/10, Train Loss: 0.0032, Val Loss: 0.0035
No improvement for 1 epochs.
Epoch 7/10, Train Loss: 0.0031, Val Loss: 0.0033
Epoch 8/10, Train Loss: 0.0028, Val Loss: 0.0032
Epoch 9/10, Train Loss: 0.0027, Val Loss: 0.0032
No improvement for 1 epochs.
Epoch 10/10, Train Loss: 0.0025, Val Loss: 0.0032
No improvement for 2 epochs.


In [12]:
def project_bbox_2d_from_yolo(bbox, image_shape=(224,224), coords_normalized=True):
    # bbox: 7-element vector: [x, y, z, dx, dy, dz, heading]
    img_h, img_w = image_shape
    x, y, z, dx, dy, dz, heading = bbox
    if coords_normalized:
        x *= img_w
        y *= img_h
        dx *= img_w
        dy *= img_h
    xmin = x - dx/2
    xmax = x + dx/2
    ymin = y - dy/2
    ymax = y + dy/2
    xmin = max(0, xmin)
    ymin = max(0, ymin)
    xmax = min(img_w, xmax)
    ymax = min(img_h, ymax)
    return xmin, ymin, xmax, ymax

def visualize_2d_bbox_yolo(folder_path, pred_bbox, gt_bbox, image_shape=(224,224)):
    rgb = cv2.imread(os.path.join(folder_path, 'rgb.jpg'))[:, :, ::-1]
    transform = A.Compose([A.Resize(image_shape[0], image_shape[1])])
    rgb_aug = transform(image=rgb)['image']

    pred_box = project_bbox_2d_from_yolo(pred_bbox, image_shape=image_shape, coords_normalized=True)
    gt_box = project_bbox_2d_from_yolo(gt_bbox, image_shape=image_shape, coords_normalized=True)

    fig, ax = plt.subplots(1, 1, figsize=(6,6))
    ax.imshow(rgb_aug)
    rect_pred = plt.Rectangle((pred_box[0], pred_box[1]),
                              pred_box[2]-pred_box[0],
                              pred_box[3]-pred_box[1],
                              fill=False, edgecolor='red', linewidth=3, label='Prediction')
    rect_gt = plt.Rectangle((gt_box[0], gt_box[1]),
                              gt_box[2]-gt_box[0],
                              gt_box[3]-gt_box[1],
                              fill=False, edgecolor='green', linewidth=3, label='Ground Truth')
    ax.add_patch(rect_pred)
    ax.add_patch(rect_gt)
    ax.legend()
    ax.set_title("2D Projection of YOLO 3D BBox")
    plt.axis("off")
    plt.show()

def visualize_3d_bbox_yolo(folder_path, pred_bbox, gt_bbox):
    # For 3D visualization, we will plot the centers.
    # In a complete system, you would reconstruct the full 3D box.
    pc = np.load(os.path.join(folder_path, 'pc.npy'))
    if pc.ndim == 3 and pc.shape[0] == 3:
        pc = np.transpose(pc, (1,2,0)).reshape(-1, 3)
    if pc.shape[0] > 5000:
        idx = np.random.choice(pc.shape[0], 5000, replace=False)
        pc = pc[idx, :]

    pred_center = pred_bbox[:3]
    gt_center = gt_bbox[:3]

    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(pc[:,0], pc[:,1], pc[:,2], s=1, c='gray', alpha=0.5, label='Point Cloud')
    ax.scatter(pred_center[0], pred_center[1], pred_center[2], c='red', s=50, label='Prediction Center')
    ax.scatter(gt_center[0], gt_center[1], gt_center[2], c='green', s=50, label='GT Center')
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    ax.legend()
    ax.set_title("3D Visualization: BBox Centers")
    plt.show()

# Evaluate and visualize:
model.eval()
with torch.no_grad():
    for idx, batch in enumerate(test_loader):
        rgb = batch['rgb'].to(device)
        gt_bbox = batch['bbox3d'].to(device)
        pred_bbox = model(rgb)  # YOLO model outputs [B, 7]

        pred_bbox_sample = pred_bbox[0].cpu().numpy()
        gt_bbox_sample = gt_bbox[0].cpu().numpy()
        sample_folder = test_folders[idx]

        print("Predicted 3D BBox:", pred_bbox_sample)
        print("Ground Truth 3D BBox:", gt_bbox_sample)

        visualize_2d_bbox_yolo(sample_folder, pred_bbox_sample, gt_bbox_sample, image_shape=(224,224))
        visualize_3d_bbox_yolo(sample_folder, pred_bbox_sample, gt_bbox_sample)
        break


Predicted 3D BBox: [[-3.53548825e-02 -3.38993827e-03  1.12962115e+00  1.85610473e-01
   1.85809016e-01  1.00645833e-01 -4.52358276e-03]
 [-1.08058006e-01 -2.21356954e-02  6.39100254e-01  2.59341337e-02
   1.15205914e-01  2.95251548e-01  9.49093625e-02]
 [-1.27711147e-02 -3.92076336e-02  7.40729988e-01  2.64516830e-01
   2.34324008e-01  1.81283176e-01 -5.05844131e-03]
 [-1.99563801e-01 -1.04054678e-02  5.45952499e-01  7.36568868e-02
   1.93361342e-01  1.57665759e-01 -2.62960717e-02]
 [-2.31507003e-01  1.55084226e-02  4.63510603e-01  2.00940557e-02
   2.27932632e-01  1.80549338e-01  5.50751314e-02]
 [-1.35064870e-01 -8.11409764e-03  7.68112242e-01  1.59620136e-01
   1.80514246e-01  1.31509155e-01 -9.16884094e-02]
 [-2.24104375e-01 -1.46498919e-01  5.54047227e-01 -2.70931840e-01
   1.82016611e-01  1.81429237e-01 -3.20196673e-02]
 [ 1.16348714e-02 -4.96539399e-02  8.24663162e-01  1.75475910e-01
   1.70860216e-01  9.64743122e-02 -2.71716639e-02]
 [ 2.85898805e-01 -7.74158314e-02 -1.12013392

ValueError: too many values to unpack (expected 7)

In [None]:
# ONNX Export: Export the model to ONNX format for deployment.
dummy_input = torch.randn(1, 3, 224, 224).to(device)
torch.onnx.export(model, dummy_input, "yolo3d_model.onnx", opset_version=11)

# Git Logging: Push key files to your GitHub repository.
!git config --global user.email "sp.tangadpalliwar@stud.fh-sm.de"
!git config --global user.name "https://github.com/shubhamt2897"
!git init
!git branch -M main
!echo "dl_challenge/" >> .gitignore
!echo "dl_challenge.tar.xz" >> .gitignore
!echo "sample_data/" >> .gitignore
!git add DL_Bounding_Box.ipynb training_metrics.csv training.log yolo3d_model.onnx
!git commit -m "Final commit: YOLO3D pipeline and logs"
!git remote add origin https://github.com/shubhamt2897/DL_BB_YOLO.git
!git push -u origin main
