In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BoundingBox3DNet(nn.Module):
    def __init__(self, num_bins=8, feature_extractor='resnet50'):
        super(BoundingBox3DNet, self).__init__()
        
        # Feature extractor (pre-trained backbone)
        self.backbone, feature_dim = self.get_backbone(feature_extractor)

        # Dimension regression branch
        self.dim_branch = nn.Sequential(
            nn.Linear(feature_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 3)  # Output: dx, dy, dz
        )

        # Orientation estimation (MultiBin) branch
        self.num_bins = num_bins
        self.orient_conf_branch = nn.Sequential(
            nn.Linear(feature_dim, 256),
            nn.ReLU(),
            nn.Linear(256, num_bins)  # Confidence scores for bins
        )
        self.orient_res_branch = nn.Sequential(
            nn.Linear(feature_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 2 * num_bins)  # sin(\u0394\u03b8), cos(\u0394\u03b8) for each bin
        )

        # Translation regression branch
        self.trans_branch = nn.Sequential(
            nn.Linear(feature_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 3)  # Output: tx, ty, tz
        )

        # Optional: Corner prediction branch
        self.corner_branch = nn.Sequential(
            nn.Linear(feature_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 24)  # Output: 8 corners (normalized positions)
        )

    def forward(self, x):
        # Extract features
        features = self.backbone(x)
        features = features.view(features.size(0), -1)  # Flatten

        # Dimensions
        dims = self.dim_branch(features)

        # Orientation (confidence and residuals)
        orient_conf = self.orient_conf_branch(features)
        orient_res = self.orient_res_branch(features)
        orient_res = orient_res.view(-1, self.num_bins, 2)  # Reshape to (batch, bins, [sin, cos])

        # Translation
        translation = self.trans_branch(features)

        # Corners (optional)
        corners = self.corner_branch(features)

        return dims, orient_conf, orient_res, translation, corners

    def get_backbone(self, model_name):
        if model_name == 'resnet50':
            from torchvision.models import resnet50
            backbone = resnet50(pretrained=True)
            # Remove the fully connected layer
            backbone = nn.Sequential(*list(backbone.children())[:-1])
            feature_dim = 2048
        elif model_name == 'vgg16':
            from torchvision.models import vgg16
            backbone = vgg16(pretrained=True)
            backbone = nn.Sequential(*list(backbone.features.children()))
            feature_dim = 512
        else:
            raise ValueError(f"Unsupported model: {model_name}")
        return backbone, feature_dim

# Instantiate the model
model = BoundingBox3DNet(num_bins=8)
print(model)


BoundingBox3DNet(
  (backbone): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): C

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import pandas as pd
import numpy as np
import os

class BoundingBox3DDataset(Dataset):
    def __init__(self, image_dir, data_2d_dir, data_3d_dir, camera_intrinsics, transform=None):
        """
        Args:
            image_dir (str): Directory containing images.
            data_2d_dir (str): Directory containing 2D data CSV files.
            data_3d_dir (str): Directory containing 3D data CSV files.
            camera_intrinsics (np.ndarray): Intrinsic camera matrix.
            transform (callable, optional): Transform to be applied on a sample.
        """
        self.image_dir = image_dir
        self.data_2d_dir = data_2d_dir
        self.data_3d_dir = data_3d_dir
        self.camera_intrinsics = camera_intrinsics
        self.transform = transform

        # Match files across all directories
        self.file_indices = [
            os.path.splitext(f)[0]
            for f in os.listdir(self.image_dir)
            if f.endswith(".jpg") and
            os.path.exists(os.path.join(self.data_2d_dir, f"{os.path.splitext(f)[0]}.csv")) and
            os.path.exists(os.path.join(self.data_3d_dir, f"{os.path.splitext(f)[0]}.csv"))
        ]
        print(f"Found {len(self.file_indices)} matching files.")

    def __len__(self):
        return len(self.file_indices)

    def __getitem__(self, idx):
        file_index = self.file_indices[idx]

        # Load image
        image_path = os.path.join(self.image_dir, f"{file_index}.png")
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Load 2D data
        data_2d_path = os.path.join(self.data_2d_dir, f"{file_index}.csv")
        data_2d = pd.read_csv(data_2d_path, header=None).values.flatten()
        bb_center = torch.tensor(data_2d[1:3], dtype=torch.float32)
        bb_size = torch.tensor(data_2d[3:5], dtype=torch.float32)
        corners_2d = torch.tensor(data_2d[5:], dtype=torch.float32)

        # Load 3D data
        data_3d_path = os.path.join(self.data_3d_dir, f"{file_index}.csv")
        data_3d = pd.read_csv(data_3d_path, header=None).values.flatten()
        position = torch.tensor(data_3d[1:4], dtype=torch.float32)
        rotation = torch.tensor(data_3d[4:7], dtype=torch.float32)
        corners_3d = torch.tensor(data_3d[7:], dtype=torch.float32)

        # Camera intrinsics
        camera_intrinsics = torch.tensor(self.camera_intrinsics, dtype=torch.float32)

        sample = {
            "image": image,
            "bb_center": bb_center,
            "bb_size": bb_size,
            "corners_2d": corners_2d,
            "position": position,
            "rotation": rotation,
            "corners_3d": corners_3d,
            "camera_intrinsics": camera_intrinsics,
        }
        return sample

# Camera intrinsics matrix
camera_intrinsics = np.array([
    [1108.513, 0, 640, 0],
    [0, 623.5383, 360, 0],
    [0, 0, 1, 0],
    [0, 0, 0, 1]
])

# Paths to directories
image_dir = r"C:\Users\sakar\OneDrive\mt-datas\synthetic_data\8_correct_relative\images\train"
data_2d_dir = r"C:\Users\sakar\OneDrive\mt-datas\synthetic_data\8_correct_relative\2d_data"
data_3d_dir = r"C:\Users\sakar\OneDrive\mt-datas\synthetic_data\8_correct_relative\3d_data"

# Define transformations for images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Initialize dataset and dataloader
dataset = BoundingBox3DDataset(image_dir, data_2d_dir, data_3d_dir, camera_intrinsics, transform=transform)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


Found 0 matching files.


ValueError: num_samples should be a positive integer value, but got num_samples=0