<h1><b>SLAM INTEGRATION</h1></b>

In [1]:
!pip install ultralytics 



In [2]:
from ultralytics import YOLO
from pathlib import Path
from collections import OrderedDict, defaultdict
from typing import Dict, List
from tqdm import tqdm
import json
import numpy as np

<h2>Path Declaration</h2>

In [3]:
# Paths
model_weights = '../results/yolov8s/kitti/best.pt'
calib_file = Path('../data/data_odometry_gray/sequences/08/calib.txt')
seq_dir = Path('../data/data_odometry_gray/sequences/08/image_0')
pose_dir = Path('../data/data_odometry_poses/poses/08.txt')
bbox_output_dir = Path('../data/bbox_outputs_2')
cuboid_output_dir = Path('../data/cuboid_outputs_2')
# bbox_output_dir = Path('../data/bbox_outputs')
# cuboid_output_dir = Path('../data/cuboid_outputs')
# bbox_output_dir = Path('../data/bbox_outputs_adelaide')
# cuboid_output_dir = Path('../data/cuboid_outputs_adelaide')

<h2>1. ByteTrack Tracker Implementation</h2>

**Multi-object tracker to maintain object identity across frames** 

```
KITTI Images → YOLOv8 → [NEW: ByteTrack] → 3D Cuboids (cached) → ORB-SLAM3
```

In [4]:
class SimpleByteTracker:
    """ByteTrack implementation for maintaining object IDs across frames."""
    
    def __init__(self, track_thresh=0.5, track_buffer=30, match_thresh=0.8, min_box_area=100):
        self.track_thresh = track_thresh
        self.track_buffer = track_buffer
        self.match_thresh = match_thresh
        self.min_box_area = min_box_area
        self.tracked_tracks = OrderedDict()
        self.lost_tracks = OrderedDict()
        self.removed_tracks = OrderedDict()
        self.frame_id = 0
        self.track_id_count = 0
    
    def update(self, detections: np.ndarray) -> List[Dict]:
        self.frame_id += 1
        if len(detections) == 0:
            detections = np.empty((0, 6))
        valid_detections = self._filter_detections(detections)
        high_det = valid_detections[valid_detections[:, 4] >= self.track_thresh]
        low_det = valid_detections[valid_detections[:, 4] < self.track_thresh]
        tracks_output = []
        if len(self.tracked_tracks) > 0:
            matched, unmatched_tracks, unmatched_dets = self._associate(self.tracked_tracks, high_det)
            for track_idx, det_idx in matched:
                track = self.tracked_tracks[track_idx]
                det = high_det[det_idx]
                track.update({'bbox': det[:4], 'score': det[4], 'frame_id': self.frame_id, 'state': 'tracked'})
                tracks_output.append(track.copy())
            for track_idx in unmatched_tracks:
                track = self.tracked_tracks[track_idx]
                track['state'] = 'lost'
                self.lost_tracks[track['track_id']] = track
            for track_idx in unmatched_tracks:
                del self.tracked_tracks[track_idx]
            if len(low_det) > 0 and len(self.lost_tracks) > 0:
                matched_lost, _, _ = self._associate(self.lost_tracks, low_det)
                for track_idx, det_idx in matched_lost:
                    track = self.lost_tracks[track_idx]
                    det = low_det[det_idx]
                    track.update({'bbox': det[:4], 'score': det[4], 'frame_id': self.frame_id, 'state': 're-identified'})
                    self.tracked_tracks[track_idx] = track
                    tracks_output.append(track.copy())
                    del self.lost_tracks[track_idx]
            for det_idx in unmatched_dets:
                new_track = self._init_track(high_det[det_idx])
                self.tracked_tracks[new_track['track_id']] = new_track
                tracks_output.append(new_track.copy())
        else:
            for det in high_det:
                new_track = self._init_track(det)
                self.tracked_tracks[new_track['track_id']] = new_track
                tracks_output.append(new_track.copy())
        lost_to_remove = []
        for track_id, track in self.lost_tracks.items():
            if self.frame_id - track['frame_id'] > self.track_buffer:
                lost_to_remove.append(track_id)
                self.removed_tracks[track_id] = track
        for track_id in lost_to_remove:
            del self.lost_tracks[track_id]
        return tracks_output
    
    def _filter_detections(self, detections):
        if len(detections) == 0:
            return detections
        areas = (detections[:, 2] - detections[:, 0]) * (detections[:, 3] - detections[:, 1])
        return detections[areas >= self.min_box_area]
    
    def _init_track(self, detection):
        self.track_id_count += 1
        return {
            'track_id': self.track_id_count,
            'bbox': detection[:4].tolist(),
            'score': float(detection[4]),
            'class_id': int(detection[5]),
            'frame_id': self.frame_id,
            'state': 'new'
        }
    
    def _associate(self, tracks, detections):
        if len(tracks) == 0 or len(detections) == 0:
            return [], list(range(len(tracks))), list(range(len(detections)))
        track_boxes = np.array([t['bbox'] for t in tracks.values()])
        det_boxes = detections[:, :4]
        iou_matrix = self._compute_iou_matrix(track_boxes, det_boxes)
        matched = []
        unmatched_tracks = list(range(len(tracks)))
        unmatched_dets = list(range(len(detections)))
        while unmatched_tracks and unmatched_dets:
            max_iou = -1
            best_track = best_det = -1
            for t in unmatched_tracks:
                for d in unmatched_dets:
                    if iou_matrix[t, d] > max_iou:
                        max_iou, best_track, best_det = iou_matrix[t, d], t, d
            if max_iou >= self.match_thresh:
                matched.append((best_track, best_det))
                unmatched_tracks.remove(best_track)
                unmatched_dets.remove(best_det)
            else:
                break
        return matched, unmatched_tracks, unmatched_dets
    
    @staticmethod
    def _compute_iou_matrix(boxes1, boxes2):
        area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
        area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
        x1 = np.maximum(boxes1[:, 0:1], boxes2[:, 0])
        y1 = np.maximum(boxes1[:, 1:2], boxes2[:, 1])
        x2 = np.minimum(boxes1[:, 2:3], boxes2[:, 2])
        y2 = np.minimum(boxes1[:, 3:4], boxes2[:, 3])
        intersection = np.maximum(0, x2 - x1) * np.maximum(0, y2 - y1)
        union = area1[:, np.newaxis] + area2 - intersection
        return intersection / (union + 1e-6)
    
    def get_track_count(self):
        return self.track_id_count

class TrackManager:
    """Manages tracks and determines when to generate 3D cuboids."""
    def __init__(self, min_track_length=3):
        self.min_track_length = min_track_length
        self.track_history = defaultdict(list)
        self.cuboid_generated = set()
    
    def update(self, tracked_objects):
        tracks_for_cuboid = []
        for track in tracked_objects:
            track_id = track['track_id']
            self.track_history[track_id].append({
                'bbox': track['bbox'],
                'score': track['score'],
                'class_id': track['class_id'],
                'state': track['state']
            })
            track_length = len(self.track_history[track_id])
            if track_id not in self.cuboid_generated and track_length >= self.min_track_length:
                tracks_for_cuboid.append({**track, 'reason': 'new_stable'})
                self.cuboid_generated.add(track_id)
            elif track['state'] == 're-identified' and track_id in self.cuboid_generated:
                tracks_for_cuboid.append({**track, 'reason': 'updated'})
        return tracks_for_cuboid

def yolo_to_bytetrack_format(boxes, scores, classes):
    if len(boxes) == 0:
        return np.empty((0, 6))
    return np.column_stack([boxes, scores, classes])

print('✅ ByteTrack tracker classes loaded!')

✅ ByteTrack tracker classes loaded!


<h2>2. Extracting 2D Bounding Box Coordinates using YOLOv8x weight (Exp 2)</h2>

In [5]:
# Initialize ByteTrack tracker
tracker = SimpleByteTracker(track_thresh=0.5, track_buffer=30, match_thresh=0.8)
track_manager = TrackManager(min_track_length=3)
tracking_output_dir = Path('../data/tracking_outputs')
tracking_output_dir.mkdir(exist_ok=True)
tracking_stats = {'total_detections': 0, 'total_tracks': 0, 'cuboids_generated': 0}
cuboid_cache = {}  # Cache cuboids by track_id
print('✅ Tracker initialized!')

✅ Tracker initialized!


In [None]:
# Define the path to the trained model weights

# Load model
model = YOLO(model_weights)

# Get all image files
image_files = sorted(seq_dir.glob('*.png'))
print(f'Processing {len(image_files)} images...')

# Inference on KITTI images
for frame_idx, img_file in enumerate(tqdm(image_files, desc='Detecting & Tracking')):
    results = model(str(img_file), conf=0.6, iou=0.45)  # Adjust thresholds as per evaluation

    # Extract bounding boxes
    frame_data = []
    boxes_list = []      
    scores_list = []     
    classes_list = []
    for result in results:
        boxes = result.boxes.xyxy.cpu().numpy()  # [x_min, y_min, x_max, y_max]
        classes = result.boxes.cls.cpu().numpy()
        confs = result.boxes.conf.cpu().numpy()

        for i in range(len(boxes)):
            frame_data.append({
                'class': int(classes[i]),
                'confidence': float(confs[i]),
                'bbox': boxes[i].tolist()
            })
            
            boxes_list.append(boxes[i])
            scores_list.append(confs[i])
            classes_list.append(classes[i])

    # Save as JSON
    json_path = bbox_output_dir / f'{img_file.stem}.json'
    with open(json_path, 'w') as f:
        json.dump(frame_data, f, indent=4)
        
# =================================================================
# ByteTrack Tracking
# =================================================================
# Convert to tracker format
if len(boxes_list) > 0:
    detections = yolo_to_bytetrack_format(
        np.array(boxes_list),
        np.array(scores_list),
        np.array(classes_list)
    )
else:
    detections = np.empty((0, 6))
    
    # Update tracker
    tracked_objects = tracker.update(detections)
    tracking_stats['total_detections'] += len(frame_data)
    
    # Save tracking results
    tracking_data = {
        'frame_idx': frame_idx,
        'frame_name': img_file.stem,
        'tracked_objects': tracked_objects
    }
    tracking_json = tracking_output_dir / f'{img_file.stem}.json'
    with open(tracking_json, 'w') as ft:
        json.dump(tracking_data, ft, indent=4)
    
    # Track management - determine which need cuboids
    tracks_for_cuboid = track_manager.update(tracked_objects)
    
    # Generate cuboids only for new/stable tracks (cached by track_id)
    for track in tracked_objects:
        track_id = track['track_id']
        if track_id not in cuboid_cache:
            # Your 3D cuboid estimation here
            # cuboid = estimate_cuboid(track['bbox'], track['class_id'])
            # cuboid_cache[track_id] = cuboid
            pass

print(f'Bounding boxes extracted and saved to {bbox_output_dir}')

<h2>3. Converting 2D Bounding Boxes to 3D Cuboids</h2>

In [18]:
# 1. Load camera intrinsic parameters from calib.txt (using P2 matrix for left camera)
with open(calib_file, 'r') as f:
    calib_lines = f.readlines()
# Find the line starting with 'P2:'
P2_line = next(line for line in calib_lines if line.startswith('P2:'))
P2_vals = P2_line.strip().split()[1:]  # skip 'P2:'
P2 = np.array(list(map(float, P2_vals))).reshape(3, 4)  # 3x4 projection matrix
# Intrinsic matrix K is the left 3x3 part of P2
K   = P2[:, :3]
fx  = K[0, 0]; fy = K[1, 1]
cx  = K[0, 2]; cy = K[1, 2]
print(f'Loaded camera intrinsics fx={fx:.2f}, fy={fy:.2f}, cx={cx:.2f}, cy={cy:.2f}')

Loaded camera intrinsics fx=707.09, fy=707.09, cx=601.89, cy=183.11


In [19]:
# 2. (Optional) Load camera poses for each frame (SLAM trajectory or KITTI ground truth)

poses = {}
if pose_dir.exists():
    # Each line in pose_file is a 3x4 transform matrix (T_cam_world) flattened
    with open(pose_dir, 'r') as f:
        for idx, line in enumerate(f):
            vals = list(map(float, line.split()))
            if len(vals) == 12:  # valid pose line
                Tcw = np.array(vals).reshape(3, 4)     # Transform from world to camera
                Rcw = Tcw[:, :3]                       # rotation matrix (camera <- world)
                tcw = Tcw[:, 3]                        # translation vector (camera origin in world coords, in camera frame)
                # Compute world-to-camera inverse: camera-to-world (Rwc, twc)
                Rwc = Rcw.T
                twc = -Rwc.dot(tcw)
                poses[idx] = (Rwc, twc)
    print(f'Loaded {len(poses)} camera poses for SLAM integration.')
else:
    print('Camera pose file not found. Results will be in camera coordinates.')

Loaded 4071 camera poses for SLAM integration.


In [20]:
# 3. Define average object dimensions per class (width, height, length in meters)

# Using KITTI dataset stats:contentReference[oaicite:8]{index=8} for Car/Pedestrian/Cyclist as examples
class_dims = {
    0: {'name': 'Car',          'dims': (1.6, 1.5, 3.9)},   # width, height, length
    1: {'name': 'Pedestrian',   'dims': (0.6, 1.7, 0.6)},   # human body
    2: {'name': 'Cyclist',      'dims': (0.6, 1.7, 1.5)},   # person + bicycle
    3: {'name': 'Lane',         'dims': (3.5, 0.1, 50.0)},  # typical lane width ~3.5m, very flat & long
    4: {'name': 'Traffic Sign', 'dims': (0.8, 2.0, 0.2)},   # pole-mounted sign (width ~0.8m, height ~2m)
    5: {'name': 'Traffic Light','dims': (0.4, 1.0, 0.4)},   # pole-mounted light cluster
    6: {'name': 'Drivable Area','dims': (6.0, 0.1, 50.0)},  # approximate: wide, flat road patch
    7: {'name': 'Truck',        'dims': (2.5, 3.5, 12.0)},  # semi-truck
    8: {'name': 'Bus',          'dims': (2.5, 3.0, 12.0)},  # city bus
    9: {'name': 'Bike',         'dims': (0.6, 1.2, 1.8)},   # standalone bicycle
   10: {'name': 'Motor',        'dims': (0.8, 1.4, 2.2)},   # motorcycle + rider
   11: {'name': 'Train',        'dims': (3.2, 4.5, 30.0)}   # single carriage segment
}

# Note: adjust/add classes as per the YOLO model training.
# (These are typical values; CubeSLAM can also optimize dimensions if needed:contentReference[oaicite:9]{index=9}.)

# Helper function: project 3D cuboid corners given center (X,Y,Z) and yaw in camera frame
def project_cuboid(X, Y, Z, W, H, L, yaw):
    """
    Compute the 2D bounding box (min/max u,v) of a cuboid with given center, size, and yaw (rotation about vertical axis),
    projected into the camera image.
    """
    # Rotation matrix around camera Y-axis (assumed vertical in camera coords) by yaw
    c, s = np.cos(yaw), np.sin(yaw)
    R_yaw = np.array([
        [ c, 0, s],
        [ 0, 1, 0],
        [-s, 0, c]
    ])  # rotates object local coords into camera coords
    # Eight corners of cuboid in object local coordinates (centered at origin)
    corners_local = np.array([
        [dx, dy, dz]
        for dx in (-W/2, W/2)
        for dy in (-H/2, H/2)
        for dz in (-L/2, L/2)
    ])
    # Transform corners to camera coordinates
    corners_cam = corners_local.dot(R_yaw.T) + np.array([X, Y, Z])
    # Project to image pixels
    us = fx * (corners_cam[:, 0] / corners_cam[:, 2]) + cx
    vs = fy * (corners_cam[:, 1] / corners_cam[:, 2]) + cy
    u_min, u_max = us.min(), us.max()
    v_min, v_max = vs.min(), vs.max()
    return u_min, u_max, v_min, v_max

In [21]:
# 4. Function to estimate 3D cuboid from a single 2D bounding box

def estimate_cuboid_from_bbox(bbox, class_id):
    """
    Given a 2D bounding box [x_min, y_min, x_max, y_max] and object class,
    estimate the 3D cuboid's center (camera coords), orientation (yaw), and dimensions.
    Returns (X, Y, Z, yaw, width, height, length).
    """
    x1, y1, x2, y2 = bbox
    W, H, L = class_dims.get(class_id, {'dims': (1.0, 1.0, 1.0)})['dims']  # default to 1m cube if class unknown
    # **Depth estimation**: use bounding box height to approximate distance
    pixel_height = y2 - y1
    Z = (fy * H) / (pixel_height + 1e-6)   # depth along camera Z-axis (in meters)
    # **Horizontal position**: assume bounding box center corresponds to object center horizontally
    u_center = 0.5 * (x1 + x2)
    X = (u_center - cx) / fx * Z           # lateral position in camera coords (X axis)
    # **Vertical position**: compute Y so that bottom of cuboid aligns with detected bottom
    v_bottom = max(y1, y2)
    Y = (v_bottom - cy) / fy * Z - 0.5 * H  # vertical position (camera Y-axis, positive downwards)
    # **Orientation (yaw) estimation**: solve for yaw such that projected width fits the 2D box width
    pixel_width = x2 - x1
    # Use geometry: for a given yaw, the effective horizontal span ≈ |cos(yaw)*W + sin(yaw)*L| in world units:contentReference[oaicite:10]{index=10}.
    # We find yaw that matches the observed width ~ (pixel_width/fx)*Z.
    target_hspan = (pixel_width / fx) * Z  # horizontal span in meters that the box suggests
    # Solve for yaw using the equation: |cos(yaw)*W + sin(yaw)*L| = target_hspan
    # (We consider yaw in [0, pi/2] since symmetric; will decide left/right later.)
    # Avoiding negative sqrt issues:
    cos_yaw = 0.0
    if target_hspan < L:
        # Quadratic solve: cos_yaw * W + sin_yaw * L = target_hspan
        # => (L^2+W^2)*sin^2(yaw) - 2*W*target_hspan*sin(yaw) + (W^2 - target_hspan^2) = 0 in terms of sin(yaw).
        # We solve for cos(yaw) instead via: cos_yaw = sqrt(1 - sin^2(yaw)).
        # We'll do a simple bracket search since analytic might be complex with abs.
        pass  # (we'll handle via iteration below)
    # Instead of closed-form, do a small search over yaw
    best_yaw = 0.0
    min_err = float('inf')
    for deg in range(0, 91, 5):  # coarse search every 5 degrees
        yaw_cand = np.deg2rad(deg)
        u_min, u_max, _, _ = project_cuboid(X, Y, Z, W, H, L, yaw_cand)
        proj_width = u_max - u_min
        err = abs(proj_width - pixel_width)
        if err < min_err:
            min_err = err
            best_yaw = yaw_cand
    # Refine yaw around best_yaw
    yaw = best_yaw
    for _ in range(3):  # a few refinement iterations
        delta = np.deg2rad(2)  # small adjustment step (~2 degrees)
        # Try adjusting yaw slightly up or down to see if error improves
        u_min, u_max, _, _ = project_cuboid(X, Y, Z, W, H, L, yaw)
        err0 = abs((u_max - u_min) - pixel_width)
        u_min, u_max, _, _ = project_cuboid(X, Y, Z, W, H, L, yaw + delta); err_plus = abs((u_max - u_min) - pixel_width)
        u_min, u_max, _, _ = project_cuboid(X, Y, Z, W, H, L, yaw - delta); err_minus = abs((u_max - u_min) - pixel_width)
        # Gradient descent: move yaw in direction of decreasing error
        if err_plus < err0 or err_minus < err0:
            if err_plus < err_minus:
                yaw += delta
            else:
                yaw -= delta
        else:
            break  # no improvement
    # **Center adjustment**: re-align X if needed so projected box is centered on detection
    u_min, u_max, _, _ = project_cuboid(X, Y, Z, W, H, L, yaw)
    proj_center = 0.5 * (u_min + u_max)
    X += ((u_center - proj_center) / fx) * Z  # small tweak to center alignment

    return X, Y, Z, yaw, W, H, L

<b>CubeSLAM technique</b>

1. Single-view 3D cuboid proposal:
- What CubeSLAM does: Given a 2D bounding box + class prior (average dimensions per category), estimate a plausible 3D cuboid (center, yaw, dimensions) in camera frame.
- Corresponding code snippet: `estimate_cuboid_from_bbox()` and `project_cuboid()`.

2. Multi-view optimization/SLAM fusion:
- What CubeSLAM does: Refines cuboids jointly with camera poses using bundle adjustment constraints.
- Corresponding code: This part is in ORB-SLAM3, but ORB-SLAM3 doesn’t yet include cuboid objects by default. Need to add manually.

In [22]:
num_frames = len(list(seq_dir.glob("*.png")))
# frame_times = np.arange(num_frames) / fps
frame_times = np.loadtxt(str(seq_dir.parent / 'times.txt'))
print(f"Total frames in sequence: {num_frames}")

Total frames in sequence: 4071


In [23]:
# 5. Process each frame's detections and save results
bbox_files = sorted(bbox_output_dir.glob('*.json'), key=lambda x: int(x.stem))

frames_to_process = range(num_frames)

for frame_id in frames_to_process:
    bbox_file = None
    for f in bbox_files:
        if int(f.stem) == frame_id:
            bbox_file = f
            break

    if bbox_file:
        with open(bbox_file, 'r') as f:
            detections = json.load(f)
        
        output_data = {'frame': frame_id, 'objects': []}
        
        for det in detections:
            cls_id = det['class']
            conf   = det.get('confidence', 1.0)
            bbox   = det['bbox']
            
            # Estimate 3D cuboid in camera coordinates
            Xc, Yc, Zc, yaw, W, H, L = estimate_cuboid_from_bbox(bbox, cls_id)
            
            # ===== CRITICAL FIX: Keep in CAMERA coordinates =====
            # ORB-SLAM3 expects cuboids in camera-relative coordinates
            # Do NOT transform to world coordinates
            
            x_cam, y_cam, z_cam = float(Xc), float(Yc), float(Zc)
            
            # Rotation quaternion from yaw (rotation around Y-axis in camera frame)
            # Convert yaw to quaternion [w, x, y, z]
            half_yaw = yaw / 2.0
            rot = {
                'w': float(np.cos(half_yaw)),
                'x': 0.0,
                'y': float(np.sin(half_yaw)),
                'z': 0.0
            }
            
            # Prepare object record IN CAMERA COORDINATES
            obj_record = {
                'class': class_dims.get(cls_id, {'name': str(cls_id)})['name'],
                'class_id': int(cls_id),
                'confidence': float(conf),
                'center': [x_cam, y_cam, z_cam],  # Camera coordinates
                'rotation': rot,                   # Rotation in camera frame
                'dimensions': [float(W), float(H), float(L)]
            }
            output_data['objects'].append(obj_record)
        
        # Save to JSON
        out_path = cuboid_output_dir / f'{frame_id:06d}.json'
        with open(out_path, 'w') as f:
            json.dump(output_data, f, indent=4)
        print(f'Saved 3D cuboids (camera coords) for frame {frame_id}')
    else:
        print(f'Warning: No bbox data for frame {frame_id}')

Saved 3D cuboids (camera coords) for frame 0
Saved 3D cuboids (camera coords) for frame 1
Saved 3D cuboids (camera coords) for frame 2
Saved 3D cuboids (camera coords) for frame 3
Saved 3D cuboids (camera coords) for frame 4
Saved 3D cuboids (camera coords) for frame 5
Saved 3D cuboids (camera coords) for frame 6
Saved 3D cuboids (camera coords) for frame 7
Saved 3D cuboids (camera coords) for frame 8
Saved 3D cuboids (camera coords) for frame 9
Saved 3D cuboids (camera coords) for frame 10
Saved 3D cuboids (camera coords) for frame 11
Saved 3D cuboids (camera coords) for frame 12
Saved 3D cuboids (camera coords) for frame 13
Saved 3D cuboids (camera coords) for frame 14
Saved 3D cuboids (camera coords) for frame 15
Saved 3D cuboids (camera coords) for frame 16
Saved 3D cuboids (camera coords) for frame 17
Saved 3D cuboids (camera coords) for frame 18
Saved 3D cuboids (camera coords) for frame 19
Saved 3D cuboids (camera coords) for frame 20
Saved 3D cuboids (camera coords) for frame 2

In [1]:
# Compare frame coverage
print(f"Before: {2189} poses across {4071} frames = {2189/4071*100:.1f}% coverage")
print(f"After: {1806} poses across {4071} frames = {1806/4071*100:.1f}% coverage")

Before: 2189 poses across 4071 frames = 53.8% coverage
After: 1806 poses across 4071 frames = 44.4% coverage
