The code includes:

1. Custom dataset class for COCO format
2. Model initialization with pretrained weights
3. Training and validation loops
4. Learning rate scheduling
5. Model checkpointing

Key features:

1. Uses Faster R-CNN with ResNet50 backbone
2. Automatically maps your categories to model labels
3. Saves the best model based on validation loss
4. Shows progress bars during training
5. Includes learning rate scheduling for better convergence

# Screenshots


In [None]:
import cv2
import numpy as np
import keyboard
import time
from datetime import datetime

# Define a variable to control the screenshot loop
capture = False

while True:
    # Check if the 's' key is pressed to start capturing
    if keyboard.is_pressed('s'):
        capture = True
        print("Screenshot capture started!")
        time.sleep(0.2)  # Small delay to avoid multiple starts with one press

    # Check if the 'q' key is pressed to stop capturing
    if keyboard.is_pressed('q'):
        capture = False
        print("Screenshot capture stopped!")
        time.sleep(0.2)  # Small delay to avoid multiple stops with one press

    # Take screenshots continuously if capture is True
    if capture:
        # Take a screenshot from the screen
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        screenshot = pyautogui.screenshot()
        
        # Convert screenshot to OpenCV format
        screenshot_cv = cv2.cvtColor(np.array(screenshot), cv2.COLOR_RGB2BGR)
        
        # Save the screenshot with a unique timestamp
        screenshot_path = f"D:/Thinkin in programming/Metopen/Traffic Signs/{timestamp}.png"
        cv2.imwrite(screenshot_path, screenshot_cv)
        
        # Add a delay to control the frequency of screenshots
        time.sleep(1)  # Take a screenshot every 1 second (adjust as needed)
    
    # Exit the loop if the ESC key is pressed
    if keyboard.is_pressed('esc'):
        print("Exiting program.")
        break


Screenshot capture started!
Exiting program.


: 

# Load Dataset

https://github.com/harshatejas/pytorch_custom_object_detection

https://github.com/trzy/FasterRCNN

https://colab.research.google.com/drive/1YSpk-PiAyH9DAja_Okv-gM2mqbMsHWgb?authuser=1#scrollTo=Pm0QGw219EVU  Sandi Groot

https://colab.research.google.com/drive/18ZEyu2oVV_iEpBTaSXL7Xvkf_-_jTfRC?usp=sharing sandijamlu

In [19]:
import os
import cv2 as cv
import json
from tqdm import tqdm
from pycocotools.coco import COCO
from PIL import Image
import numpy as np
import traceback
import matplotlib.pyplot as plt
import PIL as Image

# torch
import torch
import torchvision 
from torchvision import transforms
from torchvision.ops import box_iou

from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torch.amp import GradScaler, autocast

# Train Model

In [None]:
class TrafficSignDataset(Dataset):
    def __init__(self, root_dir, annotation_file):

        self.root_dir = root_dir
        self.to_tensor = transforms.ToTensor()
        
        # Load COCO annotation
        self.coco = COCO(annotation_file)
        
        # get images ids
        self.image_ids = list(sorted(self.coco.imgs.keys()))
        
        # Get category mapping
        self.category_ids = sorted(self.coco.getCatIds())
        print("Original category IDs:", self.category_ids)
        
        # Create a continuous mapping starting from 1 (0 is background)
        self.category_id_to_label = {cat_id: idx + 1 for idx, cat_id in enumerate(self.category_ids)}
        print("Category mapping:", self.category_id_to_label)
        
        # Verify all images exist
        self.valid_image_ids = []
        for img_id in self.image_ids:
            img_info = self.coco.loadImgs(img_id)[0]
            img_path = os.path.join(self.root_dir, img_info['file_name'])
            if os.path.exists(img_path):
                self.valid_image_ids.append(img_id)
        
        print(f"Found {len(self.valid_image_ids)} valid images out of {len(self.image_ids)} total")

    def __len__(self):
        return len(self.valid_image_ids)

    def __getitem__(self, idx):
        try:
            # Load image
            image_id = self.valid_image_ids[idx]
            image_info = self.coco.loadImgs(image_id)[0]
            image_path = os.path.join(self.root_dir, image_info['file_name'])
            
            # Read and process image
            image = cv.imread(image_path)
            if image is None:
                raise ValueError(f"Failed to load image: {image_path}")
            
            image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
            image = cv.resize(image, (400, 400))
            
            # Convert to tensor
            image = self.to_tensor(image)
            
            # Load annotations
            ann_ids = self.coco.getAnnIds(imgIds=image_id)
            anns = self.coco.loadAnns(ann_ids)
            
            boxes = []
            labels = []
            
            # Handle empty annotations
            if not anns:
                boxes = torch.zeros((0, 4), dtype=torch.float32)
                labels = torch.zeros((0,), dtype=torch.int64)
            else:
                for ann in anns:
                    x, y, w, h = ann['bbox']
                    # Convert to x1, y1, x2, y2 format and normalize
                    x1 = max(x, 0)
                    y1 = max(y, 0)
                    x2 = min(x + w, 400)
                    y2 = min(y + h, 400)
                    
                    if x2 <= x1 or y2 <= y1:
                        continue
                    
                    boxes.append([x1, y1, x2, y2])
                    label = self.category_id_to_label[ann['category_id']]
                    labels.append(label)
                
                if boxes:  # Only convert if we have valid boxes
                    boxes = torch.as_tensor(boxes, dtype=torch.float32)
                    labels = torch.as_tensor(labels, dtype=torch.int64)
                else:  # If all boxes were invalid
                    boxes = torch.zeros((0, 4), dtype=torch.float32)
                    labels = torch.zeros((0,), dtype=torch.int64)
            
            # Prepare target dictionary
            target = {
                'boxes': boxes,
                'labels': labels,
                'image_id': torch.tensor([image_id]),
                'area': torch.zeros((len(boxes),), dtype=torch.float32) if len(boxes) == 0 
                       else (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]),
                'iscrowd': torch.zeros((len(boxes),), dtype=torch.int64)
            }
            
            return image, target
            
        except Exception as e:
            print(f"Error processing image {image_path}: {str(e)}")
            # Return the next valid image instead
            return self.__getitem__((idx + 1) % len(self))

# Make a different class for readability man
def get_model(num_classes):

    # Load pre-trained model
    model = fasterrcnn_resnet50_fpn(pretrained = True)

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features

    # Relace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    return model

def compute_ap(recalls, precisions):
    """ Compute the average precision, given the recall and precision curves"""
    ap = 0
    for t in np.arange(0.0, 1.0, 0.01):
        if np.sum(recalls >= t) == 0:
            p = 0
        else:
            p = np.max(precisions[recalls >= t])

        ap = ap + p / 11.


def validate(model, data_loader, device, iou_threshold=0.5):
    model.eval()

    # Dictionary to store all detections and ground truths
    all_detections = defaultdict(list)  # class_id -> list of detections
    all_ground_truths = defaultdict(list)  # class_id -> list of ground truths

    with torch.no_grad():
        for images, targets in tqdm(data_loader, desc="Validation"):
            images = list(image.to(device) for image in images)

            # Get predictions
            predictions = model(images)

            # Process each image in the batch
            for img_idx, (pred, target) in enumerate(zip(predictions, targets)):
                # Get predictions for this image
                boxes = pred['boxes'].cpu()
                scores = pred['scores'].cpu()
                labels = pred['labels'].cpu()

                # Get ground truth for this image
                gt_boxes = target['boxes'].cpu()
                gt_labels = target['labels'].cpu()

                # Store predictions and ground truths by class
                for box, score, label in zip(boxes, scores, labels):
                    all_detections[label.item()].append({
                        'box': box,
                        'score': score.item(),
                        'img_idx': img_idx
                    })

                for box, label in zip(gt_boxes, gt_labels):
                    all_ground_truths[label.item()].append({
                        'box': box,
                        'img_idx': img_idx,
                        'matched': False  # Add matched flag instead of modifying tensor
                    })

    # Calculate AP for each class
    aps = []

    print("\nCalculating AP for each class:")
    for class_id in all_ground_truths.keys():
        detections = all_detections[class_id]
        ground_truths = all_ground_truths[class_id]

        # Skip if no ground truths for this class
        if len(ground_truths) == 0:
            continue

        # Sort detections by score
        detections = sorted(detections, key=lambda x: x['score'], reverse=True)

        # Initialize arrays for precision-recall calculation
        tp = np.zeros(len(detections))
        fp = np.zeros(len(detections))

        # Create dictionary of ground truth boxes per image
        gt_per_img = defaultdict(list)
        for gt_idx, gt in enumerate(ground_truths):
            gt_per_img[gt['img_idx']].append({
                'box': gt['box'],
                'matched': False,
                'idx': gt_idx
            })

        # Match detections to ground truths
        for det_idx, detection in enumerate(detections):
            img_gt_boxes = gt_per_img[detection['img_idx']]

            if len(img_gt_boxes) == 0:
                fp[det_idx] = 1
                continue

            # Get all ground truth boxes for this image
            gt_boxes_tensor = torch.stack([gt['box'] for gt in img_gt_boxes])

            # Calculate IoU with all ground truth boxes
            iou = box_iou(detection['box'].unsqueeze(0), gt_boxes_tensor)

            if len(iou) > 0:
                max_iou = iou.max().item()
                max_idx = iou.argmax().item()

                if max_iou >= iou_threshold:
                    # If this ground truth wasn't matched before
                    if not img_gt_boxes[max_idx]['matched']:
                        tp[det_idx] = 1
                        # Mark this ground truth as matched
                        img_gt_boxes[max_idx]['matched'] = True
                        ground_truths[img_gt_boxes[max_idx]['idx']]['matched'] = True
                    else:
                        fp[det_idx] = 1
                else:
                    fp[det_idx] = 1
            else:
                fp[det_idx] = 1

        # Calculate precision and recall
        tp_cumsum = np.cumsum(tp)
        fp_cumsum = np.cumsum(fp)

        recalls = tp_cumsum / len(ground_truths)
        precisions = tp_cumsum / (tp_cumsum + fp_cumsum)

        # Add endpoints for AP calculation
        precisions = np.concatenate([[1], precisions])
        recalls = np.concatenate([[0], recalls])

        # Calculate AP for this class using interpolation
        ap = np.trapz(precisions, recalls)  # Using trapezoidal rule for AUC
        aps.append(ap)

        print(f"Class {class_id}: AP = {ap:.4f}")

    # Calculate mAP
    mAP = np.mean(aps) if aps else 0.0
    print(f"\nMean Average Precision (mAP): {mAP:.4f}")

    return mAP

def train_one_epoch(model, optimizer, data_loader, device, scaler):
    model.train()
    total_loss = 0
    valid_batches = 0
    
    for images, targets in tqdm(data_loader, desc="Training"):
        try:
            # Move images and targets to device
            images = [image.to(device) for image in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
            # Skip empty targets
            if any(len(target['boxes']) == 0 for target in targets):
                continue
                
            # Forward pass with autocast
            with autocast(device_type=device.type):
                loss_dict = model(images, targets)
                losses = sum(loss for loss in loss_dict.values())
            
            # Skip if loss is NaN
            if torch.isnan(losses):
                continue
                
            # Backward pass with scaler
            optimizer.zero_grad()
            scaler.scale(losses).backward()
            scaler.step(optimizer)
            scaler.update()
            
            total_loss += losses.item()
            valid_batches += 1
            
        except Exception as e:
            print(f"Error in training batch: {str(e)}")
            continue
    
    return total_loss / valid_batches if valid_batches > 0 else float('inf')

def main():
    # Set device and enable blocking CUDA calls for better error messages
    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Dataset paths
    root_dir = "D:/Thinkin in programming/R-CNN/train"
    train_annotation = "D:/Thinkin in programming/R-CNN/train/_annotations.coco.json"
    val_root_dir = "D:/Thinkin in programming/R-CNN/valid"
    val_annotation = "D:/Thinkin in programming/R-CNN/valid/_annotations.coco.json"

    # Create datasets
    train_dataset = TrafficSignDataset(root_dir, train_annotation)
    val_dataset = TrafficSignDataset(val_root_dir, val_annotation)

    # Get number of classes
    num_classes = len(train_dataset.category_ids) + 1
    print(f"Number of classes (including background): {num_classes}")

    # initialize scaler
    scaler = GradScaler()

    # Create data loaders with smaller batch size
    train_loader = DataLoader(
        train_dataset,
        batch_size=1,
        shuffle=True,
        collate_fn=lambda x: tuple(zip(*x)),
        num_workers=0
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=2,
        shuffle=False,
        collate_fn=lambda x: tuple(zip(*x)),
        num_workers=2
    )

    # Initialize model
    model = get_model(num_classes)
    model.to(device)

    # Initialize optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.0005, momentum=0.9, weight_decay=0.0005)

    # Initialize learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

    # Training loop
    num_epochs = 10
    best_map = 0.0

    # Lists to store losses for plotting
    train_losses = []
    val_maps = []

    for epoch in range(num_epochs):
      # save the model even validations fails
      train_loss = train_one_epoch(model, optimizer, train_loader, device, scaler)
      print(f"Train Loss: {train_loss:.4f}")

      # Save checkpoint every N epochs
      if (epoch + 1) % 2 == 0: # Save every 2 epochs
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
        }, f'checkpoint_epoch_{epoch+1}.pth')

      print(f"\nEpoch {epoch+1}/{num_epochs}")

      if torch.cuda.is_available():
          torch.cuda.empty_cache()

      try:
          # Train
          train_loss = train_one_epoch(model, optimizer, train_loader, device)
          print(f"Train Loss: {train_loss:.4f}")
          train_losses.append(train_loss)

          # Validate with mAP
          val_map = validate(model, val_loader, device)
          print(f"Validation mAP: {val_map:.4f}")
          val_maps.append(val_map)

          # Update learning rate
          lr_scheduler.step()

          # Save best model (now based on mAP)
          if val_map > best_map:
              best_map = val_map
              torch.save({
                  'epoch': epoch,
                  'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'best_map': best_map,
                  'train_losses': train_losses,
                  'val_maps': val_maps
              }, 'best_model.pth')
              print("Saved best model checkpoint")

      except Exception as e:
          print(f"Error in epoch {epoch+1}: {e}")
          traceback.print_exc()
          continue

if __name__ == "__main__":
    main()

Using device: cuda
loading annotations into memory...
Done (t=0.03s)
creating index...
index created!
Original category IDs: [0, 1, 2, 3]
Category mapping: {0: 1, 1: 2, 2: 3, 3: 4}
Found 834 valid images out of 834 total
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Original category IDs: [0, 1, 2, 3]
Category mapping: {0: 1, 1: 2, 2: 3, 3: 4}
Found 50 valid images out of 50 total
Number of classes (including background): 5


Training:   0%|          | 1/834 [00:00<02:28,  5.59it/s]

Final tensor shape: torch.Size([3, 400, 400])
Error in batch: Could not run 'torchvision::nms' with arguments from the 'CUDA' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'torchvision::nms' is only available for these backends: [CPU, Meta, QuantizedCPU, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradMPS, AutogradXPU, AutogradHPU, AutogradLazy, AutogradMeta, Tracer, AutocastCPU, AutocastXPU, AutocastMPS, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

CPU: registered at 

Training:   0%|          | 2/834 [00:00<02:14,  6.18it/s]


Error in batch: Could not run 'torchvision::nms' with arguments from the 'CUDA' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'torchvision::nms' is only available for these backends: [CPU, Meta, QuantizedCPU, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradMPS, AutogradXPU, AutogradHPU, AutogradLazy, AutogradMeta, Tracer, AutocastCPU, AutocastXPU, AutocastMPS, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

CPU: registered at C:\actions-runner\_work\vision\vision\pytorch\

IndexError: too many indices for tensor of dimension 1

In [2]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

True
1
0
NVIDIA GeForce RTX 2050


In [29]:
def load_model(model_path, num_classes):
  # initialize model
  model = fasterrcnn_resnet50_fpn(pretrained = False)
  in_features = model.roi_heads.box_predictor.cls_score.in_features
  model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

  # Load saved weights
  checkpoint = torch.load(model_path)
  model.load_state_dict(checkpoint['model_state_dict'])

  return model


def predict_image(model, image_path, device, confidence_threshold = 0.5):
  # Load and preprocess the image
  image = cv.imread(image_path)
  image = cv.cvtColor(image, cv.COLOR_BGR2RGB)

  # Convert to tensor and normalize
  image_tensor = torch.from_numpy(image).float() / 255.0
  image_tensor = image_tensor.permute(2, 0, 1).unsqueeze(0)

  # Move to device and set model to evaluation mode
  model = model.to(device)
  model.eval()
  image_tensor = image_tensor.to(device)

  # Get prediction
  with torch.no_grad():
      predictions = model(image_tensor)

  # Extract boxes, scores, and labels from predictions
  # Assuming predictions is a list of dictionaries, and the first element
  # contains the predictions for the input image
  boxes = predictions[0]['boxes'].cpu().numpy()
  scores = predictions[0]['scores'].cpu().numpy()
  labels = predictions[0]['labels'].cpu().numpy()

  # Filter predictions based on confidence threshold
  mask = scores >= confidence_threshold
  boxes = boxes[mask]
  scores = scores[mask]
  labels = labels[mask]

  return image, boxes, scores, labels

def draw_predictions(image, boxes, scores, labels, class_names=None):
    # Make a copy of the image to draw on
    image_draw = image.copy()

    # Define colors for different classes
    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0)]

    # Total count for statistics
    total_predictions = len(scores)
    correct_predictions = sum([score >= 0.5 for score in scores])  # Confidence threshold

    # Draw each prediction
    for box, score, label in zip(boxes, scores, labels):
        # Convert box coordinates to integers
        box = box.astype(np.int64)

        # Get color for this class
        color = colors[label % len(colors)]

        # Draw box
        cv.rectangle(image_draw, (box[0], box[1]), (box[2], box[3]), color, 2)

        # Prepare label text
        if class_names and label < len(class_names):
            label_text = f"{class_names[label]}: {score:.2f}"
        else:
            label_text = f"Class {label}: {score:.2f}"

        # Draw label
        cv.putText(image_draw, label_text, (box[0], box[1] - 10),
                   cv.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    # Calculate and display precision (percentage of correct predictions)
    precision = (correct_predictions / total_predictions) * 100 if total_predictions > 0 else 0
    accuracy_text = f"Accuracy: {precision:.2f}%"

    # Display accuracy on the image
    cv.putText(image_draw, accuracy_text, (10, 30), cv.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)

    return image_draw

def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Model path and number of classes (including background)
    model_path = '/content/best_model.pth'
    num_classes = 5  # Adjust this based on your model

    # Class names (adjust these to match your classes)
    class_names = ['background', 'other', 'caution', 'information', 'Warning']

    # Load model
    model = load_model(model_path, num_classes)
    model.to(device)

    # Path to test image
    test_image_path = '/content/drive/MyDrive/R-CNN/Untitled.png'

    # Get predictions
    image, boxes, scores, labels = predict_image(model, test_image_path, device, confidence_threshold=0.5)

    # Draw predictions with additional information
    result_image = draw_predictions(image, boxes, scores, labels, class_names)

    # Display results
    plt.figure(figsize=(12, 8))
    plt.imshow(result_image)
    plt.axis('off')
    plt.show()

    # Save result
    result_image_rgb = cv.cvtColor(result_image, cv.COLOR_RGB2BGR)
    cv.imwrite('prediction_result.jpg', result_image_rgb)
    print("Results saved to 'prediction_result.jpg'")

if __name__ == "__main__":
    main()


Using device: cuda


  checkpoint = torch.load(model_path)


FileNotFoundError: [Errno 2] No such file or directory: '/content/best_model.pth'

REAL-TIME

In [2]:
def load_model(model_path, num_classes):
    # Initialize model
    model = fasterrcnn_resnet50_fpn(pretrained=False)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # Load saved weights
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model_state_dict'])

    return model

def predict_image(model, image, device, confidence_threshold=0.5):
    # Convert to tensor and normalize
    image_tensor = torch.from_numpy(image).float() / 255.0
    image_tensor = image_tensor.permute(2, 0, 1).unsqueeze(0)

    # Move to device and set model to evaluation mode
    model = model.to(device)
    model.eval()
    image_tensor = image_tensor.to(device)

    # Get prediction
    with torch.no_grad():
        predictions = model(image_tensor)

    # Extract boxes, scores, and labels from predictions
    boxes = predictions[0]['boxes'].cpu().numpy()
    scores = predictions[0]['scores'].cpu().numpy()
    labels = predictions[0]['labels'].cpu().numpy()

    # Filter predictions based on confidence threshold
    mask = scores >= confidence_threshold
    boxes = boxes[mask]
    scores = scores[mask]
    labels = labels[mask]

    return boxes, scores, labels

def draw_predictions(image, boxes, scores, labels, class_names=None):
    # Make a copy of the image to draw on
    image_draw = image.copy()

    # Define colors for different classes
    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0)]

    # Draw each prediction
    for box, score, label in zip(boxes, scores, labels):
        # Convert box coordinates to integers
        box = box.astype(np.int64)

        # Get color for this class
        color = colors[label % len(colors)]

        # Draw box
        cv.rectangle(image_draw, (box[0], box[1]), (box[2], box[3]), color, 2)

        # Prepare label text
        if class_names and label < len(class_names):
            label_text = f"{class_names[label]}: {score:.2f}"
        else:
            label_text = f"Class {label}: {score:.2f}"

        # Draw label
        cv.putText(image_draw, label_text, (box[0], box[1] - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    return image_draw


def display_frame(frame):
  plt.figure(figsize=(10, 10))
  plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
  plt.axis('off')  # Hide axis
  plt.show()

def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Model path and number of classes (including background)
    model_path = '/content/best_model.pth'
    num_classes = 5  # Adjust this based on your model

    # Class names (adjust these to match your classes)
    class_names = ['background', 'warning', 'caution', 'information', 'other']

    # Load model
    model = load_model(model_path, num_classes)
    model.to(device)

    # Open webcam
    cap = cv.VideoCapture(0)  # Change to a video file path for testing with videos
    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return

    while True:
        ret, frame = cap.read()
        if not ret:
            print("Error: Failed to grab frame.")
            break

        # Convert BGR to RGB
        frame_rgb = cv.cvtColor(frame, cv.COLOR_BGR2RGB)

        # Get predictions
        boxes, scores, labels = predict_image(model, frame_rgb, device, confidence_threshold=0.5)

        # Draw predictions on the frame
        result_frame = draw_predictions(frame, boxes, scores, labels, class_names)

        # Display the frame with predictions
        cv.imshow("Real-Time Object Detection", result_frame)

        # Break the loop when 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # Release webcam and close windows
    cap.release()
    cv.destroyAllWindows()

if __name__ == "__main__":
    main()


Validation folder exists: True
Validation annotation exists: True

Validation annotation structure:
Number of images: 50
Number of annotations: 45
