In [None]:
import cv2
import numpy as np
import torch
import torchvision.transforms as transforms
from yolov5.models.common import DetectMultiBackend
from yolov5.utils.general import non_max_suppression
from yolov5.utils.torch_utils import select_device

# Define scale_coords manually
def scale_coords(img1_shape, coords, img0_shape):
    gain = min(img0_shape[0] / img1_shape[0], img0_shape[1] / img1_shape[1])
    pad = (img0_shape[1] - img1_shape[1] * gain) / 2, (img0_shape[0] - img1_shape[0] * gain) / 2
    coords[:, [0, 2]] -= pad[0]  
    coords[:, [1, 3]] -= pad[1]  
    coords[:, :4] /= gain
    coords[:, :4] = coords[:, :4].clamp(min=0)
    return coords

# Load YOLOv5 model
def load_model(weights='yolov5s.pt', device='cpu'):
    device = select_device(device)
    model = DetectMultiBackend(weights, device=device)  # Load model
    model.eval()
    return model, device

# Preprocess frame
def preprocess_frame(frame, img_size=640):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor()
    ])
    img = transform(frame).unsqueeze(0)
    return img

# Perform object detection
def detect_objects(model, device, frame, conf_thres=0.4, iou_thres=0.5):
    img = preprocess_frame(frame)
    img = img.to(device)
    
    with torch.no_grad():
        pred = model(img)
    
    pred = non_max_suppression(pred, conf_thres, iou_thres)
    return pred

# Draw bounding boxes
def draw_boxes(frame, pred, img_size=640):
    h, w = frame.shape[:2]
    for det in pred:
        if det is not None and len(det):
            det[:, :4] = scale_coords((img_size, img_size), det[:, :4], (h, w)).round()
            for *xyxy, conf, cls in det:
                cv2.rectangle(frame, (int(xyxy[0]), int(xyxy[1])), (int(xyxy[2]), int(xyxy[3])), (0, 255, 0), 2)
                cv2.putText(frame, f'Conf: {conf:.2f}', (int(xyxy[0]), int(xyxy[1]) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    return frame

# Real-time object detection
def object_detection_realtime():
    model, device = load_model()
    cap = cv2.VideoCapture(0)
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        pred = detect_objects(model, device, frame)
        frame = draw_boxes(frame, pred)
        
        cv2.imshow('Object Detection', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

# Run object detection
if __name__ == '__main__':
    object_detection_realtime()


YOLOv5 🚀 2025-3-2 Python-3.11.7 torch-2.6.0 CPU

Fusing layers... 
YOLOv5s summary: 270 layers, 7235389 parameters, 0 gradients, 16.6 GFLOPs
2025-03-02 17:58:21.365 python[2187:56722] +[IMKClient subclass]: chose IMKClient_Modern
2025-03-02 17:58:21.365 python[2187:56722] +[IMKInputSession subclass]: chose IMKInputSession_Modern
