In [None]:
import cv2
from ultralytics import YOLO
import math

# Initialize YOLO model (downloads automatically on first run)
model = YOLO("yolov8n.pt")  # You can use 'yolov8n.pt', 'yolov8s.pt', etc.

# Open video capture (0 for webcam, or video file path)
cap = cv2.VideoCapture(0)

# COCO dataset class names (YOLO's default)
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
              "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
              "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
              "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
              "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake",
              "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop",
              "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
              "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]

while True:
    success, img = cap.read()
    if not success:
        break

    # Run object detection
    results = model(img, stream=True)

    # Initialize counting dictionary
    detection_counts = {}

    # Process detection results
    for r in results:
        boxes = r.boxes
        for box in boxes:
            # Extract bounding box coordinates
            x1, y1, x2, y2 = box.xyxy[0]
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)

            # Calculate confidence
            confidence = math.ceil((box.conf[0] * 100)) / 100

            # Get class name
            cls = int(box.cls[0])
            class_name = classNames[cls]

            # Draw bounding box
            cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 2)

            # Draw label with confidence
            label = f'{class_name} {confidence:.2f}'
            cv2.putText(img, label, (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 255), 2)

            # Update count for detected class
            if class_name in detection_counts:
                detection_counts[class_name] += 1
            else:
                detection_counts[class_name] = 1

    # Display counts on image
    y_offset = 30
    for obj_class, count in detection_counts.items():
        count_text = f'{obj_class}: {count}'
        cv2.putText(img, count_text, (10, y_offset),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
        y_offset += 30

    # Show result
    cv2.imshow('Object Detection and Counting', img)

    # Exit on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Cleanup
cap.release()
cv2.destroyAllWindows()


0: 480x640 1 person, 155.2ms
Speed: 32.5ms preprocess, 155.2ms inference, 16.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 65.8ms
Speed: 3.2ms preprocess, 65.8ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 57.9ms
Speed: 1.1ms preprocess, 57.9ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 57.1ms
Speed: 1.7ms preprocess, 57.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.7ms
Speed: 1.5ms preprocess, 58.7ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 50.9ms
Speed: 1.0ms preprocess, 50.9ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 54.2ms
Speed: 1.0ms preprocess, 54.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 55.4ms
Speed: 1.3ms preprocess, 55.4ms inference, 0.8ms postprocess per image at shape (1, 3