In [1]:

import cv2
import numpy as np
from IPython.display import display, clear_output
from ultralytics import YOLO
import ipywidgets as widgets
import os


In [2]:

# Cell 3: Load YOLOv8 model
model = YOLO('yolov8m.pt')  # fast + lightweight


In [3]:

# Get COCO class names
COCO_CLASSES = model.names

# Cell 4: User selects classes
class_selector = widgets.SelectMultiple(
    options=list(COCO_CLASSES.values()),
    value=['person', 'cat', 'dog'],
    description='Select classes to detect:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)
display(class_selector)


SelectMultiple(description='Select classes to detect:', index=(0, 15, 16), layout=Layout(width='50%'), options…

In [4]:

# Cell 5: File selector widget
video_path_widget = widgets.Text(
    value='dog_and_cat.mp4',
    description='Video File Path:',
    layout=widgets.Layout(width='75%')
)
display(video_path_widget)


Text(value='dog_and_cat.mp4', description='Video File Path:', layout=Layout(width='75%'))

In [5]:

# Cell 6: Helper function to compute relative position
def get_relative_position(box_center, frame_center):
    x_rel = 2 * (box_center[0] - frame_center[0]) / frame_center[0]
    y_rel = 2 * (box_center[1] - frame_center[1]) / frame_center[1]
    return round(x_rel, 2), round(y_rel, 2)


In [None]:
# Cell 7: Load video file and process
video_path = video_path_widget.value

if not os.path.exists(video_path):
    print(f"❌ File not found: {video_path}")
else:
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("❌ Unable to open video.")
    else:
        ret, frame = cap.read()
        if not ret:
            print("❌ Unable to read video.")
        else:
            h, w = frame.shape[:2]
            frame_center = np.array([w / 2, h / 2])
            print("🎥 Processing video...")

            # Create a named window for displaying images
            cv2.namedWindow('Annotated Frame', cv2.WINDOW_NORMAL)

            try:
                frame_count = 0
                while True:
                    ret, frame = cap.read()
                    if not ret:
                        print("✅ End of video stream.")
                        break

                    results = model(frame)[0]
                    annotated_frame = frame.copy()
                    output_data = []

                    for i, det in enumerate(results.boxes):
                        cls_id = int(det.cls[0])
                        cls_name = COCO_CLASSES[cls_id]

                        if cls_name not in class_selector.value:
                            continue

                        conf = float(det.conf[0])
                        x1, y1, x2, y2 = map(int, det.xyxy[0])
                        box_center = np.array([(x1 + x2) / 2, (y1 + y2) / 2])
                        rel_x, rel_y = get_relative_position(box_center, frame_center)

                        label = f"{cls_name} {conf:.2f} [{rel_x}, {rel_y}]"
                        cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                        cv2.putText(annotated_frame, label, (x1, y1 - 10),
                                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)

                        output_data.append({
                            "frame": frame_count,
                            "id": i,
                            "class": cls_name,
                            "confidence": round(conf, 2),
                            "rel_position": {"x": rel_x, "y": rel_y}
                        })

                    for obj in output_data:
                        print(obj)

                    # Display the annotated frame using OpenCV in the named window
                    cv2.imshow('Annotated Frame', annotated_frame)
                    if cv2.waitKey(1) & 0xFF == ord('q'):  # Press 'q' to exit
                        break

                    frame_count += 1

            finally:
                cap.release()
                cv2.destroyAllWindows()  # Close the OpenCV window
                print("✅ Video processing complete.")


🎥 Processing video...

0: 384x640 1 dog, 252.7ms
Speed: 7.2ms preprocess, 252.7ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)
{'frame': 0, 'id': 0, 'class': 'dog', 'confidence': 0.82, 'rel_position': {'x': np.float64(0.88), 'y': np.float64(0.47)}}

0: 384x640 1 dog, 255.8ms
Speed: 2.2ms preprocess, 255.8ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)
{'frame': 1, 'id': 0, 'class': 'dog', 'confidence': 0.82, 'rel_position': {'x': np.float64(0.88), 'y': np.float64(0.47)}}

0: 384x640 1 bus, 1 dog, 247.3ms
Speed: 2.8ms preprocess, 247.3ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)
{'frame': 2, 'id': 0, 'class': 'dog', 'confidence': 0.72, 'rel_position': {'x': np.float64(0.88), 'y': np.float64(0.43)}}

0: 384x640 1 bus, 1 dog, 220.5ms
Speed: 1.3ms preprocess, 220.5ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)
{'frame': 3, 'id': 0, 'class': 'dog', 'confidence': 0.66, 'rel_position': {'x': np.float64(0.88), 

KeyboardInterrupt: 