In [143]:
from ultralytics import YOLO
import cv2

from sort.sort import Sort

import numpy as np
import pandas as pd

In [144]:
log_df = pd.DataFrame(columns=["id", "x1", "y1", "x2", "y2", "class", "confidence"])

In [145]:
model = YOLO("yolov8n.pt")

In [146]:
tracker = Sort(max_age=30, min_hits=3, iou_threshold=0.3)

In [147]:
cap = cv2.VideoCapture(0)

In [148]:
ret, frame = cap.read()

print(frame.shape)
xbuffer = 10
ybuffer = 8
roi = [xbuffer, ybuffer, frame.shape[1]-xbuffer, frame.shape[0]-ybuffer]


(480, 640, 3)


In [149]:
while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Run YOLOv8 inference
    results = model.predict(source=frame, show=False, conf=0.5)


    # Get first result (single frame)
    result = results[0]

    detections = np.empty((0, 5))

    # Extract bounding boxes
    for box in result.boxes:
        x1, y1, x2, y2 = box.xyxy[0]  
        x1, y1, x2, y2 = int(x1), int(x2), int(y1), int(y2)

        conf = box.conf[0]           
        cls = int(box.cls[0])

        CurrentArray = np.array([x1, y1, x2, y2, conf])
        detections = np.vstack((detections, CurrentArray))

    TrackResults = tracker.update(detections)
    for tracking in TrackResults:
        x1, y1, x2, y2, id = tracking
        x1, y1, x2, y2 = int(x1), int(x2), int(y1), int(y2)

        cx, cy = (x2-x1)/2, (y2-y1)/2

        if roi[0]< cx < roi[2] and roi[1] < cy < roi[3] and int(id) not in log_df['id'].values:
            log_df.loc[len(log_df)] = [id, x1, y1, x2, y2, model.names[cls], float(conf)]

        cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 3)
        cv2.putText(frame, f'{id}', (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 1)
        # print(track)
    
    # Display result
    cv2.imshow("YOLOv8 Webcam with Boxes", frame)

    # Exit on ' '
    if cv2.waitKey(1) & 0xFF == ord(' '):
        break

cap.release()
cv2.destroyAllWindows()


0: 480x640 1 person, 63.1ms
Speed: 2.4ms preprocess, 63.1ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 77.6ms
Speed: 1.6ms preprocess, 77.6ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.4ms
Speed: 1.3ms preprocess, 60.4ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 61.1ms
Speed: 2.0ms preprocess, 61.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 66.1ms
Speed: 1.5ms preprocess, 66.1ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.7ms
Speed: 1.3ms preprocess, 59.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 57.2ms
Speed: 1.0ms preprocess, 57.2ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 61.3ms
Speed: 1.0ms preprocess, 61.3ms inference, 1.2ms postprocess per image at shape (1, 3, 48

  o = wh / ((bb_test[..., 2] - bb_test[..., 0]) * (bb_test[..., 3] - bb_test[..., 1])


Speed: 1.1ms preprocess, 54.8ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.4ms
Speed: 1.1ms preprocess, 59.4ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.9ms
Speed: 1.0ms preprocess, 60.9ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 55.0ms
Speed: 1.2ms preprocess, 55.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.9ms
Speed: 1.0ms preprocess, 59.9ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 66.5ms
Speed: 0.9ms preprocess, 66.5ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 70.3ms
Speed: 1.4ms preprocess, 70.3ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 56.8ms
Speed: 1.0ms preprocess, 56.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person,

  h = x[2] / w


Speed: 1.0ms preprocess, 55.7ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.9ms
Speed: 1.0ms preprocess, 60.9ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 57.5ms
Speed: 1.4ms preprocess, 57.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.2ms
Speed: 1.0ms preprocess, 59.2ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.7ms
Speed: 1.1ms preprocess, 58.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 57.3ms
Speed: 0.9ms preprocess, 57.3ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 61.9ms
Speed: 1.1ms preprocess, 61.9ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.1ms
Speed: 1.0ms preprocess, 58.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person,

  h = x[2] / w


0: 480x640 2 persons, 80.2ms
Speed: 1.8ms preprocess, 80.2ms inference, 2.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 102.9ms
Speed: 2.4ms preprocess, 102.9ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 cat, 87.1ms
Speed: 2.3ms preprocess, 87.1ms inference, 1.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 91.2ms
Speed: 2.4ms preprocess, 91.2ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 92.8ms
Speed: 2.1ms preprocess, 92.8ms inference, 1.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 121.4ms
Speed: 2.2ms preprocess, 121.4ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 128.9ms
Speed: 2.7ms preprocess, 128.9ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 99.7ms
Speed: 2.6ms preprocess, 99.7ms inference, 1.2ms po

  h = x[2] / w


0: 480x640 1 person, 102.4ms
Speed: 1.9ms preprocess, 102.4ms inference, 1.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 105.7ms
Speed: 2.5ms preprocess, 105.7ms inference, 1.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 92.5ms
Speed: 2.0ms preprocess, 92.5ms inference, 2.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 124.2ms
Speed: 3.2ms preprocess, 124.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 107.7ms
Speed: 2.5ms preprocess, 107.7ms inference, 1.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 99.4ms
Speed: 2.5ms preprocess, 99.4ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 97.8ms
Speed: 2.0ms preprocess, 97.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 105.1ms
Speed: 2.3ms preprocess, 105.1ms inference, 1.6ms pos

  h = x[2] / w


Speed: 1.7ms preprocess, 73.5ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 85.9ms
Speed: 2.5ms preprocess, 85.9ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 76.5ms
Speed: 1.7ms preprocess, 76.5ms inference, 1.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 80.8ms
Speed: 1.8ms preprocess, 80.8ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 105.5ms
Speed: 1.9ms preprocess, 105.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)


In [6]:
print(model.names)

{0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microw

In [150]:
print(log_df)

       id  x1   y1   x2   y2         class  confidence
0  4768.0  23   77  537  479        person    0.852019
1  4769.0  27   71  537  479        person    0.859774
2  4770.0  23   68  543  479        person    0.869325
3  4919.0   1   25  270  475        person    0.691459
4  5039.0   3  271   87  487        person    0.899150
5  5093.0  30  276  314  474        person    0.568963
6  5204.0   0  132   80  183  dining table    0.520930
