In [2]:
# libraries 
import cv2 # manipulate frames and video (display)
from deep_sort_realtime.deepsort_tracker import DeepSort # to use the model of track 
from ultralytics import YOLO # to use yolo to track 


# Set environment variable to avoid duplicate library errors
# os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"


# Initialize the DeepSort tracker (une instance )
''' 
DeepSort uses :
    Kalman Filters & Hungarian Algorithm for { association and prediction }
    Neural network for { appearance-based re-identification }
It ensures consistent tracking of objects 
    [even when they temporarily disappear from view or when the camera moves slightly]
Each tracked object is assigned
    a unique ID (to monitor that object)
'''
object_tracker = DeepSort()

# Initialize the YOLO model (version 8) with a smaller model for faster inference
# Use yolov5n for faster inference
# DETAILS 
'''
model = YOLO(weights, task=None, mode=None)
weights (Required):
    path to the custom model weights
        Pretrained YOLOv8 weights:
            "yolov8'c'.pt": c => can be ,n(nano) ,s(small) ,m(medium) ,l(large) ,x(extra large)
task (Optional):
    Specifies the task the model is meant to perform:
        "detect": Object detection (default if not specified).
        "segment": Instance segmentation.
        "classify": Image classification.
mode (Optional):
    Specifies the mode of operation:
        "train": Train a new or custom YOLO model.
        "val": Validate the performance of the model on a dataset.
        "predict": Make predictions on images or videos (default).
        "export": Export the model for inference (e.g., ONNX, TensorRT, etc.).

default : 
- model = YOLO("yolov8n-seg.pt/yolov8n.pt")
    Default task : "segment".
    Default mode: "predict".
- model = YOLO("yolov8n-cls.pt/yolov8n.pt")
    Default task : "classify".
    Default mode: "predict".
'''
model = YOLO("yolo-Weights/yolov5n.pt")  


# code to start the webcam ( 0 => the actual default camera , in my case my laptop)
# DETAILS 
'''
cap = cv2.VideoCapture(source, apiPreference)
READING of the video frame by frame 
Parameters:
    source (required):
        Specifies the video source.
        Integer: Refers to the index of the camera.
            0: Default camera 
            1: Second connected camera, and so on.
        String: Path to a video file 
apiPreference (optional):
Specifies which API backend to use (e.g., DirectShow, Media Foundation, etc.).
Common values include:
cv2.CAP_ANY (default): Auto-select the backend.
cv2.CAP_DSHOW: DirectShow (Windows).
cv2.CAP_AVFOUNDATION: AVFoundation (macOS).
'''
cap = cv2.VideoCapture(0)

# these lines is to define the capture of cam width and the height 
# DETAILS 
'''
cap.set(id number ,pixels values )
- '3' => CAP_PROP_FRAME_WIDTH the video frame width in pixels.
- '4' => CAP_PROP_FRAME_HEIGHT the video frame height in pixels.
'''
cap.set(3, 640)  
cap.set(4, 480)  

# Read class names from the model (reads all the classes that are available on the yolo model)
# DETAILS
'''
COCO dataset
model.names is a dictionnary : 
    {
        (key: "value",)
        0: "person",
        1: "bicycle",
        2: "car",
        ...
        79: "toothbrush"
    }
'''
classNames = model.names  

# just take the index of the object we want to detect , in our case , cell phone => 67
phone_class_index = 67  

# this line ensures that the web cam is indeed opened 
while cap.isOpened():
    '''
    cap.read() returns :
        success: A boolean  frame was successfully read or not .
        img: The actual frame (image) if success is true .
    '''
    success, img = cap.read()

    '''cas frame not read sortir '''
    if not success:
        break
    
    # DETAILS
    '''
    img is the frame we read 
    the stream value 
        The stream=True means  'YOLO' will return results as a generator(with streaming)
    means :
        Instead of returning all detections at once in one frame it returns a set of results that way we can manipulate each as we want , it is also memory saving
    '''
    '''
    here:
        preprocessing 
            resizing the frame to treat it with yolo 
            normalisation ect 
        inference 
            runing cnn of yolo to detect 
        steaming 
            already explained 
    '''
    results = model(img, stream=True)

    # Prepare a list to store detections for DeepSORT
    detections = []

    # Process results from YOLO
    '''
    for each results r we will have : 
        Bounding Box (r.boxes.xyxy):
            (x1, y1, x2, y2).
        Class Index (r.boxes.cls):
            The index of the detected class 
        Confidence (r.boxes.conf):
            The confidence score for the detection of the class (proba)
    '''
    # iterate all the results 
    for r in results:
        # retrieve the box detection result of all objects 
        boxes = r.boxes
        # this to only detect a phone 
        # iterate all the boxes 
        for box in boxes:
            # retrieve the classe of detection 
            cls = int(box.cls[0])  

            # If the detected class is 'cell phone'
            '''do traitement , detection + tracking '''
            if cls == phone_class_index:
                '''get the coordinates'''
                x1, y1, x2, y2 = box.xyxy[0]
                '''
                    float to int
                    because : 
                        Image pixels are discrete and indexed using integers.
                '''
                x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)

                # Calculate the center of the bounding box
                center_x = (x1 + x2) // 2
                center_y = (y1 + y2) // 2

                # Add detection to list for DeepSORT (bbox, confidence, class)
                detections.append(([x1, y1, x2, y2], box.conf[0], cls))

                # DISPLAY 
                '''
                display rectangle 
                cv2.rectangle ( frame , the coordinate top left , bottom right , color line rectangle , border thickness)
                display a circle 
                cv2.circle(frame , coordinate of center , radius of circle,color ,circle filled or no (with color))
                '''
                # Draw bounding box 
                cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3)
                cv2.circle(img, (center_x, center_y), 5, (0, 255, 0), -1) 
                ''' 
                this to display the center x and y (position of the object)
                '''
                # Display the center position as text on the webcam
                center_text = f"Center: ({center_x}, {center_y})"
                org = (x1, y1 - 10)  
                font = cv2.FONT_HERSHEY_SIMPLEX
                fontScale = 0.7
                color = (0, 255, 0)
                thickness = 2
                ''' 
                cv2.putText(frame , text ,where to display (position),font, fontScale, color, thickness)
                '''
                cv2.putText(img, center_text, org, font, fontScale, color, thickness)


    # Pass detections to DeepSORT for tracking
    # track the object 
    ''' 
    Matching Detections to Existing Tracks:
        DeepSort attempts to match the new detections (detections) to previously tracked objects using:
            Bounding box overlap (Intersection over Union, IoU).
            Appearance features (if enabled).
        Updating Tracks:
            For matched detections, DeepSort updates the state of the corresponding track .
        Creating New Tracks:
            If a detection cannot be matched to an existing track, DeepSort adds it.
        Removing Lost Tracks:
            Tracks that have not been updated for multiple frames => delete.
        
        Return Value:
            tracks: A list of track objects
            track_id: ID
            to_ltrb(): The bounding box coordinates [left, top, right, bottom] of the tracked object.
            is_confirmed(): A flag indicating whether the track is active and confirmed.
            Optionally, additional information
    '''
    tracks = object_tracker.update_tracks(detections, frame=img)

    # Draw a moving dot for each track (phone)
    for track in tracks:
        
        if not track.is_confirmed():
            continue
        track_id = track.track_id
        ltrb = track.to_ltrb()

        # Calculate the center of the bounding box for the dot
        center_x = int((ltrb[0] + ltrb[2]) // 2)
        center_y = int((ltrb[1] + ltrb[3]) // 2)

        # Draw the dot at the center of the tracked phone object
        cv2.circle(img, (center_x, center_y), 5, (0, 0, 255), -1)  # Red dot for tracking

    

    # Display the image on webcam with all the added displays 
    cv2.imshow('Webcam', img)

    # press q to quit 
    if cv2.waitKey(1) == ord('q'):
        break

# Release the webcam and close all windows
cap.release()
cv2.destroyAllWindows()


PRO TIP  Replace 'model=yolo-Weights/yolov5n.pt' with new 'model=yolo-Weights/yolov5nu.pt'.
YOLOv5 'u' models are trained with https://github.com/ultralytics/ultralytics and feature improved performance vs standard YOLOv5 models trained with https://github.com/ultralytics/yolov5.

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov5nu.pt to 'yolo-Weights\yolov5nu.pt'...


100%|██████████| 5.31M/5.31M [00:06<00:00, 887kB/s] 



0: 480x640 1 person, 5111.1ms
Speed: 99.2ms preprocess, 5111.1ms inference, 53.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 2721.3ms
Speed: 72.4ms preprocess, 2721.3ms inference, 28.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 3268.7ms
Speed: 18.4ms preprocess, 3268.7ms inference, 12.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 2817.5ms
Speed: 43.4ms preprocess, 2817.5ms inference, 14.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 3053.9ms
Speed: 29.8ms preprocess, 3053.9ms inference, 27.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 2727.8ms
Speed: 20.3ms preprocess, 2727.8ms inference, 16.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 2683.1ms
Speed: 21.7ms preprocess, 2683.1ms inference, 15.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 2746.9ms
Spe