In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os 
os.chdir("/content/drive/MyDrive/AI projects/1. Car Counter")
home = os.getcwd()
home

'/content/drive/MyDrive/AI projects/1. Car Counter'

In [None]:
!pip install loguru
!pip install lap
!pip install ultralytics


!git clone https://github.com/ifzhang/ByteTrack.git
%cd {home}/ByteTrack
!sed -i 's/onnx==1.8.1/onnx==1.9.0/g' requirements.txt

!pip3 install -q -r requirements.txt
!python3 setup.py -q develop
!pip install -q cython_bbox
!pip install -q onemetric

!pip install supervision==0.1.0

!pip install cvzone

Installing collected packages: cvzone
Successfully installed cvzone-1.5.6


In [None]:
# Checking everything.
import ultralytics 
print(ultralytics.checks())

import supervision
print(f"Supervision version = {supervision.__version__}")
 
import yolox
print(f"yolox verison = {yolox.__version__}")

Ultralytics YOLOv8.0.90 🚀 Python-3.10.11 torch-2.0.0+cu118 CUDA:0 (Tesla T4, 15102MiB)
Setup complete ✅ (2 CPUs, 12.7 GB RAM, 23.5/78.2 GB disk)


None
Supervision version = 0.1.0
yolox verison = 0.1.0


In [None]:
from supervision.video.source import get_video_frames_generator
from supervision.draw.color import ColorPalette,Color
from supervision.notebook.utils import show_frame_in_notebook
from supervision.tools.detections import Detections,BoxAnnotator
from supervision.video.sink import VideoSink # To save the video.
from supervision.video.dataclasses import VideoInfo
from supervision.tools.line_counter import LineCounter, LineCounterAnnotator
from supervision.geometry.dataclasses import Point
from tqdm.notebook import tqdm
import numpy as np
from ultralytics import YOLO

In [None]:
from yolox.tracker.byte_tracker import BYTETracker, STrack
from onemetric.cv.utils.iou import box_iou_batch
from dataclasses import dataclass


@dataclass(frozen=True)
class BYTETrackerArgs:
    track_thresh: float = 0.25
    track_buffer: int = 30
    match_thresh: float = 0.8
    aspect_ratio_thresh: float = 3.0
    min_box_area: float = 1.0
    mot20: bool = False

In [None]:
from typing import List

import numpy as np


# converts Detections into format that can be consumed by match_detections_with_tracks function
def detections2boxes(detections: Detections) -> np.ndarray:
    # This will just horizontally stack the two values, looks like this [1,2,3,4,5] , 1 to 4 are the location, 5 is the conf.
    return np.hstack((
        detections.xyxy,
        # It makes confidence of each object in a seperate array. if conf = [1,2,3,4] then will change it into [1],[2],[3]...
        detections.confidence[:, np.newaxis]
    ))


# converts List[STrack] into format that can be consumed by match_detections_with_tracks function
# This will turn the tracks in to xmin,ymin,xmax,ymax
def tracks2boxes(tracks: List[STrack]) -> np.ndarray:
    return np.array([
        track.tlbr
        for track
        in tracks
    ], dtype=float)







# This function takes in a set of detections and a list of tracks and matches the detections to the 
# corresponding tracks based on their bounding box coordinates.

# First, it checks if there are any detections or tracks. If there are none, it returns an empty array.

# Next, it converts the tracks to bounding boxes using the tracks2boxes function and computes 
# the intersection over union (IoU) between each track's bounding box and each detection's bounding box using the box_iou_batch function.

# Then, it finds the index of the detection with the highest IoU for each track using np.argmax, and stores these indices in track2detection.

# The function then initializes an empty list called tracker_ids with the same length as the number of detections. For each track, 
# it checks if the highest IoU between the track and the detections is not zero. If it's not zero, it stores the track's ID in tracker_ids at 
# the index corresponding to the detection with the highest IoU.

# Finally, the function returns the list of tracker IDs for each detection.

# matches our bounding boxes with predictions
def match_detections_with_tracks(
    detections: Detections, 
    tracks: List[STrack]
) -> Detections:
    if not np.any(detections.xyxy) or len(tracks) == 0:
        return np.empty((0,))

    tracks_boxes = tracks2boxes(tracks=tracks)
    iou = box_iou_batch(tracks_boxes, detections.xyxy)
    track2detection = np.argmax(iou, axis=1)
    
    tracker_ids = [None] * len(detections)
    
    for tracker_index, detection_index in enumerate(track2detection):
        if iou[tracker_index, detection_index] != 0:
            tracker_ids[detection_index] = tracks[tracker_index].track_id

    return tracker_ids

In [None]:
# Settings 
# LINE_START = Point(0,550)
# LINE_END = Point(1920,100)

TARGET_VIDEO_PATH = f"{home}/Videos/WIN_20230430_20_05_38_Pro-RESULT101.mp4"

# The Flow
1. Video Source
2. Model
3. model.fuse()
4. class_id (the ones you want to be detected)
5. class_names_dict (the names of the classes)
6. initialize byte tracker
7. video_info(takes in source path)
8. create generator
9. create line counter instance
10. box_annotator instance
11. line annotator instance

12. with videoSink(target_video,video_info) as sink:
13. loop over frames
14. result
15. detections
16. filtering out detections with unwanted classes

17. tracking detections
18. extracting tracker id
19. filtering out detections without trackers
20. Labels
21. updating line counter
22. make bbox
23. make line
24. sink.write to save the video.

In [None]:
import cv2
import cvzone



# video source
video_source_path = f"{home}/Videos/WIN_20230430_20_05_38_Pro.mp4"

# model
model = YOLO(f"{home}/yolo_weights/best.pt")
model.fuse()

# video_info
video_info = VideoInfo.from_video_path(video_source_path)

# generator
generator = get_video_frames_generator(video_source_path)

# CLASS ID
CLASS_ID = [4,6]

# CLASS_NAMES_DICT
CLASS_NAMES_DICT = model.model.names

with VideoSink(TARGET_VIDEO_PATH , video_info) as sink:
  # loop over frames
  for frame in tqdm(generator , total = video_info.total_frames):
    # results
    results = model(frame)
    # detections
    detections = Detections(
        xyxy = results[0].boxes.xyxy.cpu().numpy(),
        confidence = results[0].boxes.conf.cpu().numpy(),
        class_id = results[0].boxes.cls.cpu().numpy().astype(int)
    )

    # filtering
    mask = np.array([class_id in CLASS_ID for class_id in detections.class_id],dtype = bool)
    detections.filter(mask=mask , inplace= True)

    
    labels = [
        f"{CLASS_NAMES_DICT[class_id]} {confidence:0.5f}"
        for _, confidence , class_id, tracker_id in detections
    ]
    if labels:
      if labels[0].split()[0] == "Mask":
        box_annotator = BoxAnnotator(color = Color(0,255,0), thickness = 1, text_thickness = 1, text_scale = 0.5)
      elif labels[0].split()[0] == "NO-Mask":
        box_annotator = BoxAnnotator(color = Color(255,0,0), thickness = 1, text_thickness = 1, text_scale = 0.5)
    else:
      box_annotator = BoxAnnotator(color = Color(255,0,0), thickness = 1, text_thickness = 1, text_scale = 0.5)

    box_annotator.annotate(frame,detections = detections , labels = labels)
    sink.write_frame(frame)

Model summary (fused): 268 layers, 43625883 parameters, 0 gradients, 164.9 GFLOPs


  0%|          | 0/538 [00:00<?, ?it/s]


0: 384x640 1 NO-Hardhat, 1 NO-Mask, 1 NO-Safety Vest, 1 Person, 42.2ms
Speed: 2.6ms preprocess, 42.2ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 NO-Hardhat, 1 NO-Mask, 1 NO-Safety Vest, 1 Person, 41.0ms
Speed: 3.7ms preprocess, 41.0ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 NO-Hardhat, 1 NO-Mask, 1 NO-Safety Vest, 1 Person, 28.4ms
Speed: 3.4ms preprocess, 28.4ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 NO-Hardhat, 1 NO-Mask, 1 NO-Safety Vest, 1 Person, 28.6ms
Speed: 3.0ms preprocess, 28.6ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 NO-Hardhat, 1 NO-Mask, 1 NO-Safety Vest, 1 Person, 28.8ms
Speed: 3.1ms preprocess, 28.8ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 NO-Hardhat, 1 NO-Mask, 1 NO-Safety Vest, 1 Person, 27.5ms
Speed: 3.0ms preprocess, 27.5ms inference, 1.4ms postprocess per image at shape (1,