In [1]:
# Install the required packages
!pip install opencv-python numpy ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.130-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [2]:
from google.colab import files
uploaded = files.upload()

Saving vidCV.mp4 to vidCV.mp4


In [3]:
import cv2
import numpy as np
import time
from ultralytics import YOLO
import math
from scipy.optimize import linear_sum_assignment

model = YOLO('yolov8n.pt')
model.overrides['classes'] = [0]  # Only detect persons

def get_floor_point(box, frame_height):
    x1, y1, x2, y2 = box
    if y2 > frame_height - 10:
        person_height = y2 - y1
        estimated_floor = min(y2, frame_height - 2)
        return int((x1 + x2) / 2), estimated_floor
    return int((x1 + x2) / 2), y2

def process_video(input_path, output_path, num_persons=3):
    cap = cv2.VideoCapture(input_path)
    if not cap.isOpened():
        print(f"Error opening video: {input_path}")
        return False

    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

    colors = [(0,0,255), (0,255,0), (255,0,0)]
    person_tracks = []
    path_history = [[] for _ in range(num_persons)]
    person_features = [None] * num_persons
    hist_bins = [8, 8, 8]

    frame_count = 0
    initialization_phase = True

    while True:
        ret, frame = cap.read()
        if not ret: break

        frame_count += 1
        display_frame = frame.copy()
        results = model(frame)

        current_detections = []
        for box in results[0].boxes.cpu().numpy():
            if box.cls[0] == 0 and box.conf[0] > 0.7:
                x1,y1,x2,y2 = box.xyxy[0].astype(int)
                if (x2-x1) < 20 or (y2-y1) < 50 or (y2-y1) < (x2-x1)*1.2:
                    continue

                lower_y1 = int(y1 + (y2-y1)*0.3)
                person_roi = frame[lower_y1:y2, x1:x2]
                feature = cv2.calcHist([person_roi], [0,1,2], None, hist_bins, [0,256]*3)
                feature = cv2.normalize(feature, feature).flatten()

                current_detections.append({
                    'box': (x1,y1,x2,y2),
                    'floor_point': get_floor_point((x1,y1,x2,y2), frame_height),
                    'centroid': ((x1+x2)//2, (y1+y2)//2),
                    'area': (x2-x1)*(y2-y1),
                    'feature': feature,
                    'matched': False
                })

        current_detections.sort(key=lambda x: x['area'], reverse=True)

        if initialization_phase and len(current_detections) >= num_persons:
            for i in range(num_persons):
                person_tracks.append({
                    'id': i,
                    **current_detections[i],
                    'missing_count': 0
                })
                person_features[i] = current_detections[i]['feature']
                path_history[i].append(current_detections[i]['floor_point'])
            initialization_phase = False
            continue

        for track in person_tracks: track['matched'] = False

        cost_matrix = np.zeros((len(person_tracks), len(current_detections)))
        for i, track in enumerate(person_tracks):
            for j, det in enumerate(current_detections):
                app_sim = cv2.compareHist(track['feature'], det['feature'], cv2.HISTCMP_CORREL)
                tx,ty = track['centroid']
                dx,dy = det['centroid']
                spatial_dist = math.sqrt((tx-dx)**2 + (ty-dy)**2)/math.sqrt(frame_width**2 + frame_height**2)
                cost_matrix[i,j] = (0.7*(1-max(0,app_sim))) + (0.3*spatial_dist)

        row_ind, col_ind = linear_sum_assignment(cost_matrix)
        for r,c in zip(row_ind, col_ind):
            if cost_matrix[r,c] < 0.6:
                track = person_tracks[r]
                det = current_detections[c]
                track.update({k: det[k] for k in ['box','floor_point','centroid','area']})
                track['feature'] = 0.7*track['feature'] + 0.3*det['feature']
                track['feature'] /= np.sum(track['feature']) if np.sum(track['feature']) > 0 else 1
                person_features[track['id']] = track['feature']
                track['matched'], det['matched'] = True, True
                track['missing_count'] = 0
                path_history[track['id']].append(track['floor_point'])

        for track in person_tracks:
            if not track['matched']:
                track['missing_count'] += 1
                if track['missing_count'] <= 30:
                    path_history[track['id']].append(track['floor_point'])

        for track in person_tracks:
            if track['missing_count'] > 30: continue
            person_id = track['id']
            color = colors[person_id % len(colors)]

            if track['matched']:
                x1,y1,x2,y2 = track['box']
                cv2.rectangle(display_frame, (x1,y1), (x2,y2), color, 2)
                cv2.putText(display_frame, f"Person {person_id}", (x1,y1-10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
                cv2.circle(display_frame, track['floor_point'], 3, color, -1)

            path = path_history[person_id]
            if len(path) > 1:
                path_array = np.array(path, dtype=np.int32).reshape((-1,1,2))
                cv2.polylines(display_frame, [path_array], False, color, 4, cv2.LINE_AA)

        out.write(display_frame)

    cap.release()
    out.release()
    print(f"Processing complete! Output saved to {output_path}")
    return True

# Process video
process_video("vidCV.mp4", "tracked_output.mp4", num_persons=3)

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:00<00:00, 225MB/s]



0: 384x640 3 persons, 315.4ms
Speed: 17.0ms preprocess, 315.4ms inference, 36.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 141.7ms
Speed: 3.9ms preprocess, 141.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 144.5ms
Speed: 3.7ms preprocess, 144.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 139.7ms
Speed: 3.6ms preprocess, 139.7ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 165.2ms
Speed: 3.5ms preprocess, 165.2ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 141.1ms
Speed: 3.7ms preprocess, 141.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 158.3ms
Speed: 2.9ms preprocess, 158.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 138.0ms
Speed: 3.6ms preprocess, 138.0ms inference, 1.4ms postprocess pe

True

**Setup**

- Load YOLOv8 model (persons only)

- Open input/output videos

**Per Frame Processing**

- Detect persons using YOLO

- Extract features (color histograms from lower body)

**Tracking Logic**

- Initialize: First N detections become tracked persons

- Match: Compare new detections to existing tracks using:

- 70% appearance similarity (histograms)

- 30% position proximity

- Update: Refresh matched tracks with new positions/features

- Handle occlusions: Keep lost tracks for 30 frames

**Visualization**

- Draw bounding boxes with IDs

- Plot floor points and movement paths

**Output**

- Save processed video with tracking visuals