In [22]:
import cv2
import numpy as np

def process_video(input_video_path, output_video_path):
    # Open the video file
    cap = cv2.VideoCapture(input_video_path)
    
    # Get video properties
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec
    
    # Define the codec and create VideoWriter object
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Convert frame to grayscale (if not already)
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
        # Apply a threshold to get binary image
        _, binary = cv2.threshold(gray, 50, 255, cv2.THRESH_BINARY)
        
        # Find contours (objects) in the binary image
        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        # Draw bounding boxes around detected objects based on size
        for contour in contours:
            area = cv2.contourArea(contour)
            if area > 50:  # Minimum area threshold to filter out small objects/noise
                x, y, w, h = cv2.boundingRect(contour)
                cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 0, 255), 2)  # Red bounding box
            
        # Write the frame into the output video
        out.write(frame)
    
    # Release everything
    cap.release()
    out.release()


# Example usage:
input_video_path = '/Users/adil/Desktop/Codes/Image Captioning/depth_estimate_MiDaS/output_depth_maps/output_depth_map_video.mov'
output_video_path = 'path_to_output_video.mp4'
process_video(input_video_path, output_video_path)

In [2]:
import cv2
import torch
import numpy as np

# Load YOLOv5 model (you can choose a different model from yolov5s, yolov5m, yolov5l, etc.)
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

# Initialize video capture (laptop camera)
cap = cv2.VideoCapture(0)

# Dictionary to store previous positions of detected objects
prev_positions = {}

# Unique ID counter for tracked objects
object_id_counter = 0

# Function to calculate the center of the bounding box
def get_center(bbox):
    x_min, y_min, x_max, y_max = bbox
    return int((x_min + x_max) / 2), int((y_min + y_max) / 2)

# Track object positions and draw trajectories
def track_objects(frame, detections):
    global object_id_counter
    current_positions = {}

    for *box, conf, cls in detections:
        x_min, y_min, x_max, y_max = map(int, box)
        label = model.names[int(cls)]
        center = get_center((x_min, y_min, x_max, y_max))

        # Find the object that is closest to the current center
        min_dist = float('inf')
        min_id = None
        for obj_id, prev_center in prev_positions.items():
            dist = np.linalg.norm(np.array(center) - np.array(prev_center))
            if dist < min_dist:
                min_dist = dist
                min_id = obj_id

        if min_dist < 50:  # Threshold for considering it the same object
            current_positions[min_id] = center
            prev_center = prev_positions[min_id]
            cv2.arrowedLine(frame, prev_center, center, (255, 0, 0), 3, tipLength=0.5)
        else:
            current_positions[object_id_counter] = center
            object_id_counter += 1

        # Draw the bounding box and label
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
        cv2.putText(frame, f'{label} {conf:.2f}', (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    return current_positions

# Main loop to process video frames
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # YOLOv5 inference on the frame
    results = model(frame)

    # Extract bounding boxes, confidence scores, and class labels
    detections = results.xyxy[0].cpu().numpy()

    # Track objects and update positions
    prev_positions = track_objects(frame, detections)

    # Display the frame
    cv2.imshow('YOLOv5 Object Tracking', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()


Downloading: "https://github.com/ultralytics/yolov5/zipball/master" to /Users/adil/.cache/torch/hub/master.zip


[31m[1mrequirements:[0m Ultralytics requirement ['setuptools>=70.0.0'] not found, attempting AutoUpdate...
Collecting setuptools>=70.0.0
  Downloading setuptools-73.0.1-py3-none-any.whl.metadata (6.6 kB)
Downloading setuptools-73.0.1-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 69.0.2
    Uninstalling setuptools-69.0.2:
      Successfully uninstalled setuptools-69.0.2
Successfully installed setuptools-73.0.1

[31m[1mrequirements:[0m AutoUpdate success ✅ 3.0s, installed 1 package: ['setuptools>=70.0.0']
[31m[1mrequirements:[0m ⚠️ [1mRestart runtime or rerun command for updates to take effect[0m




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
YOLOv5 🚀 2024-8-26 Python-3.11.7 torch-2.4.0 CPU

Downloading https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt to yolov5s.pt...
100%|██████████| 14.1M/14.1M [00:04<00:00, 3.29MB/s]

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast

: 

In [1]:
import cv2
import torch
import numpy as np
from collections import deque

# ==========================
# Tracker Class Definition
# ==========================
class Tracker:
    def __init__(self, max_lost=30, distance_threshold=50):
        """
        Initializes the tracker with parameters.

        :param max_lost: Maximum number of frames object was not detected.
        :param distance_threshold: Maximum distance to consider for matching.
        """
        self.next_object_id = 0
        self.objects = dict()  # object_id: (center_x, center_y)
        self.lost = dict()     # object_id: number of consecutive frames lost
        self.max_lost = max_lost
        self.distance_threshold = distance_threshold
        self.trajectories = dict()  # object_id: deque of positions
        self.colors = dict()  # object_id: (B, G, R)

    def assign_colors(self, object_id):
        """
        Assigns a unique color to each object.

        :param object_id: Unique identifier for the object.
        :return: Tuple representing color in BGR.
        """
        np.random.seed(object_id)
        color = tuple(np.random.randint(0, 255, size=3).tolist())
        self.colors[object_id] = color
        return color

    def update(self, detections):
        """
        Updates tracker with new detections.

        :param detections: List of detected object centers [(x, y), ...]
        :return: Dictionary of updated objects.
        """
        updated_objects = dict()

        if len(self.objects) == 0:
            # No existing objects, assign all detections to new objects
            for center in detections:
                self.objects[self.next_object_id] = center
                self.lost[self.next_object_id] = 0
                self.trajectories[self.next_object_id] = deque(maxlen=64)
                self.trajectories[self.next_object_id].append(center)
                self.assign_colors(self.next_object_id)
                self.next_object_id += 1
        else:
            # Match detections to existing objects
            object_ids = list(self.objects.keys())
            object_centers = list(self.objects.values())

            # Compute distance matrix between existing objects and new detections
            D = np.linalg.norm(
                np.array(object_centers)[:, np.newaxis] - np.array(detections), axis=2
            )

            # For each existing object, find the closest detection
            rows = D.min(axis=1).argsort()
            cols = D.argmin(axis=1)[rows]

            assigned_detections = set()
            assigned_objects = set()

            for row, col in zip(rows, cols):
                if row in assigned_objects or col in assigned_detections:
                    continue
                if D[row, col] > self.distance_threshold:
                    continue
                object_id = object_ids[row]
                self.objects[object_id] = detections[col]
                self.lost[object_id] = 0
                self.trajectories[object_id].append(detections[col])
                updated_objects[object_id] = detections[col]
                assigned_objects.add(row)
                assigned_detections.add(col)

            # Increment lost count for unmatched existing objects
            for row, object_id in enumerate(object_ids):
                if row not in assigned_objects:
                    self.lost[object_id] += 1
                    if self.lost[object_id] > self.max_lost:
                        del self.objects[object_id]
                        del self.lost[object_id]
                        del self.trajectories[object_id]
                        del self.colors[object_id]

            # Assign remaining detections to new objects
            for i, center in enumerate(detections):
                if i not in assigned_detections:
                    self.objects[self.next_object_id] = center
                    self.lost[self.next_object_id] = 0
                    self.trajectories[self.next_object_id] = deque(maxlen=64)
                    self.trajectories[self.next_object_id].append(center)
                    self.assign_colors(self.next_object_id)
                    self.next_object_id += 1

        return self.objects

# ==========================
# Helper Functions
# ==========================
def get_center(bbox):
    """
    Calculates the center of a bounding box.

    :param bbox: Tuple or list (x_min, y_min, x_max, y_max)
    :return: Tuple (center_x, center_y)
    """
    x_min, y_min, x_max, y_max = bbox
    center_x = int((x_min + x_max) / 2)
    center_y = int((y_min + y_max) / 2)
    return (center_x, center_y)

def draw_map(map_img, trajectories, colors, map_scale=1.0):
    """
    Draws trajectories on the map image.

    :param map_img: The map image to draw on.
    :param trajectories: Dictionary of object_id to deque of positions.
    :param colors: Dictionary of object_id to color tuples.
    :param map_scale: Scaling factor for positions.
    :return: None. The map_img is modified in place.
    """
    for object_id, points in trajectories.items():
        color = colors[object_id]
        for i in range(1, len(points)):
            pt1 = (int(points[i-1][0] * map_scale), int(points[i-1][1] * map_scale))
            pt2 = (int(points[i][0] * map_scale), int(points[i][1] * map_scale))
            cv2.line(map_img, pt1, pt2, color, 2)
        # Draw the latest position
        if len(points) > 0:
            cv2.circle(map_img, (int(points[-1][0] * map_scale), int(points[-1][1] * map_scale)), 5, color, -1)

# ==========================
# Main Function
# ==========================
def main():
    # Load YOLOv5 model from torch.hub
    print("Loading YOLOv5 model...")
    model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
    model.conf = 0.4  # confidence threshold
    print("Model loaded.")

    # Initialize video capture (0 for default camera)
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Error: Could not open video capture.")
        return

    # Get frame dimensions
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Initialize tracker
    tracker = Tracker(max_lost=30, distance_threshold=80)

    # Initialize a blank map image (same size as frame or scaled)
    map_scale = 0.5  # Scale down for the map if needed
    map_width = int(frame_width * map_scale)
    map_height = int(frame_height * map_scale)
    map_img = np.ones((map_height, map_width, 3), dtype=np.uint8) * 255  # White background

    # Main loop
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame.")
            break

        # Perform detection
        results = model(frame)

        # Parse detection results
        detections = results.xyxy[0].cpu().numpy()  # (x_min, y_min, x_max, y_max, conf, cls)

        # Extract bounding boxes with confidence above threshold
        boxes = []
        for *bbox, conf, cls in detections:
            if conf < model.conf:
                continue
            boxes.append([int(coord) for coord in bbox])

        # Get centers of detected objects
        centers = [get_center(box) for box in boxes]

        # Update tracker with detected centers
        tracked_objects = tracker.update(centers)

        # Draw bounding boxes and labels on the frame
        for i, box in enumerate(boxes):
            center = centers[i]
            # Find which tracked object this detection corresponds to
            matched_id = None
            for object_id, obj_center in tracked_objects.items():
                if obj_center == center:
                    matched_id = object_id
                    break
            if matched_id is not None:
                color = tracker.colors[matched_id]
                cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), color, 2)
                label = f'ID {matched_id}'
                cv2.putText(frame, label, (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
        
        # Draw trajectories on the map
        map_img.fill(255)  # Clear the map
        draw_map(map_img, tracker.trajectories, tracker.colors, map_scale=map_scale)

        # Display the frames
        cv2.imshow('Video Feed', frame)
        cv2.imshow('Trajectory Map', map_img)

        # Exit on pressing 'q'
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # Release resources
    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()


Loading YOLOv5 model...


Using cache found in /Users/adil/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-8-26 Python-3.11.7 torch-2.4.0 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


Model loaded.


  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with a

ValueError: operands could not be broadcast together with shapes (1,1,2) (0,) 

: 

In [1]:
import cv2
import torch
import numpy as np

# Load YOLOv5 model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

# Initialize video capture (laptop camera)
cap = cv2.VideoCapture(0)

# Initialize a blank trajectory map
map_height, map_width = 500, 500
trajectory_map = np.zeros((map_height, map_width, 3), dtype=np.uint8)

# Dictionary to store previous positions of detected objects
prev_positions = {}

# Unique ID counter for tracked objects
object_id_counter = 0

# Function to calculate the center of the bounding box
def get_center(bbox):
    x_min, y_min, x_max, y_max = bbox
    return int((x_min + x_max) / 2), int((y_min + y_max) / 2)

# Scale center position to fit in the trajectory map
def scale_to_map(center, frame_shape, map_shape):
    frame_height, frame_width = frame_shape[:2]
    map_height, map_width = map_shape[:2]
    scaled_x = int(center[0] * map_width / frame_width)
    scaled_y = int(center[1] * map_height / frame_height)
    return scaled_x, scaled_y

# Track object positions and draw trajectories
def track_objects(frame, detections, trajectory_map):
    global object_id_counter
    current_positions = {}

    for *box, conf, cls in detections:
        x_min, y_min, x_max, y_max = map(int, box)
        label = model.names[int(cls)]
        center = get_center((x_min, y_min, x_max, y_max))

        # Scale the center to the trajectory map
        scaled_center = scale_to_map(center, frame.shape, trajectory_map.shape)

        # Find the object that is closest to the current center
        min_dist = float('inf')
        min_id = None
        for obj_id, prev_center in prev_positions.items():
            dist = np.linalg.norm(np.array(center) - np.array(prev_center))
            if dist < min_dist:
                min_dist = dist
                min_id = obj_id

        if min_dist < 50:  # Threshold for considering it the same object
            current_positions[min_id] = center
            prev_center = prev_positions[min_id]
            scaled_prev_center = scale_to_map(prev_center, frame.shape, trajectory_map.shape)

            # Draw trajectory line on the map
            cv2.line(trajectory_map, scaled_prev_center, scaled_center, (255, 0, 0), 2)
        else:
            current_positions[object_id_counter] = center
            object_id_counter += 1

        # Draw the bounding box and label on the main frame
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
        cv2.putText(frame, f'{label} {conf:.2f}', (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

        # Mark the current object on the trajectory map
        cv2.circle(trajectory_map, scaled_center, 5, (0, 255, 0), -1)

    return current_positions

# Main loop to process video frames
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # YOLOv5 inference on the frame
    results = model(frame)

    # Extract bounding boxes, confidence scores, and class labels
    detections = results.xyxy[0].cpu().numpy()

    # Track objects and update positions
    prev_positions = track_objects(frame, detections, trajectory_map)

    # Resize trajectory map to match the frame's height
    resized_map = cv2.resize(trajectory_map, (frame.shape[1], frame.shape[0]))

    # Display the original frame and the trajectory map side by side
    combined_display = np.hstack((frame, resized_map))
    cv2.imshow('YOLOv5 Object Tracking with Trajectory Map', combined_display)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()


Using cache found in /Users/adil/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-8-26 Python-3.11.7 torch-2.4.0 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autoca

: 