In [5]:
import torch
import torchvision.transforms as T
from PIL import Image, ImageDraw, ImageFont
import requests
import matplotlib.pyplot as plt

# --- Configuration ---
# We will use a pre-trained YOLOv5 model from PyTorch Hub
MODEL = 'yolov5s' # 's' is for small, a good starting point
# New, stable URL for a sample KITTI image
IMAGE_URL = "https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/zidane.jpg" # A more reliable image URL

def main():
    print("Starting Project 2: 2D Object Detection with YOLO")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # --- 1. Load the Model ---
    # PyTorch Hub makes it easy to load pre-trained models.
    try:
        model = torch.hub.load('ultralytics/yolov5', MODEL, pretrained=True)
        model.to(device) # Move model to GPU if available
        model.eval() # Set the model to evaluation mode
        print(f"YOLOv5 model '{MODEL}' loaded successfully.")
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    # --- 2. Load and Prepare the Image ---
    print(f"Loading image from: {IMAGE_URL}")
    try:
        response = requests.get(IMAGE_URL, stream=True)
        response.raise_for_status()
        input_image = Image.open(response.raw).convert("RGB")
    except requests.exceptions.RequestException as e:
        print(f"Error loading image: {e}")
        return

    # --- 3. Perform Inference ---
    # The model expects a list of images.
    results = model([input_image])
    print("Inference complete.")

    # --- 4. Process and Visualize Results (The Easy Way) ---
    # The 'results' object contains the bounding box coordinates, confidence scores, and class labels.
    # We can use its .render() method for a quick visualization.
    print("Visualizing results with the .render() method...")
    rendered_image_np = results.render()[0]

    # Convert the NumPy array back to a PIL Image and display it.
    rendered_image = Image.fromarray(rendered_image_np)
    rendered_image.show(title="YOLOv5 Detection Results (Rendered)")

    # --- BONUS: Manual Visualization ---
    # For more control, you can access the raw prediction data.
    # The predictions are in a pandas DataFrame.
    predictions_df = results.pandas().xyxy[0]
    print("\nRaw Predictions (DataFrame):")
    print(predictions_df)

    # Create a copy of the original image to draw on
    manual_image = input_image.copy()
    draw = ImageDraw.Draw(manual_image)

    # Try to load a font, fall back to default if not found
    try:
        font = ImageFont.truetype("arial.ttf", 15)
    except IOError:
        font = ImageFont.load_default()

    # Loop through the DataFrame and draw the boxes yourself using ImageDraw.
    for index, row in predictions_df.iterrows():
        # Get coordinates, confidence, and label
        xmin, ymin, xmax, ymax = int(row['xmin']), int(row['ymin']), int(row['xmax']), int(row['ymax'])
        confidence = row['confidence']
        label = f"{row['name']} {confidence:.2f}"

        # Draw bounding box
        draw.rectangle([xmin, ymin, xmax, ymax], outline="red", width=2)

        # Draw label background and text
        text_size = draw.textbbox((0,0), label, font=font)
        draw.rectangle([xmin, ymin - text_size[3] - 5, xmin + text_size[2], ymin], fill="red")
        draw.text((xmin, ymin - text_size[3] - 5), label, fill="white", font=font)

    print("\nVisualizing results with manual drawing...")
    manual_image.show(title="YOLOv5 Detection Results (Manual)")


if __name__ == "__main__":
    main()


Starting Project 2: 2D Object Detection with YOLO
Using device: cuda


Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2025-9-20 Python-3.12.11 torch-2.8.0+cu126 CUDA:0 (Tesla T4, 15095MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


YOLOv5 model 'yolov5s' loaded successfully.
Loading image from: https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/zidane.jpg


  with amp.autocast(autocast):


Inference complete.
Visualizing results with the .render() method...

Raw Predictions (DataFrame):
         xmin        ymin         xmax        ymax  confidence  class    name
0  743.290527   48.343597  1141.756470  720.000000    0.879861      0  person
1  441.989624  437.336670   496.585083  710.036255    0.675119     27     tie
2  123.051208  193.238007   714.690491  719.771240    0.666694      0  person
3  978.989807  313.579468  1025.302856  415.526184    0.261517     27     tie

Visualizing results with manual drawing...


In [None]:
# In a Colab notebook, run this single command in a cell BEFORE running this script.
# This installs a modern, well-maintained tracking library.
!pip install deep-sort-realtime
!pip install ultralytics

In [5]:
import torch
from PIL import Image, ImageDraw, ImageFont
import requests
import cv2
import numpy as np
import matplotlib.pyplot as plt
from deep_sort_realtime.deepsort_tracker import DeepSort
import os
# Imports for Colab video display
from IPython.display import HTML
from base64 import b64encode
import warnings

# --- Suppress YOLOv5 FutureWarnings ---
# This is a clean way to handle the noise from the older library.
warnings.filterwarnings("ignore", category=FutureWarning, module="torch.cuda.amp.autocast")

# --- Configuration ---
VIDEO_PATH = "test_video.mp4" # Path to your uploaded video
OUTPUT_VIDEO_PATH = "tracked_output.mp4"
YOLO_MODEL = 'yolov5s'

def display_video(video_path):
    """Helper function to display a video in a Colab notebook."""
    if not os.path.exists(video_path) or os.path.getsize(video_path) == 0:
        print(f"Error: Output video file not found or is empty at '{video_path}'.")
        return
    mp4 = open(video_path,'rb').read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
    return HTML(f"""
    <video width=600 controls>
          <source src="{data_url}" type="video/mp4">
    </video>
    """)

def main():
    print("Starting Project 2: 2D Object Tracking with YOLOv5 and DeepSORT-Realtime")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # --- 1. Load Models ---
    yolo_model = torch.hub.load('ultralytics/yolov5', YOLO_MODEL, pretrained=True)
    yolo_model.to(device).eval()
    print(f"YOLOv5 model '{YOLO_MODEL}' loaded successfully.")

    tracker = DeepSort(max_age=30)
    print("Deep SORT Realtime tracker loaded successfully.")

    # --- 2. Prepare Video IO ---
    if not os.path.exists(VIDEO_PATH) or os.path.getsize(VIDEO_PATH) == 0:
        print(f"Error: Video file 'test_video.mp4' not found. Please upload your video and rename it.")
        return

    cap = cv2.VideoCapture(VIDEO_PATH)
    if not cap.isOpened():
        print(f"Error: Cannot open video file: {VIDEO_PATH}")
        return

    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(OUTPUT_VIDEO_PATH, fourcc, fps, (frame_width, frame_height))

    if not out.isOpened():
        print("Error: Could not open video writer. The 'mp4v' codec may not be supported.")
        cap.release()
        return

    print("Video reader and writer configured successfully.")

    # --- 3. Process Video ---
    print("\nStarting video processing...")
    frame_count = 0
    frames_written = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Get detections from YOLOv5
        results = yolo_model(frame)
        detections_df = results.pandas().xyxy[0]
        # Person, bicycle, car, motorcycle, bus, truck
        detections_df = detections_df[detections_df['class'].isin([0, 1, 2, 3, 5, 7])]

        # Format detections for DeepSORT-Realtime
        detections_for_tracker = []
        for _, row in detections_df.iterrows():
            x1, y1, x2, y2 = int(row['xmin']), int(row['ymin']), int(row['xmax']), int(row['ymax'])
            w, h = x2 - x1, y2 - y1
            bbox = [x1, y1, w, h]
            score = row['confidence']
            class_name = row['name']
            detections_for_tracker.append((bbox, score, class_name))

        # Update the tracker with the new detections
        tracks = tracker.update_tracks(detections_for_tracker, frame=frame)

        # --- Frame Rendering (More Robust Method) ---
        # Instead of drawing on the original frame, we get the rendered image from YOLO
        # This ensures colors and formats are consistent.
        rendered_frame_np = results.render()[0]

        # Create a writable copy of the rendered frame and convert to BGR for drawing with cv2
        output_frame_bgr = cv2.cvtColor(np.copy(rendered_frame_np), cv2.COLOR_RGB2BGR)


        # Now, draw the track IDs on top of the BGR frame
        for track in tracks:
            if not track.is_confirmed(): continue
            track_id = track.track_id
            ltrb = track.to_ltrb() # Get bounding box in [left, top, right, bottom] format
            x1, y1, x2, y2 = int(ltrb[0]), int(ltrb[1]), int(ltrb[2]), int(ltrb[3])

            # This label will be drawn over the YOLO label, which is fine for this project
            cv2.putText(output_frame_bgr, f"ID: {track_id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

        # Write the final frame (which is already BGR) to the output video
        out.write(output_frame_bgr)
        frames_written += 1
        frame_count += 1
        if frame_count % 100 == 0: print(f"Processed {frame_count} frames...")

    # --- 4. Cleanup ---
    cap.release()
    out.release()

    if frames_written == 0:
        print("\nWarning: Processing finished, but no frames were written to the output file.")
        return

    print(f"\nProcessing complete. {frames_written} frames written to: {OUTPUT_VIDEO_PATH}")

    # --- 5. Display the final video in the notebook ---
    print("\nDisplaying the final tracked video:")
    display_video(OUTPUT_VIDEO_PATH)


if __name__ == "__main__":
    main()
    try:
        if 'google.colab' in str(get_ipython()):
            display_video(OUTPUT_VIDEO_PATH)
    except NameError:
        print("Not in a Colab/IPython environment, skipping video display.")

Starting Project 2: 2D Object Tracking with YOLOv5 and DeepSORT-Realtime
Using device: cuda


Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2025-9-21 Python-3.12.11 torch-2.8.0+cu126 CUDA:0 (Tesla T4, 15095MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):
  with amp.autocast(autocast):


YOLOv5 model 'yolov5s' loaded successfully.
Deep SORT Realtime tracker loaded successfully.
Video reader and writer configured successfully.

Starting video processing...


  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with a

Processed 100 frames...


  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with a

Processed 200 frames...


  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):



Processing complete. 208 frames written to: tracked_output.mp4

Displaying the final tracked video:
