In [None]:
!git clone https://github.com/ultralytics/yolov5
%cd yolov5


fatal: destination path 'yolov5' already exists and is not an empty directory.
/content/yolov5


In [None]:
!pip install -r requirements.txt




Step 1. Collect a source video. It may be necessary to divide the video into discrete image frames.

In [None]:
!pip install opencv-python-headless  # Use opencv-python-headless for minimal installation
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt




In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import cv2
import os

def video_to_frames(video_path, frames_dir="frames", every_n_frame=1):
    """
    Extract frames from a video file and print the total number of frames extracted.

    Parameters:
    - video_path: Path to the video file.
    - frames_dir: Directory to save the extracted frames.
    - every_n_frame: Extract every nth frame.
    """
    # Ensure the output directory exists
    if not os.path.exists(frames_dir):
        os.makedirs(frames_dir)

    # Open the video file
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    saved_frame_count = 0  # Initialize a counter for saved frames

    while True:
        # Read frame
        ret, frame = cap.read()

        # Break the loop if there are no more frames
        if not ret:
            break

        # Save every nth frame
        if frame_count % every_n_frame == 0:
            frame_path = os.path.join(frames_dir, f"frame_{frame_count}.jpg")
            cv2.imwrite(frame_path, frame)
            saved_frame_count += 1  # Increment saved frame counter
            print(f"Frame {frame_count} saved.")

        frame_count += 1

    # Release the video capture object
    cap.release()
    print(f"Done extracting frames. Total frames saved: {saved_frame_count}")

video_path = '/content/drive/My Drive/video.mp4'
video_to_frames(video_path)


Frame 0 saved.
Frame 1 saved.
Frame 2 saved.
Frame 3 saved.
Frame 4 saved.
Frame 5 saved.
Frame 6 saved.
Frame 7 saved.
Frame 8 saved.
Frame 9 saved.
Frame 10 saved.
Frame 11 saved.
Frame 12 saved.
Frame 13 saved.
Frame 14 saved.
Frame 15 saved.
Frame 16 saved.
Frame 17 saved.
Frame 18 saved.
Frame 19 saved.
Frame 20 saved.
Frame 21 saved.
Frame 22 saved.
Frame 23 saved.
Frame 24 saved.
Frame 25 saved.
Frame 26 saved.
Frame 27 saved.
Frame 28 saved.
Frame 29 saved.
Frame 30 saved.
Frame 31 saved.
Frame 32 saved.
Frame 33 saved.
Frame 34 saved.
Frame 35 saved.
Frame 36 saved.
Frame 37 saved.
Frame 38 saved.
Frame 39 saved.
Frame 40 saved.
Frame 41 saved.
Frame 42 saved.
Frame 43 saved.
Frame 44 saved.
Frame 45 saved.
Frame 46 saved.
Frame 47 saved.
Frame 48 saved.
Frame 49 saved.
Frame 50 saved.
Frame 51 saved.
Frame 52 saved.
Frame 53 saved.
Frame 54 saved.
Frame 55 saved.
Frame 56 saved.
Frame 57 saved.
Frame 58 saved.
Frame 59 saved.
Frame 60 saved.
Frame 61 saved.
Frame 62 saved.
Fr

Step 2. Conduct inference on each frame of the video, drawing bounding boxes around detected vehicles.

In [None]:
import cv2
import numpy as np

def process_and_display_frame(frame, model):
    # Convert frame to RGB as YOLOv5 expects RGB images
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Perform inference
    results = model(frame_rgb)

    # Desired classes telling about what labels to be included
    desired_classes = ['car', 'truck', 'bus', 'traffic light', 'person']

    # Draw bounding boxes on the original frame
    for *xyxy, conf, cls in results.xyxy[0]:  # results.xyxy[0] contains bbox coords, confidence, class
        label = model.names[int(cls)]  # Get the class name using the class index
        if label in desired_classes:  # Check against updated list of desired classes
            start_point = (int(xyxy[0]), int(xyxy[1]))
            end_point = (int(xyxy[2]), int(xyxy[3]))
            color = (255, 0, 0)  # Box color
            frame = cv2.rectangle(frame, start_point, end_point, color, 2)
            cv2.putText(frame, label, (int(xyxy[0]), int(xyxy[1]-10)), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)

    # Display the frame
    cv2.imshow('Frame', frame)
    cv2.waitKey(1)  # Use cv2.waitKey(0) if you want to display each frame until a key is pressed


In [None]:
import torch

# Load the YOLOv5 model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)


cap = cv2.VideoCapture('video.mp4')

while True:
    ret, frame = cap.read()
    if not ret:
        break  # Break the loop if no frame is returned

    process_and_display_frame(frame, model)

cap.release()
cv2.destroyAllWindows()


Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-2-28 Python-3.10.12 torch-2.1.0+cu121 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


Step 3. Format the results back into a video.



In [None]:
import cv2
import torch
from google.colab.patches import cv2_imshow  # For displaying frames within Colab

# Load the YOLOv5 model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

def process_and_display_frame(frame, model):
    """
    Process a single frame through YOLOv5 model, draw bounding boxes on detected objects of interest,
    and return the processed frame.
    """
    # Convert frame to RGB (YOLOv5 expects RGB images)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Perform inference
    results = model(frame_rgb)

    # Convert results to numpy array and draw bounding boxes
    labels, cord = results.xyxyn[0][:, -1].numpy(), results.xyxyn[0][:, :-1].numpy()
    n = len(labels)
    for i in range(n):
        row = cord[i]
        desired_class = model.names[int(labels[i])]
        # Check if the detected class is one of the specified types
        if desired_class in ['car', 'truck', 'bus', 'traffic light', 'person']:
            x1, y1, x2, y2, conf = int(row[0]*frame_width), int(row[1]*frame_height), \
                                   int(row[2]*frame_width), int(row[3]*frame_height), row[4]
            # Draw rectangle and label
            cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)
            cv2.putText(frame, f'{desired_class} {int(conf * 100)}%',
                        (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
    return frame

# Initialize video capture
cap = cv2.VideoCapture('/content/drive/My Drive/video.mp4')
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))

# Define the codec and create VideoWriter object
out = cv2.VideoWriter('Output_Video_1.avi', cv2.VideoWriter_fourcc('M','J','P','G'), 30, (frame_width, frame_height))

while True:
    ret, frame = cap.read()
    if not ret:
        break  # No more frames, exit the loop

    # Process the frame
    processed_frame = process_and_display_frame(frame, model)

    # Write the processed frame to the output video
    out.write(processed_frame)

Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-2-28 Python-3.10.12 torch-2.1.0+cu121 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


Step 3. Format the results back into a video.



In [None]:
# Release everything if job is finished
cap.release()
out.release()
cv2.destroyAllWindows()

In [None]:
from google.colab import files

files.download('Output_Video_1.avi')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>