In [5]:
import cv2 as cv

In [6]:
input_stream = cv.VideoCapture('in.mp4')

In [15]:
from torch import hub

model = hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

Using cache found in /home/ro/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2022-12-6 Python-3.10.6 torch-1.13.0+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


In [18]:
model.to('cpu')

AutoShape(
  (model): DetectMultiBackend(
    (model): DetectionModel(
      (model): Sequential(
        (0): Conv(
          (conv): Conv2d(3, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
          (act): SiLU(inplace=True)
        )
        (1): Conv(
          (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (act): SiLU(inplace=True)
        )
        (2): C3(
          (cv1): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv2): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv3): Conv(
            (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (m): Sequential(
            (0): Bottleneck(
              (cv1): Conv(
                (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
  

# Try out the entire process on a single frame

## Get and process frame

In [43]:
ret, frame = input_stream.read()

In [48]:
results = model(frame)

In [56]:
labels = results.xyxyn[0][:, -1].numpy()

In [58]:
cord = results.xyxyn[0][:, :-1].numpy()

## Plot boxes over scored frame

In [60]:
x, y = frame.shape[1], frame.shape[0]

for i in range(len(labels)):
    row = cord[i]
    if row[4] < 0.2:
        continue
    
    x1 = int(row[0]*x)
    x2 = int(row[2]*x)
    y1 = int(row[1]*y)
    y2 = int(row[3]*y)

    box_color = (0, 255, 0)
    classes = model.names
    label_font = cv.FONT_HERSHEY_COMPLEX
    cv.rectangle(frame, (x1, y1), (x2, y2), box_color, 2)
    cv.putText(frame, classes[labels[i]], (x1,y1), label_font, 0.9, box_color, 2)

## View final processed and plotted frame

In [72]:
from PUL import Image
img = Image.fromarray(frame)
img.show()

# Final Function Definitions

### Process frames
This function will take in a frame and generate and return the results of running yolov5 on it

In [80]:
def process_frame(frame):
    # Fetch results of the model
    results = model(frame)

    # Split results
    labels = results.xyxyn[0][:, -1].numpy()
    cord = results.xyxyn[0][:, :-1].numpy()
    
    return labels, cord

# Plot boxes and add labels
This function will take in a frame and the results given by the yolov5 model and plots boxes and adds label accordingly

In [111]:
def plot_frame(results, frame):
    labels, cord = results
    x, y = frame.shape[1], frame.shape[0]
    
    for i in range(len(labels)):
        row = cord[i]
        # Ignore row if confidence too low
        if row[4] < 0.2:
            continue
        
        # Extract 2 points of box around object
        x1 = int(row[0]*x)
        y1 = int(row[1]*y)
        x2 = int(row[2]*x)
        y2 = int(row[3]*y)

        # Graphics settings
        box_color = (200, 0, 0)
        font_color = (0, 0, 200)
        font = cv.FONT_HERSHEY_SIMPLEX

        # Plot box and add label
        cv.rectangle(frame, (x1, y1), (x2, y2), box_color, 2)
        cv.putText(frame, model.names[labels[i]], (x1, y1), label_font, 0.9, font_color, 2)

        return frame

# Final Run

## V1

In [115]:
input_stream = cv.VideoCapture('in-1.mp4')
video_dimensions = tuple(int(input_stream.get(i)) for i in (cv.CAP_PROP_FRAME_WIDTH, cv.CAP_PROP_FRAME_HEIGHT))

fourcc = cv.VideoWriter_fourcc(*'MJPG')
output_stream = cv.VideoWriter('out-1.avi', fourcc, 20, video_dimensions)

In [116]:
ret, frame = input_stream.read()
while ret:
    results = process_frame(frame)
    frame = plot_frame(results, frame)
    output_stream.write(frame)
    ret, frame = input_stream.read()

In [117]:
!mpv "out-1.avi"

=[0m (+) Video --vid=1 (mjpeg 854x480 20.000fps)
[0m[0mVO: [gpu] 854x480 yuv420p
[0m[0mV: 00:00:07 / 00:00:08 (87%)
[0;33m[ffmpeg/demuxer] avi: Packet corrupt (stream = 0, dts = 174).
[0m[0mV: 00:00:08 / 00:00:08 (98%)
[0;31m[ffmpeg/video] mjpeg: overread 8
[0m[0;33m[ffmpeg/video] mjpeg: EOI missing, emulating
[0m[0mV: 00:00:08 / 00:00:08 (100%)
[0m
[0m[0mExiting... (End of file)
[0m>