In [5]:
import torch
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import ToTensor
from PIL import Image
import numpy as np
import cv2

# Load the fine-tuned model
model = fasterrcnn_resnet50_fpn(pretrained=False)
num_classes = 2  # 1 class (car) + background
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)
model.load_state_dict(torch.load("faster_rcnn_finetuned.pth"))
model.to("cuda")
model.eval()

# Video input and output paths
input_video_path = "3978613-hd_1920_1080_24fps.mp4"  # Replace with your video path
output_video_path = "output_video_with_boxes.mp4"

# Open the input video
cap = cv2.VideoCapture(input_video_path)
if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

# Get video properties
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

# Define the codec and create VideoWriter object to save the output video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for MP4
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

# Process each frame
frame_count = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    frame_count += 1
    print(f"Processing frame {frame_count}/{total_frames}")

    # Convert frame (numpy array) to PIL image
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(frame_rgb)
    img_tensor = ToTensor()(pil_img)
    img_tensor = img_tensor.unsqueeze(0).to("cuda")  # Add batch dimension and move to GPU

    # Perform inference
    with torch.no_grad():
        prediction = model(img_tensor)  # Prediction is already on GPU

    # Process predictions (move tensors to CPU for numpy conversion)
    boxes = prediction[0]['boxes'].cpu().numpy()
    labels = prediction[0]['labels'].cpu().numpy()
    scores = prediction[0]['scores'].cpu().numpy()

    # Draw bounding boxes on the frame
    confidence_threshold = 0.5
    for box, label, score in zip(boxes, labels, scores):
        if score > confidence_threshold:
            x_min, y_min, x_max, y_max = map(int, box)
            # Draw rectangle
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)  # Red box
            # Add label and score
            label_text = f"{'car' if label == 1 else 'background'} ({score:.2f})"
            cv2.putText(frame, label_text, (x_min, y_min - 10), 
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    # Write the frame to the output video
    out.write(frame)

# Release resources
cap.release()
out.release()
cv2.destroyAllWindows()
print(f"Video processing complete. Output saved as {output_video_path}")

  model.load_state_dict(torch.load("faster_rcnn_finetuned.pth"))


Processing frame 1/381
Processing frame 2/381
Processing frame 3/381
Processing frame 4/381
Processing frame 5/381
Processing frame 6/381
Processing frame 7/381
Processing frame 8/381
Processing frame 9/381
Processing frame 10/381
Processing frame 11/381
Processing frame 12/381
Processing frame 13/381
Processing frame 14/381
Processing frame 15/381
Processing frame 16/381
Processing frame 17/381
Processing frame 18/381
Processing frame 19/381
Processing frame 20/381
Processing frame 21/381
Processing frame 22/381
Processing frame 23/381
Processing frame 24/381
Processing frame 25/381
Processing frame 26/381
Processing frame 27/381
Processing frame 28/381
Processing frame 29/381
Processing frame 30/381
Processing frame 31/381
Processing frame 32/381
Processing frame 33/381
Processing frame 34/381
Processing frame 35/381
Processing frame 36/381
Processing frame 37/381
Processing frame 38/381
Processing frame 39/381
Processing frame 40/381
Processing frame 41/381
Processing frame 42/381
P