# Computer Vision Project

Sajeel Nadeem Alam


# Notebook Setup

### Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

### Define Local Path

In the next cell you should assign to the variable `GOOGLE_DRIVE_PATH_AFTER_MYDRIVE` the relative path of this folder in your Google Drive.

**IMPORTANT:** you have to make sure that **all the files required to test your functions are loaded using this variable** (as was the case for all lab tutorials). In other words, do not use in the notebook any absolute paths. This will ensure that the markers can run your functions. Also, **do not use** the magic command `%cd` to change directory.



In [None]:
import os

GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'Colab Notebooks/ComputerVision/Coursework/CW_Folder_PG'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

### Load packages

In the next cell you should load all the packages required to test your function.

In [None]:
import matplotlib.pyplot as plt
from matplotlib import patches, rc
import matplotlib.animation as animation
import numpy as np
from joblib import dump, load
import torch
import torch.nn as nn
from torchvision import transforms, models
import cv2
from PIL import Image
from skimage.feature import hog
from skimage.transform import resize

!pip install mtcnn
!pip install lz4

from mtcnn import MTCNN

### Load models

In the next cell you should load your best performing model (this might consist of more than one file). Avoid to load it within `MaskDetection` to avoid having to reload it each time.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # this part of the code is adpated from Lab 8 [2]
model = models.resnet18(weights='IMAGENET1K_V1')
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 3)
model = model.to(device)

model_path = os.path.join(GOOGLE_DRIVE_PATH, 'Models','bestCNNClassifier.pth')
model.load_state_dict(torch.load(model_path))

# Test MaskDetection

This section should allow to test the `MaskDetection` function. First, add cells with the code needed to load the necessary subroutines to make `MaskDetection` work.

In [None]:
rc('animation', html='jshtml', embed_limit=50)

def MaskDetectionAnimation(path):
    cap = cv2.VideoCapture(path)                            # opens the video which is at the provided path [1]
    frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))     # gets the total number of frames in the video [1]
    frameWidth = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))     # gets the width of each frame [1]
    frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))   # gets the height of each frame [1]
    print(f"Frame count: {frameCount}, Width: {frameWidth}, Height: {frameHeight}")

    mtcnn = MTCNN()      # creates a MTCNN object to detect faces [2]

    model.eval()                            # sets the CNN model to evaluation mode
    dataMeans = [0.485, 0.456, 0.406]       # ImageNet mean values to normalise each channel in the image
    dataStds = [0.229, 0.224, 0.225]        # ImageNet standard deviation values to normalise each channel in the image
    transformTest = transforms.Compose([    # defines transformation to be applied to the image [2]
        transforms.Resize(256),             # resizes shorter side of the image to 256 pixels
        transforms.CenterCrop(224),         # crops a 224*224 pixel square from the centre of the image
        transforms.ToTensor(),              # converts the image to a tensor
        transforms.Normalize(dataMeans, dataStds)  # uses the defined means and standard deviations to normalise the image
    ])

    video = []            # initalises an empty list to store processed video frames
    annotations = []      # initalises an empty list to store the bounding boxes and predictions
    for i in range(0, frameCount, 5):         # loops over the video and selects every frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)    # sets the current frame position in the video to the ith position
        ret, frame = cap.read()                # reads the frame at the current position; ret indicates whether it was read
        if not ret:                            # successfully or not. It it was not read successfully then the loop is exited
            break

        rgbFrame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)    # converts frame from BGR to RGB as required by MTCNN
        faces = mtcnn.detect_faces(rgbFrame)                 # extracts a list of faces and their properties from the frame
        bbox = None                            # initialises bounding box to None
        pred = None                            # initalises prediction to None

        if faces:                           # checks for faces in the list and only proceeds if there are any
            x, y, w, h = faces[0]['box']    # extracts the coordinates for the top left corner and the width and height of the
            bbox = (x, y, w, h)             # bounding box of the first face (my video only has one face) and saves them as a tuple
            faceCrop = rgbFrame[y:y+h, x:x+w]  # crops the face from the frame

            # transforms the face crop using the defined transformations, adds a batch dimension and moves the tensor to the GPU
            faceTensor = transformTest(Image.fromarray(faceCrop)).unsqueeze(0).to(device)
            with torch.no_grad():                       # makes sure that gradients are not computed while getting predictions
                out = model(faceTensor)                 # uses the model to get the prediction for the face tensor
                pred = torch.argmax(out, dim=1).item()  # gets the class with the highest score and converts it to a scalar

            video.append(rgbFrame)                  # appends the frame to video list
            annotations.append((bbox, int(pred)))   # appends the bounding box along with the prediction to the annotations list

    cap.release()     # releases the video capture object

    fig, ax = plt.subplots(figsize=(12, 8))  # creates a figure and axes for plotting the video [1]

    def frame(i):           # defines the call back function which is called to display each frame [1]
        ax.clear()          # clears the  previous frame's axes so that the current frame can be displayed correctly [1]
        ax.axis('off')      # turns the axes off, getting rid of ticks and labels for clearer visualisation [1]
        img = video[i]                # retrieves the ith frame from the list of frames (video) and stores in img
        bbox, pred = annotations[i]   # retrieves and stores the corresponding bounding boxes and prediction
        ax.imshow(img)                # displays the current frame

        predDict = {0: 'No mask', 1: 'Mask', 2: 'Mask Incorrect'}   # sets corresponding class names
        if bbox:                      # checks whether the frame has a bounding box
            x, y, w, h = bbox         # extracts the coordinates of the bounding box and uses them to add a patch to the axes which creates the bounding box
            ax.add_patch(patches.Rectangle((x, y), w, h, fill=False, edgecolor='red', linewidth=2)) # [2]
            ax.text(x, y - 10, f"{predDict[pred]}", color='red', fontsize=12, weight='bold')  # add the predicted class on top of the bounding box

    anim = animation.FuncAnimation(fig, frame, frames=len(video))  # creates an animation object. Fig is where the frames will be displayed
    plt.close()                                                         # and frame is the call back function called at each frame. Plot is closed
    return anim                                                   # after animation object is created. Animation object is then returned [1]

Then, make a call to the `MaskDetection` function to see what results it produces.

In [None]:
video_path = os.path.join(GOOGLE_DRIVE_PATH, 'Personal_Video/testVideo.mp4')
cap_info = cv2.VideoCapture(video_path)
if not cap_info.isOpened():
    print(f"Error: Could not open video file {video_path} to get FPS.")
    original_fps = 30 # Default to 30 if cannot read
else:
    original_fps = cap_info.get(cv2.CAP_PROP_FPS)
    cap_info.release()
print(f"Original video FPS: {original_fps}")

path_to_test = os.path.join(GOOGLE_DRIVE_PATH, 'Personal_Video/testVideo.mp4')
anim = MaskDetectionAnimation(path_to_test)

if anim:
    output_animation_path_sampled = os.path.join(GOOGLE_DRIVE_PATH, 'Personal_Video/maskDetectionOutput_SampledVideo.mp4')
    try:
        # Calculate the effective FPS for the sampled video.
        # If original video is 30 FPS and you take every 10th frame,
        # you effectively have 3 frames per second of original content.
        effective_fps = original_fps / 5
        print(f"Calculated effective FPS for sampled video: {effective_fps}")

        # Save with high DPI, no borders, and the adjusted effective FPS.
        anim.save(output_animation_path_sampled, fps=effective_fps, dpi=200,
                          savefig_kwargs={'bbox_inches': 'tight', 'pad_inches': 0})
        print(f"Sampled animation successfully saved to: {output_animation_path_sampled}")
    except Exception as e:
        print(f"An error occurred while saving the sampled animation: {e}")
else:
    print("Sampled animation object was not created. Check video path or function errors.")

anim

# References

[1] G. Tarroni, Lab Tutorial 04, Computer Vision - IN3060/INM460, School of Science & Technology, Department of Computer Science, City, University of London, London, U.K., 2025

[2] G. Tarroni, Lab Tutorial 08, Computer Vision - IN3060/INM460, School of Science & Technology, Department of Computer Science, City, University of London, London, U.K., 2025