In [108]:
from pytube import YouTube
import os
import cv2
import time

# Q1: Face detection and association-based tracking

## 1.1 - Data preparation. 
We will implement face detection and tracking on a famous scene from the movie Forrest Gump. To prepare the dataset, please download the video clip from https://www.youtube.com/ watch?v=bSMxl1V8FSg (the mp4 at 480p resolution) and burst the first 30 seconds into frames (you should get about 719-720 frames).

In [109]:
def download_video(video_url, output_dir):
    yt = YouTube(video_url)
    video = yt.streams.filter(file_extension='mp4', resolution='480p').first()
    video_path = video.download(output_dir)

    # Check if the video was downloaded successfully
    if os.path.exists(video_path):
        new_file_path = os.path.join(output_dir, "input_video.mp4")
        os.rename(video_path, new_file_path)
        print("Video downloaded successfully.")
        return new_file_path
    else:
        print("Error downloading video.")
        return None

def extract_frames(video_path, output_dir, duration=30):
    os.makedirs(output_dir, exist_ok=True)

    # Extract frames from the video
    ffmpeg_command = f"ffmpeg -i {video_path} -vf fps=24/1 -t {duration} {output_dir}/frame_%03d.png"
    result = os.system(ffmpeg_command)

    if result == 0:
        print("Frames extracted successfully.")
        return True
    else:
        print("Error extracting frames.")
        return False


In [110]:
video_url = "https://www.youtube.com/watch?v=bSMxl1V8FSg"
video_output_dir = "input"
frames_output_dir = "input_frames"
duration = 30 # Duration of frames to extract (in seconds)

video_path = download_video(video_url, video_output_dir)
if video_path:
    extract_frames(video_path, frames_output_dir, duration)


Video downloaded successfully.


ffmpeg version 6.1.1 Copyright (c) 2000-2023 the FFmpeg developers
  built with Apple clang version 15.0.0 (clang-1500.1.0.2.5)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/6.1.1_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libso

Frames extracted successfully.


[out#0/image2 @ 0x6000031f4540] video:389493kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
frame=  720 fps=177 q=-0.0 Lsize=N/A time=00:00:29.95 bitrate=N/A speed=7.38x    


## 1.2 - Face detection. 
Use the Viola-Jones Haar cascades based face detector from OpenCV to detect faces in each frame. How long does it take to process each frame? Identify some key factors of the algorithm that could change the time.
Hint: you may need to look within the xml config file.

In [111]:

def detect_faces_and_save_video(frames_folder, output_folder, output_video_name):
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')


    frame_path = f'{frames_folder}/frame_001.png'
    first_frame = cv2.imread(frame_path)
    height, width, _ = first_frame.shape

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    output_video_path = os.path.join(output_folder, output_video_name)
    out = cv2.VideoWriter(output_video_path, fourcc, 24.0, (width, height))

    total_processing_time = 0
    for i in range(1, 721):  
    
        frame = cv2.imread(f'{frames_folder}/frame_{i:03d}.png')
        
        gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
        start_time = time.time()
        faces = face_cascade.detectMultiScale(gray_frame, scaleFactor=1.3, minNeighbors=8, minSize=(15, 15))
        end_time = time.time()
        
        processing_time = end_time - start_time
        total_processing_time += processing_time
        
        for (x, y, w, h) in faces:
            cv2.rectangle(frame, (x, y), (x+w, y+h), (255, 0, 0), 2)
        out.write(frame)
        print(f"Frame {i}: Processing time = {processing_time:.2f} seconds")

    out.release()
    cv2.destroyAllWindows()

    print(f"Total processing time for all frames: {total_processing_time:.2f} seconds")


In [112]:
detect_faces_and_save_video("input_frames", "output", "output_video.mp4")

Frame 1: Processing time = 0.02 seconds
Frame 2: Processing time = 0.01 seconds
Frame 3: Processing time = 0.01 seconds
Frame 4: Processing time = 0.01 seconds
Frame 5: Processing time = 0.01 seconds
Frame 6: Processing time = 0.01 seconds
Frame 7: Processing time = 0.01 seconds
Frame 8: Processing time = 0.01 seconds
Frame 9: Processing time = 0.01 seconds
Frame 10: Processing time = 0.01 seconds
Frame 11: Processing time = 0.01 seconds
Frame 12: Processing time = 0.01 seconds
Frame 13: Processing time = 0.01 seconds
Frame 14: Processing time = 0.01 seconds
Frame 15: Processing time = 0.01 seconds
Frame 16: Processing time = 0.01 seconds
Frame 17: Processing time = 0.01 seconds
Frame 18: Processing time = 0.01 seconds
Frame 19: Processing time = 0.01 seconds
Frame 20: Processing time = 0.01 seconds
Frame 21: Processing time = 0.01 seconds
Frame 22: Processing time = 0.01 seconds
Frame 23: Processing time = 0.01 seconds
Frame 24: Processing time = 0.01 seconds
Frame 25: Processing time

Frame 206: Processing time = 0.01 seconds
Frame 207: Processing time = 0.01 seconds
Frame 208: Processing time = 0.01 seconds
Frame 209: Processing time = 0.01 seconds
Frame 210: Processing time = 0.01 seconds
Frame 211: Processing time = 0.01 seconds
Frame 212: Processing time = 0.01 seconds
Frame 213: Processing time = 0.01 seconds
Frame 214: Processing time = 0.01 seconds
Frame 215: Processing time = 0.01 seconds
Frame 216: Processing time = 0.01 seconds
Frame 217: Processing time = 0.01 seconds
Frame 218: Processing time = 0.01 seconds
Frame 219: Processing time = 0.01 seconds
Frame 220: Processing time = 0.01 seconds
Frame 221: Processing time = 0.01 seconds
Frame 222: Processing time = 0.01 seconds
Frame 223: Processing time = 0.01 seconds
Frame 224: Processing time = 0.01 seconds
Frame 225: Processing time = 0.01 seconds
Frame 226: Processing time = 0.01 seconds
Frame 227: Processing time = 0.01 seconds
Frame 228: Processing time = 0.01 seconds
Frame 229: Processing time = 0.01 

Frame 403: Processing time = 0.01 seconds
Frame 404: Processing time = 0.01 seconds
Frame 405: Processing time = 0.01 seconds
Frame 406: Processing time = 0.01 seconds
Frame 407: Processing time = 0.01 seconds
Frame 408: Processing time = 0.01 seconds
Frame 409: Processing time = 0.01 seconds
Frame 410: Processing time = 0.01 seconds
Frame 411: Processing time = 0.01 seconds
Frame 412: Processing time = 0.01 seconds
Frame 413: Processing time = 0.01 seconds
Frame 414: Processing time = 0.01 seconds
Frame 415: Processing time = 0.01 seconds
Frame 416: Processing time = 0.01 seconds
Frame 417: Processing time = 0.01 seconds
Frame 418: Processing time = 0.01 seconds
Frame 419: Processing time = 0.01 seconds
Frame 420: Processing time = 0.01 seconds
Frame 421: Processing time = 0.01 seconds
Frame 422: Processing time = 0.01 seconds
Frame 423: Processing time = 0.01 seconds
Frame 424: Processing time = 0.01 seconds
Frame 425: Processing time = 0.01 seconds
Frame 426: Processing time = 0.01 

Frame 602: Processing time = 0.01 seconds
Frame 603: Processing time = 0.01 seconds
Frame 604: Processing time = 0.01 seconds
Frame 605: Processing time = 0.01 seconds
Frame 606: Processing time = 0.01 seconds
Frame 607: Processing time = 0.01 seconds
Frame 608: Processing time = 0.01 seconds
Frame 609: Processing time = 0.01 seconds
Frame 610: Processing time = 0.01 seconds
Frame 611: Processing time = 0.01 seconds
Frame 612: Processing time = 0.01 seconds
Frame 613: Processing time = 0.01 seconds
Frame 614: Processing time = 0.01 seconds
Frame 615: Processing time = 0.01 seconds
Frame 616: Processing time = 0.01 seconds
Frame 617: Processing time = 0.01 seconds
Frame 618: Processing time = 0.01 seconds
Frame 619: Processing time = 0.01 seconds
Frame 620: Processing time = 0.01 seconds
Frame 621: Processing time = 0.01 seconds
Frame 622: Processing time = 0.01 seconds
Frame 623: Processing time = 0.01 seconds
Frame 624: Processing time = 0.01 seconds
Frame 625: Processing time = 0.01 

#### After experimenting with different parameter values for the face detection algorithm, I found that adjusting the following parameters led to a reduction in processing time:

- **Scale Factor:** I tried different scale factors, including 1.1, 1.3, 1.8, and 2.2. By increasing the scale factor, I observed a decrease in processing time, with a scale factor of 1.3 providing the best balance between accuracy and speed.

- **MinNeighbors:** I varied the number of neighbors for each candidate rectangle, testing values such as 5, 8, and 10. Increasing the minNeighbors parameter can reduce false positives but may also slow down the detection process. I found that setting minNeighbors to 8 resulted in a significant reduction in processing time while maintaining satisfactory detection accuracy.

- **MinSize:** Adjusting the minimum object size parameter, I experimented with values such as (10, 10), (15, 15), (25, 25), and (30, 30). Decreasing the minSize parameter may increase false positives and processing time, while increasing it may reduce the likelihood of false positives but may also lead to missed detections. I found that a minimum size of (15, 15) provided a good balance between speed and accuracy.

With the optimized parameter values, I observed a reduction in processing time from an average of 0.02 to 0.04 seconds per frame to approximately 0.01 seconds per frame on average. These optimizations allow for efficient face detection while maintaining satisfactory detection accuracy.


## 1.3 - Face detection visualization. 
Visualize the face detections made over the first 30s frames as a new video. Link to the video from your google drive. Watch the video and draw three conclusions about when does the face detector work or fail. Why do you think this is the case?
Hint: You can use cv2.rectangle to draw boxes on the image and then save them back to disk. Then ffmpeg can be used again to stitch together the frames into a new video.

### Observations on Face Detection Video

After watching the video created from the face detection process, I made the following observations:

1. **Successful Face Detection**: In most frames of the 30-second video, the face detector successfully detects and draws rectangles around the faces present in the scene. This indicates that the face detection algorithm is generally effective in identifying faces when they are well-lit, frontal, and of adequate size.

2. **Undetected Faces**: However, there are instances where some faces in certain frames are not detected by the face detector. This could be due to various reasons, such as:
   - **Min Frame Size**: The faces that are not detected may be too small in size relative to the minimum size parameter set for the face detection algorithm. This could cause the detector to miss smaller faces or faces that appear farther away in the scene.
   - **Angles of Faces**: Faces that are not facing directly towards the camera or are at extreme angles may not be detected accurately. The face detector may struggle to identify faces that are tilted or rotated away from the frontal view.
   - **False Positives**: On the other hand, there are instances where false positive detections occur, resulting in rectangles being drawn around non-facial objects or patterns that resemble faces. This could be due to factors such as shadows, reflections, or patterns that resemble facial features.

3. **Overall Performance**: Despite these occasional failures, the face detector performs reasonably well in detecting faces across the video frames. It demonstrates the capability to identify faces in a variety of contexts, lighting conditions, and orientations. However, there is room for improvement, especially in handling smaller faces and reducing false positive detections.

Overall, the effectiveness of the face detector is influenced by factors such as the size of the faces, their orientation relative to the camera, and the presence of other objects that may resemble faces. Fine-tuning the parameters of the face detection algorithm and employing additional techniques such as deep learning-based approaches could further enhance its performance.


## 1.4 - Association-based tracking. 
Tracking can be used to associate face detections across time and understand that it is the same character appearing across multiple frames of the movie. We will explore a simple way to perform tracking.

### 1.4.1 Generate face tracks by comparing face detections in two consecutive frames and associating them based on IoU scores. 
You may want to associate faces only when IoU > 0.5. Do consider what happens when there are multiple face detections in both frames. Start new tracks for faces not seen in the previous frame. End existing tracks when faces are not visible in the next frame. How many unique tracks did you create in the first 30 seconds?

In [121]:
import cv2
import os

def detect_faces(frame):
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray_frame, scaleFactor=1.3, minNeighbors=8, minSize=(15, 15))
    return faces

def compute_iou(box1, box2):
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[0] + box1[2], box2[0] + box2[2])
    y2 = min(box1[1] + box1[3], box2[1] + box2[3])
    intersection_area = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1)
    box1_area = (box1[2] + 1) * (box1[3] + 1)
    box2_area = (box2[2] + 1) * (box2[3] + 1)
    iou = intersection_area / float(box1_area + box2_area - intersection_area)
    return iou

def associate_face_tracks(frames_folder):
    tracks = {}
    output_folder = "output"
    output_video_name = "tracked_output.mp4"
    output_video_path = os.path.join(output_folder, output_video_name)
    
    # Get information about the first frame
    frame_path = os.path.join(frames_folder, "frame_001.png")
    first_frame = cv2.imread(frame_path)
    height, width, _ = first_frame.shape
    
    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, 24.0, (width, height))
    
    track_id = 1  # Initialize track id
    
    for i in range(1, len(os.listdir(frames_folder))):  # Loop through frames
        frame_path = os.path.join(frames_folder, f"frame_{i:03d}.png")
        frame = cv2.imread(frame_path)

        curr_faces = detect_faces(frame)

        # Get faces in the next frame
        next_frame_path = os.path.join(frames_folder, f"frame_{(i+1):03d}.png")
        next_frame = cv2.imread(next_frame_path)
        next_faces = detect_faces(next_frame)

        # Store matching tracks for current frame
        matched_tracks = {}
        
        # Associate faces with tracks
        for face1 in next_faces :
            for face2 in curr_faces:
                if compute_iou(face1, face2) > 0.5:  
                    if tuple(face2) in tracks.values():
                        for key, value in tracks.items():
                            if value == tuple(face2):
                                tracks[key] = tuple(face1)
                                matched_tracks[tuple(face2)] = key
                    else:
                        matched_tracks[tuple(face2)] = track_id
                        tracks[track_id] = tuple(face1)
                        track_id += 1

        # Draw faces and write track IDs on the frame
        for (x, y, w, h) in curr_faces:
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(frame, str(matched_tracks.get(tuple((x, y, w, h)), '')), (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

        out.write(frame)

    
    out.release()
    print("Tracked video saved successfully!")


In [122]:
frames_folder = "input_frames"
associate_face_tracks(frames_folder)

Tracked video saved successfully!
