In [1]:
import cv2
import time
import numpy as np
import os

# Q1: Face detection and association-based tracking [4.5 points]

## 1. [0.5 points] Data preparation.

## 2. [1.5 points] Face detection. 

In [2]:
face_cascade = cv2.CascadeClassifier('./haarcascade_frontalface_default.xml')

In [3]:
img = cv2.imread(f"./frames/output_1.jpg")
gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)

start = time.time()
faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
print(time.time()-start)

0.031328439712524414


## 3. [1 point] Face detection visualization.

In [4]:
if not os.path.exists('output.mp4'):
    video_path = 'video.mp4'
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cap.release()
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out_path = 'output.mp4'
    video_writer = cv2.VideoWriter(out_path, fourcc, fps,(frame_width,frame_height))
    
    for i in range(720):
        img = cv2.imread(f"./frames/output_{i+1}.jpg")
        gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
        for (x, y, w, h) in faces:
            cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)
        video_writer.write(img)
    
        # cv2.imshow('Face Detection', img)
        # key = cv2.waitKey(0) # goes to next image only when we press smtg
        # # key = cv2.waitKey(1) # goes to next image automatically
    
        # if key == ord('q'):  # 'Esc' key
        #     cv2.destroyAllWindows()
        #     break
    
    # Release the capture
    video_writer.release()
    cv2.destroyAllWindows()

## 4. [1.5 point] Association-based tracking

In [5]:
class Tracker:
    
    def __init__(self):
        self.tracker = {
            "bbox" : [],
            "id": [],
            "last_uniq_id": -1
        }
    
    def get_IOU(self,bbox1, bbox2):
        
        x1, y1, w1, h1 = bbox1
        x2, y2, w2, h2 = bbox2
    
        x_left = max(x1, x2)
        y_top = max(y1, y2)
        x_right = min(x1 + w1, x2 + w2)
        y_bottom = min(y1 + h1, y2 + h2)
    
        if x_right < x_left or y_bottom < y_top:
            return 0.0
    
        intersection_area = (x_right - x_left) * (y_bottom - y_top)
    
        # Calculate areas of the bounding boxes
        bbox1_area = w1 * h1
        bbox2_area = w2 * h2
    
        # Calculate union area
        union_area = bbox1_area + bbox2_area - intersection_area
    
        # Calculate IoU
        iou = intersection_area / union_area
    
        return iou

    def update(self,cur_bboxes):
        # print(cur_bboxes,self.tracker)
        temp = self.tracker["bbox"].copy()
        # for id,bbox in enumerate(self.tracker["bbox"]):
        a=0
        for id,bbox in enumerate(temp):
            # print(id,bbox)
            max_iou,max_index = -1,-1
            for index,cur_bbox in enumerate(cur_bboxes):
                iou = self.get_IOU(bbox,cur_bbox)
                if (iou>max_iou):
                    max_iou = iou
                    max_index = index
                
            if max_iou < 0.5:
                # print("no max_iou")
                self.tracker["bbox"].pop(id-a)
                self.tracker["id"].pop(id-a)
                a+=1
            else:
                # print("max_iou found",max_index)
                self.tracker["bbox"][id-a] = cur_bboxes[max_index]
                cur_bboxes = np.delete(cur_bboxes,max_index,axis=0)
            # print(cur_bboxes,self.tracker)
        while len(cur_bboxes)!=0:
            self.tracker["last_uniq_id"]+=1
            self.tracker["bbox"].append(cur_bboxes[0])
            self.tracker["id"].append(self.tracker["last_uniq_id"])
            cur_bboxes=np.delete(cur_bboxes,0,axis=0)
        return self.tracker

In [6]:
if not os.path.exists('track_id.mp4'):
    no_uniq_tracks = 0
    track_vid = Tracker()
    
    # Video Writing Part
    video_path = 'video.mp4'
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cap.release()
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out_path = 'track_id.mp4'
    video_writer = cv2.VideoWriter(out_path, fourcc, fps,(frame_width,frame_height))
    
    for i in range(720):
        img = cv2.imread(f"./frames/output_{i+1}.jpg")
        gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
        
        # print(faces)
        
        tracker = track_vid.update(faces)
        
        # print("==============",i)
        
        for index,bbox in enumerate(tracker["bbox"]):
            
            # print(bbox,type(bbox))
            x, y, w, h = bbox[0],bbox[1],bbox[2],bbox[3]
            x_min,y_min = x,y
            x_max,y_max = x+w,y+h
            
            cv2.rectangle(img, (int(x_min), int(y_min)), (int(x_max), int(y_max)), (0, 255, 0), 2)
            # Put unique ID text
            cv2.putText(img, str(tracker["id"][index]), (int(x_min), int(y_min) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
        cv2.imwrite(f"./frames_track/img_{i}.jpg",img)
        video_writer.write(img)
        no_uniq_tracks = tracker["last_uniq_id"]

    print(no_uniq_tracks)
    # Release the capture
    video_writer.release()
    cv2.destroyAllWindows()

# Q2: YOLO Object Detection [5.5 points]

## 1. [0.5 point] Data preparation.

**Training and Validation Data:**

1. train/images: This folder contains the training images (e.g., *.jpg files).
2. train/labels: This folder contains the annotation files (e.g., *.txt files) corresponding to the training images.
3. valid/images: This folder contains the validation (or test) images.
4. valid/labels: This folder contains the annotation files corresponding to the validation images.

## 2. [1 point] Understanding YOLO object detector.

The YOLO object detector is a pioneering deep learning-based approach for real-time object detection in images and video. Unlike traditional methods that use region proposal networks (like R-CNN series), YOLO is a single-shot detector. This means it predicts bounding boxes and class probabilities for these boxes directly from the full image in one evaluation, making it extremely fast.

**Difference between YOLO and R-CNN Series:**

1. **Single-shot vs. Two-stage Detection**:
   - YOLO is a single-shot detector, meaning it processes the entire image at once to predict bounding boxes and class probabilities. It doesn't require a separate region proposal step.
   - R-CNN series (like Faster R-CNN) are two-stage detectors. They first propose regions of interest (RoIs) using a region proposal network (RPN) and then use a classifier to classify these proposed regions.

2. **Speed and Simplicity**:
   - YOLO is faster compared to the R-CNN series because it doesn't need multiple passes through the network for region proposals and classification.
   - R-CNN series, although accurate, are slower due to their two-stage process involving region proposals and subsequent classification.

3. **End-to-End Learning**:
   - YOLO learns to detect objects in an end-to-end manner. The loss function considers both localization and classification tasks jointly during training.
   - R-CNN series typically train region proposal and classification networks separately, which can make the training process more complex.

**Different Versions of YOLO:**

1. **YOLOv1**:
   - The original YOLO model introduced the concept of dividing the input image into a grid and predicting bounding boxes and class probabilities directly from this grid. It had limitations in handling small objects and suffered from localization errors.

2. **YOLOv3**:
   - YOLOv3 addressed the limitations of YOLOv1 by introducing improvements like feature pyramid network (FPN) for multi-scale feature extraction, bounding box regression using logistic regression, and the use of anchor boxes to handle different aspect ratios.

3. **YOLOv4**:
   - YOLOv4 further improved speed and accuracy by incorporating advanced techniques like the use of bag-of-freebies (data augmentation, multi-scale training), bag-of-specials (Mish activation, PANet feature pyramid), and optimized architecture design (CSPDarknet53 as backbone).

## 3. [1 points] Hands on with ultralytics.