In [1]:
import cv2
import time
import numpy as np
import os

# Q1: Face detection and association-based tracking [4.5 points]

## 1. [0.5 points] Data preparation.

## 2. [1.5 points] Face detection. 

In [2]:
face_cascade = cv2.CascadeClassifier('./haarcascade_frontalface_default.xml')

In [3]:
img = cv2.imread(f"./frames/output_1.jpg")
gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)

start = time.time()
faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
print(time.time()-start)

0.026061058044433594


## 3. [1 point] Face detection visualization.

In [4]:
if not os.path.exists('output.mp4'):
    video_path = 'video.mp4'
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cap.release()
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out_path = 'output.mp4'
    video_writer = cv2.VideoWriter(out_path, fourcc, fps,(frame_width,frame_height))
    
    for i in range(720):
        img = cv2.imread(f"./frames/output_{i+1}.jpg")
        gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
        for (x, y, w, h) in faces:
            cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)
        video_writer.write(img)
    
        # cv2.imshow('Face Detection', img)
        # key = cv2.waitKey(0) # goes to next image only when we press smtg
        # # key = cv2.waitKey(1) # goes to next image automatically
    
        # if key == ord('q'):  # 'Esc' key
        #     cv2.destroyAllWindows()
        #     break
    
    # Release the capture
    video_writer.release()
    cv2.destroyAllWindows()

## 4. [1.5 point] Association-based tracking

In [5]:
class Tracker:
    
    def __init__(self):
        self.tracker = {
            "bbox" : [],
            "id": [],
            "last_uniq_id": -1
        }
    
    def get_IOU(self,bbox1, bbox2):
        
        x1, y1, w1, h1 = bbox1
        x2, y2, w2, h2 = bbox2
    
        x_left = max(x1, x2)
        y_top = max(y1, y2)
        x_right = min(x1 + w1, x2 + w2)
        y_bottom = min(y1 + h1, y2 + h2)
    
        if x_right < x_left or y_bottom < y_top:
            return 0.0
    
        intersection_area = (x_right - x_left) * (y_bottom - y_top)
    
        # Calculate areas of the bounding boxes
        bbox1_area = w1 * h1
        bbox2_area = w2 * h2
    
        # Calculate union area
        union_area = bbox1_area + bbox2_area - intersection_area
    
        # Calculate IoU
        iou = intersection_area / union_area
    
        return iou

    def update(self,cur_bboxes):
        for id,bbox in enumerate(self.tracker["bbox"]):
            max_iou,max_index = -1,-1
            for index,cur_bbox in enumerate(cur_bboxes):
                iou = self.get_IOU(bbox,cur_bbox)
                if (iou>max_iou):
                    max_iou = iou
                    max_index = index
                
            if max_iou < 0.5:
                self.tracker["bbox"].pop(id)
                self.tracker["id"].pop(id)
            else:
                self.tracker["bbox"][id] = cur_bboxes[max_index]
                np.delete(cur_bboxes,max_index)
        while len(cur_bboxes)!=0:
            self.tracker["last_uniq_id"]+=1
            self.tracker["bbox"].append(cur_bboxes[0])
            self.tracker["id"].append(self.tracker["last_uniq_id"])
            cur_bboxes=np.delete(cur_bboxes,0,axis=0)
        return self.tracker

In [6]:
if not os.path.exists('track_id.mp4'):
    no_uniq_tracks = 0
    track_vid = Tracker()

    # Video Writing Part
    video_path = 'video.mp4'
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cap.release()
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out_path = 'track_id.mp4'
    # video_writer = cv2.VideoWriter(out_path, fourcc, fps,(frame_width,frame_height))
    
    for i in range(720):
        img = cv2.imread(f"./frames/output_{i+1}.jpg")
        gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
        
        # print(faces)
        
        tracker = track_vid.update(faces)
        
        # print(tracker)
        
        for index,bbox in enumerate(tracker["bbox"]):
            
            # print(bbox,type(bbox))
            x, y, w, h = bbox[0],bbox[1],bbox[2],bbox[3]
            x_min,y_min = x,y
            x_max,y_max = x+w,y+h
            
            cv2.rectangle(img, (int(x_min), int(y_min)), (int(x_max), int(y_max)), (0, 255, 0), 2)
            # Put unique ID text
            cv2.putText(img, str(tracker["id"][index]), (int(x_min), int(y_min) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
        cv2.imwrite(f"./frames_track/img_{i}.jpg",img)
        # video_writer.write(img)
        no_uniq_tracks = tracker["last_uniq_id"]
        if (i)
        
    # Release the capture
    # video_writer.release()
    cv2.destroyAllWindows()

In [7]:
print(no_uniq_tracks)

438


# Q2: YOLO Object Detection [5.5 points]

In [8]:
for i,a in enumerate([]):
    print(i,a)