In [1]:
import cv2
import time
import numpy as np
import os

# Q1: Face detection and association-based tracking [4.5 points]

## 1. [0.5 points] Data preparation.

## 2. [1.5 points] Face detection. 

In [2]:
img = cv2.imread(f"./frames/output_1.jpg")
gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)

start = time.time()
face_cascade = cv2.CascadeClassifier('./haarcascade_frontalface_default.xml')
faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=50, minSize=(30, 30))
print("minNeighbors = 50,num_stages=25",time.time()-start)

start = time.time()
face_cascade = cv2.CascadeClassifier('./haarcascade_frontalface_default-Copy1.xml')
faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
print("minNeighbors = 5,num_stages = 20",time.time()-start)

minNeighbors = 50,num_stages=25 0.048168182373046875
minNeighbors = 5,num_stages = 20 0.04134011268615723


1. **Number of stages in the cascade:** The XML file contains information about the cascade structure. Each stage has a specific number of features to evaluate. Classifiers with more stages (typically for higher accuracy) take longer to process.

1. **Minimum number of neighbors:** This parameter specifies how many neighboring detections are required to confirm a face. Evaluating more neighbors takes more time.

## 3. [1 point] Face detection visualization.

In [3]:
face_cascade = cv2.CascadeClassifier('./haarcascade_frontalface_alt.xml')
if not os.path.exists('output.mp4'):
    video_path = 'video.mp4'
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cap.release()
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out_path = 'output.mp4'
    video_writer = cv2.VideoWriter(out_path, fourcc, fps,(frame_width,frame_height))
    
    for i in range(720):
        img = cv2.imread(f"./frames/output_{i+1}.jpg")
        gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
        for (x, y, w, h) in faces:
            cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)
        video_writer.write(img)
    
        # cv2.imshow('Face Detection', img)
        # key = cv2.waitKey(0) # goes to next image only when we press smtg
        # # key = cv2.waitKey(1) # goes to next image automatically
    
        # if key == ord('q'):  # 'Esc' key
        #     cv2.destroyAllWindows()
        #     break
    
    # Release the capture
    video_writer.release()
    cv2.destroyAllWindows()

#### Face Detection Visualization Analysis

**Video Link:** [Face Detection Video](https://drive.google.com/file/d/1z9JU6s31nj9wGWHrQglsO6SobZXHmJGO/view?usp=sharing)

**Successful Detection Conditions**:
  - Works well with clear, unobstructed views of faces.
  - Effective in identifying faces without occlusion or motion blur.

**Failure Scenarios**:
  - Fails to detect faces from side or other views.
  - Struggles in environments with complex or cluttered backgrounds, leading to false positives.
  - Faces at a distance or in low-resolution frames may be missed entirely.

## 4. [1.5 point] Association-based tracking

In [4]:
class Tracker:
    
    def __init__(self):
        self.tracker = {
            "bbox" : [],
            "id": [],
            "last_uniq_id": -1
        }
    
    def get_IOU(self,bbox1, bbox2):
        
        x1, y1, w1, h1 = bbox1
        x2, y2, w2, h2 = bbox2
    
        x_left = max(x1, x2)
        y_top = max(y1, y2)
        x_right = min(x1 + w1, x2 + w2)
        y_bottom = min(y1 + h1, y2 + h2)
    
        if x_right < x_left or y_bottom < y_top:
            return 0.0
    
        intersection_area = (x_right - x_left) * (y_bottom - y_top)
    
        # Calculate areas of the bounding boxes
        bbox1_area = w1 * h1
        bbox2_area = w2 * h2
    
        # Calculate union area
        union_area = bbox1_area + bbox2_area - intersection_area
    
        # Calculate IoU
        iou = intersection_area / union_area
    
        return iou

    def update(self,cur_bboxes):
        # print(cur_bboxes,self.tracker)
        temp = self.tracker["bbox"].copy()
        # for id,bbox in enumerate(self.tracker["bbox"]):
        a=0
        for id,bbox in enumerate(temp):
            # print(id,bbox)
            max_iou,max_index = -1,-1
            for index,cur_bbox in enumerate(cur_bboxes):
                iou = self.get_IOU(bbox,cur_bbox)
                if (iou>max_iou):
                    max_iou = iou
                    max_index = index
                
            if max_iou < 0.5:
                # print("no max_iou")
                self.tracker["bbox"].pop(id-a)
                self.tracker["id"].pop(id-a)
                a+=1
            else:
                # print("max_iou found",max_index)
                self.tracker["bbox"][id-a] = cur_bboxes[max_index]
                cur_bboxes = np.delete(cur_bboxes,max_index,axis=0)
            # print(cur_bboxes,self.tracker)
        while len(cur_bboxes)!=0:
            self.tracker["last_uniq_id"]+=1
            self.tracker["bbox"].append(cur_bboxes[0])
            self.tracker["id"].append(self.tracker["last_uniq_id"])
            cur_bboxes=np.delete(cur_bboxes,0,axis=0)
        return self.tracker

In [5]:
if not os.path.exists('track_id.mp4'):
    no_uniq_tracks = 0
    track_vid = Tracker()
    
    # Video Writing Part
    video_path = 'video.mp4'
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cap.release()
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out_path = 'track_id.mp4'
    video_writer = cv2.VideoWriter(out_path, fourcc, fps,(frame_width,frame_height))
    
    for i in range(720):
        img = cv2.imread(f"./frames/output_{i+1}.jpg")
        gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
        
        # print(faces)
        
        tracker = track_vid.update(faces)
        
        # print("==============",i)
        
        for index,bbox in enumerate(tracker["bbox"]):
            
            # print(bbox,type(bbox))
            x, y, w, h = bbox[0],bbox[1],bbox[2],bbox[3]
            x_min,y_min = x,y
            x_max,y_max = x+w,y+h
            
            cv2.rectangle(img, (int(x_min), int(y_min)), (int(x_max), int(y_max)), (0, 255, 0), 2)
            # Put unique ID text
            cv2.putText(img, str(tracker["id"][index]), (int(x_min), int(y_min) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
        cv2.imwrite(f"./frames_track/img_{i}.jpg",img)
        video_writer.write(img)
        no_uniq_tracks = tracker["last_uniq_id"]

    print(no_uniq_tracks)
    # Release the capture
    video_writer.release()
    cv2.destroyAllWindows()

No of unique Trackers = 49

#### Face Tracking Analysis

**Video Link with Tracking IDs:** [Face Tracking Video](https://drive.google.com/file/d/1Jg7cPimln5NpuEXlVrfzlh3xDrlG9ZLK/view?usp=sharing)

- **Distinct Track IDs for Individuals**:
  - Different people generally receive unique track IDs, maintaining separation between individuals throughout the video.

- **Challenges with Occlusion**:
  - Occlusion may lead to multiple faces being associated with a single track ID (IoU > 0.5), especially when faces overlap or are close together.

- **Failure Cases and Recommendations**:
  - **0:00**: Motion blur causes a girl to be undetected in the next frame, resulting in a new track ID for her in subsequent frames.
  - **0:03**: False positive detections from the background create unwanted new track IDs, affecting tracking accuracy.
  - **0:05**: Small or low-resolution faces may not be detected, disrupting tracking continuity.

##### Recommendations for Improvement

- **Adjust Tracker Removal Criteria**:
  - Extend the duration (e.g., to 5 frames) before removing a tracker to accommodate brief occlusions or missed detections.

- **Explore Advanced Matching Techniques**:
  - Implement alternative methods like template matching or color-based matching to improve face association in challenging scenarios such as occlusion or complex backgrounds.


# Q2: YOLO Object Detection [5.5 points]

## 1. [0.5 point] Data preparation.

In [None]:
# !pip install kaggle
# !kaggle datasets download -d haziqasajid5122/yolov8-finetuning-dataset-ducks
# !unzip yolov8-finetuning-dataset-ducks.zip

**Training and Validation Data:**

1. train/images: This folder contains the training images (e.g., *.jpg files).
2. train/labels: This folder contains the annotation files (e.g., *.txt files) corresponding to the training images.
3. valid/images: This folder contains the validation (or test) images.
4. valid/labels: This folder contains the annotation files corresponding to the validation images.

## 2. [1 point] Understanding YOLO object detector.

- Yolo is a single shot detector which predicts both bounding boxes and class probabilities in a single go unlike R-CNN series
- In R-CNN series, it first proposes a region of interest using Region Proposal Network and now runs a classifier on each of these ROI to classify these proposed regions.
- This two-stage process makes R-CNN series much slower than Yolo which requires only one pass through the network.
- In R-CNN series, we have to train the region proposal network and the classification networks separately, while YOLO being end to end considers both localisation and classification tasks in the loss function jointly during training.

**Yolo Series**
1. YOLOv1:
- It divides the input images into a grid and predicts bounding boxes and class probabilities directly from the grid.But using this approach the model could not detect small objects.
  
2. YOLOv3:
- It introduced a Feature Pyramid Network for multi-scale extraction.
- It changed its backbone to Darknet-53 for feature extraction.
- It predicts bounding box using logistic regression and it used anchor boxes to handle different aspect ratios

3. YOLOv5:
- It changed its backbone to a much more complex architechture EfficientDet
- It introduced dynamic anchor boxes.

## 3. [1 points] Hands on with ultralytics.

In [None]:
import random
import os
import glob
import shutil
import yaml
import torch
from ultralytics import YOLO

In [None]:
model = YOLO('yolov8n.yaml')  # build a new model from YAML

# Print the total number of parameters
print(f"Total Parameters: {sum(p.numel() for p in model.parameters())}")

# Count the convolutional layers (excluding BatchNorm2d and other non-convolutional layers)
num_conv_layers = 0
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Conv2d):
        num_conv_layers += 1

print(f"Number of Convolutional Layers: {num_conv_layers}")

In [None]:
model = YOLO('yolov8m.yaml')  # build a new model from YAML

# Print the total number of parameters
print(f"Total Parameters: {sum(p.numel() for p in model.parameters())}")

# Count the convolutional layers (excluding BatchNorm2d and other non-convolutional layers)
num_conv_layers = 0
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Conv2d):
        num_conv_layers += 1

print(f"Number of Convolutional Layers: {num_conv_layers}")

## 4. [2 points] Training YOLO variants.

#### (i) Create two versions of the training dataset

In [None]:
def is_image_file(filename):
    image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp']
    _, ext = os.path.splitext(filename)
    return ext.lower() in image_extensions
    
def get_image_paths(directory):
    image_paths = []
    
    for file in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, file)):
            if is_image_file(file):
                image_paths.append(os.path.join(directory, file))    
    return image_paths

image_paths = get_image_paths("./datasets/archive/images/train")
print(len(image_paths))

#### (ii) train three variants of the Yolo v8 models

In [None]:
def remove_cache_files(directory):
    # Iterate through all the directories and files
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.cache'):
                # Construct the full path to the file
                file_path = os.path.join(root, file)
                try:
                    # Attempt to remove the file
                    os.remove(file_path)
                    print(f"Removed: {file_path}")
                except OSError as e:
                    print(f"Error: {file_path} - {e}")

In [None]:
def train_model(model_name, config_path, epochs):

    if "train1" in config_path:
        project = "yolov8_100img"
    else:
        project = "yolov8_400img"
        
    pretrained=True
    if "yaml" in model_name:
        name = "8n_scratch"
        pretrained=False
    elif "8n.pt" in model_name:
        name = "8n_pretrained"
    elif "8m.pt" in model_name:
        name = "8m_pretrained"


    destination = f"./{name}_{project[-6:]}.pt"

    if not os.path.exists(destination):
        model = YOLO(model_name)
        results = model.train(data=config_path, epochs=epochs, project=project, name=name, exist_ok=True,pretrained=pretrained,workers=8)
        source = f"./{project}/{name}/weights/best.pt"
        shutil.move(source, destination)
        !rm -rf {project}
        !rm -rf {"wandb/"}
        remove_cache_files("./datasets/")
        return results
    else:
        print("already trained")
        return None

In [None]:
# Train each variant
variants = [
    ("yolov8n.yaml", "./datasets/archive/train1_config.yaml", 20),
    ("yolov8n.pt", "./datasets/archive/train1_config.yaml", 20),
    ("yolov8m.pt", "./datasets/archive/train1_config.yaml", 20),
    ("yolov8n.yaml", "./datasets/archive/config.yaml", 20),
    ("yolov8n.pt", "./datasets/archive/config.yaml", 20),
    ("yolov8m.pt", "./datasets/archive/config.yaml", 20),
]

results=[]
for variant_name, config_path, epochs in variants:
    print(f"Training: {variant_name}")
    results.append(train_model(variant_name, config_path, epochs))

#### (iii) Report and compare the results (AP50)

In [None]:
def is_weights_file(filename):
    # image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp']
    weights_extensions = [".pt"]
    _, ext = os.path.splitext(filename)
    return ext.lower() in weights_extensions
    
def get_weights_paths(directory):
    image_paths = []
    
    for file in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, file)):
            if is_weights_file(file):
                if("img") in file:
                    image_paths.append(os.path.join(directory, file))    
    return image_paths

weight_paths = get_weights_paths("./")
weight_paths.sort()
print(weight_paths)

In [None]:
if not os.path.exists("./results.yaml"):
    results = {}
    
    for weight in weight_paths:
        model = YOLO(weight)
        
        temp = "train1" if "100" in weight else "train"
        
        with open("./datasets/archive/config.yaml", 'r') as file:
            config = yaml.safe_load(file)
            config['val'] = f"images/{temp}"
        
        with open("./datasets/archive/val_config.yaml", 'w') as new_file:
            yaml.dump(config, new_file)
    
        remove_cache_files("./datasets/")
        val_map50 = model.val(data='./datasets/archive/config.yaml',workers=8).box.map50
        remove_cache_files("./datasets/")
        train_map50 = model.val(data='./datasets/archive/val_config.yaml',workers=8).box.map50 
        
        results[weight] = {
            'train_map50': str(train_map50),
            'val_map50': str(val_map50)
        }
    
    with open('results.yaml', 'w') as results_file:
        yaml.dump(results, results_file, default_flow_style=False)
    
    print("Results saved to results.yaml file.")
else:
    with open("./results.yaml", 'r') as file:
        results = yaml.safe_load(file)

In [None]:
results

(a) Increasing dataset size generally improves model performance. For `yolov8m` (larger model), there's a slight decrease in validation mAP@50 (from `0.765` to `0.682`) when moving from 100 to 400 images, potentially due to overfitting. In contrast, `yolov8n` (smaller model) shows significant improvement (`0.620` to `0.745`) with more data, indicating better generalization.

(b) The larger model (`yolov8m`) outperforms the smaller model (`yolov8n`) in most cases. `yolov8m` achieves higher validation mAP@50 (`0.765` for 100 images and `0.682` for 400 images) compared to `yolov8n` (`0.620` for 100 images and `0.745` for 400 images). This is likely due to `yolov8m`'s increased capacity to learn complex patterns, though it may suffer from overfitting with smaller datasets.

#### (iv) Visualize

In [None]:
model = YOLO("./8m_pretrained_100img.pt")

In [None]:
image_paths = get_image_paths("./datasets/archive/images/val")
print(len(image_paths))

In [None]:
train1_images = random.sample(image_paths, 4)
results = model.predict(train1_images)

In [None]:
# import matplotlib.pyplot as plt
for i,result in enumerate(results):
    result.save(f"result{i}.png")

## 5. [1 point] Impact of augmentations.

In [None]:
import numpy.random 
print(numpy.random.__file__) 