In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
from torchmetrics.detection import IntersectionOverUnion
from PIL import Image
import os
import cv2
import numpy as np
import json
import shutil
from torchsummary import summary
import csv

In [2]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("icaslab/dataset-face-detection-for-edge-computing-class")
print("Path to dataset files:", path)

# Download latest version
path = kagglehub.dataset_download("reganwillis/testset")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/dataset-face-detection-for-edge-computing-class
Path to dataset files: /kaggle/input/testset


In [4]:
#Data base addresses
base_adress = '/kaggle/input/dataset-face-detection-for-edge-computing-class/Dataset_FDDB/Dataset_FDDB/images'
labels_adr = '/kaggle/input/dataset-face-detection-for-edge-computing-class/Dataset_FDDB/Dataset_FDDB/label.txt'

In [5]:
# Make the labels ready

with open(labels_adr, 'r') as f:
    lines = f.readlines()
annotations = []
bboxes = []
flag = False
for line in lines:
    if line.startswith('#'):
      if flag:
        annotations.append({'image':img_name, 'bboxes': bboxes})
        bboxes = []
      flag = True
      img_name = line[2:]
    else:
      x_min, y_min, x_max, y_max = line.split()
      bboxes.append([int(x_min), int(y_min), int(x_max), int(y_max)])

In [6]:
# Custom Dataset Class for FDDB
class FDDBDataset(Dataset):
    def __init__(self, img_dir, annot_file, target_size=(224, 224), transform=None):
        self.img_dir = img_dir
        self.target_size = target_size
        self.transform = transform
        self.data = self._parse_annotations(annot_file)

    def _parse_annotations(self, annot_file):
        
        data = []
        for el in annot_file:
          img_path = os.path.join(self.img_dir, el['image'][:-1])
          boxes = el['bboxes']
          data.append((img_path, boxes))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, boxes = self.data[idx]
        image = cv2.imread(img_path)
        if image is None:
            raise FileNotFoundError(f"Image not found: {img_path}")

        # Original dimensions
        h, w, _ = image.shape

        # Resize image
        image_resized = cv2.resize(image, self.target_size)
        target_h, target_w = self.target_size

        # Scale bounding boxes
        scale_x = target_w / w
        scale_y = target_h / h
        boxes_resized = []
        for box in boxes:
            x_min = int(box[0] * scale_x)
            y_min = int(box[1] * scale_y)
            x_max = int(box[2] * scale_x)
            y_max = int(box[3] * scale_y)
            boxes_resized.append([x_min, y_min, x_max, y_max])

        # Convert to tensor
        if self.transform:
            image_resized = self.transform(image_resized)
        else:
            image_resized = transforms.ToTensor()(image_resized)

        return image_resized, torch.tensor(boxes_resized, dtype=torch.float32)

In [7]:
# DataLoader preparation
def get_dataloaders(img_dir, annot_file, batch_size=16, target_size=(224, 224), validation_split=0.2):

    # Transformations
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Dataset
    dataset = FDDBDataset(img_dir, annot_file, target_size, transform)

    # Split dataset
    val_size = int(len(dataset) * validation_split)
    train_size = len(dataset) - val_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    # DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    return train_loader, val_loader

def collate_fn(batch):
    """
    Custom collate function to handle variable-length bounding box arrays.

    :param batch: List of tuples (image, boxes).
    :return: Tuple of images and targets.
    """
    images = torch.stack([item[0] for item in batch])
    targets = [item[1] for item in batch]
    return images, targets

In [8]:
class MobileNetFaceDetector(nn.Module):
    def __init__(self, pretrained=False):
        super().__init__()

        self.conv1 = nn.Conv2d(
            in_channels=3,
            out_channels=32,
            kernel_size=3,
            stride=2,
            padding=1
        )
        self.bn1 = nn.BatchNorm2d(32)
        self.relu1 = nn.ReLU6()

        # bottleneck
        #hidden_dim = 32
        self.bottleneck1 = nn.Sequential(
            #nn.Conv2d(in_channels=32, out_channels=32, kernel_size=1, stride=1, padding=0, bias=False),
            #nn.BatchNorm2d(32),
            #nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1, groups=32, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=32, out_channels=16, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(16),
        )
        #hidden_dim = 16*6
        self.bottleneck2 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=16*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(16*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=16*6, out_channels=16*6, kernel_size=3, stride=2, padding=1, groups=16*6, bias=False),
            nn.BatchNorm2d(16*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=16*6, out_channels=24, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(24),
        )
        self.bottleneck21 = nn.Sequential(
            nn.Conv2d(in_channels=24, out_channels=24*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(24*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=24*6, out_channels=24*6, kernel_size=3, stride=1, padding=1, groups=24*6, bias=False),
            nn.BatchNorm2d(24*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=24*6, out_channels=24, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(24),
        )
        #hidden_dim = 24*6
        self.bottleneck3 = nn.Sequential(
            nn.Conv2d(in_channels=24, out_channels=24*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(24*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=24*6, out_channels=24*6, kernel_size=3, stride=2, padding=1, groups=24*6, bias=False),
            nn.BatchNorm2d(24*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=24*6, out_channels=32, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(32),
        )
        self.bottleneck31 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(32*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=32*6, out_channels=32*6, kernel_size=3, stride=1, padding=1, groups=32*6, bias=False),
            nn.BatchNorm2d(32*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=32*6, out_channels=32, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(32),
        )
        self.bottleneck32 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(32*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=32*6, out_channels=32*6, kernel_size=3, stride=1, padding=1, groups=32*6, bias=False),
            nn.BatchNorm2d(32*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=32*6, out_channels=32, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(32),
        )
        #hidden_dim=32*6
        self.bottleneck4 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(32*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=32*6, out_channels=32*6, kernel_size=3, stride=2, padding=1, groups=32*6, bias=False),
            nn.BatchNorm2d(32*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=32*6, out_channels=64, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(64),
        )
        self.bottleneck41 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=64*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(64*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=64*6, out_channels=64*6, kernel_size=3, stride=1, padding=1, groups=64*6, bias=False),
            nn.BatchNorm2d(64*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=64*6, out_channels=64, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(64),
        )
        self.bottleneck42 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=64*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(64*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=64*6, out_channels=64*6, kernel_size=3, stride=1, padding=1, groups=64*6, bias=False),
            nn.BatchNorm2d(64*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=64*6, out_channels=64, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(64),
        )
        self.bottleneck43 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=64*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(64*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=64*6, out_channels=64*6, kernel_size=3, stride=1, padding=1, groups=64*6, bias=False),
            nn.BatchNorm2d(64*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=64*6, out_channels=64, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(64),
        )
        #hidden_dim=64*6
        self.bottleneck5 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=64*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(64*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=64*6, out_channels=64*6, kernel_size=3, stride=1, padding=1, groups=64*6, bias=False),
            nn.BatchNorm2d(64*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=64*6, out_channels=96, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(96),
        )
        self.bottleneck51 = nn.Sequential(
            nn.Conv2d(in_channels=96, out_channels=96*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(96*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=96*6, out_channels=96*6, kernel_size=3, stride=1, padding=1, groups=96*6, bias=False),
            nn.BatchNorm2d(96*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=96*6, out_channels=96, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(96),
        )
        self.bottleneck52 = nn.Sequential(
            nn.Conv2d(in_channels=96, out_channels=96*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(96*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=96*6, out_channels=96*6, kernel_size=3, stride=1, padding=1, groups=96*6, bias=False),
            nn.BatchNorm2d(96*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=96*6, out_channels=96, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(96),
        )
        #hidden_dim=96*6
        self.bottleneck6 = nn.Sequential(
            nn.Conv2d(in_channels=96, out_channels=96*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(96*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=96*6, out_channels=96*6, kernel_size=3, stride=2, padding=1, groups=96*6, bias=False),
            nn.BatchNorm2d(96*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=96*6, out_channels=160, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(160),
        )
        self.bottleneck61 = nn.Sequential(
            nn.Conv2d(in_channels=160, out_channels=160*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(160*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=160*6, out_channels=160*6, kernel_size=3, stride=1, padding=1, groups=160*6, bias=False),
            nn.BatchNorm2d(160*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=160*6, out_channels=160, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(160),
        )
        self.bottleneck62 = nn.Sequential(
            nn.Conv2d(in_channels=160, out_channels=160*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(160*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=160*6, out_channels=160*6, kernel_size=3, stride=1, padding=1, groups=160*6, bias=False),
            nn.BatchNorm2d(160*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=160*6, out_channels=160, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(160),
        )
        #hidden_dim=160*6
        self.bottleneck7 = nn.Sequential(
            nn.Conv2d(in_channels=160, out_channels=160*6, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(160*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=160*6, out_channels=160*6, kernel_size=3, stride=1, padding=1, groups=160*6, bias=False),
            nn.BatchNorm2d(160*6),
            nn.ReLU6(inplace=True),
            nn.Conv2d(in_channels=160*6, out_channels=320, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(320),
        )

        self.conv2 = nn.Conv2d(
            in_channels=320,
            out_channels=1280,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=False
        )
        self.bn2 = nn.BatchNorm2d(1280)
        self.relu2 = nn.ReLU6()

        self.pool = nn.AdaptiveAvgPool2d(1)

        # Custom head for bounding box and classification
        self.fc_bbox = nn.Sequential(
            nn.Linear(1280, 512),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(512, 4),  # Bounding box: [x_min, y_min, x_max, y_max]
        )
        self.fc_label = nn.Sequential(
            nn.Linear(1280, 512),
            nn.ReLU(),
            nn.Linear(512, 1),  # Binary classification: face/no face
            nn.Sigmoid(),
        )

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)

        x = self.bottleneck1(x)
        x = self.bottleneck2(x)
        x = self.bottleneck21(x)
        x = self.bottleneck3(x)
        x = self.bottleneck31(x)
        x = self.bottleneck32(x)
        x = self.bottleneck4(x)
        x = self.bottleneck41(x)
        x = self.bottleneck42(x)
        x = self.bottleneck43(x)
        x = self.bottleneck5(x)
        x = self.bottleneck51(x)
        x = self.bottleneck52(x)
        x = self.bottleneck6(x)
        x = self.bottleneck61(x)
        x = self.bottleneck62(x)
        x = self.bottleneck7(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)

        x = self.pool(x).view(x.size(0), -1)
        bbox = self.fc_bbox(x)
        label = self.fc_label(x)
        return bbox, label
mobilenetfacedetector = MobileNetFaceDetector().to(device)
summary(mobilenetfacedetector, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 32, 112, 112]             896
       BatchNorm2d-2         [-1, 32, 112, 112]              64
             ReLU6-3         [-1, 32, 112, 112]               0
            Conv2d-4         [-1, 32, 112, 112]             288
       BatchNorm2d-5         [-1, 32, 112, 112]              64
             ReLU6-6         [-1, 32, 112, 112]               0
            Conv2d-7         [-1, 16, 112, 112]             512
       BatchNorm2d-8         [-1, 16, 112, 112]              32
            Conv2d-9         [-1, 96, 112, 112]           1,536
      BatchNorm2d-10         [-1, 96, 112, 112]             192
            ReLU6-11         [-1, 96, 112, 112]               0
           Conv2d-12           [-1, 96, 56, 56]             864
      BatchNorm2d-13           [-1, 96, 56, 56]             192
            ReLU6-14           [-1, 96,

In [9]:
class YOLOFaceDetector(nn.Module):
    def __init__(self, pretrained=False):
        super().__init__()
        # architecture from original YOLO paper - Fast YOLO
        # architecture improved according to YOLOv2 paper

        # conv layer 1
        self.conv1 = nn.Conv2d(
            in_channels=3,
            out_channels=int(64/2),
            kernel_size=7,
            stride=2,
            padding=1
        )
        self.bn1 = nn.BatchNorm2d(int(64/2))
        self.relu1 = nn.LeakyReLU(negative_slope=0.1)
        self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv2 = nn.Conv2d(
            in_channels=int(64/2),
            out_channels=int(192/2),
            kernel_size=3,
            stride=1,
            padding=1
        )
        self.bn2 = nn.BatchNorm2d(int(192/2))
        self.relu2 = nn.LeakyReLU(negative_slope=0.1)
        self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv3 = nn.Conv2d(
            in_channels=int(192/2),
            out_channels=int(128/2),
            kernel_size=1,
            stride=1,
            padding=1
        )
        self.bn3 = nn.BatchNorm2d(int(128/2))
        self.relu3 = nn.LeakyReLU(negative_slope=0.1)
        self.conv4 = nn.Conv2d(
            in_channels=int(128/2),
            out_channels=int(256/2),
            kernel_size=3,
            stride=1,
            padding=1
        )
        self.bn4 = nn.BatchNorm2d(int(256/2))
        self.relu4 = nn.LeakyReLU(negative_slope=0.1)
        self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv7 = nn.Conv2d(
            in_channels=int(256/2),
            out_channels=int(256/2),
            kernel_size=1,
            stride=1,
            padding=1
        )
        self.bn7 = nn.BatchNorm2d(int(256/2))
        self.relu7 = nn.LeakyReLU(negative_slope=0.1)
        self.conv8 = nn.Conv2d(
            in_channels=int(256/2),
            out_channels=int(512/2),
            kernel_size=3,
            stride=1,
            padding=1
        )
        self.bn8 = nn.BatchNorm2d(int(512/2))
        self.relu8 = nn.LeakyReLU(negative_slope=0.1)
        self.maxpool4 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        self.conv17 = nn.Conv2d(
            in_channels=int(512/2),
            out_channels=int(512/2),
            kernel_size=1,
            stride=1,
            padding=1
        )
        self.bn17 = nn.BatchNorm2d(int(512/2))
        self.relu17 = nn.LeakyReLU(negative_slope=0.1)
        self.conv18 = nn.Conv2d(
            in_channels=int(512/2),
            out_channels=int(1024/2),
            kernel_size=3,
            stride=1,
            padding=1
        )
        self.bn18 = nn.BatchNorm2d(int(1024/2))
        self.relu18 = nn.LeakyReLU(negative_slope=0.1)

        self.conv24 = nn.Conv2d(
            in_channels=int(1024/2),
            out_channels=int(1024/2),
            kernel_size=3,
            stride=1,
            padding=1
        )
        self.bn24 = nn.BatchNorm2d(int(1024/2))
        self.relu24 = nn.LeakyReLU(negative_slope=0.1)

        self.flatten = nn.Flatten()

        # regression head
        self.bbox = nn.Sequential(
            nn.Linear(int(1024/2)*10*10, int(4096/2)),
            nn.LeakyReLU(negative_slope=0.1),
            nn.Dropout(p=0.5),
            nn.Linear(int(4096/2), 4)
        )
        
        # classification head
        self.classify = nn.Sequential(
            nn.Linear(int(1024/2)*10*10, int(4096/2)),
            nn.LeakyReLU(negative_slope=0.1),
            nn.Dropout(p=0.5),
            nn.Linear(int(4096/2), 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        # pass through feature extractor backbone
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)

        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.conv4(x)
        x = self.bn4(x)
        x = self.relu4(x)
        x = self.maxpool3(x)

        x = self.conv7(x)
        x = self.bn7(x)
        x = self.relu7(x)
        x = self.conv8(x)
        x = self.bn8(x)
        x = self.relu8(x)
        x = self.maxpool4(x)

        x = self.conv17(x)
        x = self.bn17(x)
        x = self.relu17(x)
        x = self.conv18(x)
        x = self.bn18(x)
        x = self.relu18(x)
        x = self.conv24(x)
        x = self.bn24(x)
        x = self.relu24(x)

        #_, c, h, w = x.shape
        #print(c, h, w)
        
        x = self.flatten(x)
        
        bbox = self.bbox(x)
        label = self.classify(x)

        return bbox, label
yolofacedetector = YOLOFaceDetector().to(device)
summary(yolofacedetector, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 32, 110, 110]           4,736
       BatchNorm2d-2         [-1, 32, 110, 110]              64
         LeakyReLU-3         [-1, 32, 110, 110]               0
         MaxPool2d-4           [-1, 32, 55, 55]               0
            Conv2d-5           [-1, 96, 55, 55]          27,744
       BatchNorm2d-6           [-1, 96, 55, 55]             192
         LeakyReLU-7           [-1, 96, 55, 55]               0
         MaxPool2d-8           [-1, 96, 27, 27]               0
            Conv2d-9           [-1, 64, 29, 29]           6,208
      BatchNorm2d-10           [-1, 64, 29, 29]             128
        LeakyReLU-11           [-1, 64, 29, 29]               0
           Conv2d-12          [-1, 128, 29, 29]          73,856
      BatchNorm2d-13          [-1, 128, 29, 29]             256
        LeakyReLU-14          [-1, 128,

In [10]:
from torch.optim.lr_scheduler import LambdaLR

def step_lr(epoch):
    if epoch < 5:
        return (epoch+1)/5
    elif epoch >= 5 and epoch < 30:
        return 1.0
    elif epoch >= 30 and epoch < 40:
        return 0.1
    else:
        return 0.01

In [11]:
# Loss functions
bbox_loss_fn = nn.SmoothL1Loss()  # For bounding box regression
label_loss_fn = nn.BCELoss()      # For binary classification

In [12]:
# Training loop
def train_model(train_loader, val_loader, num_epochs, scheduler, model_id, model, arch, learning_rate, batch_size):
    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    if scheduler:
        scheduler_fn = LambdaLR(optimizer, step_lr)

    best_val_loss = float('inf')  # Initialize best validation loss
    best_model_path = f"{arch}_{model_id}.pth"  # Path to save the best model
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for images, targets in train_loader:
            images = images.to(device)
            bboxes = [torch.tensor(t, dtype=torch.float32).to(device) for t in targets]  # List of bounding boxes
            labels = [int(1) for t in targets]  # List of labels
            labels = torch.tensor(labels, dtype=torch.float32).to(device)
            preds_bbox, preds_label = model(images)
            # Compute losses
            bbox_losses = []
            label_losses = []
            for i in range(len(bboxes)):
              bbox_losses.append(bbox_loss_fn(preds_bbox[i], bboxes[i]))
              label_losses.append(label_loss_fn(preds_label[i], labels[i].unsqueeze(-1)))

            bbox_loss = torch.mean(torch.stack(bbox_losses))
            label_loss = torch.mean(torch.stack(label_losses))
            loss = bbox_loss + label_loss

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        if scheduler:
            scheduler_fn.step()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader):.4f}")

        # Validate and save the best model
        val_loss = validate_model(model, val_loader)
        if val_loss < best_val_loss:
            print(f"Validation loss improved from {best_val_loss:.4f} to {val_loss:.4f}. Saving model...")
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_path)

    print("Training complete. Best model saved as:", best_model_path)
    return best_val_loss.item()


In [13]:
def normalize_boxes(preds):
    """
    Normalize the 'boxes' in the predictions to ensure they are all tensors of shape [N, 4].
    Args:
        preds: List of dictionaries with 'boxes' and 'labels'.
    Returns:
        Normalized predictions with 'boxes' as tensors of shape [N, 4].
    """
    for pred in preds:
        # If boxes is a list of tensors, stack them into a single tensor
        if isinstance(pred['boxes'], list):
            pred['boxes'] = torch.stack(pred['boxes'])  # Stack into [N, 4]
    return preds

In [14]:
def calculate_iou(pred_box, gt_box):
    """
    Calculate IoU (Intersection over Union) for a single pair of boxes.
    Args:
        pred_box: Tensor of shape (4,), [x_min, y_min, x_max, y_max].
        gt_box: Tensor of shape (4,), [x_min, y_min, x_max, y_max].
    Returns:
        IoU value (float).
    """
    # Determine the (x, y)-coordinates of the intersection rectangle
    x1 = max(pred_box[0], gt_box[0])
    y1 = max(pred_box[1], gt_box[1])
    x2 = min(pred_box[2], gt_box[2])
    y2 = min(pred_box[3], gt_box[3])

    # Compute the area of intersection rectangle
    inter_width = max(0, x2 - x1)
    inter_height = max(0, y2 - y1)
    inter_area = inter_width * inter_height

    # Compute the area of both the predicted and ground-truth rectangles
    pred_area = (pred_box[2] - pred_box[0]) * (pred_box[3] - pred_box[1])
    gt_area = (gt_box[2] - gt_box[0]) * (gt_box[3] - gt_box[1])

    # Compute the area of union
    union_area = pred_area + gt_area - inter_area

    # Compute IoU
    iou = inter_area / union_area if union_area > 0 else 0.0
    return iou

In [15]:
def compute_map(pred_boxes, gt_boxes, threshold=0.5):
    tp = 0
    fp = 0

    # format
    pred_boxes = pred_boxes[0]["boxes"].cpu().detach().numpy().tolist()
    gt_boxes = gt_boxes[0]["boxes"].cpu().detach().numpy().tolist()

    # if there is more than one ground truth box
    # loop over ious for all boxes and pick the
    # closest one to measure mAP
    closest_iou = 0.0
    for i in range(len(gt_boxes)):
        iou = calculate_iou(pred_boxes[0], gt_boxes[i])
        if iou > closest_iou:
            closest_iou = iou
    if closest_iou >= threshold:
        tp += 1
    else:
        fp += 1
    precision = tp / (tp + fp + 1e-6)

    return precision * 100

In [16]:
def validate_model(model, val_loader):
    metric = IntersectionOverUnion().to(device)
    model.eval()
    total_bbox_loss = 0
    total_label_loss = 0
    total_iou = []
    total_map = []
    with torch.no_grad():
        for images, targets in val_loader:
            images = images.to(device)
            bboxes = [torch.tensor(t, dtype=torch.float32).to(device) for t in targets]  # List of bounding boxes
            labels = [int(1) for t in targets]  # List of labels
            labels = torch.tensor(labels, dtype=torch.float32).to(device)
            preds_bbox, preds_label = model(images)

            bbox_losses = []
            label_losses = []

            for i in range(len(bboxes)):
                bbox_losses.append(bbox_loss_fn(preds_bbox[i], bboxes[i]))
                label_losses.append(label_loss_fn(preds_label[i], labels[i].unsqueeze(-1)))
                
                preds = [{"boxes": [preds_bbox[i]], "labels": preds_label[i]}]
                preds = normalize_boxes(preds)
                
                targets_combined = torch.cat([bboxes[i]], dim=0)
                targets = [{"boxes": targets_combined, "labels": torch.ones(len(targets_combined)).to(device)}]

                iou_value = metric(preds, targets)
                total_iou.append(iou_value['iou'].item())
                map_value = compute_map(preds, targets)
                total_map.append(map_value)
            total_bbox_loss += torch.mean(torch.stack(bbox_losses))
            total_label_loss += torch.mean(torch.stack(label_losses))

    # Calculate average validation loss
    avg_bbox_loss = total_bbox_loss / len(val_loader)
    avg_label_loss = total_label_loss / len(val_loader)
    val_loss = avg_bbox_loss + avg_label_loss
    print('IoU = ', sum(total_iou)/len(total_iou))
    print('mAP@50 = ', sum(total_map)/len(total_map))
    print(f"Validation - BBox Loss: {avg_bbox_loss:.4f}, Label Loss: {avg_label_loss:.4f}, Total Loss: {val_loss:.4f}")
    return val_loss

In [17]:
!pip install hyperopt
from hyperopt import fmin, tpe, hp, anneal, Trials



In [18]:
#batch_size = 16
#target_size = (224, 224)
#train_loader, val_loader = get_dataloaders(base_adress, annotations, batch_size, target_size)
#train_model(mobilenetfacedetector, train_loader, val_loader, num_epochs=2)
import time

hyperopt_out = [['model id', 'score', 'arch', 'learning_rate', 'batch_size']]

# ***************** mobilenet architecture
def min_func(params):
    model_id = time.time()
    curr_model = MobileNetFaceDetector().to(device)
    params = {'model_id': model_id, 'model': curr_model, 'arch': params['arch'],
              'learning_rate': float(params['learning_rate']),
              'batch_size': int(params['batch_size'])}
              #'dropout': float(params['dropout'])}
              #weight decay
    target_size = (224, 224)
    train_loader, val_loader = get_dataloaders(base_adress, annotations, params['batch_size'], target_size)
    score = train_model(train_loader, val_loader, num_epochs=20, scheduler=False, **params)
    hyperopt_out.append([model_id, score, params['arch'],
                         params['learning_rate'], params['batch_size']])

    return score

mobilenetv2space={'arch': 'mobilenetv2',
                  'learning_rate': hp.loguniform('learning_rate', np.log(1e-4), np.log(1e-3)),
                  'batch_size': hp.choice('batch_size', [16, 32, 64])}
                  #'dropout' hp.loguniform('dropout', np.log(0.3), np.log(0.5))}

best_params1 = fmin(fn=min_func, space=mobilenetv2space, algo=tpe.suggest, max_evals=10)

# ************* yolo architecture
def min_func(params):
    model_id = time.time()
    curr_model = YOLOFaceDetector().to(device)
    params = {'model_id': model_id, 'model': curr_model, 'arch': params['arch'],
              'learning_rate': float(params['learning_rate']),
              'batch_size': int(params['batch_size'])}
    target_size = (224, 224)
    train_loader, val_loader = get_dataloaders(base_adress, annotations, params['batch_size'], target_size)
    score = train_model(train_loader, val_loader, num_epochs=20, scheduler=True, **params)
    hyperopt_out.append([model_id, score, params['arch'],
                         params['learning_rate'], params['batch_size']])

    return score

yolospace={'arch': 'yolo',
           'learning_rate': hp.loguniform('learning_rate', np.log(1e-5), np.log(1e-2)),
           'batch_size': hp.choice('batch_size', [16, 32, 64])}

best_params2 = fmin(fn=min_func, space=yolospace, algo=tpe.suggest, max_evals=10)

with open('hyperopt_out.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(hyperopt_out)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

  bboxes = [torch.tensor(t, dtype=torch.float32).to(device) for t in targets]  # List of bounding boxes

  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)

  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)

  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)

  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)

  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)

  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)

  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)

  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)

  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)

  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)

  return F.smooth_l1_loss(input, target, reduction=self

Epoch 1/20, Loss: 47.1209
  0%|          | 0/10 [00:23<?, ?trial/s, best loss=?]

  bboxes = [torch.tensor(t, dtype=torch.float32).to(device) for t in targets]  # List of bounding boxes



IoU = 
0.2740053621605335
mAP@50 = 
27.20845335691775
Validation - BBox Loss: 29.4250, Label Loss: 0.0000, Total Loss: 29.4250
Validation loss improved from inf to 29.4250. Saving model...
Epoch 2/20, Loss: 28.5950
IoU = 
0.2144517757544827
mAP@50 = 
32.862157950563024
Validation - BBox Loss: 27.3659, Label Loss: 0.0000, Total Loss: 27.3659
Validation loss improved from 29.4250 to 27.3659. Saving model...
Epoch 3/20, Loss: 26.5245
IoU = 
0.31088730337447074
mAP@50 = 
36.749079858694145
Validation - BBox Loss: 26.1865, Label Loss: 0.0000, Total Loss: 26.1865
Validation loss improved from 27.3659 to 26.1865. Saving model...
Epoch 4/20, Loss: 25.1774
IoU = 
0.35687842350222776
mAP@50 = 
46.64306289757337
Validation - BBox Loss: 24.4167, Label Loss: 0.0000, Total Loss: 24.4167
Validation loss improved from 26.1865 to 24.4167. Saving model...
Epoch 5/20, Loss: 24.3755
IoU = 
0.3847892711098658
mAP@50 = 
49.293236925844596
Validation - BBox Loss: 23.4019, Label Loss: 0.0000, Total Loss: 23.4

In [19]:
class FDDBTestDataset(Dataset):
    def __init__(self, img_dir, target_size=(224, 224)):
        self.img_dir = img_dir
        self.target_size = target_size
        self.transform = transforms.Compose([
                                            transforms.ToTensor(),
                                            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                                            ])
        self.image_files = []

        for img in os.listdir(img_dir):
            self.image_files.append(img)

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = self.img_dir + '/' + self.image_files[idx]
        image = cv2.imread(img_path)
        if image is None:
            raise FileNotFoundError(f"Image not found: {img_path}")
        image_resized = cv2.resize(image, self.target_size)
        image_resized = self.transform(image_resized)

        return image_resized, self.image_files[idx]

# DataLoader preparation
def get_test_dataloader(img_dir, batch_size=1, target_size=(224, 224)):
    dataset = FDDBTestDataset(img_dir, target_size)
    test_loader = DataLoader(dataset, batch_size=batch_size)#, collate_fn=collate_fn)

    return test_loader

In [20]:
def inference(model, test_loader, device):
    predictions = []

    model.eval()
    with torch.no_grad():
        for i, (image, image_name) in enumerate(test_loader):
            image = image.to(device)

            # get prediction
            pbbox, plabel = model(image)
            
            # format prediction
            preds = [{"id": image_name, "boxes": [pbbox], "labels": plabel}]
            preds = normalize_boxes(preds)
            predictions.append(preds)
    return predictions

In [21]:
# evaluate - submission

# test dataloader
testset_file_path = '/kaggle/input/testset/testset'
test_dataloader = get_test_dataloader(testset_file_path)

# model inference
for model_path in os.listdir('/kaggle/working'):
    full_path = model_path.split('.')
    arch = model_path.split('_')[0]
    model_id = ''.join(full_path[0:len(full_path)-1])
    if full_path[-1] == 'pth':
        print('inferencing on model:', model_id)
        if arch == 'mobilenetv2':
            inf_model = MobileNetFaceDetector()
        elif arch == 'yolo':
            inf_model = YOLOFaceDetector()
        else:
            print('ERROR: model architecture not supported:', arch)
        inf_model.load_state_dict(torch.load(model_path))
        inf_model.to(device)
        predictions = inference(inf_model, test_dataloader, device)
        
        # create submission
        data = [['image_id', 'x1', 'y1', 'x2', 'y2']]
        
        for i in range(len(predictions)):
            img_id = predictions[i][0]['id'][0]
            image = cv2.imread(testset_file_path + '/' + img_id)
            pbbox = predictions[i][0]['boxes'].cpu().tolist()[0][0]
        
            # scale predictions
            h, w = (224, 224)
            target_h, target_w, _ = image.shape
            scale_x = target_w / w
            scale_y = target_h / h
            x_min = int(pbbox[0] * scale_x)
            y_min = int(pbbox[1] * scale_y)
            x_max = int(pbbox[2] * scale_x)
            y_max = int(pbbox[3] * scale_y)
            pbbox = [x_min, y_min, x_max, y_max]
            formatted_img_id = "\'" + img_id + "\'"
            data.append([formatted_img_id, pbbox[0], pbbox[1], pbbox[2], pbbox[3]])
        with open(f'od_out_{model_id}.csv', 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(data)

inferencing on model: yolo_1744576496831536


  inf_model.load_state_dict(torch.load(model_path))


inferencing on model: mobilenetv2_17445725907177842
inferencing on model: yolo_17445742946865199
inferencing on model: mobilenetv2_17445704774527164
inferencing on model: mobilenetv2_17445721807024539
inferencing on model: mobilenetv2_17445700473371203
inferencing on model: mobilenetv2_1744571778807785
inferencing on model: yolo_17445760870446835
inferencing on model: mobilenetv2_17445687340807354
inferencing on model: yolo_17445747593731036
inferencing on model: mobilenetv2_17445696036144097
inferencing on model: mobilenetv2_17445709267047217
inferencing on model: yolo_17445756250224514
inferencing on model: yolo_17445739025584137
inferencing on model: yolo_17445752184424615
inferencing on model: mobilenetv2_17445691889165504
inferencing on model: mobilenetv2_17445713412417684
inferencing on model: yolo_17445734808197696
inferencing on model: yolo_17445769224919732
inferencing on model: yolo_17445730221955445


In [22]:
!pip install onnx
!pip install onnxscript

Collecting onnxscript
  Downloading onnxscript-0.2.4-py3-none-any.whl.metadata (15 kB)
Downloading onnxscript-0.2.4-py3-none-any.whl (705 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m705.4/705.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnxscript
Successfully installed onnxscript-0.2.4


In [23]:
# Model class must be defined somewhere
for PATH in os.listdir('/kaggle/working'):
    full_path = PATH.split('.')
    arch = PATH.split('_')[0]
    model_id = ''.join(full_path[0:len(full_path)-1])
    if full_path[-1] == 'pth':
        print('deploying to onnx:', model_id)
        if arch == 'mobilenetv2':
            model = MobileNetFaceDetector()
        elif arch == 'yolo':
            model = YOLOFaceDetector()
        else:
            print('ERROR: model architecture not supported:', arch)
        model.load_state_dict(torch.load(PATH))
        model.to(device)
        model.eval()
        dummy_input = torch.randn(1, 3, 224, 224).to(device)  # Adjust shape based on your model's input size
        
        # Export the model to ONNX
        torch.onnx.export(
            model,  # The loaded PyTorch model
            dummy_input,  # Example input tensor
            f"{model_id}.onnx",  # Output ONNX file name
            export_params=True,  # Store trained parameters
            opset_version=13,  # ONNX version (adjust as needed)
            do_constant_folding=True,  # Optimize by folding constants
            input_names=["input"],  # Naming input tensor
            output_names=["output"],  # Naming output tensor
            dynamic_axes=None 
        )
        
        print("...Model successfully exported to ONNX!")

deploying to onnx: yolo_1744576496831536


  model.load_state_dict(torch.load(PATH))


...Model successfully exported to ONNX!
deploying to onnx: mobilenetv2_17445725907177842
...Model successfully exported to ONNX!
deploying to onnx: yolo_17445742946865199
...Model successfully exported to ONNX!
deploying to onnx: mobilenetv2_17445704774527164
...Model successfully exported to ONNX!
deploying to onnx: mobilenetv2_17445721807024539
...Model successfully exported to ONNX!
deploying to onnx: mobilenetv2_17445700473371203
...Model successfully exported to ONNX!
deploying to onnx: mobilenetv2_1744571778807785
...Model successfully exported to ONNX!
deploying to onnx: yolo_17445760870446835
...Model successfully exported to ONNX!
deploying to onnx: mobilenetv2_17445687340807354
...Model successfully exported to ONNX!
deploying to onnx: yolo_17445747593731036
...Model successfully exported to ONNX!
deploying to onnx: mobilenetv2_17445696036144097
...Model successfully exported to ONNX!
deploying to onnx: mobilenetv2_17445709267047217
...Model successfully exported to ONNX!
dep