### Convert WiderFace dataset into a COCO JSON format

There exist other converter but they are not compatible with how Edge Impulse expects the data to be formatted. Therefore, we need to implement our own conversion script.

The dataset was downloaded from [Kaggle](https://www.kaggle.com/datasets/iamprateek/wider-face-a-face-detection-dataset).  
The folders were moved in order to have the same structure in the following cell.

Directory structure of the WiderFace dataset:

```bash
.
├── wider_face_split
│   ├── readme.txt
│   ├── wider_face_test_filelist.txt
│   ├── wider_face_test.mat
│   ├── wider_face_train_bbx_gt.txt
│   ├── wider_face_train.mat
│   ├── wider_face_val_bbx_gt.txt
│   └── wider_face_val.mat
├── WIDER_test
│   └── images
├── WIDER_train
│   └── images
└── WIDER_val
    └── images
```

In [1]:
ANNOTATIONS_FILE = "./archive/wider_face_split/wider_face_train_bbx_gt.txt"
IMAGES_DIR = "./archive/WIDER_train/images/"

OUTPUT_DIR = "./archive/output_coco"
OUTPUT_ANNOTATIONS_FILE = "./archive/output_coco/annotations.json"

In [2]:
import os

if not os.path.exists(ANNOTATIONS_FILE):
    raise FileNotFoundError(f"Annotations file not found: {ANNOTATIONS_FILE}")

if not os.path.exists(IMAGES_DIR):
    raise FileNotFoundError(f"Images directory not found: {IMAGES_DIR}")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


In [3]:
SUBSETS = ["7--Cheering"]

In [4]:
IMAGE_EXTS = ("jpg", "jpeg", ".png")


def line_is_image_path(line: str) -> bool:
    return any([line.strip().lower().endswith(ext) for ext in IMAGE_EXTS])

def get_image_name(image_path: str) -> str:
    return image_path.split("/")[-1]

def image_belongs_to_subsets(image_path: str) -> bool:
    return any([image_path.startswith(subset) for subset in SUBSETS])

In [5]:
from PIL import Image
from shutil import copy

def get_image_size(image_path: str) -> tuple:
    with Image.open(image_path) as img:
        return img.size

def copy_image(source_path: str) -> None:
    output_path = OUTPUT_DIR + "/" + get_image_name(source_path)
    copy(source_path, output_path)


In [6]:
annotations = {
    "categories": [
        {
            "id": 1,
            "name": "face",
        }
    ],
    "annotations": [],
    "images": [],
}

image_id = 0
last_image = None
with open(ANNOTATIONS_FILE) as f:
    for line in f:
        line = line.strip()
        if line_is_image_path(line):
            if not image_belongs_to_subsets(line):
                last_image = None
                continue
            last_image = get_image_name(line)
            image_id += 1
            width, height = get_image_size(os.path.join(IMAGES_DIR, line))
            annotations["images"].append(
                {
                    "id": image_id,
                    "file_name": last_image,
                    "width": width,
                    "height": height,
                }
            )
            # Skip one line for the number of images
            f.readline()
            # Copy image to output directory
            copy_image(os.path.join(IMAGES_DIR, line))
        elif last_image is not None:
            try:
                (
                    x1,
                    y1,
                    w,
                    h,
                    blur,
                    expression,
                    illumination,
                    invalid,
                    occlusion,
                    pose,
                ) = line.split(" ")
                area = int(w) * int(h)
                annotations["annotations"].append(
                    {
                        "id": len(annotations["annotations"]) + 1,
                        "image_id": image_id,
                        "category_id": 1,
                        "bbox": [int(x1), int(y1), int(w), int(h)],
                        "area": area,
                        "iscrowd": 0,
                    }
                )
            except ValueError as e:
                print(f"{line=} {e}")

In [7]:
import json

with open(OUTPUT_ANNOTATIONS_FILE, "w") as f:
    json.dump(annotations, f, indent=4)