This notebook will do the following:

1. Download pretrained model (automatically w/ PyTorch):
    * https://huggingface.co/google/vit-base-patch16-224
2. Download data https://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/
    * a PyTorch FGVCAircraft class also exists, but doesn't use the supplied bounding boxes.
    * Instead, let's manually pass the images to `ViTImageProcessor` and read the label ("variant") ourselves
3. Process all the images in the dataset
    * Pass image through model, collect intermediate layer
4. Write dataset to disk.
    * labels: JSON file. `ID: label` in `labels_*.json`
    * Cropped images: In `cropped`, filename is `{ID}.jpg`.

Dataset notes:

- The (main) aircraft in each image is annotated with a tight bounding box and a hierarchical airplane model label.
- Annotation level: Variant, e.g. Boeing 737-700. A variant collapses all the models that are visually indistinguishable into one class. The dataset comprises 102 different variants.
- Comes with train/test split already
- The top-left pixel of an image has coordinate (1,1).

In [4]:
import asyncio
from pathlib import Path
from typing import Dict, Tuple, List
import itertools

from PIL import Image
import torch


LIMIT = asyncio.Semaphore(100)

BBox = Tuple[int, int, int, int]
Feature = List[float]

DIR = Path(".").absolute()
DS_DIR = DIR / "dataset" / "fgvc-aircraft-2013b"  / "data"

In [5]:
def get_labels(f: Path):
    text = f.read_text().split("\n")
    return {int(x.split(" ")[0]): " ".join(x.split(" ")[1:]) for x in text if x}
    
train_labels = get_labels(DS_DIR / "images_family_trainval.txt")
test_labels = get_labels(DS_DIR / "images_family_test.txt")
assert len(test_labels) + len(train_labels) == 10_000

In [21]:
def get_boxes(f: Path) -> Dict[int, BBox]:
    raw = f.read_text().split("\n")
    rare = {int(x.split(" ")[0]): tuple([int(_x) for _x in x.split(" ")[1:]]) for x in raw if x}
    mrare = {id: tuple([x - 1 for x in box]) for id, box in rare.items()}
    return mrare

async def _crop_images(files: List[Path], bbox: List[BBox]) -> List[Image]:
    async def open(f: Path) -> Image:
        async with LIMIT:
            return Image.open(f)
    images = await asyncio.gather(*[open(f) for f in files])
    images = [img.crop(box)  for box, img in zip(bbox, images)]
    return images

async def crop_images(
    raw_images: List[Path],
    bboxes: List[BBox],
    ids: List[int],
    batch=64,
    save=False,
) -> Dict[int, Feature]:
    feats = dict()
    for k in itertools.count():
        good_id = ids[k * batch:(k + 1) * batch]
        if not len(good_id):
            break
        bimg = [raw_images[k] for k in good_id]
        bbbox = [bboxes[k] for k in good_id]
        assert len(bimg) == len(bbbox)
        bimgs2 = await _crop_images(bimg, bbbox)

        if save:
            for fname, img in zip(good_id, bimgs2):
                img.save(DIR / "dataset" / "cropped" / f"{fname}.jpg")
        
        # process images through model
        # write to disk
    return feats

In [22]:
raw_images = {int(f.name.replace(".jpg", "")): f for f in (DS_DIR / "images").glob("*.jpg")}
ids = [int(x) for x in sorted(raw_images.keys())]
assert len(ids) == 10_000
assert len(raw_images) == 10_000

bboxes = get_boxes(DS_DIR / "images_box.txt")
assert len(bboxes) == 10_000

await crop_images(raw_images, bboxes, ids, save=True)

In [28]:
import json

with open(DIR / "dataset" / "labels_test.json", "w") as f:
    json.dump(test_labels, f)
with open(DIR / "dataset" / "labels_train.json", "w") as f:
    json.dump(train_labels, f)