In [3]:
%%capture
%pip install accelerate -U
%pip install transformers[torch]

In [4]:
from transformers import AutoImageProcessor, AutoModelForObjectDetection

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
checkpoint = "hustvl/yolos-tiny"
teacher_processor = AutoImageProcessor.from_pretrained(checkpoint)
teacher_model = AutoModelForObjectDetection.from_pretrained(
    checkpoint
)

# pretrained model to start w/ - https://huggingface.co/hustvl/yolos-tiny
# dataset to finetune on - https://huggingface.co/datasets/wider_face

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [6]:
%%capture
%pip install datasets

In [10]:
from datasets import load_dataset, load_from_disk
import os

from datasets.arrow_dataset import Dataset, IterableDataset
from datasets.dataset_dict import DatasetDict, IterableDatasetDict

train_dir = "C:/source/repos/Models-Inferencing-On-Client-Sandbox/datasets/image-face-dataset/train"
test_dir = "C:/source/repos/Models-Inferencing-On-Client-Sandbox/datasets/image-face-dataset/test"
validation_dir = "C:/source/repos/Models-Inferencing-On-Client-Sandbox/datasets/image-face-dataset/validation"

is_data_local: bool = os.path.exists(train_dir) and os.path.exists(test_dir) and os.path.exists(validation_dir)
is_using_subset: bool = True

if is_data_local:
    train_dataset: Dataset | DatasetDict | IterableDataset | IterableDatasetDict = load_from_disk(train_dir)
    test_dataset: Dataset | DatasetDict | IterableDataset | IterableDatasetDict = load_from_disk(test_dir)
    validation_dataset: Dataset | DatasetDict | IterableDataset | IterableDatasetDict = load_from_disk(validation_dir)
else:
    train_dataset = load_dataset("wider_face", split="train[:5%]")
    test_dataset = load_dataset("wider_face", split="test[:5%]")
    validation_dataset = load_dataset("wider_face", split="validation[:5%]")
    train_dataset.save_to_disk(train_dir)
    test_dataset.save_to_disk(test_dir)
    validation_dataset.save_to_disk(validation_dir)

In [11]:
from datasets import DatasetDict

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': validation_dataset,
})

dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'faces'],
        num_rows: 644
    })
    test: Dataset({
        features: ['image', 'faces'],
        num_rows: 805
    })
    validation: Dataset({
        features: ['image', 'faces'],
        num_rows: 161
    })
})

In [12]:
%%capture
!pip install -U albumentations opencv-python

In [14]:
from typing import Any, Dict


example: Dict[Any, Any] = dataset["train"][2]

example

<class 'dict'>


In [None]:
import torch
from torchvision.ops import box_convert
from torchvision.utils import draw_bounding_boxes
from torchvision.transforms.functional import pil_to_tensor, to_pil_image

boxes_xywh = torch.tensor(example['faces']['bbox'])
boxes_xyxy = box_convert(boxes_xywh, 'xywh', 'xyxy')
labels = [str(x) for x in example['faces']['expression']]
to_pil_image(
    draw_bounding_boxes(
        pil_to_tensor(example['image']),
        boxes_xyxy,
        colors="red",
        labels=labels
    )
)

In [None]:
import albumentations
import numpy as np

transform = albumentations.Compose([
    albumentations.Resize(480, 480),
    albumentations.HorizontalFlip(p=1.0),
    albumentations.RandomBrightnessContrast(p=1.0),
], bbox_params=albumentations.BboxParams(format='coco', label_fields=['expression']))

image = np.array(example['image'])
out = transform(
    image=image,
    bboxes=example['faces']['bbox'],
    expression=example['faces']['expression']
)

In [None]:
image = torch.tensor(out['image']).permute(2, 0, 1)
boxes_xywh = torch.stack([torch.tensor(x) for x in out['bboxes']])
boxes_xyxy = box_convert(boxes_xywh, 'xywh', 'xyxy')
labels = [str(x) for x in out['expression']]
to_pil_image(
    draw_bounding_boxes(
        image,
        boxes_xyxy,
        colors='red',
        labels=labels
    )
)

In [None]:
def transforms(examples):
    images, bboxes, expressions = [], [], []
    for image, objects in zip(examples['image'], examples['faces']):
        image = np.array(image.convert("RGB"))
        out = transform(
            image=image,
            bboxes=objects['bbox'],
            expression=objects['expression']
        )
        images.append(torch.tensor(out['image']).permute(2, 0, 1))
        bboxes.append(torch.tensor(out['bboxes']))
        expressions.append(out['expression'])
    return {'image': images, 'bbox': bboxes, 'expression': expressions}

In [None]:
dataset['train'].set_transform(transforms)

In [None]:
# now whole dataset should be preprocessed
example = dataset['train'][400]
to_pil_image(
    draw_bounding_boxes(
        example['image'],
        box_convert(example['bbox'], 'xywh', 'xyxy'),
        colors='red',
        labels=[str(x) for x in example['expression']]
    )
)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="finetuned-tiny-yolo-face",
    per_device_train_batch_size=8,
    num_train_epochs=10,
    save_steps=200,
    logging_steps=50,
    learning_rate=1e-5,
    weight_decay=1e-4,
    save_total_limit=2,
    remove_unused_columns=False,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=teacher_model,
    args=training_args,
    train_dataset=dataset["train"],
    tokenizer=teacher_processor,
)

trainer.train()