In [1]:
from transformers import AutoImageProcessor, AutoModelForObjectDetection

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = "hustvl/yolos-tiny"
teacher_processor = AutoImageProcessor.from_pretrained(checkpoint)
teacher_model = AutoModelForObjectDetection.from_pretrained(
    checkpoint,
    ignore_mismatched_sizes=True,
)

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [3]:
from datasets import load_dataset, load_from_disk
import os


train_dir = "C:/source/repos/Models-Inferencing-On-Client-Sandbox/datasets/image-face-dataset/train"
test_dir = "C:/source/repos/Models-Inferencing-On-Client-Sandbox/datasets/image-face-dataset/test"
validation_dir = "C:/source/repos/Models-Inferencing-On-Client-Sandbox/datasets/image-face-dataset/validation"

is_data_local: bool = os.path.exists(train_dir) and os.path.exists(test_dir) and os.path.exists(validation_dir)
is_using_subset: bool = True

if is_data_local:
    train_dataset = load_from_disk(train_dir)
    test_dataset = load_from_disk(test_dir)
    validation_dataset = load_from_disk(validation_dir)
else:
    train_dataset = load_dataset("wider_face", split="train[:5%]")
    test_dataset = load_dataset("wider_face", split="test[:5%]")
    validation_dataset = load_dataset("wider_face", split="validation[:5%]")
    train_dataset.save_to_disk(train_dir)
    test_dataset.save_to_disk(test_dir)
    validation_dataset.save_to_disk(validation_dir)

In [4]:
from datasets import DatasetDict

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': validation_dataset,
})

dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'faces'],
        num_rows: 644
    })
    test: Dataset({
        features: ['image', 'faces'],
        num_rows: 805
    })
    validation: Dataset({
        features: ['image', 'faces'],
        num_rows: 161
    })
})

In [5]:

def add_coco_fields(example, idx):
    bboxes = example['faces']['bbox']
    areas = [bbox[2] * bbox[3] for bbox in bboxes]
    
    example['image_id'] = idx
    example['faces']['area'] = areas
    example['faces']['category'] = [0 for _ in example["faces"]["expression"]]
    
    return example

In [6]:
for split in dataset.keys():
    dataset[split] = dataset[split].map(add_coco_fields, with_indices=True)
    
categories = ["face"]

In [7]:
dataset["train"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x1385>,
 'faces': {'area': [18178.0],
  'bbox': [[449.0, 330.0, 122.0, 149.0]],
  'blur': [0],
  'category': [0],
  'expression': [0],
  'illumination': [0],
  'invalid': [False],
  'occlusion': [0],
  'pose': [0]},
 'image_id': 0}

In [8]:
import albumentations
import numpy as np
import torch

transform = albumentations.Compose(
    [
        albumentations.Resize(480, 480),
        albumentations.HorizontalFlip(p=1.0),
        albumentations.RandomBrightnessContrast(p=1.0),
    ],
    bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
)

In [9]:
def formatted_anns(image_id, category, area, bbox):
    annotations = []
    for i in range(0, len(category)):
        new_ann = {
            "image_id": image_id,
            "category_id": category[i],
            "isCrowd": 1 if len(bbox)>1 else 0,
            "area": area[i],
            "bbox": list(bbox[i]),
        }
        annotations.append(new_ann)

    return annotations

In [10]:
def transform_aug_ann(examples):
    processed_images = []  # This will hold the processed 'pixel_values'
    labels = []  # This will hold the 'labels' dicts for each image
    
    for idx, (image, faces) in enumerate(zip(examples["image"], examples["faces"])):
        # Convert PIL image to numpy array and ensure RGB order
        image_np = np.array(image.convert("RGB"))
        
        # Apply transformations
        transformed = transform(image=image_np, bboxes=faces["bbox"], category=faces["category"])
        
        # Process transformed image with teacher_processor to get pixel_values
        pixel_values = teacher_processor(images=transformed["image"], return_tensors="pt").pixel_values
        
        # Construct labels dict
        image_labels = {
            'size': torch.tensor([480, 480], dtype=torch.long),
            'image_id': torch.tensor([examples["image_id"][idx]], dtype=torch.long),
            'class_labels': torch.tensor(faces["category"], dtype=torch.long),
            'boxes': torch.tensor(transformed["bboxes"]).float(),
            'area': torch.tensor(faces["area"]),
            'iscrowd': torch.tensor([1 if len(faces["bbox"]) > 1 else 0]),
            'orig_size': torch.tensor([image.size[1], image.size[0]])  # Height, Width
        }
        
        # Append the processed pixel_values and labels
        pixel_values = pixel_values.squeeze(0) # Remove extra dim
        processed_images.append(pixel_values)
        labels.append(image_labels)
    
    # Convert lists to tensors for batching
    pixel_values_batch = torch.stack(processed_images)
    # Labels need to be a list of dicts, each dict corresponding to an image
    
    return {"pixel_values": pixel_values_batch, "labels": labels}


In [11]:
dataset["train"] = dataset["train"].with_transform(transform_aug_ann)

In [12]:
example = dataset["train"][0]

example

{'pixel_values': tensor([[[ 0.7933, -1.0733, -1.7069,  ..., -1.9467, -1.9467, -1.9467],
          [ 0.8104, -1.3473, -1.9467,  ..., -2.1179, -2.1179, -2.1179],
          [ 0.8104, -1.3815, -1.9809,  ..., -2.1179, -2.1179, -2.1179],
          ...,
          [ 0.8276, -1.7240, -2.1179,  ..., -1.9467, -1.9295, -1.9467],
          [ 0.8276, -1.6042, -2.1008,  ..., -1.9980, -1.9980, -2.0665],
          [ 0.8276, -1.6042, -2.0837,  ..., -2.1179, -2.0152, -1.8782]],
 
         [[ 0.9230, -0.9328, -1.5455,  ..., -1.6506, -1.6506, -1.6506],
          [ 0.9230, -1.2654, -1.8431,  ..., -2.0357, -2.0357, -2.0357],
          [ 0.9055, -1.2829, -1.8782,  ..., -2.0357, -2.0357, -2.0357],
          ...,
          [ 0.9755, -1.6331, -2.0357,  ..., -1.7381, -1.7381, -1.7556],
          [ 0.9755, -1.5105, -1.9832,  ..., -1.8081, -1.8081, -1.8782],
          [ 0.9580, -1.4930, -1.9832,  ..., -1.9657, -1.8431, -1.7031]],
 
         [[ 1.0539, -0.5844, -1.0201,  ..., -0.9330, -0.9330, -0.9330],
          [ 

In [13]:
def collate_fn(batch):
    pixel_values = [item["pixel_values"] for item in batch]
    encoding = teacher_processor.pad(pixel_values, return_tensors="pt")
    labels = [item["labels"] for item in batch]
    batch = {}
    batch["pixel_values"] = encoding["pixel_values"]
    batch["labels"] = labels
    return batch

In [14]:
device = "cuda:0" if torch.cuda.is_available() else "cpu" 

teacher_model = teacher_model.to(device)

In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="finetuned-yolos-tiny",
    per_device_train_batch_size=8,
    num_train_epochs=10,
    fp16=True,
    save_steps=200,
    logging_steps=50,
    learning_rate=1e-5,
    weight_decay=1e-4,
    save_total_limit=2,
    remove_unused_columns=False,
)

In [16]:
from transformers import Trainer

trainer = Trainer(
    model=teacher_model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=dataset["train"], # type: ignore
    tokenizer=teacher_processor,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  6%|▌         | 50/810 [00:15<03:43,  3.40it/s]

{'loss': 2063.6431, 'grad_norm': 42.59532165527344, 'learning_rate': 9.419753086419753e-06, 'epoch': 0.62}


 12%|█▏        | 100/810 [00:30<03:30,  3.37it/s]

{'loss': 1899.5256, 'grad_norm': 39.595062255859375, 'learning_rate': 8.814814814814817e-06, 'epoch': 1.23}


 19%|█▊        | 150/810 [00:45<03:11,  3.46it/s]

{'loss': 1996.6937, 'grad_norm': 35.58168029785156, 'learning_rate': 8.197530864197532e-06, 'epoch': 1.85}


 25%|██▍       | 200/810 [01:00<03:07,  3.25it/s]

{'loss': 2055.2484, 'grad_norm': 28.648643493652344, 'learning_rate': 7.580246913580247e-06, 'epoch': 2.47}


 31%|███       | 250/810 [01:15<02:44,  3.40it/s]

{'loss': 1844.985, 'grad_norm': 18.545175552368164, 'learning_rate': 6.962962962962964e-06, 'epoch': 3.09}


 37%|███▋      | 300/810 [01:31<02:47,  3.05it/s]

{'loss': 1855.3216, 'grad_norm': 8.46521282196045, 'learning_rate': 6.345679012345679e-06, 'epoch': 3.7}


 43%|████▎     | 350/810 [01:46<02:18,  3.32it/s]

{'loss': 2023.9447, 'grad_norm': 6.406212329864502, 'learning_rate': 5.728395061728396e-06, 'epoch': 4.32}


 49%|████▉     | 400/810 [02:01<01:59,  3.43it/s]

{'loss': 2005.1331, 'grad_norm': 6.289460182189941, 'learning_rate': 5.1111111111111115e-06, 'epoch': 4.94}


 56%|█████▌    | 450/810 [02:17<01:51,  3.22it/s]

{'loss': 2087.0244, 'grad_norm': 8.88055419921875, 'learning_rate': 4.493827160493827e-06, 'epoch': 5.56}


 62%|██████▏   | 500/810 [02:33<01:39,  3.13it/s]

{'loss': 1858.4398, 'grad_norm': 10.83547592163086, 'learning_rate': 3.876543209876544e-06, 'epoch': 6.17}


 68%|██████▊   | 550/810 [02:48<01:24,  3.09it/s]

{'loss': 1977.4927, 'grad_norm': 9.50704288482666, 'learning_rate': 3.25925925925926e-06, 'epoch': 6.79}


 74%|███████▍  | 600/810 [03:03<01:04,  3.24it/s]

{'loss': 1975.0391, 'grad_norm': 10.463424682617188, 'learning_rate': 2.6419753086419752e-06, 'epoch': 7.41}


 80%|████████  | 650/810 [03:19<00:44,  3.56it/s]

{'loss': 1999.7152, 'grad_norm': 12.629549980163574, 'learning_rate': 2.0246913580246915e-06, 'epoch': 8.02}


 86%|████████▋ | 700/810 [03:34<00:32,  3.41it/s]

{'loss': 2013.5748, 'grad_norm': 10.011380195617676, 'learning_rate': 1.4074074074074075e-06, 'epoch': 8.64}


 93%|█████████▎| 750/810 [03:49<00:17,  3.41it/s]

{'loss': 1886.3752, 'grad_norm': 10.846943855285645, 'learning_rate': 7.901234567901235e-07, 'epoch': 9.26}


 99%|█████████▉| 800/810 [04:04<00:03,  3.27it/s]

{'loss': 1989.5583, 'grad_norm': 11.371615409851074, 'learning_rate': 1.728395061728395e-07, 'epoch': 9.88}


100%|██████████| 810/810 [04:07<00:00,  3.27it/s]

{'train_runtime': 247.8369, 'train_samples_per_second': 25.985, 'train_steps_per_second': 3.268, 'train_loss': 1970.4979021990741, 'epoch': 10.0}





TrainOutput(global_step=810, training_loss=1970.4979021990741, metrics={'train_runtime': 247.8369, 'train_samples_per_second': 25.985, 'train_steps_per_second': 3.268, 'train_loss': 1970.4979021990741, 'epoch': 10.0})

In [17]:
tuned_model_path = "tuned_yolostiny_model"

teacher_model.save_pretrained(tuned_model_path)
teacher_processor.save_pretrained(tuned_model_path)

['tuned_yolostiny_model\\preprocessor_config.json']