In [1]:
from transformers import AutoImageProcessor, AutoModelForObjectDetection

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = "hustvl/yolos-tiny"
teacher_processor = AutoImageProcessor.from_pretrained(checkpoint)
teacher_model = AutoModelForObjectDetection.from_pretrained(
    checkpoint,
    ignore_mismatched_sizes=True,
)

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [3]:
from datasets import load_dataset, load_from_disk
import os


train_dir = "C:/source/repos/Models-Inferencing-On-Client-Sandbox/datasets/image-face-dataset/train"
test_dir = "C:/source/repos/Models-Inferencing-On-Client-Sandbox/datasets/image-face-dataset/test"
validation_dir = "C:/source/repos/Models-Inferencing-On-Client-Sandbox/datasets/image-face-dataset/validation"

is_data_local: bool = os.path.exists(train_dir) and os.path.exists(test_dir) and os.path.exists(validation_dir)
is_using_subset: bool = True

if is_data_local:
    train_dataset = load_from_disk(train_dir)
    test_dataset = load_from_disk(test_dir)
    validation_dataset = load_from_disk(validation_dir)
else:
    train_dataset = load_dataset("wider_face", split="train[:5%]")
    test_dataset = load_dataset("wider_face", split="test[:5%]")
    validation_dataset = load_dataset("wider_face", split="validation[:5%]")
    train_dataset.save_to_disk(train_dir)
    test_dataset.save_to_disk(test_dir)
    validation_dataset.save_to_disk(validation_dir)

In [4]:
from datasets import DatasetDict

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': validation_dataset,
})

dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'faces'],
        num_rows: 644
    })
    test: Dataset({
        features: ['image', 'faces'],
        num_rows: 805
    })
    validation: Dataset({
        features: ['image', 'faces'],
        num_rows: 161
    })
})

In [5]:

def add_coco_fields(example, idx):
    bboxes = example['faces']['bbox']
    areas = [bbox[2] * bbox[3] for bbox in bboxes]
    
    example['image_id'] = idx
    example['faces']['area'] = areas
    example['faces']['category'] = [0 for _ in example["faces"]["expression"]]
    
    return example

In [6]:
for split in dataset.keys():
    dataset[split] = dataset[split].map(add_coco_fields, with_indices=True)
    
categories = ["face"]

In [7]:
dataset["train"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x1385>,
 'faces': {'area': [18178.0],
  'bbox': [[449.0, 330.0, 122.0, 149.0]],
  'blur': [0],
  'category': [0],
  'expression': [0],
  'illumination': [0],
  'invalid': [False],
  'occlusion': [0],
  'pose': [0]},
 'image_id': 0}

In [8]:
import albumentations
import numpy as np
import torch

transform = albumentations.Compose(
    [
        albumentations.Resize(480, 480),
        albumentations.HorizontalFlip(p=1.0),
        albumentations.RandomBrightnessContrast(p=1.0),
    ],
    bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
)

In [9]:
def formatted_anns(image_id, category, area, bbox):
    annotations = []
    for i in range(0, len(category)):
        new_ann = {
            "image_id": image_id,
            "category_id": category[i],
            "isCrowd": 1 if len(bbox)>1 else 0,
            "area": area[i],
            "bbox": list(bbox[i]),
        }
        annotations.append(new_ann)

    return annotations

In [24]:
def transform_aug_ann(examples):
    processed_images = []  # This will hold the processed 'pixel_values'
    labels = []  # This will hold the 'labels' dicts for each image
    
    for idx, (image, faces) in enumerate(zip(examples["image"], examples["faces"])):
        # Convert PIL image to numpy array and ensure RGB order
        image_np = np.array(image.convert("RGB"))
        
        # Apply transformations
        transformed = transform(image=image_np, bboxes=faces["bbox"], category=faces["category"])
        
        # Process transformed image with teacher_processor to get pixel_values
        pixel_values = teacher_processor(images=transformed["image"], return_tensors="pt").pixel_values
        
        # Construct labels dict
        image_labels = {
            'size': torch.tensor([480, 480], dtype=torch.long),
            'image_id': torch.tensor([examples["image_id"][idx]], dtype=torch.long),
            'class_labels': torch.tensor(faces["category"], dtype=torch.long),
            'boxes': torch.tensor(transformed["bboxes"]).float(),
            'area': torch.tensor(faces["area"]),
            'iscrowd': torch.tensor([1 if len(faces["bbox"]) > 1 else 0]),
            'orig_size': torch.tensor([image.size[1], image.size[0]])  # Height, Width
        }
        
        # Append the processed pixel_values and labels
        pixel_values = pixel_values.squeeze(0) # Remove extra dim
        processed_images.append(pixel_values)
        labels.append(image_labels)
    
    # Convert lists to tensors for batching
    pixel_values_batch = torch.stack(processed_images)
    # Labels need to be a list of dicts, each dict corresponding to an image
    
    return {"pixel_values": pixel_values_batch, "labels": labels}


In [25]:
dataset["train"] = dataset["train"].with_transform(transform_aug_ann)

In [26]:
example = dataset["train"][0]

example

{'pixel_values': tensor([[[ 2.2489,  1.0331,  0.2453,  ..., -0.0458, -0.0458, -0.0458],
          [ 2.2489,  0.7077, -0.0629,  ..., -0.6281, -0.6109, -0.6109],
          [ 2.2489,  0.6392, -0.1143,  ..., -0.6794, -0.6794, -0.6794],
          ...,
          [ 2.2489,  0.2111, -0.5424,  ..., -0.0458, -0.0287, -0.0629],
          [ 2.2489,  0.3652, -0.2513,  ..., -0.1314, -0.1314, -0.2171],
          [ 2.2489,  0.3823, -0.2342,  ..., -0.2856, -0.1486,  0.0227]],
 
         [[ 2.4286,  1.2381,  0.4678,  ...,  0.3277,  0.3277,  0.3277],
          [ 2.4286,  0.8354,  0.1001,  ..., -0.2500, -0.2500, -0.2500],
          [ 2.4286,  0.7829,  0.0476,  ..., -0.2675, -0.2675, -0.2675],
          ...,
          [ 2.4286,  0.3627, -0.3901,  ...,  0.2052,  0.2402,  0.1877],
          [ 2.4286,  0.5028, -0.0924,  ...,  0.1352,  0.1352,  0.0476],
          [ 2.4286,  0.5378, -0.0749,  ..., -0.0574,  0.1001,  0.2752]],
 
         [[ 2.6400,  1.6117,  1.0714,  ...,  1.1585,  1.1585,  1.1585],
          [ 

In [27]:
def collate_fn(batch):
    pixel_values = [item["pixel_values"] for item in batch]
    encoding = teacher_processor.pad(pixel_values, return_tensors="pt")
    labels = [item["labels"] for item in batch]
    batch = {}
    batch["pixel_values"] = encoding["pixel_values"]
    batch["labels"] = labels
    return batch

In [30]:
device = "cuda:0" if torch.cuda.is_available() else "cpu" 

teacher_model = teacher_model.to(device)

In [31]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="finetuned-yolos-tiny",
    per_device_train_batch_size=8,
    num_train_epochs=10,
    fp16=True,
    save_steps=200,
    logging_steps=50,
    learning_rate=1e-5,
    weight_decay=1e-4,
    save_total_limit=2,
    remove_unused_columns=False,
)

In [32]:
from transformers import Trainer

trainer = Trainer(
    model=teacher_model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=dataset["train"], # type: ignore
    tokenizer=teacher_processor,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
                                                
 10%|█         | 85/810 [03:10<03:30,  3.44it/s]

{'loss': 2052.5519, 'grad_norm': 36.325740814208984, 'learning_rate': 9.432098765432099e-06, 'epoch': 0.62}




KeyboardInterrupt: 