## 1 Introduction
 
In this episode we will again use transfer learning with a transformer model. Transformers have revolutionized the field of natural language processing (NLP) and have found their way into various applications, including object detection.
                
Following the object detection tutorial at huggingface, we use the Detection Transformer (DETR) model. DETR is a state-of-the-art object detection model that combines the power of transformers with the task of object detection It consists of two main components: a CNN backbone and a transformer-based detection head. The backbone CNN extracts image features, which are then passed to the transformer-based detection head.
              
As in the previous section, most of the work here consists in prepraring the dataset to fit the model requirements and then follow the huggingface transformer [tutorial](https://huggingface.co/docs/transformers/tasks/object_detection) to complete the process. Specifically, we use the pre-trained model facebook/detr-resnet-50 available at huggignface hub. The model was train on the COCO 2017 object detection dataset and we need to process our data to follow the COCO format to fine-tune the model.

## 2. Data Preparation

Using the scripts implemented in the first episode we generate a dataset containing 1000 labelled images and display the first 100.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import random
from tqdm import tqdm 
from PIL import Image, ImageDraw


# Define a dictionary to store information about different objects
objects = {
    0: {'name': 'Sun', 'file': 'sun.png', 'code': 'SU'},
    1: {'name': 'Earth', 'file': 'earth.png', 'code': 'EA'},
    2: {'name': 'Mars', 'file': 'mars.png', 'code': 'MA'},
    3: {'name': 'Venus', 'file': 'venus.png', 'code': 'VE'},
    4: {'name': 'Jupiter', 'file': 'jupiter.png', 'code': 'JU'},
    5: {'name': 'Mercury', 'file': 'mercury.png', 'code': 'ME'},
    6: {'name': 'Saturn', 'file': 'saturn2.png', 'code': 'SA'},
    7: {'name': 'Neptune', 'file': 'neptune.png', 'code': 'NE'},
    8: {'name': 'Uranus', 'file': 'uranus.png', 'code': 'UR'},
    9: {'name': 'Asteroid', 'file': 'asteroid.png', 'code': 'AS'},
    10: {'name': 'Black Hole', 'file': 'black-hole.png', 'code': 'BL'},
    11: {'name': 'Star Destroyer', 'file': 'star-destroyer.png', 'code': 'ST'}
}

# Get the total number of objects
n_objects = len(objects)

# Define a list to store background images
bg = []
bg.append(Image.open(r"../img/background/bg1.jpg"))
bg.append(Image.open(r"../img/background/bg2.jpg"))
bg.append(Image.open(r"../img/background/bg3.jpg"))

# Set the directory path for object images
im_dir = '../img/objects'

# Loop through each object in the objects dictionary
for class_id, values in objects.items():
    # Open and convert the image file for the current planet
    png_file = Image.open(os.path.join(im_dir, values['file'])).convert('RGBA')
    
    # Crop the image to a square shape using the maximum dimension
    png_file = png_file.crop((0, 0, np.max(png_file.size), np.max(png_file.size)))
    
    # Store the processed image in the objects dictionary
    objects[class_id]['image'] = png_file


# Function to get a random background image and crop it
def get_bg(bg): 
    id_bg = np.random.randint(0, 3)  # Randomly select a background image index
    bg_tmp = bg[id_bg]  # Get the selected background image
    (w, h) = bg_tmp.size  # Get the dimensions of the background image
    x1 = np.random.randint(0, w - bg_size)  # Randomly choose x-coordinate for cropping
    y1 = np.random.randint(0, h - bg_size)  # Randomly choose y-coordinate for cropping
    bg_tmp = bg_tmp.crop((x1, y1, x1 + bg_size, y1 + bg_size))  # Crop the background image
    return bg_tmp

# Function to overlay a planet onto the background image
def put_object(obj, bg_tmp):
    h = obj.size[0]  # Get the size of the object image
    if h >= bg_size:
        scale = 0.4 * np.random.random() + 0.1  # Randomly choose a scale for large object
    else:
        scale = 0.7 * np.random.random() + 0.1  # Randomly choose a scale for small object
    h = np.int32(scale * h)  # Calculate the new size of the object image based on the scale
    p = obj.resize((h, h))  # Resize the object image
    h_bg = bg_tmp.size[0]  # Get the size of the background image
    x = np.random.randint(0, h_bg - h)  # Randomly choose x-coordinate for placing the object
    y = np.random.randint(0, h_bg - h)  # Randomly choose y-coordinate for placing the object
    bg_tmp.paste(p, (x, y), mask=p)  # Paste the object onto the background image
    return bg_tmp, x, y, h


In [None]:
n_img = 1000   # number of images  
bg_size = 800  # Size of the background image
im_size = 256  # Size of the resized images

# Function to create a single example for the dataset
def create_example():
    class_id = np.random.randint(0, n_objects)  # Randomly choose a class ID
    bg_tmp = get_bg(bg)  # Get a random background image
    plan_im = objects[class_id]['image']  # Get the image of the chosen class
    img, x, y, h = put_object(plan_im, bg_tmp)  # Overlay the object on the background
    img = img.resize((im_size, im_size))  # Resize the image to the desired size
    x1 = np.float32(x) / bg_size  # Normalize x-coordinate of the object's position
    y1 = np.float32(y) / bg_size  # Normalize y-coordinate of the object's position
    h = np.float32(h) / bg_size  # Normalize size of the object
    return img, class_id, x1, y1, h

# Function to create the dataset
def create_dataset(set_size):
    dataset = []
    for i in tqdm(range(set_size)):
        image, class_id, x1, y1, h = create_example()
        dataset.append([image, class_id, x1, y1, h])
    return dataset


print('Generating training set...')
dataset = create_dataset(n_img)  # Create a dataset of 1000 examples



The set is now processed to be loaded as a huggingface DataSet instace

In [None]:
newset=[]

for i,data in enumerate(dataset):
     width = data[0].size[0]
     height = data[0].size[1]
     newdata={}
     newdata["image_id"] = i
     newdata["image"] = data[0]
     newdata["width"] = width
     newdata["height"] = height
     newdata["objects"] = {
         "id":[1],
         "area": [0],
         "bbox": [[data[2]*width,data[3]*height,data[4]*width,data[4]*height]],
         "category": [data[1]]
     }
     newset.append(newdata)
     
from datasets import Dataset
ds = Dataset.from_list(newset)

We now define a transformation pipeline to transform the dataset into the COCO format

In [None]:
from transformers import AutoImageProcessor

checkpoint = "facebook/detr-resnet-50"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

import albumentations
import torch

transform = albumentations.Compose(
    [
    #    albumentations.HorizontalFlip(p=1.0),
    #    albumentations.RandomBrightnessContrast(p=1.0),
    ],
    bbox_params=albumentations.BboxParams(format="coco",label_fields=["category"]),
)

def formatted_anns(image_id, category, area, bbox):
    annotations = []
    for i in range(0, len(category)):
        new_ann = {
            "image_id": image_id,
            "category_id": category[i],
            "isCrowd": 0,
            "area": area[i],
            "bbox": list(bbox[i]),
        }
        annotations.append(new_ann)

    return annotations

# transforming a batch
def transform_aug_ann(examples):
    image_ids = examples["image_id"]
    images, bboxes, area, categories = [], [], [], []
    for image, objects in zip(examples["image"], examples["objects"]):
        image = np.array(image.convert("RGB"))[:, :, ::-1]
        out = transform(image=image, bboxes=objects["bbox"], category=objects["category"])

        area.append(objects["area"])
        images.append(out["image"])
        bboxes.append(out["bboxes"])
        categories.append(out["category"])

    targets = [
        {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
        for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes)
    ]

    return image_processor(images=images, annotations=targets, return_tensors="pt")

ds_new = ds.with_transform(transform_aug_ann)

## 3 Training

In [None]:
def collate_fn(batch):
    pixel_values = [item["pixel_values"] for item in batch]
    encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt")
    labels = [item["labels"] for item in batch]
    batch = {}
    batch["pixel_values"] = encoding["pixel_values"]
    batch["pixel_mask"] = encoding["pixel_mask"]
    batch["labels"] = labels
    return batch

In [None]:
from transformers import AutoModelForObjectDetection

categories = [objects[id_]["name"] for id_ in objects]
id2label = {index: x for index, x in enumerate(categories, start =0)}
label2id = {v: k for k,v in id2label.items()}

model = AutoModelForObjectDetection.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="deep-wars",
    per_device_train_batch_size=8,
    num_train_epochs=30,
    fp16=True,
    save_steps=200,
    logging_steps=50,
    learning_rate=1e-5,
    weight_decay=1e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=ds_new,
    tokenizer=image_processor,
)

trainer.train()

## 4 Prediction

In [None]:

image, class_id, x1, y1, h = create_example()
image_1= image.copy()

with transfer learning

In [None]:
from transformers import AutoModelForObjectDetection

image_processor = AutoImageProcessor.from_pretrained("deep-wars/checkpoint-3600") # adapt to your setting
model = AutoModelForObjectDetection.from_pretrained("deep-wars/checkpoint-3600")  # adapt to your setting

with torch.no_grad():
    inputs = image_processor(images=image, return_tensors="pt")
    outputs = model(**inputs)
    target_sizes = torch.tensor([image.size[::-1]])
    results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {model.config.id2label[label.item()]} with confidence "
        f"{round(score.item(), 3)} at location {box}"
    )
    

If the object is not detected adjust the threshold

In [None]:
draw = ImageDraw.Draw(image_1)

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    x, y, x2, y2 = tuple(box)
    draw.rectangle((x, y, x2, y2), outline="red", width=1)
    draw.text((x, y2), model.config.id2label[label.item()], fill="white")

image_1

without transfer learning

In [None]:

from transformers import AutoModelForObjectDetection

image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50")

with torch.no_grad():
    inputs = image_processor(images=image, return_tensors="pt")
    outputs = model(**inputs)
    target_sizes = torch.tensor([image.size[::-1]])
    results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {model.config.id2label[label.item()]} with confidence "
        f"{round(score.item(), 3)} at location {box}"
    )

In [None]:
draw = ImageDraw.Draw(image)

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    x, y, x2, y2 = tuple(box)
    draw.rectangle((x, y, x2, y2), outline="red", width=1)
    draw.text((x, y2), model.config.id2label[label.item()], fill="white")

image