## DETR Pytorch Ligtning Finetuning with COCO-like dataset

This jupyter notebook was used for finetuning facebook/detr-resnet-101 base model

I used a COCO-like dataset in JSON format built with Roboflow Page with propietary images

The labels for the dataset are 2:
* words --> As supercategory
* word --> As Category/Label

The structure of the dataset is the following: "image_path", "class_label_id" and "coords" (wich is a 4 tuple with X, Y, Width, Height of the bounding box)

I only used the label "word" in the entire dataset with the goal to only detect and generate the bounding boxes for Handwritten and Cursive text

This script uses Pytorch with GPU support and Pytorch Lighning to use GPU Acceleration

This code is mainly based on this [Roboflow Colab Tutorial](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/train-huggingface-detr-on-custom-dataset.ipynb)

Author: Rodrigo Alvarez

In [None]:
import os
import torchvision
from transformers import DetrForObjectDetection, DetrImageProcessor
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import torch
import shutil
import numpy as np
from PIL import Image, ImageDraw

In [2]:
TRAIN_DATASET_DIR = "../hand-cursive-detr"
TRAIN_ANNOTATION_FILE_NAME = "_train_annotations_coco.json"

VAL_DATASET_DIR = "../hand-cursive-detr"
VAL_ANNOTATION_FILE_NAME = "_val_annotations_coco.json"

HF_CACHE = "/home/ralvarez22/Documentos/llm_data/llm_cache"  # I used HF cache dir to avoid the re-downloading of the model
DEVICE = "cuda"  # CUDA, CPU, or Specific device (CUDA:0, CUDA:0)
DETR_CHECKPOINT = os.path.join(
    HF_CACHE,
    "models--facebook--detr-resnet-101/snapshots/7d14702e444d98d0b1764824567fc2b45e1eb218",
)  # If using HF_CACHE, please set the path to the "snapshot" dir of the model
WEIGHT_DECAY = 1e-4
CLIP_GRAD = 0.1  # 1e-4 #0.001
BATCH_SIZE = 15
ACC_BATCH =  BATCH_SIZE * 2
MODEL_LR = 1e-4  # 
BB_LR = 1e-5  # 
MAX_EPOCHS = 60  # Use >= 50 . But it stops learning near the step 70
MAX_TRAIN_ITEMS = -1
LAB_NAME = "DETR_LAB"  # For my logger
EXPERIMENT_NAME = "Nous"  # For logger and versioning
EXPERIMENT_VERSION = 4  # Same
CKP_PATH = "../finetuned/detr"

LOG_DIR = "../tb_logs_detr"

In [3]:
os.makedirs(CKP_PATH, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

#### Dataset Preparation

In [4]:
# A custom class for loading Coco datasets in JSON format
class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(
        self, 
        image_directory_path: str,
        annotations_filename: str,
        image_processor, 
        train: bool = True
    ):
        annotation_file_path = os.path.join(image_directory_path, annotations_filename)
        # Its not an error to pass the "root_dataset_directory" and "full_annotation_file_path", inside the code, they dont concatenate
        super(CocoDetection, self).__init__(image_directory_path, annotation_file_path)
        self.image_processor = image_processor

    def __getitem__(self, idx):
        images, annotations = super(CocoDetection, self).__getitem__(idx)        
        image_id = self.ids[idx]
        annotations = {'image_id': image_id, 'annotations': annotations}
        encoding = self.image_processor(images=images, annotations=annotations, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze()
        target = encoding["labels"][0]
        return pixel_values, target

In [5]:

#First, load the image processor
image_processor = DetrImageProcessor.from_pretrained(DETR_CHECKPOINT, cache_dir=HF_CACHE)

In [None]:
# Load the dataset and create an "id to label" dictionary
TRAIN_DATASET = CocoDetection(
    image_directory_path=TRAIN_DATASET_DIR,
    annotations_filename=TRAIN_ANNOTATION_FILE_NAME,
    image_processor=image_processor,
    train=True,
)

VALIDATION_DATASET = CocoDetection(
    image_directory_path=VAL_DATASET_DIR,
    annotations_filename=VAL_ANNOTATION_FILE_NAME,
    image_processor=image_processor,
    train=False,
)

categories = TRAIN_DATASET.coco.cats
id2label = {k: v["name"] for k, v in categories.items()}

In [None]:
if MAX_TRAIN_ITEMS > 0:
    TRAIN_DATASET = TRAIN_DATASET[:MAX_TRAIN_ITEMS]
print("Number of training examples:", len(TRAIN_DATASET))

### TRAINING

In [8]:

# This function allow to process the data from a batch, the data must be pre-tokenized or pre-processed to be padded
def collate_fn(batch):
    # DETR authors employ various image sizes during training, making it not possible 
    # to directly batch together images. Hence they pad the images to the biggest 
    # resolution in a given batch, and create a corresponding binary pixel_mask 
    # which indicates which pixels are real/which are padding
    pixel_values = [item[0] for item in batch]
    encoding = image_processor.pad(pixel_values, return_tensors="pt").to(DEVICE)
    labels = [item[1] for item in batch]
    return {
        'pixel_values': encoding['pixel_values'],
        'pixel_mask': encoding['pixel_mask'],
        'labels': labels
    }

# Create the dataset loader, you can experiment with this
# In case of CUDA OUT OF MEMORY error, reduce the batch size or, move the logic to process the image inside the collate_fn
TRAIN_DATALOADER = DataLoader(dataset=TRAIN_DATASET, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
VAL_DATALOADER = DataLoader(dataset=VALIDATION_DATASET, batch_size=BATCH_SIZE, collate_fn=collate_fn)

In [None]:
# based on https://github.com/woctezuma/finetune-detr/blob/master/finetune_detr.ipynb
image_ids = TRAIN_DATASET.coco.getImgIds()
# let's pick a random image
image_id = image_ids[np.random.randint(0, len(image_ids))]
print('Image n°{}'.format(image_id))
image = TRAIN_DATASET.coco.loadImgs(image_id)[0]
image = Image.open(os.path.join(TRAIN_DATASET.root, image['file_name']))
annotations = TRAIN_DATASET.coco.imgToAnns[image_id]
draw = ImageDraw.Draw(image, "RGBA")
cats = TRAIN_DATASET.coco.cats
id2label = {k: v['name'] for k,v in cats.items()}
for annotation in annotations:
  box = annotation['bbox']
  class_idx = annotation['category_id']
  x,y,w,h = tuple(box)
  draw.rectangle((x,y,x+w,y+h), outline='red', width=2)
  draw.text((x, y), id2label[class_idx], fill='white')
image.show()


In [10]:
# A Python Lightning wrapper for the DETR Model to be able to use the Accelerator and Trainer
class Detr(pl.LightningModule):

    def __init__(
        self,
        model_path,
        lr,
        lr_backbone,
        weight_decay,
        training_dataset,
        validation_dataset=None,
    ):
        super().__init__()
        self.model = DetrForObjectDetection.from_pretrained(
            pretrained_model_name_or_path=model_path,
            num_labels=len(id2label),
            ignore_mismatched_sizes=True,
            cache_dir=HF_CACHE,
        )

        self.lr = lr
        self.lr_backbone = lr_backbone
        self.weight_decay = weight_decay
        self.training_dataset = training_dataset
        self.validation_dataset = validation_dataset

    def forward(self, pixel_values, pixel_mask):
        return self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)

    def common_step(self, batch, batch_idx):
        pixel_values = batch["pixel_values"]
        pixel_mask = batch["pixel_mask"]
        labels = [{k: v.to(self.device) for k, v in t.items()} for t in batch["labels"]]
        outputs = self.model(
            pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels
        )
        loss = outputs.loss
        loss_dict = outputs.loss_dict

        return loss, loss_dict

    def training_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)
        # logs metrics for each training_step, and the average across the epoch
        self.log("training_loss", loss)
        for k, v in loss_dict.items():
            self.log("train_" + k, v.item())

        return loss

    def validation_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)
        self.log("validation_loss", loss)
        for k, v in loss_dict.items():
            self.log("validation_" + k, v.item())

        return loss

    def configure_optimizers(self):
        param_dicts = [
            {
                "params": [
                    p
                    for n, p in self.named_parameters()
                    if "backbone" not in n and p.requires_grad
                ]
            },
            {
                "params": [
                    p
                    for n, p in self.named_parameters()
                    if "backbone" in n and p.requires_grad
                ],
                "lr": self.lr_backbone,
            },
        ]
        # Feel free to modify the optimizer, I used this because the good performance in other projects and is the frequenly used in transformers models
        return torch.optim.AdamW(
            param_dicts, lr=self.lr, weight_decay=self.weight_decay
        )

    def train_dataloader(self):
        return self.training_dataset

    def val_dataloader(self):
        return self.validation_dataset if self.validation_dataset else None

In [None]:
# Create a Pythorch Lightning model
model = Detr(
    DETR_CHECKPOINT,
    lr=MODEL_LR,
    lr_backbone=BB_LR,
    weight_decay=WEIGHT_DECAY,
    training_dataset=TRAIN_DATALOADER,
    validation_dataset=VAL_DATALOADER,
)

In [12]:
# Params for the logger
hyperparams = {
    "model_type": "DETR",
    "model_name": "detr-resnet-101",
    "codename": EXPERIMENT_NAME,
    "version": EXPERIMENT_VERSION,
    "model_learning_rate": MODEL_LR,
    "backbone_learning_rate": BB_LR,
    "epochs": MAX_EPOCHS,
    "weight_decay": WEIGHT_DECAY,
    "acc_grad_batches": ACC_BATCH,
    "clip_grad": CLIP_GRAD,
    "batch_size": BATCH_SIZE
}

In [None]:
# To get an idea of the total of params and the trainable params
detr_total_params = sum(p.numel() for p in model.parameters())
detr_train_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total params: {}\nTrainable params: {} M".format(detr_total_params / 1e6, detr_train_params/ 1e6))
hyperparams["total_params"] = detr_total_params
hyperparams["trainable_params"] = detr_train_params


In [14]:
log_path = os.path.join(LOG_DIR, EXPERIMENT_NAME, "version_{}".format(EXPERIMENT_VERSION))
shutil.rmtree(log_path, ignore_errors=True)
# I used Tensorboard Logger. If you too, please make sure to initiate the TB instance
logger = pl.loggers.TensorBoardLogger(save_dir=LOG_DIR, version=EXPERIMENT_VERSION, name=EXPERIMENT_NAME)
logger.log_hyperparams(hyperparams)

In [None]:
# Create a PL Trainer.
# I use 1 device (NVIDIA 4090 24GB). In the first attemp i wanted to use 2 GPUs (4090 & 3060) but the process finishes with error code.
# Use the GPU accelerator to speed-up the training and avoid bad performance
# accumulate_grad_batches is used to save memory, if you want to experiment, less acc value, more merory used (In theory)
trainer = pl.Trainer(devices=1, accelerator="gpu", max_epochs=MAX_EPOCHS, gradient_clip_val=CLIP_GRAD, accumulate_grad_batches=ACC_BATCH, log_every_n_steps=ACC_BATCH, check_val_every_n_epoch=ACC_BATCH, logger=logger)
# Run the training Cycle and log the metrics
trainer.fit(model)

At the end, the metrics were the following

![DETR Metrics](../metrics/detr_train_metrics.png "Loss Metrics")

In [None]:
# Save the final model
FINAL_CKP_PATH = os.path.join(CKP_PATH, EXPERIMENT_NAME, "V_{}".format(EXPERIMENT_VERSION))
# By default it saves the Safetensors type
model.model.save_pretrained(FINAL_CKP_PATH)
image_processor.save_pretrained(FINAL_CKP_PATH)

In [17]:
# Also, you can push the model to HF Hub here, but is recommended to first test it