## DETR Pytorch Ligtning Finetuning with COCO-like dataset

This jupyter notebook was used for finetuning Facebook/Detr-resnet-101 base model

I used a COCO-like dataset in JSON format built with Roboflow Page with propietary images

The labels for the dataset are 2:
* words --> As supercategory
* word --> As Category/Label

The structure of the dataset is the following: "image_path", "class_label_id" and "coords" (wich is a 4 tuple with Xmin, Ymin, Xmax, Ymax of the bounding box)

I only used the label "word" in the entire dataset with the goal to only detect and generate the bounding boxes for Handwritten and Cursive text

This script uses Pytorch with GPU support and Pytorch Lighning to use GPU Acceleration

This code is mainly based on this [Roboflow Colab Tutorial](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/train-huggingface-detr-on-custom-dataset.ipynb)

Author: Rodrigo Alvarez

In [None]:
import os
import torchvision
from transformers import DetrForObjectDetection, DetrImageProcessor
import supervision as sv
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import torch


In [None]:
DATASET_DIR = "<path_to_coco_dataset>/train"
ANNOTATION_FILE_NAME = "_annotations.coco.json"
HF_CACHE = "~/llm_cache" # I used HF cache dir to avoid the re-downloading of the model
MODEL_VERSION = 1 # Experiment version
DEVICE = "cuda" # CUDA, CPU, or Specific device (CUDA:0, CUDA:0)
CHECKPOINT = 'facebook/detr-resnet-101' # If using HF_CACHE, please set the path to the "snapshot" dir of the model
CONFIDENCE_TRESHOLD = 0.9 # Confidence score for filtering bounding boxes predictions at inference
IOU_TRESHOLD = 0.8 # Accurance for evaluating the predicted boxes with the ground truth
MODEL_LR = 4e-5 # In some articles, they set the value to 5e-4, but, in my case, it doesn't work, so I try with this and works "well"
BB_LR = 4e-5 # Same as above
MAX_EPOCHS = 100 # Use >= 50 . But it stops learning near the step 70

LAB_NAME = "DETR_LAB" # For my logger
EXPERIMENT_NAME = "Akivili" # For logger and versioning
EXPERIMENT_VERSION = 2 # Same
CKP_PATH = "<path_to_save_ckps>"

In [None]:
os.makedirs(CKP_PATH, exist_ok=True)

#### Dataset Preparation

In [None]:
# A custom class for loading Coco datasets in JSON format
class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(
        self, 
        image_directory_path: str, 
        image_processor, 
        train: bool = True
    ):
        annotation_file_path = os.path.join(image_directory_path, ANNOTATION_FILE_NAME)
        # Its not an error to pass the "root_dataset_directory" and "full_annotation_file_path", inside the code, they dont concatenate
        super(CocoDetection, self).__init__(image_directory_path, annotation_file_path)
        self.image_processor = image_processor

    def __getitem__(self, idx):
        images, annotations = super(CocoDetection, self).__getitem__(idx)        
        image_id = self.ids[idx]
        annotations = {'image_id': image_id, 'annotations': annotations}
        encoding = self.image_processor(images=images, annotations=annotations, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze()
        target = encoding["labels"][0]
        return pixel_values, target

In [None]:

#First, load the image processor
image_processor = DetrImageProcessor.from_pretrained(CHECKPOINT, cache_dir=HF_CACHE)

In [None]:
# Load the dataset and create an "id to label" dictionary
TRAIN_DATASET = CocoDetection(image_directory_path=DATASET_DIR, image_processor=image_processor, train=True)
categories = TRAIN_DATASET.coco.cats
id2label = {k: v['name'] for k,v in categories.items()}

In [None]:
print("Number of training examples:", len(TRAIN_DATASET))

### TRAINING

In [None]:

# This function allow to process the data from a batch, the data must be pre-tokenized or pre-processed to be padded
def collate_fn(batch):
    # DETR authors employ various image sizes during training, making it not possible 
    # to directly batch together images. Hence they pad the images to the biggest 
    # resolution in a given batch, and create a corresponding binary pixel_mask 
    # which indicates which pixels are real/which are padding
    pixel_values = [item[0] for item in batch]
    encoding = image_processor.pad(pixel_values, return_tensors="pt").to(DEVICE)
    labels = [item[1] for item in batch]
    return {
        'pixel_values': encoding['pixel_values'],
        'pixel_mask': encoding['pixel_mask'],
        'labels': labels
    }

# Create the dataset loader, you can experiment with this
# In case of CUDA OUT OF MEMORY error, reduce the batch size or, move the logic to process the image inside the collate_fn
TRAIN_DATALOADER = DataLoader(dataset=TRAIN_DATASET, batch_size=4, collate_fn=collate_fn, shuffle=True)

In [None]:
# A Python Lightning wrapper for the DETR Model to be able to use the Accelerator and Trainer
class Detr(pl.LightningModule):

    def __init__(self, model_path, lr, lr_backbone, weight_decay, training_dataset):
        super().__init__()
        self.model = DetrForObjectDetection.from_pretrained(
            pretrained_model_name_or_path=model_path, 
            num_labels=len(id2label),
            ignore_mismatched_sizes=True,
            cache_dir=HF_CACHE
        )
        
        self.lr = lr
        self.lr_backbone = lr_backbone
        self.weight_decay = weight_decay
        self.training_dataset = training_dataset

    def forward(self, pixel_values, pixel_mask):
        return self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)

    def common_step(self, batch, batch_idx):
        pixel_values = batch["pixel_values"]
        pixel_mask = batch["pixel_mask"]
        labels = [{k: v.to(self.device) for k, v in t.items()} for t in batch["labels"]]

        outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)

        loss = outputs.loss
        loss_dict = outputs.loss_dict

        return loss, loss_dict

    def training_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)     
        # logs metrics for each training_step, and the average across the epoch
        self.log("training_loss", loss)
        for k,v in loss_dict.items():
            self.log("train_" + k, v.item())

        return loss

    def configure_optimizers(self):
        param_dicts = [
            {
                "params": [p for n, p in self.named_parameters() if "backbone" not in n and p.requires_grad]},
            {
                "params": [p for n, p in self.named_parameters() if "backbone" in n and p.requires_grad],
                "lr": self.lr_backbone,
            },
        ]
        # Feel free to modify the optimizer, I used this because the good performance in other projects and is the frequenly used in transformers models
        return torch.optim.AdamW(param_dicts, lr=self.lr, weight_decay=self.weight_decay)

    def train_dataloader(self):
        return self.training_dataset

In [None]:
# Create a Pythorch Lightning model
model = Detr(CHECKPOINT, lr=MODEL_LR, lr_backbone=BB_LR, weight_decay=1e-4, training_dataset=TRAIN_DATALOADER)

In [None]:
# Params for the logger
hyperparams = {
    "model_type": "DETR",
    "model_name": "detr-resnet-101",
    "codename": EXPERIMENT_NAME,
    "version": EXPERIMENT_VERSION,
    "model_learning_rate": MODEL_LR,
    "backbone_learning_rate": BB_LR,
    "epochs": MAX_EPOCHS
}

In [None]:
# To get an idea of the total of params and the trainable params
detr_total_params = sum(p.numel() for p in model.parameters())
detr_train_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total params: {}\nTrainable params: {} M".format(detr_total_params / 1e6, detr_train_params/ 1e6))
hyperparams["total_params"] = detr_total_params
hyperparams["trainable_params"] = detr_train_params


In [None]:
# I used Tensorboard Logger. If you too, please make sure to initiate the TB instance
logger = pl.loggers.TensorBoardLogger(save_dir="/home/ralvarez22/Documentos/trocr_hand/trocr_llm/tb", version=MODEL_VERSION, name=LAB_NAME)

In [None]:
# Add the Hyperparams to the project log
logger.log_hyperparams(hyperparams)

In [None]:
# Create a PL Trainer.
# I use 1 device (NVIDIA 4090 24GB). In the first attemp i wanted to use 2 GPUs (4090 & 3060) but the process finishes with error code.
# Use the GPU accelerator to speed-up the training and avoid bad performance
# accumulate_grad_batches is used to save memory, if you want to experiment, less acc value, more merory used (In theory)
trainer = pl.Trainer(devices=1, accelerator="gpu", max_epochs=MAX_EPOCHS, gradient_clip_val=0.1, accumulate_grad_batches=8, log_every_n_steps=10, logger=logger)

In [None]:
# Run the training Cycle and log the metrics
trainer.fit(model)

At the end, the metrics were the following

![DETR Metrics](./images/detr_metrics.png "Loss Metrics")


In [None]:
# Save the final model
FINAL_CKP_PATH = "~/models/detr/{}/V_{}".format(EXPERIMENT_NAME, EXPERIMENT_VERSION)

In [None]:
# By default it saves the Safetensors type
model.model.save_pretrained(FINAL_CKP_PATH)
image_processor.save_pretrained(FINAL_CKP_PATH)

In [None]:
# Also, you can push the model to HF Hub here, but is recommended to first test it