# Team Name

Sergio Sanz Rodriguez

# Introduction

This code implements the training, validation, and testing pipelines for the Kaggle competition titled "Synthetic to Real Object Detection Challenge - Phase 2."

The proposed model is based on PyTorch's Region-based CNN (R-CNN), specifically the [Faster R-CNN](https://pytorch.org/vision/master/models/faster_rcnn.html) implementation. In this competition, the ``ResNet50_FPN_v2`` backbone has been used.

A key aspect of the proposed method is an ``augmentation-based regularization`` technique to improve generalization. Strong data augmentation techniques, such as horizontal and vertical flip, zooming out, occlusions, color jitter, and resolution scaling, are applied.

Additional highlights of this approach include:

1. ``Synthetic-only training:`` No real-world images were used during training or validation.
2. ``No pseudo-labeling:`` The model was trained solely on the labeled synthetic data provided in the competition.
3. ``Pre-trained model:`` A pre-trained Faster R-CNN model was fine-tuned using the Cheerios dataset.

# Importing Libraries

In [None]:
# Generic libraries
import os
import torch
import cv2
import matplotlib.pyplot as plt
import glob
from torchinfo import summary
from pathlib import Path
from torch.optim.lr_scheduler import CosineAnnealingLR

# Torchvision libraries
from torchvision import tv_tensors
from torchvision.transforms import v2 as T
from torchvision.transforms import InterpolationMode

# Import custom libraries
from modules.obj_detection_utils import collate_fn, display_and_save_predictions, visualize_transformed_data, set_seeds, RandomCircleOcclusion, RandomTextureOcclusion, RandomTextureOcclusionDeterministic, BoostRedColorTransform
from modules.obj_detection import ObjectDetectionEngine
from modules.schedulers import FixedLRSchedulerWrapper
from modules.common import Common
from modules.obj_dect_dataloaders import ProcessDatasetCheerios
from modules.faster_rcnn import StandardFasterRCNN

# Warnings
import warnings
os.environ['TORCH_USE_CUDA_DSA'] = "1"
warnings.filterwarnings("ignore", category=UserWarning, module="torch.autograd.graph")
warnings.filterwarnings("ignore", category=FutureWarning, module="onnxscript.converter")

# Create target model directory
MODEL_DIR = Path("outputs")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

DATA_DIR = "FalconEDU/Scenarios/Ex2ChangeHero/Output"
version = "8_0"

# Set seeds
set_seeds(42)

# Specifying the Target Device

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Device: {device}")

if device == "cuda":
    !nvidia-smi

# Data Preprocessing (Augmentation)

A data augmentation pipeline is used in this project to enhance model generalization. It applies several transformations, including occlusions, flipping, and zooming out. Occlusions are simulated by randomly adding colored circles and rectangles using the  ``RandomCircleOcclusion`` and ``RandomErasing`` classes.

Additionally, a synthetic textures sourced from the Falcon editor's texture dataset is overlaid onto the images using the ``RandomTextureOcclusion`` class to further simulate real-world occlusions.

The synthetic bush texture was exported as a PNG file and integrated into the transformation pipeline as an occlusion method.

In [None]:
# Augmentation pipeline
image_list1 = ["images/T_Bush_Falcon.png"]

image_list2 = [
    "images/T_CobbleStone_Smooth_D1.png",
    "images/000000016_crop.png",
]

image_list3 = [
    "images/000000042_crop.png",
    "images/000000061_crop.png",
]

image_list4 = [
    "images/000000313_crop.png",
    "images/000000459_crop.png",
]

image_list5 = [
    "images/000000569_crop.png",
    "images/000000856_crop.png",
]

def get_transform_train(train, mean_std_norm=False):
    transforms = []
    if train:

        # Resolution scaling
        transforms.append(T.RandomChoice([T.Resize(size, interpolation=InterpolationMode.BILINEAR) for size in range(580, 1080, 250)]))

        # Horizontal flip
        transforms.append(T.RandomHorizontalFlip(p=0.5))

        # Vertical flip
        transforms.append(T.RandomVerticalFlip(p=0.5))

        # Random Rotation
        transforms.append(T.RandomRotation(degrees=(0, 90), interpolation=InterpolationMode.BILINEAR))

        # Random perspective
        #transforms.append(T.RandomAffine(degrees=0, shear=10, interpolation=InterpolationMode.BILINEAR))
        transforms.append(T.RandomPerspective(distortion_scale=0.5, p=0.5))

        # Occlusions with circles
        transforms.append(RandomCircleOcclusion(p=0.3, scale=(0.02, 0.2), ratio=(0.3, 3.3), num_elems=3))

        # Occlusions with rectangles
        transforms.append(T.RandomErasing(p=0.5, scale=(0.02, 0.3), ratio=(0.3, 3.3), value=(0.5, 0.5, 0.5)))

        # Occlusion with synthetic texture from Falcon Editor: plant
        transforms.append(RandomTextureOcclusion(obj_path=image_list1, scale=(0.2, 0.5), transparency=1.0, p=0.5))
        transforms.append(RandomTextureOcclusion(obj_path=image_list1, scale=(0.2, 0.5), transparency=1.0, p=0.5))

        # Occlusion with synthetic texture from Falcon Editor: others
        transforms.append(RandomTextureOcclusion(obj_path=image_list2, scale=(0.1, 0.2), transparency=1.0, p=0.5))

        # Occlusion with synthetic texture from Falcon Editor: others
        transforms.append(RandomTextureOcclusion(obj_path=image_list3, scale=(0.1, 0.2), transparency=1.0, p=0.5))

        # Occlusion with synthetic texture from Falcon Editor: others
        transforms.append(RandomTextureOcclusion(obj_path=image_list4, scale=(0.1, 0.2), transparency=1.0, p=0.5))

        # Occlusion with synthetic texture from Falcon Editor: others
        transforms.append(RandomTextureOcclusion(obj_path=image_list5, scale=(0.1, 0.2), transparency=1.0, p=0.5))
        
        # Color jitter
        transforms.append(T.ColorJitter(contrast=0.3, saturation=0.3))

        # Zoom out
        transforms.append(T.RandomZoomOut(fill={tv_tensors.Image: (0.5, 0.5, 0.5), "others": 0}, side_range=(1.0, 1.3), p=0.5))

         
    # Image normalization
    transforms.append(T.ToDtype(torch.float, scale=True))

    if mean_std_norm:
        transforms.append(T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))

    # Convert to tensor and permute dimensions to (C, H, W)
    transforms.append(T.ToPureTensor())
    
    # Composition
    return T.Compose(transforms)

def get_transform_test(train, mean_std_norm=False):
    transforms = []
    if train:
        
        # Occlusion with synthetic texture from Falcon Editor: plant
        transforms.append(RandomTextureOcclusionDeterministic(obj_path=image_list1, scale=(0.2, 0.5), transparency=1.0, p=0.5))

        # Occlusion with synthetic texture from Falcon Editor: others
        transforms.append(RandomTextureOcclusionDeterministic(obj_path=image_list2, scale=(0.1, 0.2), transparency=1.0, p=0.5))

        # Occlusion with synthetic texture from Falcon Editor: others
        transforms.append(RandomTextureOcclusionDeterministic(obj_path=image_list3, scale=(0.1, 0.2), transparency=1.0, p=0.5))

        # Occlusion with synthetic texture from Falcon Editor: others
        transforms.append(RandomTextureOcclusionDeterministic(obj_path=image_list4, scale=(0.1, 0.2), transparency=1.0, p=0.5))

        # Occlusion with synthetic texture from Falcon Editor: others
        transforms.append(RandomTextureOcclusionDeterministic(obj_path=image_list5, scale=(0.1, 0.2), transparency=1.0, p=0.5))

    # Image normalization
    transforms.append(T.ToDtype(torch.float, scale=True))

    if mean_std_norm:
        transforms.append(T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))

    # Convert to tensor and permute dimensions to (C, H, W)
    transforms.append(T.ToPureTensor())
    
    # Composition
    return T.Compose(transforms)

# Preparing Dataloaders

Below is the Python code to create the training and validation dataloaders, which will feed the object detection model.

In [None]:
# The dataset contains two classes (ROI + background)
NUM_CLASSES = 2
BATCHES = 4

IMAGE_DIR = f"{DATA_DIR}/data/train/images"
image = cv2.imread(os.path.join(IMAGE_DIR, "000000000.png"))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Image dimensions
img_height, img_width = image.shape[:2]
img_size = (img_height, img_width)

# Create the training dataset with transformations
train_dataset = ProcessDatasetCheerios(
    root=f"{DATA_DIR}/data/train",
    image_path="images",
    label_path="labels",
    transforms=get_transform_train(train=True),
    num_classes=NUM_CLASSES-1) # Background to be removed

# Create the validation dataset with transformations
val_dataset = ProcessDatasetCheerios(
    root=f"{DATA_DIR}/data/val",
    image_path="images",
    label_path="labels",
    transforms=get_transform_test(train=True),
    num_classes=NUM_CLASSES-1) # Background to be removed

# Create the training data loader
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=BATCHES,
    shuffle=True,
    collate_fn=collate_fn
)

# Create the validation data loader
val_dataloader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=BATCHES,
    shuffle=False,
    collate_fn=collate_fn
)

# Visualizing Dataloaders: Original and Transformed

Visualizing the dataloaders helps to verify that the augmentation techniques are properly applied.

In [None]:
# Visualize transformations
# Original
dataloader_ntr = torch.utils.data.DataLoader(
    ProcessDatasetCheerios(
        root=f"{DATA_DIR}/data/train",
        image_path="images",
        label_path="labels",
        transforms=get_transform_train(train=False),
        num_classes=NUM_CLASSES-1), # Background to be removed
    batch_size=BATCHES,
    shuffle=False,
    collate_fn=collate_fn
    )

# Transformed
dataloader_tr = torch.utils.data.DataLoader(
    ProcessDatasetCheerios(
        root=f"{DATA_DIR}/data/train",
        image_path="images",
        label_path="labels",
        transforms=get_transform_train(train=True),
        num_classes=NUM_CLASSES-1), # Background to be removed
    batch_size=BATCHES,
    shuffle=False,
    collate_fn=collate_fn
    )

# Display images
for idx, ((img_tr, target_tr), (img_ntr, target_ntr)) in enumerate(zip(dataloader_tr, dataloader_ntr)):   
    for i in range(0, BATCHES):
        visualize_transformed_data(img_ntr[i], target_ntr[i], img_tr[i], target_tr[i])
    if idx > 4:
        break

# Model Training and Validation

For this challenge, the Faster Region-based Convolutional Neural Network (Faster R-CNN) was selected as the object detection architecture. Faster R-CNN is widely recognized for delivering state-of-the-art performance in various object detection tasks, combining accuracy and efficiency through its two-stage detection pipeline.

The model consists of three stages:

* ``Region Proposal Network (RPN):`` The model analyzes the image to identify regions (bounding boxes) likely to contain an object. This process, known as objectness, represents the probability that a region contains an object rather than background or noise.
* ``Classification & Bounding Box Regression:`` Once the ROIs are identified, the model classifies the objects within those regions (e.g., pedestrian, dog, table, book). Optionally, the model may also generate segmentation masks to describe the exact shape of the objects.
* ``Bounding Box Pruning:`` In this stage bounding boxes with the lowest confidence are removed to produce cleaner outputs. It helps eliminate redundant detections.

This approach provides a robust foundation for accurate object detection under the challenge's constraints, including varied occlusions and background noise.

In [None]:
# Instantiate the model
model = StandardFasterRCNN(
    backbone="resnet50_v2",
    num_classes=NUM_CLASSES,
    device=device,
    nms=[20, 5, 50, 2]
    )

# Print summary
summary(model,
        input_size=(1,3,384, 384),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

In [None]:
# Define model name
model_name = f"modelA_{version}.pth"

# Create AdamW optimizer
LR = 1e-5
optimizer = torch.optim.AdamW(
    params=model.parameters(),
    lr=LR,
    betas=(0.9, 0.999),
    weight_decay=0.01
)

# Create scheduler
EPOCHS = 30
scheduler = FixedLRSchedulerWrapper(
    scheduler=CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=1e-7),
    fixed_lr=1e-7,
    fixed_epoch=EPOCHS)

# Instantiate the engine with the created model and the target device
engine = ObjectDetectionEngine(
    model=model,
    log_verbose=True,
    device=device)

# Configure the training method
results = engine.train(
    target_dir=MODEL_DIR,                       # Directory where the model will be saved
    model_name=model_name,                      # Name of the model
    save_best_model=["loss", "last"],           # Save the best models based on different criteria
    keep_best_models_in_memory=False,           # Do not keep the models stored in memory for the sake of training time and memory efficiency
    train_dataloader=train_dataloader,          # Train dataloader
    test_dataloader=val_dataloader,             # Val dataloader
    optimizer=optimizer,                        # Optimizer    
    scheduler=scheduler,                        # Scheduler
    epochs=EPOCHS,                              # Total number of epochs
    amp=True,                                   # Enable Automatic Mixed Precision (AMP)
    enable_clipping=False,                      # Disable clipping on gradients, only useful if training becomes unestable
    debug_mode=False,                           # Disable debug mode    
    accumulation_steps=1,                       # Accumulation steps: effective batch size = batch_size x accumulation steps
    apply_validation=True                       # Enable validation step
    )

# Making Predictions on Validation

In [None]:
# Find the model file with "model_loss_epoch" prefix and rename it
def rename_model(model_name: str, new_name: str):
    old_name = model_name[0]
    os.rename(old_name, new_name)
    print(f"Renamed {old_name} to {new_name}")

model_name = glob.glob(str(MODEL_DIR / f"model_{version}_loss_epoch*.pth"))
new_model_name = str(MODEL_DIR / f"model_{version}.pth")
rename_model(model_name, new_model_name)

In [None]:
# Instantiate the trained model
model = StandardFasterRCNN(
    backbone="resnet50_v2",
    num_classes=NUM_CLASSES,
    device=device,
    nms = [20, 5, 50, 2]
    )

# Load the parameters of the best model
model = Common().load_model(model, "outputs", f"model_{version}.pth")

# Make predictions with model.pth
preds = ObjectDetectionEngine(
    model=model,
    device=device).predict(
        dataloader=val_dataloader,
        prune_predictions = True,
        score_threshold = 0.8,
        iou_threshold = 0.01,
        best_candidate="score"
        )

# Configuration parameters for visualization
BOX_COLOR = "blue"
WIDTH = round(max(img_height, img_width)/175)
FONT_TYPE = r"C:\Windows\Fonts\arial.ttf"
FONT_SIZE = 48
PRINT_LABELS = True

# Display predictions
display_and_save_predictions(
    preds=preds,
    dataloader=val_dataset,
    box_color=BOX_COLOR,
    width=WIDTH,
    font_type=FONT_TYPE,
    font_size=FONT_SIZE,
    print_classes=True,
    print_scores=True,
    label_to_class_dict={1: 'cheerios'}
    )

# Inference on Real Data
1. Open an ananconda terminal
2. Activate the ``EDU`` environment
3. Download the models from this link and copy them to folder ``outputs``
3. Execute ``python predict_v6.py --modelA "outputs/modelA_2_0.pth" --modelA "outputs/modelA_7_0.pth" --modelA "outputs/modelA_8_0.pth" --modelB "outputs/modelB_1_0.pth"``
4. Execute ``python convert_preds_to_csv_v3.py``

To get access to the model files, please contact: sergio.sanz.rodriguez@gmail.com