It is discovered that there is extreme foreground-background class imbalance problem in one-stage detector. And it is believed that this is the central cause which makes the performance of one-stage detectors inferior to two-stage detectors.
In RetinaNet, an one-stage detector, by using focal loss, lower loss is contributed by “easy” negative samples so that the loss is focusing on “hard” samples, which improves the prediction accuracy. With ResNet+FPN as backbone for feature extraction, plus two task-specific subnetworks for classification and bounding box regression, forming the RetinaNet, which achieves state-of-the-art performance, outperforms Faster R-CNN, the well-known two-stage detectors. It is a 2017 ICCV Best Student Paper Award paper with more than 500 citations. (The first author, Tsung-Yi Lin, has become Research Scientist at Google Brain when he was presenting RetinaNet in 2017 ICCV.) 

For more info read this [article](https://towardsdatascience.com/review-retinanet-focal-loss-object-detection-38fba6afabe4).

![](https://miro.medium.com/max/1010/1*0-GVAp6WCzPMR6puuaYQTQ.png)

## Load Detectron2

In [None]:
# install dependencies: 
!pip install pyyaml==5.1
import torch, torchvision
print(torch.__version__, torch.cuda.is_available())
!gcc --version
import torch
assert torch.__version__.startswith("1.7")
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.7/index.html

## Import Libraries

In [None]:
# Some basic setup:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()


# import some common libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import  StratifiedShuffleSplit
import os, json, cv2, random

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog 

import copy
import logging
import numpy as np
from typing import Callable, List, Union
import torch

from detectron2.config import configurable
from detectron2.data import MetadataCatalog
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T

from detectron2.data import detection_utils as utils
import copy
import detectron2.data.transforms as T
import matplotlib.pyplot as plt
from detectron2.data import DatasetMapper
import torch
import os
import numpy as np

from detectron2.config import configurable

## Load Dataset

In [None]:
Dataset_Path = '../input/vinbigdata-chest-xray-abnormalities-detection'

df = pd.read_csv(f'{Dataset_Path}/train.csv')
df.shape

In [None]:
df['w'], df['h'] = df['x_max'] - df['x_min'], df['y_max'] - df['y_min']
df['area'] = df['w'] * df['h']
df.head()

In [None]:
print('Total Images: ', len(os.listdir(f'{Dataset_Path}/train')))

## Spliting Data

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
df_dd = df.drop_duplicates('image_id')
df_dd = df_dd.reset_index()
sss.get_n_splits(df_dd['image_id'], df_dd['class_id'])
for train_index, test_index in sss.split(df_dd['image_id'], df_dd['class_id']):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = df_dd['image_id'][train_index], df_dd['image_id'][test_index]
    y_train, y_test = df_dd['class_id'][train_index], df_dd['class_id'][test_index]

In [None]:
classes = df.drop_duplicates('class_id').sort_values('class_id')[['class_name']].values[:-1].ravel().tolist()
print(classes)

In [None]:
import numpy as np
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

import matplotlib.pyplot as plt
%matplotlib inline


def read_xray(path, voi_lut = True, fix_monochrome = True):
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [None]:
##### DATASET PREPARING: CONERTING CSV TO DICTIONARY WHICH CONTAINS ANNOTATION AND IMAGE PATH ETC.

from detectron2.structures import BoxMode

def chest_dicts(images, img_dir = '../input/vinbigdata-chest-xray-abnormalities-detection/train'):
    
    dataset_dicts = []
    for idx, v in enumerate(images):
        record = {}
        
        filename = os.path.join(img_dir, v + '.dicom')
        
        record["file_name"] = filename
        record["image_id"] = idx
        record["height"] = 2500 # RANDOM Not Req
        record["width"] = 2500 # RANDOM Not Req
      
        annos = df[df.image_id == v]
        objs = []
        for _, anno in annos.iterrows():
            if anno.class_id != 14:

                obj = {
                    "bbox": [int(anno.x_min), int(anno.y_min), int(anno.w), int(anno.h)],
                    "bbox_mode": BoxMode.XYWH_ABS,
                    "category_id": int(anno.class_id)
                }
                objs.append(obj)
        record["annotations"] = objs
        dataset_dicts.append(record)
    return dataset_dicts
def train():
    return chest_dicts(X_train)
def val():
    return chest_dicts(X_test)
DatasetCatalog.register("chest_Train", train)
MetadataCatalog.get("chest_Train").set(thing_classes=classes)
DatasetCatalog.register("chest_Val",val)
MetadataCatalog.get("chest_Val").set(thing_classes=classes)
Chest_metadata = MetadataCatalog.get("chest_Train")

## Data Visualization

In [None]:
dataset_dicts = DatasetCatalog.get("chest_Val")
for d in random.sample(dataset_dicts, 3):
    img = read_xray(d["file_name"])
    visualizer = Visualizer(img, metadata=Chest_metadata, scale=1.0)
    out = visualizer.draw_dataset_dict(d)
    plt.imshow(out.get_image(), cmap = 'gray')
    plt.show()

## Dataset Mapper

In [None]:

##### AS THE DATASET CONTAIN DICOM IMAGES WE NEED DIFFERENT DATSETMAPPER.

class DatasetMapper:

    @configurable
    def __init__(
        self,
        is_train: bool,
        *,
        augmentations: List[Union[T.Augmentation, T.Transform]],
        image_format: str = 'BGR',
        use_instance_mask: bool = False,
        use_keypoint: bool = False,
        instance_mask_format = "polygon",
        keypoint_hflip_indices = None,
        precomputed_proposal_topk= None,
        recompute_boxes: bool = False,
    ):
        """
        NOTE: this interface is experimental.

        Args:
            is_train: whether it's used in training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            image_format: an image format supported by :func:`detection_utils.read_image`.
            use_instance_mask: whether to process instance segmentation annotations, if available
            use_keypoint: whether to process keypoint annotations if available
            instance_mask_format: one of "polygon" or "bitmask". Process instance segmentation
                masks into this format.
            keypoint_hflip_indices: see :func:`detection_utils.create_keypoint_hflip_indices`
            precomputed_proposal_topk: if given, will load pre-computed
                proposals from dataset_dict and keep the top k proposals for each image.
            recompute_boxes: whether to overwrite bounding box annotations
                by computing tight bounding boxes from instance mask annotations.
        """

        # fmt: off
        self.is_train               = is_train
        self.augmentations          = T.AugmentationList(  augmentations)
        self.image_format           = image_format
        
        # fmt: on
        logger = logging.getLogger(__name__)
        mode = "training" if is_train else "inference"
        logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}")

    @classmethod
    def from_config(cls, cfg, is_train: bool = True):
        augs = utils.build_augmentation(cfg, is_train)
        if cfg.INPUT.CROP.ENABLED and is_train:
            augs.insert(0, T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
            recompute_boxes = cfg.MODEL.MASK_ON
        else:
            recompute_boxes = False


        ret = {
            "is_train": is_train,
            "augmentations": augs,
            "image_format": cfg.INPUT.FORMAT,
            "use_instance_mask": cfg.MODEL.MASK_ON,
            "instance_mask_format": cfg.INPUT.MASK_FORMAT,
            "use_keypoint": cfg.MODEL.KEYPOINT_ON,
            "recompute_boxes": recompute_boxes,
        }

        return ret

    def __call__(self, dataset_dict):
        # print(dataset_dict)
        dataset_dict = copy.deepcopy(dataset_dict)
        image = read_xray(dataset_dict["file_name"])
        
        auginput = T.AugInput(image)
        transform = self.augmentations(auginput)
        image = np.expand_dims(auginput.image, axis=2).copy()
        image = torch.from_numpy(image.transpose(2, 0, 1))
        annos = [
            utils.transform_instance_annotations(annotation, [transform], image.shape[1:])
            for annotation in dataset_dict.pop("annotations")
        ]
        return {
        # create the format that the model expects
        "image": image,
        "instances": utils.annotations_to_instances(annos, image.shape[1:])
        }

In [None]:
#### WE ALSO NEED CUSTOM TRAINER TO TELL DETECTRON TO USE OUR CUSTOM DATASETMAPPER
from detectron2.engine import DefaultTrainer


class Trainer(DefaultTrainer):
    
    @classmethod
    def build_test_loader(cls, cfg, dataset_name):
        return build_detection_test_loader(cfg,dataset_name , mapper=DatasetMapper(cfg, is_train = True))

    @classmethod
    def build_train_loader(cls, cfg):
        return build_detection_train_loader(dataset =train(), mapper=DatasetMapper(cfg, is_train = True),aspect_ratio_grouping=False, total_batch_size = cfg.SOLVER.IMS_PER_BATCH)


## Change Config

In [None]:
from detectron2.data import MetadataCatalog, build_detection_train_loader,build_detection_test_loader
Batch = 10
Epochs = 3
steps = 1000  #  ### INCREASE THE STEPS   (len(X_train) // Batch) * Epochs 
cfg = get_cfg()
NAME = "COCO-Detection/retinanet_R_50_FPN_1x.yaml"
cfg.merge_from_file(model_zoo.get_config_file(NAME))
cfg.DATASETS.TRAIN = ("chest_Train",)
cfg.DATASETS.TEST = ('chest_Val', )
cfg.DATALOADER.NUM_WORKERS = 2
cfg.MODEL.WEIGHTS =  model_zoo.get_checkpoint_url(NAME)
cfg.SOLVER.IMS_PER_BATCH = Batch
cfg.CUDNN_BENCHMARK =  True
cfg.MODEL.RETINANET.NUM_CLASSES  = len(classes)
cfg.SOLVER.BASE_LR = 0.00025
cfg.SOLVER.LR_SCHEDULER_NAME = "WarmupCosineLR"
cfg.SOLVER.MAX_ITER = steps  
cfg.OUTPUT_DIR = './output'
cfg.MODEL.PIXEL_MEAN = [103.530]
cfg.MODEL.PIXEL_STD = [1.0]
cfg.SOLVER.CHECKPOINT_PERIOD = 1000
cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 0.95
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

trainer = Trainer(cfg) 
trainer.resume_or_load(resume=False)

## Train the Model

In [None]:
trainer.train()

## Predictions

In [None]:
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")  # path to the model we just trained
cfg.MODEL.RETINANET.SCORE_THRESH_TEST = 0.1   # set a custom testing threshold
predictor = DefaultPredictor(cfg)

In [None]:
from detectron2.utils.visualizer import ColorMode
for d in random.sample(dataset_dicts, 3):    
    im = read_xray(d["file_name"])
    im = np.expand_dims(im, axis=2)
    outputs = predictor(im)  # format is documented at https://detectron2.readthedocs.io/tutorials/models.html#model-output-format
    v = Visualizer(im[:, :, 0],
                   metadata=Chest_metadata, 
                   scale=0.5,  
    )
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    plt.imshow(out.get_image()[:,:,::-1])
    plt.show()