Original notebook (full credit): https://www.kaggle.com/tanulsingh077/end-to-end-object-detection-with-transformers-detr

Adapted  the notebook from the one above.
BBox predictions work fine.
Couldn't figure out the classifications, for some reason the predicted label is always 13 (probably because the input labels are wrong or sth). If you can figure this out let me know. I really wanted to test DETR but, running out of time

In [None]:
!git clone https://github.com/facebookresearch/detr.git   #cloning github repo of detr to import its unique loss

In [None]:
import os
import numpy as np 
import pandas as pd 
from datetime import datetime
import time
import random
from tqdm.autonotebook import tqdm


#Torch
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler

#sklearn
from sklearn.model_selection import StratifiedKFold

#CV
import cv2

################# DETR FUCNTIONS FOR LOSS######################## 
import sys
sys.path.append('./detr/')

from detr.models.matcher import HungarianMatcher
from detr.models.detr import SetCriterion # the rule for stopping the algorithm you're using
#################################################################

#Albumenatations
import albumentations as A
import matplotlib.pyplot as plt
from albumentations.pytorch.transforms import ToTensorV2

#Glob
from glob import glob
from sklearn.model_selection import GroupKFold, train_test_split



# Utils

* AverageMeter - class for averaging loss,metric,etc over epochs

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# Configuration

Basic configuration for this model

In [None]:
n_folds = 5
seed = 42
num_classes = 14
num_queries = 14
null_class_coef = 0.5 # what  is this?
BATCH_SIZE = 8
LR = 2e-5
EPOCHS = 1
size = 512
DIR_TRAIN = '../input/vinbigdata-512-image-dataset/vinbigdata/train'
DIR_TEST = '../input/vinbigdata-512-image-dataset/vinbigdata/test'


# Seed Everything

Seeding everything for reproducible results

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
seed_everything(seed)

# Preparing the Data

* For preparation of data I use code from Alex's awesome kernel [here](https://www.kaggle.com/shonenkov/training-efficientdet)
* The data can be split into any number of folds as you want , split is stratified based on number of boxes and source

In [None]:
df = pd.read_csv('../input/vinbigdata-512-image-dataset/vinbigdata/train.csv')
df.head()

In [None]:
df['x_min'] = df.apply(lambda row: (row.x_min)/row.width, axis =1)
df['y_min'] = df.apply(lambda row: (row.y_min)/row.height, axis =1)

df['x_max'] = df.apply(lambda row: (row.x_max)/row.width, axis =1)
df['y_max'] = df.apply(lambda row: (row.y_max)/row.height, axis =1)

df['x_mid'] = df.apply(lambda row: (row.x_max+row.x_min)/2, axis =1)
df['y_mid'] = df.apply(lambda row: (row.y_max+row.y_min)/2, axis =1)

df['w'] = df.apply(lambda row: (row.x_max-row.x_min), axis =1)
df['h'] = df.apply(lambda row: (row.y_max-row.y_min), axis =1)

df['area'] = df['w']*df['h']
df.head()

In [None]:
# Creating Folds
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)

df['fold'] = -1
group_kfold  = GroupKFold(n_splits = 3)
for fold, (train_index, val_index) in enumerate(group_kfold.split(df, groups=df.image_id.tolist())):
    df.loc[val_index, 'fold'] = fold
df.head()

In [None]:
df = df.drop(columns=['rad_id','width', 'height'])
df.head()

# Augmentations

* As suggested by aleksendra in her kernel ,augentations will play a major role and hence took her up advice and use awesome augmentations , cut-mix and other will be included in future versions

In [None]:
def get_train_transforms():
    return A.Compose([A.OneOf([A.HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit= 0.2, val_shift_limit=0.2, p=0.9),
                               
                      A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.9)],p=0.9),
                      
                      A.ToGray(p=0.01),
                      
                      A.HorizontalFlip(p=0.5),
                      
                      A.VerticalFlip(p=0.5),
                      
                      A.Resize(height=512, width=512, p=1),
                      
                      A.Cutout(num_holes=8, max_h_size=64, max_w_size=64, fill_value=0, p=0.5),
                      
                      ToTensorV2(p=1.0)],
                      
                      p=1.0,
                     
                      bbox_params=A.BboxParams(format='coco',min_area=0, min_visibility=0,label_fields=['labels'])
                      )

def get_valid_transforms():
    return A.Compose([A.Resize(height=512, width=512, p=1.0),
                      ToTensorV2(p=1.0)], 
                      p=1.0, 
                      bbox_params=A.BboxParams(format='coco',min_area=0, min_visibility=0,label_fields=['labels'])
                      )

def get_test_transforms():
    return A.Compose([A.Resize(height=512, width=512, p=1.0),
                      ToTensorV2(p=1.0)], 
                      p=1.0, 
                      )

# Creating Dataset

* I hope you have the video by now , DETR accepts data in coco format which is (x,y,w,h)(for those who do not know there are two formats coco and pascal(smin,ymin,xmax,ymax) which are widely used) . So now we need to prepare data in that format

In [None]:
class Dataset(Dataset):
    def __init__(self,image_ids,class_ids,dataframe,DIR,transforms=None):
        self.image_ids = image_ids
        self.class_ids = class_ids
        self.df = dataframe
        self.transforms = transforms
        self.DIR = DIR
        
    def __len__(self) -> int:
        return self.df.shape[0]
    
    def __getitem__(self,index):
        image_id = self.image_ids[index] # what is this?
        class_id = self.class_ids[index] 
        records = self.df[self.df['image_id'] == image_id]
#         print(class_id)

        image = cv2.imread(f'{self.DIR}/{image_id}.png', cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0
        
        # DETR takes in data in coco format 
        boxes = records[['x_min', 'y_min', 'w', 'h']].values
        
        #Area of bb
        area = boxes[:,2]*boxes[:,3]
        area = torch.as_tensor(area, dtype=torch.float32)
        
        # AS pointed out by PRVI It works better if the main class is labelled as zero
#         labels =  np.zeros(len(boxes), dtype=np.int32)
#         labels =  np.full(1,class_id)
        labels = np.asarray([class_id])
#         print(labels)
#         print(np.full(len(boxes),class_id))
        
        if self.transforms:
            sample = {
                'image': image,
                'bboxes': boxes,
                'labels': labels
            }
            sample = self.transforms(**sample)
            image = sample['image']
            boxes = boxes # boxes.join?
            labels = sample['labels']
            
            
        #Normalizing BBOXES
            
        _,h,w = image.shape
        boxes = A.augmentations.bbox_utils.normalize_bboxes(sample['bboxes'],rows=h,cols=w)
        target = {}
        target['boxes'] = torch.as_tensor(boxes,dtype=torch.float32)
        target['labels'] = torch.as_tensor(labels,dtype=torch.long)
        target['image_id'] = torch.tensor([index])
        target['area'] = area
        
        return image, target, image_id

# Model

* Initial DETR model is trained on coco dataset , which has 91 classes + 1 background class , hence we need to modify it to take our own number of classes
* Also DETR model takes in 100 queries ie ,it outputs total of 100 bboxes for every image , we can very well change that too

In [None]:
class DETRModel(nn.Module):
    def __init__(self,num_classes,num_queries):
        super(DETRModel,self).__init__()
        self.num_classes = num_classes
        self.num_queries = num_queries
        
        self.model = torch.hub.load('facebookresearch/detr', 'detr_resnet50', pretrained=True)
        self.in_features = self.model.class_embed.in_features
        
        self.model.class_embed = nn.Linear(in_features=self.in_features,out_features=self.num_classes)
        self.model.num_queries = self.num_queries
        
    def forward(self,images):
        return self.model(images)

# Matcher and Bipartite Matching Loss

Now we make use of the unique loss that the model uses and for that we need to define the matcher. DETR calcuates three individual losses :
* Classification Loss for labels(its weight can be set by loss_ce)
* Bbox Loss (its weight can be set by loss_bbox)
* Loss for Background class

In [None]:
'''
code taken from github repo detr , 'code present in engine.py'
'''

matcher = HungarianMatcher()

weight_dict = weight_dict = {'loss_ce': 1, 'loss_bbox': 1 , 'loss_giou': 1}

losses = ['labels', 'boxes', 'cardinality']

# Training Function

Training of DETR is unique and different from FasteRRcnn  and EfficientDET , as we train the criterion as well , the training function can be viewed here : https://github.com/facebookresearch/detr/blob/master/engine.py

In [None]:
def train_fn(data_loader,model,criterion,optimizer,device,scheduler,epoch):
    model.train()
    criterion.train()
    
    summary_loss = AverageMeter()
    
    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for (images, targets, image_ids) in tqdm(data_loader):
        
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        

        output = model(images)
        
        loss_dict = criterion(output, targets)
        weight_dict = criterion.weight_dict
        
        losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
        
        optimizer.zero_grad()

        losses.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        
        summary_loss.update(losses.item(),BATCH_SIZE)
        tk0.set_postfix(loss=summary_loss.avg)
        
    return summary_loss

# Eval Function

In [None]:
def eval_fn(data_loader, model,criterion, device):
    model.eval()
    criterion.eval()
    summary_loss = AverageMeter()
    
    with torch.no_grad():
        
        tk0 = tqdm(data_loader, total=len(data_loader))
        for step, (images, targets, image_ids) in enumerate(tk0):
            
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            output = model(images)
        
            loss_dict = criterion(output, targets)
            weight_dict = criterion.weight_dict
        
            losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
            
            summary_loss.update(losses.item(),BATCH_SIZE)
            tk0.set_postfix(loss=summary_loss.avg)
    
    return summary_loss

# Engine

In [None]:
def collate_fn(batch):
    return tuple(zip(*batch))

In [None]:
def run(fold):
    
    df_train = df[df['fold'] != fold]
    df_valid = df[df['fold'] == fold]
    
    train_dataset = Dataset(
    image_ids=df_train['image_id'].values,
    class_ids=df_train['class_id'].values,
    dataframe=df_train,
    DIR=DIR_TRAIN,
    transforms=get_train_transforms()
    )

    valid_dataset = Dataset(
    image_ids=df_valid['image_id'].values,
    class_ids=df_valid['class_id'].values,
    dataframe=df_valid,
    DIR=DIR_TRAIN,
    transforms=get_valid_transforms()
    )
    
    train_data_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    collate_fn=collate_fn
    )

    valid_data_loader = DataLoader(
    valid_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    collate_fn=collate_fn
    )
    
    device = torch.device('cuda')
    model = DETRModel(num_classes=num_classes,num_queries=num_queries)
    model = model.to(device)
    criterion = SetCriterion(num_classes-1, matcher, weight_dict, eos_coef = null_class_coef, losses=losses)
    criterion = criterion.to(device)
    

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
    
    best_loss = 10**5
    for epoch in range(EPOCHS):
        train_loss = train_fn(train_data_loader, model,criterion, optimizer,device,scheduler=None,epoch=epoch)
        valid_loss = eval_fn(valid_data_loader, model,criterion, device)
        
        print('|EPOCH {}| TRAIN_LOSS {}| VALID_LOSS {}|'.format(epoch+1,train_loss.avg,valid_loss.avg))
        
        if valid_loss.avg < best_loss:
            best_loss = valid_loss.avg
            print('Best model found for Fold {} in Epoch {}........Saving Model'.format(fold,epoch+1))
            torch.save(model.state_dict(), f'detr_best_{fold}.pth')

In [None]:
run(fold=1)

In [None]:
def generate_prediction_string(out):
    PredictionStrings = []
    
    for j, (bboxes, logits) in enumerate(zip(out['pred_boxes'], out['pred_logits'])):

        oboxes = bboxes.detach().cpu().numpy()
        oboxes = np.array([
            np.array(box).astype(np.int32) 
            for box in A.augmentations.bbox_utils.denormalize_bboxes(oboxes,512,512)
        ])
        prob   = logits.softmax(1).detach().cpu().numpy()[0, :]
        # scale boxes 
        oboxes = (oboxes*2).astype(np.int32).clip(min=0, max=1023)

        PredictionString = ' '.join(
            str(np.argmax(prob)) 
            + ' ' +
            str(round(confidence,4)) 
            + ' '
            + ' '.join(str(int(round(float(x)))) for x in box) 
            for box, confidence in zip(oboxes, prob)
            if confidence > confidence_thrsh
            )
        PredictionStrings.append(PredictionString)
        
    return PredictionStrings

In [None]:
CLASSES = [ 'Aortic enlargement',
            'Atelectasis',
            'Calcification',
            'Cardiomegaly',
            'Consolidation',
            'ILD',
            'Infiltration',
            'Lung Opacity',
            'Nodule/Mass',
            'Other lesion',
            'Pleural effusion',
            'Pleural thickening',
            'Pneumothorax',
            'Pulmonary fibrosis']


In [None]:
from PIL import Image, ImageDraw, ImageFont

In [None]:
def view_sample(df_valid,model,device):
    '''
    Code taken from Peter's Kernel 
    https://www.kaggle.com/pestipeti/pytorch-starter-fasterrcnn-train
    '''
    valid_dataset = Dataset(image_ids=df_valid['image_id'].values,
                                 class_ids=df_valid['class_id'].values,
                                 dataframe=df_valid,
                                 transforms=get_valid_transforms(),
                                 DIR=DIR_TRAIN
                                )
     
    valid_data_loader = DataLoader(valid_dataset,
                                    batch_size=15,
                                    shuffle=False,
                                   num_workers=4,
                                   collate_fn=collate_fn)
    
    images, targets, image_ids = next(iter(valid_data_loader))
    images, targets, image_ids = next(iter(valid_data_loader))
    _,h,w = images[0].shape # for de normalizing images
    
    images = list(img.to(device) for img in images)
    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
    
    boxes = targets[0]['boxes'].cpu().numpy()
    boxes = [np.array(box).astype(np.int32) for box in A.augmentations.bbox_utils.denormalize_bboxes(boxes,h,w)]
    
    model.eval()
    model.to(device)
    cpu_device = torch.device("cpu")
    with torch.no_grad():
        outputs = model(images)
        
    outputs = [{k: v.to(cpu_device) for k, v in outputs.items()}]
    
    #Drawing
#     sample = images[0].permute(1,2,0).cpu().numpy()
#     fig, ax = plt.subplots(1, 1, figsize=(16, 8))

#     for box in boxes:
#         cv2.rectangle(sample,
#                   (box[0], box[1]),
#                   (box[2]+box[0], box[3]+box[1]),
#                   (220, 0, 0), 1)
        

#     oboxes = outputs[0]['pred_boxes'][0].detach().cpu().numpy()
#     oboxes = [np.array(box).astype(np.int32) for box in A.augmentations.bbox_utils.denormalize_bboxes(oboxes,h,w)]
#     prob   = outputs[0]['pred_logits'][0].softmax(1).detach().cpu().numpy()[:,0]
#     print(prob)
#     for box,p in zip(oboxes,prob):
# #         print(p)
#         if p >0.5:
#             print(box)
#             color = (0,0,220) #if p>0.5 else (0,0,0)
#             cv2.rectangle(sample,
#                   (box[0], box[1]),
#                   (box[2]+box[0], box[3]+box[1]),
#                   color, 1)
    
#     ax.set_axis_off()
#     ax.imshow(sample)
    output = outputs[0]
    
    pred_logits=output['pred_logits'][0][:, :len(CLASSES)]
    pred_boxes=output['pred_boxes'][0]

    max_output = pred_logits.softmax(-1).max(-1)
    topk = max_output.values.topk(15)

    pred_logits = pred_logits[topk.indices]
    pred_boxes = pred_boxes[topk.indices]
    pred_logits.shape


    for logits, box in zip(pred_logits, pred_boxes):
        print(pred_boxes)
        cls = logits.argmax()
        if cls >= len(CLASSES):
            continue
        label = CLASSES[cls]
#         print(label)
        box = box.cpu() * torch.Tensor([800, 600, 800, 600])
        x, y, w, h = box
        x0, x1 = x-w//2, x+w//2
        y0, y1 = y-h//2, y+h//2
#         drw = ImageDraw.Draw(images[0])
#         drw.rectangle([x0, y0, x1, y1], outline='red', width=5)
#         drw.text((x, y), label, fill='white')

    return outputs[0]
 
    
model = DETRModel(num_classes=num_classes,num_queries=num_queries)
model.load_state_dict(torch.load("./detr_best_1.pth"))

In [None]:
outputs = view_sample(df[df['fold'] == 1],model=model,device=torch.device('cuda'))

In [None]:
class Datasettest(Dataset):
    def __init__(self,image_ids,dataframe,DIR,transforms=None):
        self.image_ids = image_ids
        self.df = dataframe
        self.transforms = transforms
        self.DIR = DIR
        
    def __len__(self) -> int:
        return self.df.shape[0]
    
    def __getitem__(self,index):
        image_id = self.image_ids[index] # what is this?
        records = self.df[self.df['image_id'] == image_id]
        
        image = cv2.imread(f'{self.DIR}/{image_id}.png', cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0
        
        if self.transforms:
            sample = {
                'image': image,
            }
            sample = self.transforms(**sample)
            image = sample['image']
        
        return image, image_id

In [None]:
df_test = pd.read_csv('../input/vinbigdata-512-image-dataset/vinbigdata/test.csv')
test_dataset = Datasettest(image_ids=df_test['image_id'].values,
                                 dataframe=df_test,
                                 transforms=get_test_transforms(),
                                 DIR=DIR_TEST)

test_data_loader = DataLoader(test_dataset,
                              batch_size=8,
                              shuffle=False,
                              num_workers=4,
                              collate_fn=collate_fn)

device = torch.device('cuda')
confidence_thrsh = 0.5
final_predictionString = []

for images, image_ids in tqdm(test_data_loader, total=len(test_data_loader)):
    _,h,w = images[0].shape # for de normalizing images

    images = list(img.to(device) for img in images)

    model.eval()
    model.to(device)
    cpu_device = torch.device("cuda")

    with torch.no_grad():
        outputs = model(images)

    outputs = [{k: v.to(cpu_device) for k, v in outputs.items()}]
    out = outputs[0]
    PredictionString = generate_prediction_string(out)
    print(PredictionString)
    
    for pred in PredictionString:
        final_predictionString.append(pred)
    

pred_df = pd.DataFrame({'image_id':df_test["image_id"].values, 'PredictionString':final_predictionString})
sub_df = pd.merge(df_test, pred_df, on = 'image_id', how = 'left').fillna("14 1 0 0 1 1")
sub_df = sub_df[['image_id', 'PredictionString']]
sub_df.to_csv('/kaggle/working/submission.csv',index = False)
sub_df.tail()

In [None]:
for value in sub_df["PredictionString"].values:
    print(value)