# Fastai v1 : Object detection Tutorial

In [None]:
!pip install fastai -q --upgrade
from fastai.vision.all import *

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import defaultdict
import os
import seaborn as sns 
import matplotlib.pyplot as plt
import matplotlib.image as immg
from sklearn.model_selection import StratifiedKFold,KFold


In [None]:
sns.set_style('darkgrid')

In [None]:
path = Path('/kaggle/input/dsta-brainhack-2021/c1_release/c1_release');path.ls()

# Other dataset
# path_pascal = untar_data(URLs.PASCAL_2007)

In [None]:
imgs, lbl_bbox = get_annotations(path/'train.json')

# testing out the model
# from IPython.display import Image
# Image("../input/dsta-brainhack-2021/c1_release/c1_release/images"+"/"+imgs[0])

lbl_bbox[0],imgs[0]

In [None]:
# from IPython.display import Image
# Image("./input/dsta-brainhack-2021/c1_release/c1_release/images/3e15222c5563afcb.jpg")


In [None]:
img2bbox = dict(zip(imgs, lbl_bbox))

In [None]:
# Let's check the first item
first = {k: img2bbox[k] for k in list(img2bbox)[:1]}; first

In [None]:
getters = [lambda o: path/'images'/o, lambda o: img2bbox[o][0], lambda o: img2bbox[o][1]]

### Transformation

In [None]:
item_tfms = [Resize(128, method='pad'),]
batch_tfms = [Rotate(), Flip(), Dihedral(), Normalize.from_stats(*imagenet_stats)]
# aug_tfms = [RandomFlip(tfm_y=TfmType.COORD),
#         RandomRotate(30, tfm_y=TfmType.COORD),
#         RandomLighting(0.1,0.1, tfm_y=TfmType.COORD)]

def get_train_imgs(noop):  return imgs



### Building Datablock

In [None]:
pascal = DataBlock(blocks=(ImageBlock, BBoxBlock, BBoxLblBlock),
                 splitter=RandomSplitter(),
                 get_items=get_train_imgs,
                 getters=getters,
                 item_tfms=item_tfms,
                 batch_tfms=batch_tfms,
                 n_inp=1)

dls = pascal.dataloaders(path/'images')


In [None]:
dls.c = 6
dls.show_batch(max_n=4,nrows=1)


In [None]:
# previous code

# train = pd.read_csv(path/'train.csv')
# tr = train.image_id.value_counts()
# tr = pd.DataFrame({'image_id':tr.index,'wheat_count':tr.values})
# tr = tr.sample(frac=1.,random_state=2020).reset_index(drop=True)
# 

## The Model

In [None]:
!git clone https://github.com/ExtremelySunnyYK/Practical-Deep-Learning-for-Coders-2.0.git
output_cv_path = "Practical-Deep-Learning-for-Coders-2.0/Computer Vision"
os.chdir(f"/kaggle/working/{output_cv_path}")
from imports import *
from object_detection_metrics.BoundingBox import BoundingBox, BBType, BBFormat
from object_detection_metrics.BoundingBoxes import BoundingBoxes
from object_detection_metrics.Evaluator import Evaluator


In [None]:
object_metrics_path = Path(f"/kaggle/working/{output_cv_path}")
object_metrics_path.ls()

In [None]:
encoder = create_body(resnet50, pretrained=True)
NUM_CLASSES = get_c(dls)


### Architecture

In [None]:
# arch = RetinaNet(encoder, get_c(dls), final_bias=-4)

# from torchvision.models.detection import maskrcnn_resnet50_fpn
# learn = mask_rcnn_learner(dls, maskrcnn_resnet50_fpn, 2)



In [None]:
ratios = [1/2,1,2]
scales = [1,2**(-1/3), 2**(-2/3)]
crit = RetinaNetFocalLoss(arch, scales=scales, ratios=ratios)

In [None]:
def _retinanet_split(m): return L(m.encoder,nn.Sequential(m.c5top6, m.p6top7, m.merges, m.smoothers, m.classifier, m.box_regressor)).map(params)


In [None]:
import pdb
import torchvision

class ThresholdingAndNMS(Callback):
    def __init__(self, threshold=0.3):
        self.threshold=threshold
    def after_loss(self):
        if self.training: return # only do this expensive computation during validation/show_results
        box_pred, cls_pred = self.learn.pred
        scores = torch.sigmoid(cls_pred)
        anchors = self.learn.loss_func.anchors
        recovered_boxes = torch.clamp(cthw2tlbr(activ_to_bbox(box_pred, anchors).view(-1,4)).view(*box_pred.shape), min=-1, max=1)
        cls_clean, box_clean = [],[]
        
        one_batch_boxes = []
        one_batch_scores = []
        one_batch_cls_pred = [] 
        for i in range(cls_pred.shape[0]):
            cur_box_pred = recovered_boxes[i]
            cur_scores = scores[i]
            max_scores, cls_idx = torch.max(cur_scores, dim=1)
            thresh_mask = max_scores > self.threshold
            
            cur_keep_boxes = cur_box_pred[thresh_mask]
            cur_keep_scores = cur_scores[thresh_mask]
            cur_keep_cls_idx = cls_idx[thresh_mask]
            
            one_img_boxes = []
            one_img_scores = []
            one_img_cls_pred = []
            for c in range(NUM_CLASSES):
                cls_mask   = cur_keep_cls_idx==c
                if cls_mask.sum()==0:
                    continue
                cls_boxes  = cur_keep_boxes[cls_mask]
                cls_scores = cur_keep_scores[cls_mask].max(dim=1)[0]
                nms_keep_idx = torchvision.ops.nms(cls_boxes,cls_scores, iou_threshold=0.5)
                one_img_boxes += [*cls_boxes[nms_keep_idx]]
                one_img_scores += [*cur_keep_scores[nms_keep_idx]]
                one_img_cls_pred += [*tensor([c]*len(nms_keep_idx))]
                
            one_batch_boxes.append(one_img_boxes)
            one_batch_scores.append(one_img_scores)
            one_batch_cls_pred.append(one_img_cls_pred)
        
        
        
        #padded_boxes, padded_cls_pred = pad_and_merge(one_batch_boxes, one_batch_cls_pred)
        #print(f"padded_boxes: {padded_boxes.shape} - padded_cls_pred: {padded_cls_pred.shape}")
        #self.learn.pred = to_device((padded_boxes, padded_cls_pred), cls_pred.device)
        padded_boxes, padded_scores = pad_and_merge_scores(one_batch_boxes, one_batch_scores)
        #print(f"padded_boxes: {padded_boxes.shape} - padded_scores: {padded_scores.shape}")
        self.learn.pred = to_device((padded_boxes, padded_scores), cls_pred.device)
        

def pad_and_merge_scores(boxes_batch, scores_batch):
    max_n_boxes = max([len(boxes_img) for boxes_img in boxes_batch])
    
    padded_boxes = torch.zeros(len(boxes_batch), max_n_boxes, 4).float()
    padded_scores = torch.zeros(len(boxes_batch), max_n_boxes, NUM_CLASSES).float()
    padded_scores[:,:] = 10 # set all to 10, if its a padded box, this is very ugly, the metric will remove 
    # these rows
    
    for i, (boxes_img, scores_img) in enumerate(zip(boxes_batch, scores_batch)):
        for j, (box, score) in enumerate(zip(boxes_img, scores_img)):
            padded_boxes[i,j] = box
            padded_scores[i,j] = score
    return (TensorBBox(padded_boxes), TensorMultiCategory(padded_scores))

def tlbr2xyxy(box, img_size=(224,224)):
    h,w = img_size  # ????
    # assume shape = (4)
    # converting from pytorch -1 to 1 -> 0 to 1
    #print(f"box shape: {box.shape}")
    box = box.squeeze()
    box = (box + 1) / 2
    x1 = int(box[0]*w)
    x2 = int(box[2]*w)
    y1 = int(box[1]*h)
    y2 = int(box[3]*h)
    return [x1,y1,x2,y2]



class mAP(Metric):
    def __init__(self):
        self.boxes = BoundingBoxes()
        self.count = 0
        self.res = None
    
    def reset(self):
        self.boxes.removeAllBoundingBoxes()
        self.count = 0
    
    def accumulate(self, learn):
        # add predictions and ground truths
        #pdb.set_trace()
        pred_boxes, pred_scores = learn.pred
        # remove padded boxes in batch
        pred_cls = pred_scores.argmax(dim=-1)
        gt_boxes, gt_cls = learn.yb
        #pdb.set_trace()
        for img_box_pred, img_score_pred, img_box_gt, img_cls_gt in zip(pred_boxes, pred_scores, gt_boxes, gt_cls): 
            
            pred_nonzero_idxs = (img_score_pred.sum(dim=-1) < 5).float().nonzero()
            #pdb.set_trace()
            if not pred_nonzero_idxs.numel() == 0:
                img_cls_pred = img_score_pred[pred_nonzero_idxs].argmax(dim=-1)
                #pdb.set_trace()
                #add predictions for this img
                for box_pred, cls_pred, score_pred in zip(img_box_pred[pred_nonzero_idxs], img_cls_pred, img_score_pred[pred_nonzero_idxs]):
                    b = BoundingBox(self.count, learn.dls.vocab[cls_pred.item()+1], *tlbr2xyxy(box_pred), 
                                bbType=BBType.Detected, format=BBFormat.XYX2Y2, classConfidence=score_pred.squeeze()[cls_pred.item()])
                    self.boxes.addBoundingBox(b)
                    #print(f"adding detection {learn.dls.vocab[cls_pred.item()]}")
             #       pdb.set_trace()
            
            gt_nonzero_idxs   = img_cls_gt.nonzero()#.squeeze()
            for box_gt, cls_gt in zip(img_box_gt[gt_nonzero_idxs], img_cls_gt[gt_nonzero_idxs]):
                b = BoundingBox(self.count, learn.dls.vocab[cls_gt.item()], *tlbr2xyxy(box_gt), 
                            bbType=BBType.GroundTruth, format=BBFormat.XYX2Y2)
                self.boxes.addBoundingBox(b)
                #print(f"adding gt {learn.dls.vocab[cls_gt.item()]}")
          #      pdb.set_trace()
            # increment counter
            self.count += 1
    
    @property
    def value(self):
        if len(self.boxes.getBoundingBoxes()) == 0:
            return 0
        self.res = Evaluator().GetPascalVOCMetrics(self.boxes)
        return np.mean([cat["AP"] for cat in self.res])
    
    @property
    def name(self):
        return "mAP"

In [None]:
class LookUpMetric(Metric):
    def __init__(self, reference_metric, metric_name, lookup_idx):
        store_attr(self, "reference_metric,metric_name,lookup_idx")
    
    def reset(self):
        pass
    def accumulate(self, learn):
        pass
    
    @property
    def value(self):
        if self.reference_metric.res is None:
            _ = self.reference_metric.value
        return self.reference_metric.res[self.lookup_idx]["AP"]
    
    @property
    def name(self):
        return self.metric_name + "AP"

In [None]:
map_metric = mAP()
metrics = [map_metric]

In [None]:


!pip install fastai==2.0.15
!pip install fastai2==0.0.30
!pip install fastcore==1.0.16

In [None]:

for lookup_idx in range(NUM_CLASSES):
    metrics.append(LookUpMetric(map_metric, dls.vocab[lookup_idx+1], lookup_idx))

In [None]:
learn = Learner(dls, arch, loss_func=crit, splitter=_retinanet_split, 
                cbs=[ThresholdingAndNMS()], metrics=metrics)
learn.to_fp16()


In [None]:
learn.freeze()

In [None]:
learn.fit_one_cycle(1, lr_max=slice(5e-5,5e-4))


In [None]:
learn.save("freeze-1")

In [None]:
learn.unfreeze()


In [None]:
learn.fit_one_cycle(15, lr_max=slice(1e-5, 1e-4))


In [None]:
# create_head(124, 6)

We have one with a smoother, a classifer, and a box_regressor (to get our points)



### Loss Function

In [None]:
ratios = [1/2,1,2]
scales = [1,2**(-1/3), 2**(-2/3)]
crit = RetinaNetFocalLoss(arch, scales=scales, ratios=ratios)


### Learner

In [None]:
def _retinanet_split(m): return L(m.encoder,nn.Sequential(m.c5top6, m.p6top7, m.merges, m.smoothers, m.classifier, m.box_regressor)).map(params)


learn = Learner(dls, arch, loss_func=crit, splitter=_retinanet_split)
learn.freeze()


### Training

In [None]:
os.chdir("/kaggle")
os.getcwd()

In [None]:
# dls = pascal.dataloaders(path/'images')

In [None]:
# hyper params
n_epochs = 10
lr_max =  slice(1e-5, 1e-4)

# learn.fine_tune(4)

# learn.fit_one_cycle(n_epochs, lr_max)
learn.fit(n_epochs,lr_max)


In [None]:
learn.summary()


# Exporting Model

In [None]:
learn.export()

### What Is an Anchor Box?
* **Anchor boxes are a set of predefined bounding boxes of a certain height and width. These boxes are defined to capture the scale and aspect ratio of specific object classes you want to detect** and are typically chosen based on object sizes in your training datasets. During detection, the predefined anchor boxes are tiled across the image. The network predicts the probability and other attributes, such as background, intersection over union (IoU) and offsets for every tiled anchor box. The predictions are used to refine each individual anchor box. You can define several anchor boxes, each for a different object size. Anchor boxes are fixed initial boundary box guesses.

* **The network does not directly predict bounding boxes, but rather predicts the probabilities and refinements that correspond to the tiled anchor boxes.** The network returns a unique set of predictions for every anchor box defined. The final feature map represents object detections for each class. The use of anchor boxes enables a network to detect multiple objects, objects of different scales, and overlapping objects.

### Advantage of Using Anchor Boxes
* When using anchor boxes, you can evaluate all object predictions at once. Anchor boxes eliminate the need to scan an image with a sliding window that computes a separate prediction at every potential position. 

In [None]:
anchors = create_anchors(sizes=[(32,32),(16,16),(8,8),(4,4)], ratios=[0.5, 1, 2], scales=[0.35, 0.55, 0.75, 1, 1.25, 1.45])

In [None]:
fig,ax = plt.subplots(figsize=(10,10))
ax.imshow(image2np(data.valid_ds[0][0].data))

for i, bbox in enumerate(anchors[:18]):
    bb = bbox.numpy()
    x = (bb[0] + 1) * size / 2 
    y = (bb[1] + 1) * size / 2 
    w = bb[2] * size / 2
    h = bb[3] * size / 2
    
    rect = [x,y,w,h]
    draw_rect(ax,rect)

In [None]:
len(anchors)

## Model Training

In [None]:
n_classes = data.train_ds.c

crit = RetinaNetFocalLoss(anchors)

encoder = create_body(models.resnet18, True, -2)

model = RetinaNet(encoder, n_classes=data.train_ds.c, n_anchors=18, sizes=[32,16,8,4], chs=32, final_bias = -4., n_conv = 2)

* **n_anchors = len(ratios) x len(scales)**

In [None]:
voc = PascalVOCMetric(anchors, size, [i for i in data.train_ds.y.classes[1:]])
learn = Learner(data,
                model, 
                loss_func=crit,
                callback_fns=[BBMetrics],
                metrics=[voc],
                model_dir = '/kaggle/working/')

In [None]:
learn.split([model.encoder[6], model.c5top5]);
learn.freeze_to(-2)
#learn = learn.to_fp16()

In [None]:
#learn.lr_find()
#learn.recorder.plot()

In [None]:
gc.collect()

In [None]:
#learn.unfreeze()
learn.fit_one_cycle(4, 1e-3 ,callbacks = [SaveModelCallback(learn, every ='improvement', monitor ='AP-wheat', name ='best_wheat')])

In [None]:
learn.load('best_wheat');
learn.export('/kaggle/working/gwheat.pkl')

In [None]:
learn.recorder.plot_losses()

In [None]:
show_results_side_by_side(learn, anchors, detect_thresh=0.5, nms_thresh=0.1, image_count=5)

## If you reached till here please don't forget to upvote.

## Prediction Test images helpful functions

In [None]:
def show_output(item,bboxs_tot,scores_tot):
    fig,ax = plt.subplots(figsize=(10,10))
    ax.imshow(image2np(item.data))
    plt.axis('off')
    area_max = 512**2/5 
    for bbox, c in zip(bboxs_tot[0], scores_tot[0].numpy()):
        txt = 'wheat, {0:.4f}'.format(c)
        if bbox[2]*bbox[3] <= area_max:
            draw_rect(ax, [bbox[1],bbox[0],bbox[3],bbox[2]], text=txt,text_size=12,color='red')

In [None]:
def process_preds_show(item,clas,bboxs,show_img,cnt,i):
    detect_thresh=0.4   # set your own detection threshold
    nms_thresh=0.1
    pred_string = []
    scores_tot = []
    bboxs_tot = []
    show_img = True if i<cnt else False
    for clas_pred, bbox_pred in list(zip(clas, bboxs)):
        bbox_pred, scores, preds = process_output(clas_pred, bbox_pred, anchors, detect_thresh)
        if bbox_pred is not None:
            to_keep = nms(bbox_pred, scores, nms_thresh)
            bbox_pred, preds, scores = bbox_pred[to_keep].cpu(), preds[to_keep].cpu(), scores[to_keep].cpu()
        t_sz = torch.Tensor([size])[None].cpu()
        if bbox_pred is not None:
            bbox_pred = to_np(rescale_boxes(bbox_pred, t_sz))
                # change from center to top left
            bbox_pred[:, :2] = bbox_pred[:, :2] - bbox_pred[:, 2:] / 2
            bboxs_tot.append(bbox_pred)
            scores_tot.append(scores)
    if show_img:
        show_output(item,bboxs_tot,scores_tot)
    area_max = (1024**2)/5
    for s,bbx in zip(scores_tot[0].numpy(),bboxs_tot[0]):
        bbx = [int(round(x)) for x in bbx*2]
        if bbx[2]*bbx[3] <= area_max :
            res = "{0:.4f} {1} {2} {3} {4}".format(s,bbx[1],bbx[0],bbx[3],bbx[2])
            pred_string.append(res)
    return pred_string

In [None]:
def get_prediction(show_img=True,cnt=10): 
    # Set show img True to see img or else false for bboxs only, cnt for number of images to show
    preds_str = {}
    for i in range(len(data.test_ds)):
        item = learn.data.test_ds[i][0]  #Pick one image
        batch = learn.data.one_item(item)
        clas,bboxs,xtr = learn.pred_batch(batch=batch)
        prd = process_preds_show(item,clas,bboxs,show_img,cnt,i) 
        preds_str[image_id[i]] = " ".join(prd)
    return preds_str

In [None]:
prediction = get_prediction()
# Set False to not show images
# Regardless of that it will give prediction string 

In [None]:
submit = pd.DataFrame.from_dict(prediction,orient='index').reset_index()
submit.columns = ['image_id','PredictionString']
submit.head(10)