# Fastai v1 : Object detection Tutorial

#### If you like, please don't forget to upvote

In [None]:
!pip install object-detection-fastai

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import defaultdict
import os
from fastai import *
from fastai.vision import *
from fastai.callbacks import *
import seaborn as sns 
import matplotlib.pyplot as plt
import matplotlib.image as immg
from sklearn.model_selection import StratifiedKFold,KFold

from object_detection_fastai.helper.object_detection_helper import *
from object_detection_fastai.loss.RetinaNetFocalLoss import RetinaNetFocalLoss
from object_detection_fastai.models.RetinaNet import RetinaNet
from object_detection_fastai.callbacks.callbacks import BBLossMetrics, BBMetrics, PascalVOCMetric

In [None]:
sns.set_style('darkgrid')

In [None]:
path = Path('/kaggle/input/global-wheat-detection');path.ls()

In [None]:
train = pd.read_csv(path/'train.csv')

In [None]:
train.head()

In [None]:
tr = train.image_id.value_counts()
tr = pd.DataFrame({'image_id':tr.index,'wheat_count':tr.values})
tr = tr.sample(frac=1.,random_state=2020).reset_index(drop=True)

In [None]:
tr.head()

In [None]:
fnames = get_files(path/'train')

## DataFrame Format:

* A function needs to return the coordinates then the labels. 
* Let's look at an example quickly; from fastai before we preprocess the data

In [None]:
coco_source = untar_data(URLs.COCO_TINY)
images, lbl_bbox = get_annotations(coco_source/'train.json')
img2bbox = dict(zip(images, lbl_bbox))

In [None]:
images[0],lbl_bbox[0]

* we have **images** and **labels**
* Then both get **ziped in a dictionary**

## fastai images label format for object detection

* Now we have our actual data frame above, we need to make some adjustments. 

* Currently our coordinates are **x, y , w , h** and we want **x1, y1, x2, y2**

* To convert it, we need to add our width and height to the respective x and y. 

* To achieve x2, y2 those we simply add **width to x** and **height to y** :---- **x2 = x + w** and **y2 = y + h**



* **dict contains image_names, bbox, label**

**format**

**{ 'dgdsf244.jpg' : [ [ [ bbox1 ][ bbox2] ] , 
                        [ 'label1' ,'label2' ] ]**


In [None]:
def get_lbl_img(train):
    wheat2bbox = {}
    train['label'] = 'wheat'
    grp = train.image_id.unique()
    tr_gr = train.groupby(['image_id'])
    from tqdm.notebook import tqdm
    for i in tqdm(range(len(grp))):
        name = str(grp[i]) + '.jpg'
        bbox = []
        temp_b = []
        temp = tr_gr.get_group(grp[i])
        tt = temp.loc[:,'bbox'].values
        for j in range(len(temp)):
            t = tt[j][1:-1].split(',')
            t = [float(x) for x in t]  # x,y, width, height
            # Currently our coordinates are x,y,w,h and we want x1,y1,x2,y2
            # To convert it, we need to add our width and height to the respective x and y.
            t[2],t[3] = t[0]+t[2],t[1]+t[3]  
            # To achieve x2,y2 those we simply add width to x and height to y :
            # x2 = x + w and y2 = y + h
            t1 = [t[1],t[0],t[3],t[2]]   # inverse in fromat w,h for fastai
            temp_b.append(t1)
        bbox.append(temp_b)
        bbox.append(['wheat']*len(temp))
        wheat2bbox[name] = bbox
    return wheat2bbox

In [None]:
wheat2bbox = get_lbl_img(train)

### Let's look at the dict of images and labels

In [None]:
a = wheat2bbox['d1f73158a.jpg'];a

* Given a path below function will simply return its bbox and labels

In [None]:
get_y_func = lambda o: wheat2bbox[Path(o).name] 

### Now lets create a Object detection data block

In [None]:
sample = pd.read_csv('../input/global-wheat-detection/sample_submission.csv')

In [None]:
sample.head()

In [None]:
image_id = sample.image_id.values

## DataBunch

In [None]:
ts = (ObjectItemList.from_df(sample,path, folder = 'test' , suffix = '.jpg',cols='image_id'))

In [None]:
data = (ObjectItemList.from_df(tr,path, folder = 'train' , suffix = '.jpg',cols='image_id')
        #Where are the images? ->
        .split_by_rand_pct(0.2)                          
        #How to split in train/valid? -> randomly with the default 20% in valid
        .label_from_func(get_y_func)
        #How to find the labels? -> use get_y_func on the file name of the data
        .transform(size=512)
        .add_test(ts)
        #Data augmentation? -> Standard transforms; also transform the label images
        .databunch(bs=2, collate_fn=bb_pad_collate))   

In [None]:
data.show_batch(1 , figsize = (8,8) ,ds_type=DatasetType.Valid)

In [None]:
len(data.train_ds),len(data.valid_ds),len(data.test_ds)

In [None]:
data.classes

In [None]:
size = 512

### What Is an Anchor Box?
* **Anchor boxes are a set of predefined bounding boxes of a certain height and width. These boxes are defined to capture the scale and aspect ratio of specific object classes you want to detect** and are typically chosen based on object sizes in your training datasets. During detection, the predefined anchor boxes are tiled across the image. The network predicts the probability and other attributes, such as background, intersection over union (IoU) and offsets for every tiled anchor box. The predictions are used to refine each individual anchor box. You can define several anchor boxes, each for a different object size. Anchor boxes are fixed initial boundary box guesses.

* **The network does not directly predict bounding boxes, but rather predicts the probabilities and refinements that correspond to the tiled anchor boxes.** The network returns a unique set of predictions for every anchor box defined. The final feature map represents object detections for each class. The use of anchor boxes enables a network to detect multiple objects, objects of different scales, and overlapping objects.

### Advantage of Using Anchor Boxes
* When using anchor boxes, you can evaluate all object predictions at once. Anchor boxes eliminate the need to scan an image with a sliding window that computes a separate prediction at every potential position. 

In [None]:
anchors = create_anchors(sizes=[(32,32),(16,16),(8,8),(4,4)], ratios=[0.5, 1, 2], scales=[0.35, 0.55, 0.75, 1, 1.25, 1.45])

In [None]:
fig,ax = plt.subplots(figsize=(10,10))
ax.imshow(image2np(data.valid_ds[0][0].data))

for i, bbox in enumerate(anchors[:18]):
    bb = bbox.numpy()
    x = (bb[0] + 1) * size / 2 
    y = (bb[1] + 1) * size / 2 
    w = bb[2] * size / 2
    h = bb[3] * size / 2
    
    rect = [x,y,w,h]
    draw_rect(ax,rect)

In [None]:
len(anchors)

## Model Training

In [None]:
n_classes = data.train_ds.c

crit = RetinaNetFocalLoss(anchors)

encoder = create_body(models.resnet18, True, -2)

model = RetinaNet(encoder, n_classes=data.train_ds.c, n_anchors=18, sizes=[32,16,8,4], chs=32, final_bias = -4., n_conv = 2)

* **n_anchors = len(ratios) x len(scales)**

In [None]:
voc = PascalVOCMetric(anchors, size, [i for i in data.train_ds.y.classes[1:]])
learn = Learner(data,
                model, 
                loss_func=crit,
                callback_fns=[BBMetrics],
                metrics=[voc],
                model_dir = '/kaggle/working/')

In [None]:
learn.split([model.encoder[6], model.c5top5]);
learn.freeze_to(-2)
#learn = learn.to_fp16()

In [None]:
#learn.lr_find()
#learn.recorder.plot()

In [None]:
gc.collect()

In [None]:
#learn.unfreeze()
learn.fit_one_cycle(4, 1e-3 ,callbacks = [SaveModelCallback(learn, every ='improvement', monitor ='AP-wheat', name ='best_wheat')])

In [None]:
learn.load('best_wheat');
learn.export('/kaggle/working/gwheat.pkl')

In [None]:
learn.recorder.plot_losses()

In [None]:
show_results_side_by_side(learn, anchors, detect_thresh=0.5, nms_thresh=0.1, image_count=5)

## If you reached till here please don't forget to upvote.

## Prediction Test images helpful functions

In [None]:
def show_output(item,bboxs_tot,scores_tot):
    fig,ax = plt.subplots(figsize=(10,10))
    ax.imshow(image2np(item.data))
    plt.axis('off')
    area_max = 512**2/5 
    for bbox, c in zip(bboxs_tot[0], scores_tot[0].numpy()):
        txt = 'wheat, {0:.4f}'.format(c)
        if bbox[2]*bbox[3] <= area_max:
            draw_rect(ax, [bbox[1],bbox[0],bbox[3],bbox[2]], text=txt,text_size=12,color='red')

In [None]:
def process_preds_show(item,clas,bboxs,show_img,cnt,i):
    detect_thresh=0.4   # set your own detection threshold
    nms_thresh=0.1
    pred_string = []
    scores_tot = []
    bboxs_tot = []
    show_img = True if i<cnt else False
    for clas_pred, bbox_pred in list(zip(clas, bboxs)):
        bbox_pred, scores, preds = process_output(clas_pred, bbox_pred, anchors, detect_thresh)
        if bbox_pred is not None:
            to_keep = nms(bbox_pred, scores, nms_thresh)
            bbox_pred, preds, scores = bbox_pred[to_keep].cpu(), preds[to_keep].cpu(), scores[to_keep].cpu()
        t_sz = torch.Tensor([size])[None].cpu()
        if bbox_pred is not None:
            bbox_pred = to_np(rescale_boxes(bbox_pred, t_sz))
                # change from center to top left
            bbox_pred[:, :2] = bbox_pred[:, :2] - bbox_pred[:, 2:] / 2
            bboxs_tot.append(bbox_pred)
            scores_tot.append(scores)
    if show_img:
        show_output(item,bboxs_tot,scores_tot)
    area_max = (1024**2)/5
    for s,bbx in zip(scores_tot[0].numpy(),bboxs_tot[0]):
        bbx = [int(round(x)) for x in bbx*2]
        if bbx[2]*bbx[3] <= area_max :
            res = "{0:.4f} {1} {2} {3} {4}".format(s,bbx[1],bbx[0],bbx[3],bbx[2])
            pred_string.append(res)
    return pred_string

In [None]:
def get_prediction(show_img=True,cnt=10): 
    # Set show img True to see img or else false for bboxs only, cnt for number of images to show
    preds_str = {}
    for i in range(len(data.test_ds)):
        item = learn.data.test_ds[i][0]  #Pick one image
        batch = learn.data.one_item(item)
        clas,bboxs,xtr = learn.pred_batch(batch=batch)
        prd = process_preds_show(item,clas,bboxs,show_img,cnt,i) 
        preds_str[image_id[i]] = " ".join(prd)
    return preds_str

In [None]:
prediction = get_prediction()
# Set False to not show images
# Regardless of that it will give prediction string 

In [None]:
submit = pd.DataFrame.from_dict(prediction,orient='index').reset_index()
submit.columns = ['image_id','PredictionString']
submit.head(10)