# Object Detection

## First let's look at the dataset

## And try to visualize the bounding box

In [1]:
# my tool box for pytorch
from p3self.matchbox import *
from constant import *
from utils import *

In [2]:
%ls {IMG}|wc -l

118287


In [3]:
# %matplotlib inline
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import matplotlib.text as text
import numpy as np
import pandas as pd
import json
from glob import glob
from PIL import Image
import os

os.system("mkdir -p /data/bbsample")

0

#### Image table, with image url , image id, yup , that's all we need now

In [4]:
imgdf=pd.DataFrame(jsdict["images"])
imgdf.head()

Unnamed: 0,coco_url,date_captured,file_name,flickr_url,height,id,license,width
0,http://images.cocodataset.org/train2017/000000...,2013-11-14 11:18:45,000000391895.jpg,http://farm9.staticflickr.com/8186/8119368305_...,360,391895,3,640
1,http://images.cocodataset.org/train2017/000000...,2013-11-14 11:38:44,000000522418.jpg,http://farm1.staticflickr.com/1/127244861_ab0c...,480,522418,4,640
2,http://images.cocodataset.org/train2017/000000...,2013-11-14 12:36:29,000000184613.jpg,http://farm3.staticflickr.com/2169/2118578392_...,336,184613,3,500
3,http://images.cocodataset.org/train2017/000000...,2013-11-14 13:02:53,000000318219.jpg,http://farm5.staticflickr.com/4125/5094763076_...,640,318219,3,556
4,http://images.cocodataset.org/train2017/000000...,2013-11-14 16:03:19,000000554625.jpg,http://farm5.staticflickr.com/4086/5094162993_...,640,554625,3,426


#### An annotation table, in this case, we use "bbox"

In [5]:
ann_df=pd.DataFrame(jsdict["annotations"])
ann_df.head()

Unnamed: 0,area,bbox,category_id,id,image_id,iscrowd,segmentation
0,2765.14865,"[199.84, 200.46, 77.71, 70.88]",58,156,558840,0,"[[239.97, 260.24, 222.04, 270.49, 199.84, 253...."
1,1545.4213,"[234.22, 317.11, 149.39, 38.55]",58,509,200365,0,"[[247.71, 354.7, 253.49, 346.99, 276.63, 337.3..."
2,5607.66135,"[239.48, 347.87, 160.0, 57.81]",58,603,200365,0,"[[274.58, 405.68, 298.32, 405.68, 302.45, 402...."
3,0.0,"[296.65, 388.33, 1.03, 0.0]",58,918,200365,0,"[[296.65, 388.33, 296.65, 388.33, 297.68, 388...."
4,800.41325,"[251.87, 333.42, 125.94, 22.71]",58,1072,200365,0,"[[251.87, 356.13, 260.13, 343.74, 300.39, 335...."


#### A category table

In [6]:
len(idx2name)

80

In [7]:
urls = glob(IMG+"/*")

In [8]:
urls[:5]

['/terminus/coco/train2017/000000184383.jpg',
 '/terminus/coco/train2017/000000038938.jpg',
 '/terminus/coco/train2017/000000029019.jpg',
 '/terminus/coco/train2017/000000256529.jpg',
 '/terminus/coco/train2017/000000427301.jpg']

#### Get the image id from image url

In [9]:
rdimg = np.random.choice(urls)
def get_id(url):
    return int(url.split("/")[-1].split(".")[0])

#### Get the bounding box data from annotation

In [10]:
def get_bb(rdimg):
    match = ann_df[ann_df["image_id"]==get_id(rdimg)][["bbox","category_id"]]
    return list(match["bbox"]),list(match["category_id"])

In [11]:
get_bb(rdimg)

([[323.6, 125.99, 79.47, 301.01],
  [371.09, 74.94, 167.72, 257.36],
  [176.41, 110.22, 107.27, 316.78],
  [0.0, 189.03, 640.0, 172.72],
  [3.95, 345.39, 92.12, 81.61],
  [547.48, 272.28, 92.52, 153.58],
  [429.22, 333.69, 113.63, 92.96],
  [387.96, 390.18, 38.74, 36.82]],
 [1, 1, 1, 42, 62, 62, 62, 62])

#### Picture boxes by loops into the picture

In [12]:
fig,ax = plt.subplots(1)
ax.imshow(Image.open(rdimg))
bbs,cids = get_bb(rdimg)
for i in range(len(bbs)):
    bb=bbs[i]
    # format of the bb: x, y, width, height
    rect = patches.Rectangle((bb[0],bb[1]),bb[2],bb[3],linewidth=1,edgecolor='r',facecolor='none')

    ax.add_patch(rect)
    # format of bb 
    ax.text(bb[0],bb[1],idx2name[cids[i]],dict({"color":"#ff0000"}))

In [13]:
fig.savefig("bbtest.png")

## You Only Look Once

### Introduction

From the paper [You Only Look Once: Unified, Real-Time Object Detection](http://arxiv.org/abs/1506.02640)

We divid the image to the grid boxes of size $S*S$

In each grid cell, we predict $B$ bounding boxes 

Each $B$ we have 5 predictions $x, y, w, h$ and confidence, 

$x,y$ relative to the grid box.$w, h$ relative to the entire picture.

Each grid cell, we predict class probability $Pr(Class_{i}|Object)$.

Then **class specified** confidence scores, when at test time we shall calculate, are:

$Pr(Class_{i}|Object)*Pr(Object)* IOU^{truth}_{pred}=Pr(Class_{i})* IOU^{truth}_{pred}$

IOU: **Intersection Over Union**

So the prediction are encoded in a tensor of size $S*S*(B*5+C)$

### YOLO style with anchor box

#### Get resized bb

In [14]:
import torch
from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F

#### Prepare bb data

In [15]:
def df_data(imgdf,ann_df,shuffle=True):
    """
    imgdf:
    A dataframe about images, fields: "id","file_name","height","width"
    ann_df:
    A dataframe about annotation, fields: "image_id","category_id","bbox",
    The field "bbox" is a list of 4 values: x,y,height, width of the bounding box
    """
    data_df=pd.merge(ann_df[["bbox","category_id","image_id"]],
                     imgdf[["id","file_name","height","width"]],
                     left_on="image_id",right_on="id")
    
    data_df["cate_id_oh"] = data_df["category_id"].apply(lambda x:idx2id[x])
    if shuffle:
        data_df = data_df.sample(frac=1).reset_index(drop=True)
    print("total data rows",len(data_df))
    return data_df

data_df = df_data(imgdf,ann_df)
data_df.head()

total data rows 860001


Unnamed: 0,bbox,category_id,image_id,id,file_name,height,width,cate_id_oh
0,"[260.4, 69.1, 88.66, 20.23]",9,504321,504321,000000504321.jpg,375,500,8
1,"[164.7, 39.54, 262.3, 560.54]",1,148551,148551,000000148551.jpg,640,427,0
2,"[340.76, 4.43, 137.45, 113.11]",3,113326,113326,000000113326.jpg,640,480,2
3,"[152.3, 332.33, 80.71, 45.97]",8,216319,216319,000000216319.jpg,480,640,7
4,"[98.19, 138.09, 30.55, 102.78]",1,71986,71986,000000071986.jpg,375,500,0


##### Resize the bounding box

In [16]:
bbox_array = np.array(data_df.bbox.tolist())
wh_array = data_df[["width","height"]].as_matrix()

def re_calibrate(bbox_array,wh_array):
    """return the resized bbox array"""
    bb_resized = (bbox_array/np.concatenate([wh_array,wh_array],axis=-1)) *SIZE
    
    true_bb = bb_resized/SCALE
    # switch xy as left top conner to center point
    true_bb[...,:2]=true_bb[...,:2]+true_bb[...,2:]/2
    # Labels' Anchor positions on the grid map
    grid_bbxy = np.floor(true_bb[...,:2])
    return bb_resized,true_bb,grid_bbxy

def find_best_anchors(true_bbwh):
    iou_score = []
    for b in range(BOX):
        wh_anc = np.tile(ANC_ARR[b],[true_bbwh.shape[0],1])
        true_area = true_bbwh.prod(axis=-1)
        anc_area = wh_anc.prod(axis=-1)
    
        inter_area = np.min([wh_anc,true_bbwh],axis=0).prod(axis=-1)
    
        union_area = true_area + anc_area - inter_area
        iou_score.append(inter_area/union_area)
    best_anchor_idx = np.array(iou_score).T.argmax(axis=-1)
    return best_anchor_idx

bb_resized,true_bb,grid_bbxy = re_calibrate(bbox_array,wh_array)
true_bbxy,true_bbwh = true_bb[...,:2],true_bb[...,2:]
best_anchor_idx = find_best_anchors(true_bbwh)

min_lbl = SCALE * 0.001

data_df["true_bb_x"],data_df["true_bb_y"],data_df["true_bb_w"],data_df["true_bb_h"]=true_bb[:,0],true_bb[:,1],true_bb[:,2],true_bb[:,3]
data_df["true_grid_x"],data_df["true_grid_y"]=grid_bbxy[:,0],grid_bbxy[:,1]

# data_df["true_bb_x"]=data_df["true_bb_x"]-data_df["true_grid_x"]
# data_df["true_bb_y"]=data_df["true_bb_y"]-data_df["true_grid_y"]

data_df["best_anchor"]=best_anchor_idx
data_df_ = data_df[data_df["true_bb_w"]>min_lbl]
data_df_ = data_df_[data_df_["true_bb_h"]>min_lbl]

##### Index to onehot

#### Reverse adjust funtion to get train labels

* t to b

$\large b_{x}=\sigma(t_{x})+c_{x}$

$\large b_{y}=\sigma(t_{y})+c_{y}$

$\large b_{w}=p_{w}e^{w}$

$\large b_{h}=p_{h}e^{h}$

* b to t

$\large t_{x}=-ln(\frac{1}{b_{x}-c_{x}}-1)$

$\large t_{y}=-ln(\frac{1}{b_{y}-c_{y}}-1)$

$\large t_{w}=ln(\frac{b_{w}}{p_{w}})$

$\large t_{h}=ln(\frac{b_{h}}{p_{h}})$

#### Feature Extractor

In [17]:
from conv_model import dn121_conv

dn121=dn121_conv(DN121)

In [18]:
class dn_yolo(nn.Module):
    def __init__(self,feat_extra,feat_in):
        super(dn_yolo,self).__init__()
        self.feat_in = feat_in
        self.feat_extra=feat_extra
        
        self.conv_1 = nn.Conv2d(self.feat_in,feat_in,kernel_size=(3,3),stride=(1,1),padding=1,bias=False)
        self.conv_2 = nn.Conv2d(self.feat_in,feat_in,kernel_size=(3,3),stride=(1,1),padding=1,bias=False)
        self.conv_3 = nn.Conv2d(self.feat_in,VEC_LEN*BOX,kernel_size=(1,1),stride=(1,1),padding=0,bias=False)
        self.bn1 = nn.BatchNorm2d(self.feat_in)
        self.bn2 = nn.BatchNorm2d(self.feat_in)
        self.bn3 = nn.BatchNorm2d(self.feat_in)
    
    def forward(self,x):
        
        x = self.feat_extra(x)
        
        x = self.bn1(x)
        x = self.conv_1(x)
        x = self.bn2(x)
        x = self.conv_2(x)
        x = self.bn3(x)
        x = self.conv_3(x)
        
        # from: bs,channel, height, width
        # to: bs, width, height, channel
        x = x.permute([0,3,2,1]).contiguous().view(-1,FEAT_W,FEAT_H,BOX,VEC_LEN)
        
        return x

### Data Generator

In [19]:
from torch.utils.data import DataLoader,dataset
from torchvision import transforms
from PIL import Image

In [20]:
transform = transforms.Compose([transforms.Resize((HEIGHT,WIDTH)),
                                transforms.ToTensor(),
                                transforms.Normalize([.5,.5,.5],[.5,.5,.5])
                               ])
trans_origin = transforms.Compose([transforms.Resize((HEIGHT,WIDTH)),
                                transforms.ToTensor(),
                               ])
back2PIL = transforms.Compose([transforms.ToPILImage(mode="RGB")])

### Training

In [21]:
from torch.utils.data import DataLoader
from tqdm import trange
from datetime import datetime
import os
from p3self.matchbox import Trainer

In [22]:
from data import Data_Multi

In [23]:
train_set = Data_Multi(data_df=data_df_,
                       transform=transform,
                       trans_origin=trans_origin)

In [24]:
trainer=Trainer(train_set,batch_size=16,print_on=5)
model = dn_yolo(dn121,1024)
from loss_ import yolo3_loss_on_t as yolo3_loss

In [25]:
loss_func = yolo3_loss(lbd_coord=1,
                       lbd_obj=5,
                       lbd_noobj=1,
                       lbd_cls=1,
                       testing=False,train_all=True)

In [26]:
CUDA = torch.cuda.is_available()
if CUDA:
    torch.cuda.empty_cache()
    model.cuda()
    loss_func.cuda()

In [27]:
from torch.optim import Adam
optimizer = Adam(model.parameters())

In [None]:
def action(*args,**kwargs):
    """
    y_s: label for scoring, because the y's bb has been transformed into t
    """
    x,original, t_box, conf_, cls_, mask, cls_mask, b_box = args[0]
    iteration=kwargs["ite"]
    # x,t_box, conf_, cls_, mask, cls_mask, b_box = Variable(x), Variable(t_box), Variable(conf_), Variable(cls_), Variable(mask), Variable(cls_mask), Variable(b_box)
    if CUDA:
        x,t_box, conf_, cls_, mask, cls_mask, b_box = x.cuda(),t_box.cuda(), conf_.cuda(), cls_.cuda(), mask.cuda(), cls_mask.cuda(), b_box.cuda()
    optimizer.zero_grad()
    
    y_ = model(x)
    model.x=x
    model.y_=y_
    
    loss,loss_x,loss_y,loss_w,loss_h,loss_obj,loss_noobj,loss_cls = loss_func(y_,t_box, conf_, cls_, mask, cls_mask, b_box)
    loss.backward()

    optimizer.step()

    if iteration%30==0:
        y_pred = loss_func.t2b(y_)[0:1,...]
        if CUDA:
            y_pred = y_pred.cpu()
        img = back2PIL(original[0])
        printimg = plot_bb(img,data_to_df_bmark(y_pred))
    return {"loss":loss.item(),
            "loss_x":loss_x.item(),
            "loss_y":loss_y.item(),
            "loss_w":loss_w.item(),
            "loss_h":loss_h.item(),
            "loss_obj":loss_obj.item(),
            "loss_noobj":loss_noobj.item(),
            "loss_cls":loss_cls.item(),}

trainer.action=action

In [None]:
trainer.train(2)

  0%|          | 3/7330 [00:02<1:25:39,  1.43it/s]

In [29]:
model.load_state_dict(torch.load("yolo_v3.0.0.3.pkl"))

In [31]:
torch.save(model.state_dict(),"yolo_v3.0.0.3.pkl")

## Debugging

In [32]:
torch.rand(10,10).sum().item()

54.80018615722656

In [50]:
y_pred.size()

torch.Size([16, 10, 10, 5, 85])