This Notebook contains very basic implementation to search and put a object in a bounding box (localization) with deep learning using pytorch.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import modules 

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as ttf
import PIL.Image as imgs
from PIL import ImageDraw

In [None]:
class RacoonsDataSet(Dataset):
    def __init__(self,p):
#         super(self).__init__()
        self.df=pd.read_csv(p)
        self.transform=ttf.Compose([ttf.Resize([64,64]),ttf.ToTensor(),ttf.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])])
        
    def __len__(self):
        return len(self.df)
    def __getitem__(self,i):
        image=imgs.open('../input/racoon-detection/Racoon Images/images/{}'.format(self.df['filename'][i])).convert('RGB')
        X=image.size[0]
        Y=image.size[1]
        try:
            image=self.transform(image)
        except Exception as e:
            print(X,"   ",Y)
            print(e)
            return
        box=torch.from_numpy(np.array([[(64/X)*self.df[x][i],(64/Y)*self.df[y][i]] for [x,y] in zip(['xmin','xmax'],['ymin','ymax'])]).ravel()).float()
        return [image,box]
    def drawBox(self,img,box):
        draw=ImageDraw.Draw(img)
        draw.rectangle([int(x) for x in box], outline=(255, 0, 0),width=1)
        return img
    def getImage(self,i):
        image=imgs.open('../input/racoon-detection/Racoon Images/images/{}'.format(self.df['filename'][i])).convert('RGB')
        
        X=image.size[0]
        Y=image.size[1]
        image=image.resize((64,64))
        return image,np.array([[(64/X)*self.df[x][i],(64/Y)*self.df[y][i]] for [x,y] in zip(['xmin','xmax'],['ymin','ymax'])])


Create the Custom Dataset

Dataset returns the racoon image with bounding box with proper scalling
Our class also have a function to directly get pil images implemented seprately

Lets Test the outputs

In [None]:
p=RacoonsDataSet('../input/racoon-detection/train_labels_.csv')
p[3]

In [None]:
k=p.getImage(23)
k[1].ravel()
p.drawBox(k[0],k[1].ravel())
    
    

Bounding Box Scalling seems fine

In [None]:
class Network(nn.Module):
    def __init__(self):
        super(Network,self).__init__()
        model = torch.hub.load('pytorch/vision:v0.6.0', 'resnet18', pretrained=True)        
        self.fc1=nn.Sequential(*self.get_req_features(model))
        self.fc_classifier=nn.Sequential(nn.Linear(64*16*16,2),nn.ReLU())
        self.boundingBox=nn.Sequential(nn.Linear(64*16*16,4),nn.ReLU())
    def forward(self,X):
        X=self.fc1(X)
        X=X.reshape(-1,64*16*16)
        class_preds=self.fc_classifier(X)
        bound_box=self.boundingBox(X)
        
        return class_preds,bound_box
    def get_req_features(self,model):
        fc=list(model.children())
        req_features=[]
        k=torch.zeros([1,3,64,64]).float()
        for i in fc:
            k=i(k)
            if k.size()[2] <800//80:
                break
            req_features.append(i)
        print("++++++++++++++++++++Processing To Extract Features++++++++++++++++++++++++")
        print(len(req_features))
        print(k.size())
        return req_features

Lets create network. We are using a pretrained network (resnet) to get basic features of image

In [None]:
model2=Network()

In [None]:
model2

In [None]:
p[2][0].shape

Loss For bounding box is MSE loss as it is a regression problem

In [None]:
loss_boundingBox=nn.MSELoss()

In [None]:
optimizer = torch.optim.Adam(model2.parameters(), lr=1e-4, weight_decay=1e-5)

In [None]:
optimizer.zero_grad()

In [None]:
# c,bb=model2(p[2][0].unsqueeze(0))
# l2=loss_boundingBox(bb,p[2][1])
# print(l2)
# l2.backward()
# optimizer.step()

In [None]:
dataloader=DataLoader(p,batch_size=64,shuffle=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model2.to(device)

In [None]:
model2

In [None]:
!pip install tensorboard


In [None]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [None]:
from matplotlib.pyplot import imshow,show

So we will train in batch of 64
Also We use a custom loss function where we also try to minimize size of bounding box. it is controlled by tolerance factor


In [None]:
batch_size=64
tolerance=0.01

get sample can be used to see the results

In [None]:
def getSample(i):    
    img,_=p.getImage(i)
    k=model2.forward(p[i][0].unsqueeze(0).to(device))
    k=k[1].to('cpu').detach().numpy()
#     k=k*128/64
    p.drawBox(img,k.ravel())
    show(img)
    return img

Let the training began

In [None]:
from tqdm import tqdm
for j in tqdm(range(1000)):
    for D,B in dataloader:
        optimizer.zero_grad()
        D,B=D.to(device),B.to(device)
        
        c,bb=model2(D)
        l2=loss_boundingBox(bb,B)+tolerance*torch.sum((bb[2]-bb[0])**2+(bb[3]-bb[1]))/(2*batch_size)
        l2.backward()
        optimizer.step()
        writer.add_scalar('Loss/train',l2.item())
        
    if j%100==0:
        print(l2.item())
        with torch.no_grad():
            k=model2.forward(p[1][0].unsqueeze(0).to(device))        
            k=k[1].cpu().detach().numpy()
            img=p.drawBox(p.getImage(1)[0],k.ravel())
            imshow(img)
            show()

In [None]:
p[0][1]

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir=runs

In [None]:
a=ttf.ToPILImage()

Lets Test with first 10 images


In [None]:
for i in range(10):

    imshow(getSample(i+3))

Seems fine, Next target is to make a racoon detection and localization project

In [None]:
torch.save(model2,'Model.pt')