# Implementation of MobileNetv2+SSD<br>
This is an implementation of the MobileNetv2 + SSD architecture for a relatively simpler task of determining bounding boxes for MNIST images embedded in a box. Each box contains only one digit(28x28 MNIST embedded into a 224x224 box) as of now, but the number of predictions per image can be expanded easily (the training outputs need to modified). Also, no data augmentation has been used till now (Colab kept crashing when I increased the dataset size beyond 1000, so the initial amount of data present was sufficient. The crashes might have been due to high traffic, but I haven't confirmed it).<p>
In the earlier implementation, the ground truth data contained information about only one bounding box, which meant only one prediction per image (reference in README). For me, it also reduced the training signal and the model was overfitting. So I changed the outputs to a prediction for each default box (as it should be, from what I understood from the SSD paper). Although the initial implementation is good for the purposes for understanding the model.

Comments mentioned throughout the code mention what needs to change if the model inputs or outputs are changed.

Import libraries

In [None]:
import os
import numpy as np
import numpy.matlib
from PIL import Image
import cv2 
import matplotlib.pyplot as plt

In [None]:
# TRAIN = False
# SAVE = False
# LOAD = True

TRAIN = True
SAVE = True
LOAD = False

# Using other model

In [None]:
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms as T
import torchvision.transforms.functional as F
from torch.utils.data import ConcatDataset,DataLoader
from torchvision.models import densenet201

SIZE = 30

In [None]:
model=densenet201(pretrained=True)
classifier_in=model.classifier.in_features
model.classifier=nn.Linear(classifier_in,10)

In [None]:
class ScaledResizePad(object):
    def __init__(self, output_size=SIZE, scale_size=18, fill=0,padding_mode='constant'):
        self.fsize = output_size
        self.dsize = scale_size

        self.fill = 0
        self.padding_mode = padding_mode
#         assert isinstance(min_size, (numbers.Number, str, tuple))
#         assert isinstance(max_size, (numbers.Number, str, tuple))
        
    def __call__(self, img):
        """
        Args:
            img (PIL Image): Image to be padded.

        Returns:
            PIL Image: Padded image.
        """
        if type(img) == torch.Tensor:
            w,h = img.shape[1:]
        elif type(img) == np.ndarray:
            w,h = img.shape[:2]
        else:
            w,h = img.size
        scale_factor = self.dsize/max(w,h)
        size = int(h*scale_factor),int(w*scale_factor)
        
        scaled_img = F.resize(img,size)
#         scaled_img = F.resize(img,(20,20))
        
        return F.pad(scaled_img, get_padding(scaled_img,self.fsize), self.fill, self.padding_mode)
    
    def __repr__(self):
        return self.__class__.__name__ + '(padding={0}, fill={1}, padding_mode={2})'.\
            format(self.fill, self.padding_mode)

In [None]:
def get_padding(image,output_size = SIZE):
    if type(image) == torch.Tensor or type(image) == np.ndarray:
        w, h = image.shape[1:]
    else:
        w, h = image.size 
    h_padding = (output_size - w) / 2
    v_padding = (output_size - h) / 2
    l_pad = h_padding if h_padding % 1 == 0 else h_padding+0.5
    t_pad = v_padding if v_padding % 1 == 0 else v_padding+0.5
    r_pad = h_padding if h_padding % 1 == 0 else h_padding-0.5
    b_pad = v_padding if v_padding % 1 == 0 else v_padding-0.5
    padding = (int(l_pad), int(t_pad), int(r_pad), int(b_pad))
    return padding

class NewPad(object):
    def __init__(self, fill=0, padding_mode='constant'):
#         assert isinstance(fill, (numbers.Number, str, tuple))
        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']

        self.fill = fill
        self.padding_mode = padding_mode
        
    def __call__(self, img):
        """
        Args:
            img (PIL Image): Image to be padded.

        Returns:
            PIL Image: Padded image.
        """
        return F.pad(img, get_padding(img), self.fill, self.padding_mode)
    
    def __repr__(self):
        return self.__class__.__name__ + '(padding={0}, fill={1}, padding_mode={2})'.\
            format(self.fill, self.padding_mode)


class RandomResize(object):
    def __init__(self, min_size,max_size):
        self.min_size = min_size
        self.max_size = max_size
#         assert isinstance(min_size, (numbers.Number, str, tuple))
#         assert isinstance(max_size, (numbers.Number, str, tuple))
        
    def __call__(self, img):
        """
        Args:
            img (PIL Image): Image to be padded.

        Returns:
            PIL Image: Padded image.
        """
        r1 = torch.rand(1)[0].item()
        r2 = 1-torch.rand(1)[0].item()*0.2+0.1
        w = int(self.min_size + (self.max_size - self.min_size)*r1)
        h = int(w*r2)
        return F.resize(img,(w,h))
    
    def __repr__(self):
        return self.__class__.__name__ + '(padding={0}, fill={1}, padding_mode={2})'.\
            format(self.fill, self.padding_mode)

    
t =  T.Compose([
        T.RandomRotation(35),
        RandomResize(15,30),
        NewPad(),
        T.ToTensor()
    ])

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
if TRAIN:
    batch=500
    training_data=datasets.MNIST(root='/kaggle/input/mobilenet',train=True,download=True,transform=t)
    test_data=datasets.MNIST(root='/kaggle/input/mobilenet',train=False,download=True,transform=t)
    training_data=ConcatDataset([training_data,test_data])
    train_loader=DataLoader(training_data,batch_size=batch)

    criterion=nn.CrossEntropyLoss()
    optimiser=torch.optim.Adam(model.parameters(),lr=1e-4)
    
    model.to(device)
    for epoch in range(5):
        epoch_loss=0
        model.train()
        for index,sample in enumerate(train_loader):
            if index %500 == 0:
                print(f"Epoch {epoch}, index {index}")
            image,label=sample
            image=image.repeat(1,3,1,1)
            image=image.to(device)
            label=label.to(device)
            optimiser.zero_grad()

            output=model(image)        
            loss=criterion(output,label)
            loss.backward()
            optimiser.step()

            epoch_loss+=loss.item()     
        print('Epoch:{} Training Loss:{}'.format(epoch+1,epoch_loss))
    model.eval()


In [None]:
if SAVE:
    torch.save(model.state_dict(),'MNIST.pth')
if LOAD:
    weights=torch.load('../input/densenet/MNIST.pth',map_location=device)
    model.load_state_dict(weights)
    model.eval()
    print('Weights Loaded Successfully')

# Testing

In [None]:
img_path = "/kaggle/input/temp-dataset-pos/machine3_randomised.png"

In [None]:
full_img = cv2.imread(img_path)

print(full_img.shape)
# plt.show(img[900:1600,700:1340,:]) # 1
img = full_img[350:700,:,:] # 2&3
# img = full_img[600:,:400,:] # 2&3
plt.imshow(img)
plt.show()

In [None]:
img = full_img[300:700,:,:].copy()
# img = cv2.resize(img,(224, 224))
result = np.zeros_like(img)

def sort_contours(cnts):
    boundingBoxes=[cv2.boundingRect(contour) for contour in contours]
    (cnts, boundingboxes) = zip(*sorted(zip(cnts, boundingBoxes),key=lambda b:b[1][0]))
    return cnts,boundingboxes

gray=cv2.cvtColor(img,cv2.COLOR_RGB2GRAY)
median = numpy.median(gray)
edged = cv2.Canny(img, int(0.6 * median), int(1 * median))
# edged=cv2.Canny(gray,100,200)
contours,heirarchy = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
contours,boundingboxes=sort_contours(contours)

label = 0
for i,box in enumerate(boundingboxes):
    x,y,w,h=box   
    if (w>=5 and w <= 50 and h>=20 and h <= 100 and h/w<5 and w/h<5):
        label += 1
#         img = cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2) #Plotting bounding box
#         img = cv2.putText(img, str(label), (x, y-3), cv2.FONT_HERSHEY_SIMPLEX, 0.3, 1) #Plotting texts on top of box
#         cv2.drawContours(img, [contours[i]], -1, (255, 255, 255), 3)#3
#         # bounding box
#         result = cv2.rectangle(result, (x, y), (x+w, y+h), (255, 0, 0), 2) #Plotting bounding box
# #         # label
#         result = cv2.putText(result, str(label), (x, y-3), cv2.FONT_HERSHEY_SIMPLEX, 1,(255, 0, 0), 1) #Plotting texts on top of box
        # draw white filled contour on black background
        cv2.drawContours(result, [contours[i]], 0, (255,255,255), 3) #cv2.FILLED


# plt.imshow(img)
plt.imshow(result)
plt.show()

In [None]:
def predict(image):
    img=cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    transform=T.Compose([T.ToPILImage(),ScaledResizePad(padding_mode='edge',fill=0),T.ToTensor()])  
    img=1-transform(img)
    img=img.repeat(1,3,1,1).to(device)
    output=model(img)
    score,predicted=torch.max(output,1)
    return score.item(),predicted.item(), output

digits=[]
scores = []
for box in boundingboxes:
    x,y,w,h=box   
    if (w>=5 and w <= 50 and h>=20 and h <= 100 and h/w<5 and w/h<5):
#         print(h,w,w/h,h/w)
        number=img[y-3:y+h+3,x-3:x+w+3]
        score, digit, raw = predict(number)
        
        if score < 1:
            continue
        if digit == 7:
            print(digit, raw,h/w)
        if digit == 7 and h/w > 2:
            if raw[0][1].item()>1:
                digit = 1
                score = raw[0][1].item()
        predict_img = cv2.rectangle(result, (x, y), (x+w, y+h), (255, 0, 0), 2) #Plotting bounding box
        # label
        predict_img = cv2.putText(predict_img, str(digit), (x, y-3), cv2.FONT_HERSHEY_SIMPLEX, 1,(255, 0, 0), 1) #Plotting texts on top of box

        digits.append(digit)
        scores.append(score)
print(digits)
print(scores)

plt.imshow(predict_img)
plt.show()