### Import Libraries

In [16]:
import os
import cv2
import time
import copy
import torch
import json
import numpy as np
import pandas as pd
import torch.nn as nn
from PIL import Image
import torch.optim as optim
# from google.colab import drive
import torch.nn.functional as F
from torchvision.ops import nms
from IPython.display import Image as ig
from tqdm.notebook import tqdm
from torchsummary import summary
from matplotlib import pyplot as plt
from torchvision import datasets, transforms
torch.manual_seed(1)

<torch._C.Generator at 0x223bb165db0>

In [17]:
# drive.mount('/content/drive')
# DRIVE_PATH = './drive/MyDrive/dataset2/'

In [18]:
DATA_PATH = './dataset/'

In [19]:
torch.cuda.empty_cache()



##### Device set to Cuda

In [20]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


#### Read the JSON file which contains the meta data about the dataset.
- Image name
- Normalized bounding box [cx, cy, w, h] and,
- Label

In [21]:
#annotation.json path
JSON_PATH = DATA_PATH + "finally_finalizedData_sure.json"
#Opening json
f = open(JSON_PATH)
json_file = json.load(f)
len(json_file)

1245

###  Actual Volume Interpretration

In [22]:
def preprocess_grid(x, y, size=1, grid_size=13):
    '''
    x, y: center of bouding box
    size: length or width of image.; considered as 1 
    grid_size: size of grid. (image is divided into 13*13 grid virtually)
    return row and column of grid cell given center (x,y) 
    '''
    row, col = None, None
    grid_len = size/grid_size
    for idx_c in range(1, grid_size+1):
        if x <= idx_c*grid_len and x>=(idx_c-1)*grid_len:
            col = idx_c
            break
    for idx_r in range(1, grid_size+1):
        if y <= idx_r*grid_len and y>=(idx_r-1)*grid_len:
            row = idx_r
            break
    if row and col:
        return row-1, col-1

In [23]:
def ground_truth_volume(labels, bbox):
    '''
    labels: str (either 'pizza' or 'sandwich')
    bbox: list(of x, y, w, and h)
    return list of 7*13*13 ground truth volume using label and bounding box
    '''
    zero_tensor = torch.zeros((7, 13, 13))
    for box in bbox:
      r, c = preprocess_grid(box[0], box[1])
      zero_tensor[0][r][c] = 1
      for i in range(4):
          zero_tensor[i+1][r][c] = box[i]
          if labels=='pizza':
              zero_tensor[5][r][c] = 1
          else:
              zero_tensor[6][r][c] = 1
    return zero_tensor

### Prepare Data  

In [24]:
def prepare_data(json_file) -> list:
    '''
    json_file: JSON.json
    return list of image path and its label with in the list.
    '''
    count_s = 0
    count_p = 0
    train_data = []
    test_data = []
    for idx in range(len(json_file)):
        IMAGE_PATH = DATA_PATH + "images/"+ json_file[idx]['file_name']
        labels = json_file[idx]['label']
        label = ground_truth_volume(json_file[idx]['label'], json_file[idx]['yolo_bbox'])
#         data.append([IMAGE_PATH, label])
        if (labels == 'pizza' and count_p <= 490) or (labels == 'sandwich' and count_s <= 500):
            train_data.append([IMAGE_PATH, label])
            if labels == 'pizza':
                count_p += 1
            if labels == 'sandwich':
                count_s += 1
        else:
            test_data.append([IMAGE_PATH, label])
    return train_data, test_data

In [25]:
#creating data with image and its respective label
train_data, val_data = prepare_data(json_file)
# datas


### Data Tranform, Datasets and Dataloaders

In [26]:
BATCH_SIZE = 4

In [27]:
#custom datasets 
class Dataset(torch.utils.data.Dataset):
    
    def __init__(self, data, transform=None):
        self.datas = data
        self.transform = transform

    def __len__(self):
        return len(self.datas)
    # To return x,y values in each iteration over dataloader as batches.0

    def __getitem__(self, idx):
        """
        Args:
            index (int): Index

        Returns:
            tuple: (sample, target) where target is y_true of the sample.
        """
        img_path, labels = self.datas[idx]
        image = Image.open(img_path)
        # image = image.convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, labels

#### Data Tranform
- Resize  
- Convert to Tensor  
- Normalization [using standard deviation and mean of Imagenet dataset.]

In [28]:
# data_transform
data_transform = transforms.Compose([
    transforms.Resize((416, 416)), 
    transforms.ToTensor(), 
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
 ])

#### Dataset

In [29]:
#datasets
dataset = {}
dataset['train'] = Dataset(train_data, data_transform)
dataset['val'] = Dataset(val_data, data_transform)

#### Dataset size

In [30]:
#data size
data_size ={x: len(dataset[x]) for x in ['train', 'val']}


#### Dataloaders

In [31]:
#dataloader for train and val data
dataloaders = {x: torch.utils.data.DataLoader(dataset[x], batch_size=BATCH_SIZE, shuffle=True) for x in ['train', 'val']}


In [32]:
# next(iter(dataloaders['train']))

### Classifier Model Network

In [33]:
#Classifier Model
class ClassifierModel(nn.Module):
    def __init__(self):
        super(ClassifierModel, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)  
        self.batchNorm1 = nn.BatchNorm2d(num_features=16)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2) 
        self.leakyRelu = nn.LeakyReLU(0.01)
        
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)   #                          
        self.batchNorm2 = nn.BatchNorm2d(num_features=32)                                  
        #pool
        
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=16, kernel_size=1, stride=1, padding=1)                             
        self.batchNorm3 = nn.BatchNorm2d(num_features=16)
        #no pool
        
        self.conv4 = nn.Conv2d(in_channels=16, out_channels=128, kernel_size=3, stride=1, padding=1)                               
        self.batchNorm4 = nn.BatchNorm2d(num_features=128)
        #no pool
        
        self.conv5 = nn.Conv2d(in_channels=128, out_channels=16, kernel_size=1, stride=1, padding=1)                              
        self.batchNorm5 = nn.BatchNorm2d(num_features=16)
        #no pool
        
        self.conv6 = nn.Conv2d(in_channels=16, out_channels=128, kernel_size=3, stride=1, padding=1)       
        self.batchNorm6 = nn.BatchNorm2d(num_features=128)                                                 
        # pool
        
        self.conv7 = nn.Conv2d(in_channels=128, out_channels=32, kernel_size=1, stride=1, padding=1)                             
        self.batchNorm7 = nn.BatchNorm2d(num_features=32)
        #no pool
        
        self.conv8 = nn.Conv2d(in_channels=32, out_channels=256, kernel_size=3, stride=1, padding=1)                             
        self.batchNorm8 = nn.BatchNorm2d(num_features=256)
        #no pool
        
        self.conv9 = nn.Conv2d(in_channels=256, out_channels=32, kernel_size=1, stride=1, padding=1)                               
        self.batchNorm9 = nn.BatchNorm2d(num_features=32)
        #no pool
        
        self.conv10 = nn.Conv2d(in_channels=32, out_channels=256, kernel_size=3, stride=1, padding=1)                               
        self.batchNorm10 = nn.BatchNorm2d(num_features=256)
        # pool
        
        self.conv11 = nn.Conv2d(in_channels=256, out_channels=64, kernel_size=1, stride=1, padding=1)                               
        self.batchNorm11 = nn.BatchNorm2d(num_features=64)
        #no pool
        
        self.conv12 = nn.Conv2d(in_channels=64, out_channels=512, kernel_size=3, stride=1, padding=1)                               
        self.batchNorm12 = nn.BatchNorm2d(num_features=512)
        #no pool
        
        self.conv13 = nn.Conv2d(in_channels=512, out_channels=64, kernel_size=1, stride=1, padding=1)                               
        self.batchNorm13 = nn.BatchNorm2d(num_features=64)
        #no pool
        
        self.conv14 = nn.Conv2d(in_channels=64, out_channels=512, kernel_size=3, stride=1, padding=1)                               
        self.batchNorm14 = nn.BatchNorm2d(num_features=512)
        #no pool
        
        self.conv15 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1, stride=1, padding=1)                               
        self.batchNorm15 = nn.BatchNorm2d(num_features=128)
        #no pool
        
        self.conv16 = nn.Conv2d(in_channels=128, out_channels=1000, kernel_size=1, stride=1, padding=1)                               
        self.avgPool = nn.AvgPool2d(kernel_size=2, stride=1)
#         softmax
        self.dropout = nn.Dropout(p=0.3)
        self.fc1 = nn.Linear(36*36*1000, 64)
        self.fc2 = nn.Linear(64, 2)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.pool(self.leakyRelu(self.batchNorm1(self.conv1(x))))     
        x = self.pool(self.leakyRelu(self.batchNorm2(self.conv2(x))))
        
        x = self.leakyRelu(self.batchNorm3(self.conv3(x)))
        x = self.leakyRelu(self.batchNorm4(self.conv4(x)))
        x = self.leakyRelu(self.batchNorm5(self.conv5(x)))
        x = self.pool(self.leakyRelu(self.batchNorm6(self.conv6(x))))
        
        x = self.leakyRelu(self.batchNorm7(self.conv7(x)))
        x = self.leakyRelu(self.batchNorm8(self.conv8(x)))
        x = self.leakyRelu(self.batchNorm9(self.conv9(x)))
        x = self.pool(self.leakyRelu(self.batchNorm10(self.conv10(x))))
        
        x = self.leakyRelu(self.batchNorm11(self.conv11(x)))
        x = self.leakyRelu(self.batchNorm12(self.conv12(x)))
        x = self.leakyRelu(self.batchNorm13(self.conv13(x)))
        x = self.leakyRelu(self.batchNorm14(self.conv14(x)))
        x = self.leakyRelu(self.batchNorm15(self.conv15(x)))
        x = self.avgPool(self.conv16(x))
        x = torch.flatten(x,1)
#         x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.softmax(self.fc2(x))
        return x

classifier_model = ClassifierModel()

### Load the trained classifier 


In [28]:
#name of the model is net
# FILE_PATH =  './Saved Models/classifier.pth'
CLASSIFIER_PATH = './Saved Models/classifier.pth'
classifier_model.load_state_dict(torch.load(CLASSIFIER_PATH))

<All keys matched successfully>

### Get the classifier portion without FC Layer

In [34]:
pretrained_classifier_model = nn.Sequential(*list(classifier_model.children())[:-5])

#### Freeze the parameters of the loaded classifier

In [20]:
for param in pretrained_classifier_model.parameters():
    param.requires_grad = False
    # param.requires_grad = False

### Object Detection Model

In [35]:
class DetectionModel(nn.Module):
    def __init__(self, classifier):
        super(DetectionModel, self).__init__()
        self.classifier = classifier
        self.conv1 = nn.Conv2d(in_channels=1000, out_channels=64, kernel_size=3, stride=2, padding=0)
        self.batchNorm1 = nn.BatchNorm2d(num_features=1000)
        self.leakyRelu = nn.LeakyReLU(0.01)
        
        self.batchNorm2 = nn.BatchNorm2d(num_features=64)                                 
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=2, padding=0)                              
        
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=16, kernel_size=1, stride=2, padding=0)                              
        self.batchNorm3 = nn.BatchNorm2d(num_features=32)                                 
        

        self.conv4 = nn.Conv2d(in_channels=16, out_channels=7, kernel_size=3, stride=2, padding=0)


    def forward(self, x):
        with torch.no_grad():
            x = self.classifier(x)
        x = self.leakyRelu(self.conv1(self.batchNorm1(x)))  #kernel_size=3, stride=2, padding=0)
        x = self.leakyRelu(self.conv2(self.batchNorm2(x)))  #kernel_size=3, stride=2, padding=0) 
        x = self.leakyRelu(self.conv3(self.batchNorm3(x)))  #kernel_size=1, stride=2, padding=0)      
        x = self.conv4(x)                                   #kernel_size=3, stride=2, padding=0)
        # 7 * 13 * 13 shape
        return x

In [36]:
#trained classifier model passed as argument in the object detection model
detector_model = DetectionModel(pretrained_classifier_model)

In [23]:
#check if the gradient is False for the classifier and True for the remaining. 
for i in detector_model.parameters():
    print(i.requires_grad)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [24]:
#summarize the network
# summary(detector_model.to(device), (3,416,416))

Test model

#### Load the ObjectDetection model to contiue training 

In [25]:
OBJECT_DETECTION_PATH = './Saved Models/Anchor_OB_Model1.pth'

### Loss Function

#### 1. Sum of Squared loss function

In [78]:
#Squared difference loss
class Squared_loss(nn.Module):
    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
        super(Squared_loss, self).__init__()
        if size_average is not None or reduce is not None:
            self.reduction: str = _Reduction.legacy_get_string(size_average, reduce)
        else:
            self.reduction = reduction
    
    def forward(self, predicted, target, objectness):
#         print(predicted.shape, target.shape)
        temp = (predicted - target) ** 2
        if temp.shape[1] == 2:
            temp = torch.add(temp[:, 0:1, :, :], temp[:, 1:, :, :])
        res = torch.sum(objectness * temp)
        return res

#### 2. IOU loss Function

In [79]:
#IOU Loss
class IOULoss(nn.Module):
  def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
    super(IOULoss, self).__init__()
    if size_average is not None or reduce is not None:
        self.reduction: str = _Reduction.legacy_get_string(size_average, reduce)
    else:
        self.reduction = reduction
  def forward(self, predicted, target, eps = 1e-6) -> torch.Tensor:
    grids = []
    for batch in range(target.shape[0]):
        for x_i in range(13):
            for y_i in range(13):
                if target[batch, 0, x_i, y_i] == 1:
                    grids.append([batch, x_i, y_i])
    # print(grids)
    output = torch.zeros(predicted.shape[0], 1, 13, 13).to(device)
    #print(output.shape)
    for batch in range(target.shape[0]):
      final = torch.zeros(1,1,13,13).to(device)
      #print(f"------------------------------------------------------------batch: {batch}")
      for grid in grids:
        if grid[0] == batch:
          bbox_x = torch.full((1,1, 13, 13), target[grid[0],1, grid[1], grid[2]])
          bbox_y = torch.full((1,1, 13, 13), target[grid[0],2, grid[1], grid[2]])
          bbox_w = torch.full((1,1, 13, 13), target[grid[0],3, grid[1], grid[2]])
          bbox_h = torch.full((1,1, 13, 13), target[grid[0],4, grid[1], grid[2]])
          bbox_x,  bbox_y,  bbox_w,  bbox_h = bbox_x.to(device),  bbox_y.to(device),  bbox_w.to(device),  bbox_h.to(device)
          xtop_left_inter = torch.max((predicted[batch,1:2, :, :] - (predicted[batch, 3:4, :,  :]/2)), (bbox_x - (bbox_w/2)))
          ytop_left_inter = torch.max((predicted[batch,2:3, :, :] - (predicted[batch,4:5, :, :]/2)), (bbox_y - (bbox_h/2)))
          xbottom_right_inter = torch.min(predicted[batch,1:2,:, :] + (predicted[batch,3:4, :, :]/2), (bbox_x + (bbox_w/2)))
          ybottom_right_inter = torch.min(predicted[batch,2:3,:, :] + (predicted[batch,4:5, :, :]/2), (bbox_y + (bbox_h/2)))
          width_inter = torch.clamp(xbottom_right_inter - xtop_left_inter, 0)
          height_inter = torch.clamp(ybottom_right_inter - ytop_left_inter, 0)
          #print(width_inter.shape)
          area_inter = width_inter * height_inter
          area_union = predicted[batch, 3:4, :, :] * predicted[batch, 4:5, :, :] + bbox_w * bbox_h - width_inter * height_inter
          iou_calc = area_inter / torch.clamp(area_union, min = eps)
          #print(area_union.shape)
          #print(iou_calc)
          final = torch.max(final, iou_calc)[0]
          final = final.reshape(1,1,13,13)
        #print(final
      output[batch, 0, :,:] = final
        #print(output)
    return output

### Training and Testing

#### Training the Network

In [28]:
#creating a class containing accuracy, training and predict function
class TrainingNetwork:
    def __init__(self, model, epoch=1, lamda_coord=5, lamda_noobj=0.5, optimizer=None, criterion=None, ious=None):
        self.epsilon = 1e-6
        self.model = model
        self.epochs = epoch
        self.lamda_noobj = lamda_noobj
        self.lamda_coord = lamda_coord
        self.optimizer = optimizer
        self.criterion = criterion
        self.ious = ious
        self.val_loss = []
        self.train_loss = []
        self.Pw = 142
        self.Ph = 110
    def get_row_column(self, objectness):
        '''
        x, y: center of bouding box
        size: taken as 1 
        grid_size: size of grid 
        return row and column of grid cell
        '''

        offsetx = torch.zeros(objectness.shape)
        offsety = torch.zeros(objectness.shape)
        for batch in range(objectness.shape[0]):
                for col in range(13):
                    for row in range(13):
                                offsetx[batch][0][row][col] = col * 32
                                offsety[batch][0][row][col] = row * 32
        return offsetx.to(device), offsety.to(device)
        
    def train_model(self):
        start = time.time()
        for epoch in (range(self.epochs)):# loop over the dataset multiple times
            print(f'Epoch: {epoch+1}')
            for mode in ['train', 'val']:
                running_loss = 0.0
                running_acc = 0.0
                if mode=='train':
                    self.model.train(True)
                    TOTAL = 248
                else:
                    self.model.train(False)
                    TOTAL = 64
                for i, data in tqdm(enumerate(dataloaders[mode]), total=TOTAL):
                    
                  # get the inputs; data is a list of [inputs, labels]
                    inputs, labels = data
                    
                  #transfer the inputs, labels to GPU
                
                    inputs = inputs.float()
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    labels[:,1:5, :, :] = labels[:,1:5,:,:] * 416
                  # zero the parameter gradients every mini batches/ reset the gradients of model parameters
                    self.optimizer.zero_grad()
                  # forward pass
                    outputs = self.model(inputs)
                    objectness = labels[:, 0:1, :, :]
                    objectness = objectness.to(device)
                    #return cx and cy
                    cx, cy = self.get_row_column(objectness)
                    #applying offsets.
                    confidence_score = torch.sigmoid(outputs[:, 0:1, :, :]).to(device)
                    b_x = torch.sigmoid(outputs[:, 1:2, :, :]) + cx
                    b_y = torch.sigmoid(outputs[:, 2:3, :, :]) + cy
                    b_w = self.Pw * torch.exp(outputs[:, 3:4, :, :])
                    b_h = self.Ph * torch.exp(outputs[:, 4:5, :, :])
                    c_hat = torch.sigmoid(outputs[:, 5:, :, :])
                    anchor_output = torch.concat((confidence_score, b_x, b_y, b_w, b_h, c_hat), axis=1) # b, 4, 13, 13
                    #Loss Function
                    #1. Bounding Box Coordinate loss
                    loss_xy = self.criterion(anchor_output[:, 1:3, :, :], labels[:, 1:3, :, :], objectness)
                    loss_wh = self.criterion(torch.sqrt(anchor_output[:, 3:5, :, :]), torch.sqrt(labels[:, 3:5, :, :]), objectness)
                    #2. Classifcation Loss
                    loss_cls = self.criterion(anchor_output[:, 5:, :, :], labels[:, 5:, :, :], objectness)
                    IOU = self.ious(anchor_output[:, 0:5, :, :], labels[:, 0:5, :, :])
                    #3. Confidence Loss
                    obj_loss = self.criterion(objectness, confidence_score*IOU, objectness)
                    noobj_loss = self.criterion(objectness, confidence_score*IOU, (1-objectness))
                    #total loss
                    loss = loss_cls + self.lamda_noobj * noobj_loss + obj_loss + self.lamda_coord * (loss_xy + loss_wh)
#                     if i%20 == 0:
#                       print(f"Batch {i+1}: Loss_XY:{loss_xy:.2f} || Loss_WH:{loss_wh:.2f} || Object_Loss:{obj_loss:.2f} || No-object_Loss:{noobj_loss:.2f} || Classification Loss:{loss_cls:.2f}")

                    if mode == 'train':
                    #backward pass/deposits the gradients of the loss w.r.t. each parameter
                        loss.backward()
                    #optimize and update the params
                        self.optimizer.step()
                  # print statistics:
                    running_loss += loss.item()
                if mode=='train':
                    print(f'Total Training Loss: {running_loss/data_size[mode]:.3f}')
                    self.train_loss.append(running_loss)
                else:
                    print(f'Total Validation Loss: {running_loss/data_size[mode]:.3f}')
                    self.val_loss.append(running_loss)
            print(f'----------------------------------------------')
            print(f'----------------------------------------------')
            #save the model
            torch.save(self.model.state_dict(), OBJECT_DETECTION_PATH)
        #time calculation
        end = time.time()
        training_time = end - start
        print(f'Training Completed in: {training_time//60} min {training_time%60:.2f} sec')
        print('Finished Training...!!!')
        return self.model

### Hyperparam

In [29]:
LEARNING_RATE = 0.001
# MOMENTUM = 0.9

### Epoch


In [30]:
EPOCH = 200

In [64]:
detector_model = detector_model.to(device)    #transfer the model in cuda mode

#### Load the object Detection model to continue traning


In [26]:
detector_model.load_state_dict(torch.load(OBJECT_DETECTION_PATH))

<All keys matched successfully>

### Optimizer and Loss 

In [82]:
criterion = Squared_loss()
IOU = IOULoss()
# optimizer = optim.Adam(detector_model.parameters(), lr=LEARNING_RATE)
optimizer = optim.Adam(detector_model.parameters(), lr=LEARNING_RATE)
# momentum=MOMENTUM

### Train the network

In [83]:
method = TrainingNetwork(detector_model, epoch=EPOCH, optimizer=optimizer, criterion=criterion, ious=IOU)

In [None]:
trained_model = method.train_model()

### Load the model

In [37]:
#name of the model is net
# FILE_PATH = 'model3_dropout_augentation3.pth'
OBJECT_MODEL_PATH = './Saved Models/Anchor_OB_Model1.pth'
detector_model.load_state_dict(torch.load(OBJECT_MODEL_PATH))

<All keys matched successfully>

### Testing single image

#### Prediction without Non-Maximal Supression

In [70]:
class WITHOUT_NMS:
    def __init__(self, data_transform):
        self.data_transform = data_transform
        self.class_name = {0:"Pizza", 1:"Sandwich"}
    def objectDetection(self, model, PATH=None):
        model.eval()
        img = Image.open(PATH)
        image_tensor = self.data_transform(img)
        img_tens = torch.unsqueeze(image_tensor, 0)
        pred = model(img_tens.to(device))
        return pred
    
    def get_offsets(self, shape):
        '''
        shape: 1*1*13*13 shape of one channel
        return: offsetx and offsety
        '''
        offsetx = torch.zeros(shape)
        offsety = torch.zeros(shape)
        for batch in range(shape[0]):
            for col in range(13):
                for row in range(13):
                    offsetx[batch][0][row][col] = col * 32
                    offsety[batch][0][row][col] = row * 32
        return offsetx.to(device), offsety.to(device)
    
    def transform(self, preds):
        offsetx, offsety = self.get_offsets(preds[:,0:1,:,:].shape)
        obj = torch.sigmoid(preds[:, 0:1, :, :])
        b_x = torch.sigmoid(preds[:,1:2,:,:]) + offsetx
        b_y = torch.sigmoid(preds[:,2:3,:,:]) + offsety
        b_w = 142 * torch.exp(preds[:,3:4,:,:])
        b_h = 110 * torch.exp(preds[:,4:5,:,:])
        class_hat = torch.sigmoid( preds[:, 5:, :, :])
        pred_res = torch.concat((obj, b_x, b_y, b_w, b_h, class_hat), axis=1)
        return pred_res.to(device)
    
    def predict_bbox_without_nms(self, model, confidence_threshold:int, index:int):
        '''
        threshold: confidence score of ab object.
        index: index of validation data.
        return: None || bounding box written in image without nms
        '''
        IMG_PATH = val_data[index][0]
        preds = self.objectDetection(model.to(device), IMG_PATH)
        preds = self.transform(preds)
        print(preds[:, 0:1, :, :])
        img = cv2.imread(IMG_PATH)
        img = cv2.resize(img, (416, 416), interpolation = cv2.INTER_AREA)
        for x_i in range(13):
            for y_i in range(13):
                if preds[0,0, x_i, y_i] >= confidence_threshold:
                    x, y, w, h = (preds[0, 1:5, x_i, y_i].cpu().detach().numpy()).astype("int")
                    classes = preds[0, 5:, x_i, y_i].cpu().detach().numpy()
                    cls = np.argmax(classes)
                    x1, y1 = int(x-(w/2)), int(y-(h/2))
                    x2, y2 = int(x+(w/2)), int(y+(h/2))
                    if cls==0:
                        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 1)
                        cv2.putText(img, f"{self.class_name[cls]}", (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 255))
                    else:
                        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 1)
                        cv2.putText(img, f"{self.class_name[cls]}", (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0))
                    ig(IMG_PATH, height=200, width=200)
                    cv2.imwrite('./image_without_nms/image'+str(index)+'.jpg', img)                
                    print(x1, y1, x2, y2) 
without_nms = WITHOUT_NMS(data_transform)

In [71]:
#testing within validation data given index.
without_nms.predict_bbox_without_nms(detector_model, confidence_threshold=0.3, index=1)

tensor([[[[0.1520, 0.0803, 0.0810, 0.0821, 0.0970, 0.0918, 0.0961, 0.0884,
           0.0713, 0.0695, 0.0832, 0.2043, 0.3215],
          [0.0766, 0.0754, 0.0878, 0.0766, 0.0984, 0.1473, 0.1580, 0.0692,
           0.0646, 0.0366, 0.0293, 0.1061, 0.2769],
          [0.0944, 0.0789, 0.1000, 0.2037, 0.3705, 0.1562, 0.2313, 0.1106,
           0.0761, 0.0625, 0.0613, 0.0324, 0.0661],
          [0.0882, 0.1025, 0.2617, 0.2305, 0.2170, 0.1964, 0.2454, 0.2413,
           0.0884, 0.0700, 0.0697, 0.0664, 0.0684],
          [0.0919, 0.0938, 0.0868, 0.1115, 0.2711, 0.3381, 0.2405, 0.2106,
           0.1576, 0.0964, 0.0551, 0.0796, 0.0642],
          [0.1012, 0.0610, 0.0977, 0.0707, 0.2277, 0.1693, 0.1598, 0.3056,
           0.1622, 0.0823, 0.1504, 0.1660, 0.0479],
          [0.1016, 0.0987, 0.1082, 0.0453, 0.0679, 0.3928, 0.3009, 0.1209,
           0.0991, 0.1063, 0.2099, 0.0622, 0.0369],
          [0.0884, 0.0611, 0.0632, 0.0399, 0.1132, 0.3891, 0.1764, 0.2603,
           0.1230, 0.1109, 0.0580, 0

### Prediction with Non-maximal Supression

In [86]:
class NMS(WITHOUT_NMS):
    def __init__(self, confidence_threshold:float, iou_threshold:float, val_data:list, index:int):
        super().__init__(data_transform)
        self.confidence_threshold = confidence_threshold
        self.iou_threshold = iou_threshold
        self.val_data=val_data
        self.index=index
        self.img = None
        self.nms1 = None
        self.nms2 = None
        self.box1 = []
        self.score1 = []
        self.box2 = []
        self.score2 = []
        self.class_1 = []
        self.class_2 = []

    def predict_bbox_with_nms(self, model):
        #with non-maximal supression
#         IMG_PATH = self.val_data[self.index][0]
        IMG_PATH = './plot image/sandwich.jpg'
        preds = self.objectDetection(model, IMG_PATH)
        check = self.transform(preds)
        print(check[:,0,:,:])
        self.img = cv2.imread(IMG_PATH)
        self.img = cv2.resize(self.img, (416, 416), interpolation = cv2.INTER_AREA)
        for x_i in range(13):
            for y_i in range(13):
                if check[0, 0, x_i, y_i] > self.confidence_threshold:
                    x, y, w, h = (check[0, 1:5, x_i, y_i].cpu().detach().numpy()).astype("int")
                    print(f"x, w:{x, w}")
                    classes = check[0, 5:, x_i, y_i].cpu().detach().numpy()
#                     print(classes[0])
                    cls = np.argmax(classes)
                    x1, y1 = int(x-(w/2)), int(y-(h/2))
                    x2, y2 = int(x+(w/2)), int(y+(h/2))
                    score = check[0, 0, x_i, y_i].cpu().detach()
                    if cls==0:
                        self.box1.append([x1, y1, x2, y2])
                        self.score1.append(score)
                        self.class_1.append(classes[0])
#                         print(self.class_1)
                    else:
                        self.box2.append([x1, y1, x2, y2])
                        self.score2.append(score)
                        self.class_2.append(classes[1])
                        
        self.box1 = torch.from_numpy(np.array(self.box1).astype('float'))
        self.box2 = torch.from_numpy(np.array(self.box2).astype('float'))
        self.score1 = torch.from_numpy(np.array(self.score1).astype('float'))
        self.score2 = torch.from_numpy(np.array(self.score2).astype('float'))
        if self.box1.shape[0]!=0:
            self.nms1 = nms(self.box1, self.score1, self.iou_threshold)
        if self.box2.shape[0]!=0:
            self.nms2= nms(self.box2, self.score2, self.iou_threshold)
        return self.nms1, self.nms2
    
    def predict(self, model):
        nms11, nms22 = self.predict_bbox_with_nms(model)
        if nms11 == None:
            print('No Pizza')
            pass
        else:
            print('Pizza found.')
            box_0, score_0 = self.box1[nms11], self.score1[nms11]
#             print(score_0)
#             print(box_0)
            for i in range(len(nms11)):
                x1, y1, x2, y2 = box_0[i].cpu().detach().numpy().astype('int')
                cv2.rectangle(self.img, (x1, y1), (x2, y2), (255, 0, 0), 1)
                cv2.putText(self.img, f"{self.class_1[i]:.2f}     Pizza", (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0))
        if nms22 == None:
            print('No Sandwich')
            pass
        else:
            print('Sandwich found.') 
            box_1, score_1 = self.box2[nms22], self.score2[nms22]
#             print(box_1, score_1)
            for idx in range(len(nms22)):
                x1, y1, x2, y2 = box_1[idx].cpu().detach().numpy().astype('int')
                cv2.rectangle(self.img, (x1, y1), (x2, y2), (0, 0, 255), 1)
                cv2.putText(self.img, f"{self.class_2[idx]:.2f}    Sandwich", (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))
        cv2.imwrite('./Test images1/Image_'+ str(self.index)+'.jpg', self.img)
        return 

In [None]:
# IMG_PATH = __

In [90]:
nms_obj = NMS(confidence_threshold=0.6, iou_threshold=0.5, val_data=val_data, index=1)
nms_obj.predict(detector_model, path=None)

tensor([[[0.3099, 0.3662, 0.3216, 0.2438, 0.3328, 0.1518, 0.0808, 0.0266,
          0.0891, 0.2273, 0.1189, 0.0795, 0.1129],
         [0.2992, 0.2154, 0.3468, 0.3205, 0.1648, 0.0674, 0.0532, 0.0599,
          0.0088, 0.0405, 0.3477, 0.2331, 0.3475],
         [0.3603, 0.2216, 0.2566, 0.3368, 0.2018, 0.0902, 0.0483, 0.0808,
          0.0921, 0.2538, 0.3724, 0.3118, 0.2938],
         [0.1959, 0.1126, 0.2193, 0.1315, 0.2277, 0.1295, 0.1456, 0.1650,
          0.4618, 0.2117, 0.3566, 0.3044, 0.4911],
         [0.2005, 0.5975, 0.6077, 0.2293, 0.0807, 0.0880, 0.1576, 0.1609,
          0.3416, 0.3145, 0.1752, 0.1314, 0.4006],
         [0.2950, 0.3329, 0.2953, 0.4101, 0.2949, 0.2322, 0.1805, 0.0647,
          0.5681, 0.2153, 0.2214, 0.3280, 0.2209],
         [0.3726, 0.5772, 0.3631, 0.2973, 0.0984, 0.2103, 0.1725, 0.1718,
          0.3894, 0.2324, 0.1852, 0.1192, 0.1071],
         [0.4227, 0.4424, 0.3416, 0.2964, 0.1630, 0.1759, 0.1843, 0.3069,
          0.2714, 0.2353, 0.3370, 0.3305, 0.1759],


## Sanity Check

#### 1. IOU Check

In [None]:
import cv2
check_iou = IOULoss()
# check1 = cv2.rectangle(img, (x1, y1), (x2, y2), (255,0,0), 2)
# cv2.imwrite('./Iou Check/image'+str(index)+'.jpg', img) 

In [None]:
img =  cv2.imread(IMG_PATH)
img = cv2.resize(img, (416,416))
x1, y1, w1, h1 = 110, 120, 100, 100
x2, y2, w2, h2 = 55, 120, 100, 100
x3, y3, w3, h3 = 310, 240, 100, 100
x4, y4, w4, h4 = 310, 240, 100, 100

tensor1 = torch.zeros(1, 4, 13, 13)
tensor2 = torch.zeros(1, 4, 13, 13)
img = cv2.rectangle(img, (int(x1 -w1/2), int(y1 -h1/2)), (int(x1+w1/2), int(y1+h1/2)), (255,0,0), 1)
img = cv2.rectangle(img, (int(x2 - w2/2), int(y2- h2/2)), (int(x2+w2/2), int(y2+h2/2)), (0, 255, 0), 1)
img = cv2.rectangle(img, (int(x3 - w3/2), int(y3- h3/2)), (int(x3+w3/2), int(y3+h3/2)), (0, 255, 255), 1)
img = cv2.rectangle(img, (int(x4 - w4/2), int(y4- h4/2)), (int(x4+w4/2), int(y4+h4/2)), (0, 255, 255), 1)
tensor1[0, 0, 0, 0] = x1
tensor1[0, 1, 0, 0] = y1
tensor1[0, 2, 0, 0] = w1
tensor1[0, 3, 0, 0] = h1

tensor1[0, 0, 0, 1] = x3
tensor1[0, 1, 0, 1] = y3
tensor1[0, 2, 0, 1] = w3
tensor1[0, 3, 0, 1] = h3

tensor2[0, 0, 0, 0] = x2
tensor2[0, 1, 0, 0] = y2
tensor2[0, 2, 0, 0] = w2
tensor2[0, 3, 0, 0] = h2

tensor2[0, 0, 0, 1] = x4
tensor2[0, 1, 0, 1] = y4
tensor2[0, 2, 0, 1] = w4
tensor2[0, 3, 0, 1] = h4
iou_value = check_iou(tensor1, tensor2)
print(iou_value)
cv2.imshow('IOUtest',img)
cv2.waitKey(0)