In [1]:
## https://github.com/experiencor/keras-yolo3/blob/master/yolo3_one_file_to_detect_them_all.py

In [1]:
import numpy as np
import math
import pandas as pd
import cv2
import os
import tqdm
from scipy.io import loadmat

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from PIL import Image
# import pytesseract

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from keras import backend as K

# from utils import *

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam, SGD
from keras.layers import *

from keras.applications import MobileNetV2
from keras.applications import InceptionResNetV2

from keras.models import Model
from keras.models import model_from_json

from keras.initializers import he_normal


In [17]:
target_size = [608, 608]
target_w = 608 # target sizes of image in model input
target_h = 608 #target sizes of image in model input

grid_size_1 = [19, 19]
grid_y_axis_1 = 19  # each image is to be segmented to 17 x 17 grid
grid_x_axis_1 = 19  # # each image is to be segmented to 17 x 17 grid
grid_stride_1 = target_w / grid_x_axis_1  # grid cell width
grid_stride_1 = target_h / grid_y_axis_1  # grid cell height

grid_size_2 = [38, 38]
grid_y_axis_2 = 38  # each image is to be segmented to 17 x 17 grid
grid_x_axis_2 = 38  # # each image is to be segmented to 17 x 17 grid
grid_stride_2 = target_w / grid_x_axis_2  # grid cell width
grid_stride_2 = target_h / grid_y_axis_2  # grid cell height

grid_size_3 = [76, 76]
grid_y_axis_3 = 76  # each image is to be segmented to 17 x 17 grid
grid_x_axis_3 = 76  # # each image is to be segmented to 17 x 17 grid
grid_stride_3 = target_w / grid_x_axis_3  # grid cell width
grid_stride_3 = target_h / grid_y_axis_3  # grid cell height


channels = 3
num_anchors = 3
class_num = 1 # vendor, invoice, inv_date, po, buyer
info = 5 + class_num    # pc, x, y, h, w, and class probabilities

categories = ['item'] # details of classes

In [2]:
def _conv_block(inp, convs, skip=True):
    x = inp
    count = 0
    
    for conv in convs:
        if count == (len(convs) - 2) and skip:
            skip_connection = x
        count += 1
        
        if conv['stride'] > 1: x = ZeroPadding2D(((1,0),(1,0)))(x) # peculiar padding as darknet prefer left and top
        x = Conv2D(conv['filter'], 
                   conv['kernel'], 
                   strides=conv['stride'], 
                   padding='valid' if conv['stride'] > 1 else 'same', # peculiar padding as darknet prefer left and top
                   name='conv_' + str(conv['layer_idx']), 
                   use_bias=False if conv['bnorm'] else True)(x)
        if conv['bnorm']: x = BatchNormalization(epsilon=0.001, name='bnorm_' + str(conv['layer_idx']))(x)
        if conv['leaky']: x = LeakyReLU(alpha=0.1, name='leaky_' + str(conv['layer_idx']))(x)

    return add([skip_connection, x]) if skip else x


In [6]:
# As per original model of 'experiencor'

def make_yolov3_model():
    input_image = Input(shape=(608, 608, 3))

    # Layer  0 => 4
    x = _conv_block(input_image, [{'filter': 32, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 0},
                                  {'filter': 64, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 1},
                                  {'filter': 32, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 2},
                                  {'filter': 64, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 3}])

    # Layer  5 => 8
    x = _conv_block(x, [{'filter': 128, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 5},
                        {'filter':  64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 6},
                        {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 7}])

    # Layer  9 => 11
    x = _conv_block(x, [{'filter':  64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 9},
                        {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 10}])

    # Layer 12 => 15
    x = _conv_block(x, [{'filter': 256, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 12},
                        {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 13},
                        {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 14}])

    # Layer 16 => 36
    for i in range(7):
        x = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 16+i*3},
                            {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 17+i*3}])
        
    skip_36 = x
        
    # Layer 37 => 40
    x = _conv_block(x, [{'filter': 512, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 37},
                        {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 38},
                        {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 39}])

    # Layer 41 => 61
    for i in range(7):
        x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 41+i*3},
                            {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 42+i*3}])
        
    skip_61 = x
        
    # Layer 62 => 65
    x = _conv_block(x, [{'filter': 1024, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 62},
                        {'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 63},
                        {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 64}])

    # Layer 66 => 74
    for i in range(3):
        x = _conv_block(x, [{'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 66+i*3},
                            {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 67+i*3}])
        
    # Layer 75 => 79
    x = _conv_block(x, [{'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 75},
                        {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 76},
                        {'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 77},
                        {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 78},
                        {'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 79}], skip=False)

    # Layer 80 => 82
    yolo_82 = _conv_block(x, [{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 80},
                              {'filter':  255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 81}], skip=False)

    # Layer 83 => 86
    x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 84}], skip=False)
    x = UpSampling2D(2)(x)
    x = concatenate([x, skip_61])

    # Layer 87 => 91
    x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 87},
                        {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 88},
                        {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 89},
                        {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 90},
                        {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 91}], skip=False)

    # Layer 92 => 94
    yolo_94 = _conv_block(x, [{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 92},
                              {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 93}], skip=False)

    # Layer 95 => 98
    x = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True,   'layer_idx': 96}], skip=False)
    x = UpSampling2D(2)(x)
    x = concatenate([x, skip_36])

    # Layer 99 => 106
    yolo_106 = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 99},
                               {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 100},
                               {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 101},
                               {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 102},
                               {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 103},
                               {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 104},
                               {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 105}], skip=False)

#    model = Model(input_image, [yolo_82, yolo_94, yolo_106]) 
    
    model_1 = Model(input_image, yolo_82)
    model_2 = Model(input_image, yolo_94)
    model_3 = Model(input_image, yolo_106)
    
#    return model
    return model_1, model_2, model_3

In [10]:
# 5/8 dropout
input_size = (608, 608, 3)


my_model_1, _, _ = make_yolov3_model()
_, my_model_2, _ = make_yolov3_model()
_, _, my_model_3 = make_yolov3_model()
# my_model_4 = make_yolov3_model()

print(my_model_3.summary())


Model: "functional_61"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           [(None, 608, 608, 3) 0                                            
__________________________________________________________________________________________________
conv_0 (Conv2D)                 (None, 608, 608, 32) 864         input_13[0][0]                   
__________________________________________________________________________________________________
bnorm_0 (BatchNormalization)    (None, 608, 608, 32) 128         conv_0[0][0]                     
__________________________________________________________________________________________________
leaky_0 (LeakyReLU)             (None, 608, 608, 32) 0           bnorm_0[0][0]                    
______________________________________________________________________________________

In [18]:
# As per original model of 'experiencor'. But here, changed last conv lines to suit 3 X 6 output for 1 class :

def make_yolov3_model():
    input_image = Input(shape=(608, 608, 3))

    # Layer  0 => 4
    x = _conv_block(input_image, [{'filter': 32, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 0},
                                  {'filter': 64, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 1},
                                  {'filter': 32, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 2},
                                  {'filter': 64, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 3}])

    # Layer  5 => 8
    x = _conv_block(x, [{'filter': 128, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 5},
                        {'filter':  64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 6},
                        {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 7}])

    # Layer  9 => 11
    x = _conv_block(x, [{'filter':  64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 9},
                        {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 10}])

    # Layer 12 => 15
    x = _conv_block(x, [{'filter': 256, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 12},
                        {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 13},
                        {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 14}])

    # Layer 16 => 36
    for i in range(7):
        x = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 16+i*3},
                            {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 17+i*3}])
        
    skip_36 = x
        
    # Layer 37 => 40
    x = _conv_block(x, [{'filter': 512, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 37},
                        {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 38},
                        {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 39}])

    # Layer 41 => 61
    for i in range(7):
        x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 41+i*3},
                            {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 42+i*3}])
        
    skip_61 = x
        
    # Layer 62 => 65
    x = _conv_block(x, [{'filter': 1024, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 62},
                        {'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 63},
                        {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 64}])

    # Layer 66 => 74
    for i in range(3):
        x = _conv_block(x, [{'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 66+i*3},
                            {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 67+i*3}])
        
    # Layer 75 => 79
    x = _conv_block(x, [{'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 75},
                        {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 76},
                        {'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 77},
                        {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 78},
                        {'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 79}], skip=False)

    # Layer 80 => 82
    yolo_82 = _conv_block(x, [{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 80},
                              {'filter':  256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 81},
                              {'filter':  128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 107},
                              {'filter':   64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 108},
                              {'filter':   18, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 109}], skip=False)

    # Layer 83 => 86
    x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 84}], skip=False)
    x = UpSampling2D(2)(x)
    x = concatenate([x, skip_61])

    # Layer 87 => 91
    x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 87},
                        {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 88},
                        {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 89},
                        {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 90},
                        {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 91}], skip=False)

    # Layer 92 => 94
    yolo_94 = _conv_block(x, [{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 92},
                              {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True,  'leaky': True, 'layer_idx': 93},
                              {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True, 'layer_idx': 110},
                              {'filter':  64, 'kernel': 1, 'stride': 1, 'bnorm': True,  'leaky': True, 'layer_idx': 111},
                              {'filter':  18, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 112}], skip=False)

    # Layer 95 => 98
    x = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True,   'layer_idx': 96}], skip=False)
    x = UpSampling2D(2)(x)
    x = concatenate([x, skip_36])

    # Layer 99 => 106
    yolo_106 = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 99},
                               {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 100},
                               {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 101},
                               {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 102},
                               {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 103},
                               {'filter':  64, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 104},
                               {'filter':  18, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 105}], skip=False)

#    model = Model(input_image, [yolo_82, yolo_94, yolo_106])

    final_1 = Reshape((grid_y_axis_1,grid_x_axis_1,num_anchors,info))(yolo_82)
    final_2 = Reshape((grid_y_axis_2,grid_x_axis_2,num_anchors,info))(yolo_94)
    final_3 = Reshape((grid_y_axis_3,grid_x_axis_3,num_anchors,info))(yolo_106)
    
    model_1 = Model(input_image, final_1)
    model_2 = Model(input_image, final_2)
    model_3 = Model(input_image, final_3)
    
#    return model
    return model_1, model_2, model_3

In [21]:
input_size = (608, 608, 3)


my_model_1, _, _ = make_yolov3_model()
_, my_model_2, _ = make_yolov3_model()
_, _, my_model_3 = make_yolov3_model()
# my_model_4 = make_yolov3_model()

print(my_model_3.summary())


Model: "functional_151"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_28 (InputLayer)           [(None, 608, 608, 3) 0                                            
__________________________________________________________________________________________________
conv_0 (Conv2D)                 (None, 608, 608, 32) 864         input_28[0][0]                   
__________________________________________________________________________________________________
bnorm_0 (BatchNormalization)    (None, 608, 608, 32) 128         conv_0[0][0]                     
__________________________________________________________________________________________________
leaky_0 (LeakyReLU)             (None, 608, 608, 32) 0           bnorm_0[0][0]                    
_____________________________________________________________________________________

In [22]:
# Making a list of image path

inv_directory = '/home/scar3crow/Downloads/drg_all'   
                                                        
inv_new_image = ['/home/scar3crow/Downloads/drg_all/{}'.format(i) for i in os.listdir(inv_directory)] # making the list
inv_new_image.sort() # Sorting the list

num_images = len(inv_new_image)

print('Number of images = ', num_images)


Number of images =  647


In [23]:
# Check sizes of exiting images & Create a Dataframe with image id and height(row) and width(column):

rows = []
columns = []
image_sl = []
df_new = pd.DataFrame()

for i in range(len(inv_new_image)):
    image = cv2.imread(inv_new_image[i]) ## Loading image
    height, width, _ = image.shape
    rows.append(height)
    columns.append(width)
    image_sl.append(inv_new_image[i])
        
row_values = pd.Series(rows)
col_values = pd.Series(columns)
image_num = pd.Series(image_sl)


df_new.insert(loc=0, column='image_serial', value=image_num)
df_new.insert(loc=1, column='rows', value=row_values)
df_new.insert(loc=2, column='columns', value=col_values)

df_new.tail(3)

Unnamed: 0,image_serial,rows,columns
644,/home/scar3crow/Downloads/drg_all/97.jpg,2479,3508
645,/home/scar3crow/Downloads/drg_all/98.jpg,2479,3508
646,/home/scar3crow/Downloads/drg_all/99.jpg,2479,3508


In [25]:
r_new_data_8 = pd.read_csv('/home/scar3crow/Downloads/drg_via/via_drg_8.csv')

for i in range(len(r_new_data_8)):
    
    f_name = r_new_data_8['#filename'][i]
    idx = r_new_data_8.index[r_new_data_8['#filename'] == f_name]
    f_name_rev = str(f_name) + '.jpg'
    
    r_new_data_8.at[idx, '#filename'] = f_name_rev
    

In [26]:
# Loading output of VGG Image Annotation tool and create a dataframe

r_new_data = pd.DataFrame()

r_new_data_1 = pd.read_csv('/home/scar3crow/Downloads/drg_via/via_drg_1.csv')
r_new_data_2 = pd.read_csv('/home/scar3crow/Downloads/drg_via/via_drg_2.csv')
r_new_data_3 = pd.read_csv('/home/scar3crow/Downloads/drg_via/via_drg_3.csv')
r_new_data_4 = pd.read_csv('/home/scar3crow/Downloads/drg_via/via_drg_4.csv')
r_new_data_5 = pd.read_csv('/home/scar3crow/Downloads/drg_via/via_drg_5.csv')
r_new_data_6 = pd.read_csv('/home/scar3crow/Downloads/drg_via/via_drg_6.csv')
r_new_data_7 = pd.read_csv('/home/scar3crow/Downloads/drg_via/via_drg_7.csv')
r_new_data_9 = pd.read_csv('/home/scar3crow/Downloads/drg_via/via_drg_9.csv')
r_new_data_10 = pd.read_csv('/home/scar3crow/Downloads/drg_via/via_drg_10.csv')
r_new_data_11 = pd.read_csv('/home/scar3crow/Downloads/drg_via/via_drg_11.csv')
r_new_data_12 = pd.read_csv('/home/scar3crow/Downloads/drg_via/via_drg_12.csv')

r_new_data = pd.concat([r_new_data_1, r_new_data_2, r_new_data_3, r_new_data_4, r_new_data_5, r_new_data_6, r_new_data_7, r_new_data_8, r_new_data_9, r_new_data_10, r_new_data_11, r_new_data_12], axis = 0, ignore_index = True)

num_obj = r_new_data['region_count'][0] # number of objects in each photo
r_new_data.drop(r_new_data.columns[[1, 2, 3, 4]], axis=1, inplace=True) # reduce unnecessary columns

r_new_data = r_new_data[(r_new_data['#filename'] != '33.jpg') & (r_new_data['#filename'] != '20.jpg')].reset_index(drop=True)
r_new_data = r_new_data[(r_new_data['#filename'] != '74.jpg') & (r_new_data['#filename'] != '42.jpg')].reset_index(drop=True)
r_new_data = r_new_data[(r_new_data['#filename'] != '96.jpg') & (r_new_data['#filename'] != '28.jpg')].reset_index(drop=True)
r_new_data = r_new_data[(r_new_data['#filename'] != '47.jpg') & (r_new_data['#filename'] != '122.jpg')].reset_index(drop=True)
r_new_data = r_new_data[(r_new_data['#filename'] != '63.jpg') & (r_new_data['#filename'] != '97.jpg')].reset_index(drop=True)
r_new_data = r_new_data[(r_new_data['#filename'] != '56.jpg') & (r_new_data['#filename'] != '38.jpg')].reset_index(drop=True)
r_new_data = r_new_data[(r_new_data['#filename'] != '36.jpg') & (r_new_data['#filename'] != '309.jpg')].reset_index(drop=True)
r_new_data = r_new_data[(r_new_data['#filename'] != '314.jpg') & (r_new_data['#filename'] != '317.jpg')].reset_index(drop=True)
r_new_data = r_new_data[(r_new_data['#filename'] != '318.jpg') & (r_new_data['#filename'] != '324.jpg')].reset_index(drop=True)
r_new_data = r_new_data[(r_new_data['#filename'] != '330.jpg') & (r_new_data['#filename'] != '331.jpg')].reset_index(drop=True)
r_new_data = r_new_data[(r_new_data['#filename'] != '525.jpg') & (r_new_data['#filename'] != '322.jpg')].reset_index(drop=True)

r_new_data.sort_values(by=['#filename'], ascending=True) # Sorting based on image-id
num_images = r_new_data["#filename"].nunique() # Find out number of unique images

print('Number of unique images = ', num_images)
r_new_data.head(5)

Number of unique images =  634


Unnamed: 0,#filename,region_shape_attributes,region_attributes
0,1.jpg,"{""name"":""rect"",""x"":1886,""y"":1863,""width"":1457,...","{""text"":""item""}"
1,2.jpg,"{""name"":""rect"",""x"":2230,""y"":1615,""width"":1243,...","{""text"":""item""}"
2,3.jpg,"{""name"":""rect"",""x"":2143,""y"":1855,""width"":1200,...","{""text"":""item""}"
3,4.jpg,"{""name"":""rect"",""x"":2071,""y"":1840,""width"":1393,...","{""text"":""item""}"
4,5.jpg,"{""name"":""rect"",""x"":2224,""y"":1330,""width"":1140,...","{""text"":""item""}"


In [27]:
# Making a dataframe for Image_id, x, y, width, height, class, image_width and image_height

x = []
y = []
width = []
height = []
obj_class = []
i_width = []
i_height = []
img_path = []
img_index = []

for i in range(len(r_new_data)):
    
    r_size = r_new_data.values[i, 1][1:(len(r_new_data.values[i, 1])-1)]
    r_size_par = r_size.split(",")
    
    x.append(int("".join(filter(str.isdigit, r_size_par[1]))))
    y.append(int("".join(filter(str.isdigit, r_size_par[2]))))
    width.append(int("".join(filter(str.isdigit, r_size_par[3]))))
    height.append(int("".join(filter(str.isdigit, r_size_par[4]))))
    
    r_attribs = r_new_data.values[i, 2][1:(len(r_new_data.values[i, 2])-1)]
    r_attribs_par = r_attribs.split(':')[1]
    obj_class.append(r_attribs_par[1:(len(r_attribs_par)-1)])
    
    foto_id = r_new_data['#filename'][i]

    i_path = '/home/scar3crow/Downloads/drg_all/' + foto_id
    foto_index = int(df_new[df_new['image_serial'] == i_path].index[0])
    foto_width = df_new['columns'][foto_index]
    foto_height = df_new['rows'][foto_index]
    i_width.append(foto_width)
    i_height.append(foto_height)
    img_path.append(i_path)
    img_index.append(foto_index)
    
x_values = pd.Series(x)
y_values = pd.Series(y)
width_values = pd.Series(width)
height_values = pd.Series(height)
class_values = pd.Series(obj_class)
i_width_values = pd.Series(i_width)
i_height_values = pd.Series(i_height)
img_path_values = pd.Series(img_path)
img_index_values = pd.Series(img_index)

r_new_data.insert(loc=1, column='img_idx', value=img_index_values)
r_new_data.insert(loc=2, column='i_path', value=img_path_values)
r_new_data.insert(loc=3, column='x', value=x_values)
r_new_data.insert(loc=4, column='y', value=y_values)
r_new_data.insert(loc=5, column='width', value=width_values)
r_new_data.insert(loc=6, column='height', value=height_values)
r_new_data.insert(loc=7, column='obj_class', value=class_values)
r_new_data.insert(loc=8, column='img_wd', value=i_width_values)
r_new_data.insert(loc=9, column='img_ht', value=i_height_values)

r_new_data.drop(r_new_data.columns[[10, 11]], axis=1, inplace=True) # reduce unnecessary columns

r_new_data.rename({'#filename': 'img_id'}, axis=1, inplace=True) # changing column name

r_new_data[0:3]


Unnamed: 0,img_id,img_idx,i_path,x,y,width,height,obj_class,img_wd,img_ht
0,1.jpg,0,/home/scar3crow/Downloads/drg_all/1.jpg,1886,1863,1457,107,item,3509,2480
1,2.jpg,256,/home/scar3crow/Downloads/drg_all/2.jpg,2230,1615,1243,410,item,3509,2480
2,3.jpg,415,/home/scar3crow/Downloads/drg_all/3.jpg,2143,1855,1200,115,item,3509,2480


In [28]:
## Find out image with max boxes - to check whether annotation is correct :

max_box_image = r_new_data['img_id'].value_counts().max()
image_with_max_box =  r_new_data['img_id'].value_counts().idxmax()
print(image_with_max_box, 'with', max_box_image, 'boxes')

1134.jpg with 1 boxes


In [29]:
def single_image_info(lines):
    
    ## lines will be a dataframe like, for i in range(num_images), lines = r_new_data[i*5:(i+1)*5]
    
    line_idx = lines.iat[0, 1]
    pic_path = lines.iat[0, 2]
    img_width = lines.iat[0, 8]
    img_height = lines.iat[0, 9]
    
    boxes = []
    labels = []
    for i in range(len(lines)):
        
        index_of = lines['obj_class'][i]
        label = categories.index(index_of)
        x_min, y_min, x_max, y_max = float(lines.iat[i,3]), float(lines.iat[i,4]), float(lines.iat[i,3]+lines.iat[i,5]), float(lines.iat[i,4]+lines.iat[i,6])
        boxes.append([x_min, y_min, x_max, y_max])
        labels.append(label)
        
    boxes = np.asarray(boxes, np.float32)
    labels = np.asarray(labels, np.int64)
    
    return line_idx, pic_path, boxes, labels, img_width, img_height  ## boxes are in format xmin, ymin, xmax, ymax


In [30]:
## Creating the complete data set :

all_image_line = []
limit_lower = 0
limit_upper = 0
for i in range(num_images):
    image_line = []
    
    img = r_new_data['img_id'][limit_upper]
    kount = r_new_data['img_id'].value_counts()[img]
       
    limit_lower = limit_upper
    limit_upper = limit_lower + kount
    
    lines = r_new_data[limit_lower:limit_upper].reset_index(drop=True)
    """
    print('===================', i)
    print(img)
    print(limit_lower)
    print(limit_upper)
    """
    
    line_idx, pic_path, boxes, labels, img_width, img_height = single_image_info(lines)
    image_line.append(line_idx)
    image_line.append(pic_path)
    image_line.append(boxes)
    image_line.append(labels)
    image_line.append(img_width)
    image_line.append(img_height)
    all_image_line.append(image_line)
    
print(len(all_image_line))
print(all_image_line[3])   ##  boxes are in format xmin, ymin, xmax, ymax

634
[467, '/home/scar3crow/Downloads/drg_all/4.jpg', array([[2071., 1840., 3464., 2056.]], dtype=float32), array([0]), 3509, 2480]


In [31]:
print('Number of unique images = ', r_new_data['img_id'].nunique())  # print total no, of unique images
print('Number of classes in diff. categories = ', r_new_data['obj_class'].value_counts())

Number of unique images =  634
Number of classes in diff. categories =  item    634
Name: obj_class, dtype: int64


In [33]:
# Train and Test split

data_train, data_val = train_test_split(all_image_line, train_size = 0.90 , shuffle = True)

num_all_bbox = len(all_image_line) * len(all_image_line[0][2])
num_bb_train = len(data_train) * len(data_train[0][2])
num_bb_val = len(data_val) * len(data_val[0][2])
print(num_all_bbox, num_bb_train, num_bb_val)

634 570 64




In [34]:
# calculating anchors from true boundary boxes :

def iou_kmeans(box, clusters):
    """
    Calculates the Intersection over Union (IoU) between a box and k clusters.
    :param box: tuple or array, shifted to the origin (i. e. width and height)
    :param clusters: numpy array of shape (k, 2) where k is the number of clusters
    :return: numpy array of shape (k, 0) where k is the number of clusters
    """
    x = np.minimum(clusters[:, 0], box[0])
    y = np.minimum(clusters[:, 1], box[1])
    if np.count_nonzero(x == 0) > 0 or np.count_nonzero(y == 0) > 0:
        raise ValueError("Box has no area")

    intersection = x * y
    box_area = box[0] * box[1]
    cluster_area = clusters[:, 0] * clusters[:, 1]

    iou = intersection / (box_area + cluster_area - intersection)

    return iou

def kmeans(boxes, k, dist=np.median):
    """
    Calculates k-means clustering with the Intersection over Union (IoU) metric.
    :param boxes: numpy array of shape (r, 2), where r is the number of rows
    :param k: number of clusters
    :param dist: distance function
    :return: numpy array of shape (k, 2)
    """
    rows = boxes.shape[0]

    distances = np.empty((rows, k))
    last_clusters = np.zeros((rows,))

    np.random.seed()

    # the Forgy method will fail if the whole array contains the same rows
    clusters = boxes[np.random.choice(rows, k, replace=False)]


    while True:
        for row in range(rows):
            distances[row] = 1 - iou_kmeans(boxes[row], clusters)

        nearest_clusters = np.argmin(distances, axis=1)

        if (last_clusters == nearest_clusters).all():
            break

        for cluster in range(k):
            clusters[cluster] = dist(boxes[nearest_clusters == cluster], axis=0)

        last_clusters = nearest_clusters

    return clusters

In [35]:
# Finding out anchors :
## Firstly, converting true boundary box width, height to width & height with respect to target image :
## finaly find anchors. Anchors here are in absolute size w.r.t. target image but not as % of target image or 
## as multiple of unit grids.

num_all_bb = len(r_new_data) # if no. of bboxes varies for images, this formula should be used 

anchors_wrt_target = np.zeros((3,2))

# num_all_bb = len(all_image_line) * len(all_image_line[0][2])  ## from all image line data, this formula to be used if no. of bboxes are same for all images

b_box_wrt_target = np.zeros((num_all_bb,2))

for i in range(num_all_bb):
    
    image_w = r_new_data['img_wd'][i]
    image_h = r_new_data['img_ht'][i]

    x_ratio = target_w / image_w 
    y_ratio = target_h / image_h

    # x_ratio = 1.    ## since we shall only pad the images 
    # y_ratio = 1.    ## since we shall only pad the images
    
    anchor_w = r_new_data['width'][i] * x_ratio
    anchor_h = r_new_data['height'][i] * y_ratio
    b_box_wrt_target[i, 0] = anchor_w
    b_box_wrt_target[i, 1] = anchor_h
    
anchors_wrt_target = kmeans(b_box_wrt_target, num_anchors)

print(anchors_wrt_target.shape)
print(anchors_wrt_target)     ## anchors wrt target image in abs. value and in format width, height


(3, 2)
[[224.01368301  31.88382412]
 [224.44697834  69.89915288]
 [206.94184721 133.62827822]]


In [36]:
nchors_wrt_target = [224.01368301, 31.88382412, 224.44697834, 69.89915288, 206.94184721, 133.62827822]
anchors_wrt_target = np.reshape(anchors_wrt_target, [3,2])
anchors_wrt_target

array([[224.01368301,  31.88382412],
       [224.44697834,  69.89915288],
       [206.94184721, 133.62827822]])

In [43]:
## Pre-processing the original data to get y_true :

def process_box(ori_boxes, ori_img_width, ori_img_height, labels, target_size, class_num, anchors):
    '''
    Generate the y_true label, i.e. the ground truth feature_map.
    params:
        boxes: [N, 5] shape, float32 dtype. `x_min, y_min, x_max, y_mix, mixup_weight`.
        labels: [N] shape, int64 dtype.
        class_num: int64 num.
        anchors: [3,2] shape, float32 dtype.
    '''
    
    img_width = ori_img_width
    img_height = ori_img_height
    boxes = ori_boxes           ## boxes in format xmin, ymin, xmax, ymax
  
    x_ratio = target_size[1] / img_width
    y_ratio = target_size[0] / img_height
      
    boxes_wrt_target = np.zeros((1,4))
    box_centers_target = np.zeros((1,2))

    boxes_wrt_target[:,0] = boxes[:,0] * x_ratio  # xmin absolute value wrt target image
    boxes_wrt_target[:,1] = boxes[:,1] * y_ratio  # ymin absolute value wrt target image
    boxes_wrt_target[:,2] = boxes[:,2] * x_ratio  # xmax absolute value wrt target image
    boxes_wrt_target[:,3] = boxes[:,3] * y_ratio  # ymax absolute value wrt target image
    
    # In above, boxes_wrt_target shape is (5, 4), now this will be taken to (5. 5) by adding 1 at end
#    boxes_wrt_target = np.concatenate((boxes_wrt_target, np.full(shape=(boxes_wrt_target.shape[0], 1), fill_value=1., dtype=np.float32)), axis=-1)
    box_centers_target = (boxes_wrt_target[:, 0:2] + boxes_wrt_target[:, 2:4]) / 2  ## centers wrt target, abs values
    
    box_sizes = boxes[:, 2:4] - boxes[:, 0:2]  #xmax-xmin = width and ymax-ymin = height wrt original image
    box_sizes[:,0] = box_sizes[:,0] * x_ratio  # width w.r.t target image in absolute value
    box_sizes[:,1] = box_sizes[:,1] * y_ratio  # width w.r.t target image in absolute value
    
#    y_true_13 = np.zeros((target_size[1] // 32, target_size[0] // 32, 3, 6 + class_num), np.float32)
    y_true_19 = np.zeros((target_size[1] // 32, target_size[0] // 32, 3, 5 + class_num), np.float32)
    y_true_38 = np.zeros((target_size[1] // 16, target_size[0] // 16, 3, 5 + class_num), np.float32)
    y_true_76 = np.zeros((target_size[1] // 8, target_size[0] // 8, 3, 5 + class_num), np.float32)

#    y_true = [y_true_13]
    
    box_sizes = np.expand_dims(box_sizes, 1)
    mins = np.maximum(- box_sizes / 2, - anchors / 2)
    maxs = np.minimum(box_sizes / 2, anchors / 2)
    whs = maxs - mins

    iou = (whs[:, :, 0] * whs[:, :, 1]) / (
                box_sizes[:, :, 0] * box_sizes[:, :, 1] + anchors[:, 0] * anchors[:, 1] - whs[:, :, 0] * whs[:, :,
                                                                                                         1] + 1e-10)
    best_match_idx = np.argmax(iou, axis=1)
    
#    print(best_match_idx)

#    anchor_mask = np.zeros((target_size[1] // 32, target_size[0] // 32, 3))

    grid_stride = [32, 16, 8]  ## = targetsize / no. of grid cellS
    
    
    
    for grid in grid_stride:
    
        for i, idx in enumerate(best_match_idx):

            x = int(np.floor(box_centers_target[i, 0] / grid))
            y = int(np.floor(box_centers_target[i, 1] / grid))
            k = int(idx)
            c = int(labels[i])

            print(x, y, k, c)
        

# Very Imp : Now preparing y_true: all values x_center, y_cemter, width & height are being taken to % of target image
            if grid == 32:
                y_true_19[y, x, k, :2] = box_centers_target[i] / target_size[0] #since target_size[0] = target_size[1]
                y_true_19[y, x, k, 2:4] = box_sizes[i] / target_size[0]
                y_true_19[y, x, k, 4] = 1.
                y_true_19[y, x, k, 5 + c] = 1.
            
            if grid == 16:
                y_true_38[y, x, k, :2] = box_centers_target[i] / target_size[0] #since target_size[0] = target_size[1]
                y_true_38[y, x, k, 2:4] = box_sizes[i] / target_size[0]
                y_true_38[y, x, k, 4] = 1.
                y_true_38[y, x, k, 5 + c] = 1.
                
            if grid == 8:
                y_true_76[y, x, k, :2] = box_centers_target[i] / target_size[0] #since target_size[0] = target_size[1]
                y_true_76[y, x, k, 2:4] = box_sizes[i] / target_size[0]
                y_true_76[y, x, k, 4] = 1.
                y_true_76[y, x, k, 5 + c] = 1.
                        
#        print(y_true_13[y,x])

    return y_true_19, y_true_38, y_true_76      ## all data are w.r.to target image in % of target image and NOT w,r,t, grid cells


In [42]:
## Single image-wise image/boundary box preprocessing:

def parse_data(line, class_num, target_size, anchors):   ## (mode, letterbox_resize):
    '''
    param:
        line: a line from the training/test txt file
        class_num: totol class nums.
        target_size: the size of image to be resized to. [width, height] format.
        anchors: anchors.
        mode: 'train' or 'val'. When set to 'train', data_augmentation will be applied.
        letterbox_resize: whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized image.
    '''
    
    img_idx, pic_path, boxes, labels,img_width, img_height = line  # boxes in format xmin, ymin, xmax, ymax
    img = cv2.imread(pic_path)
    img_resized = cv2.resize(img,(target_size[0], target_size[1]))
    
#    img_resized = unsharp_mask(img_resized)
    
#    img_resized, pad_top, pad_bottom = pad_images_to_same_size(img)
    
#    image_height = img_height + pad_top + pad_bottom
    
    # expand the 2nd dimension, mix up weight default to 1.
#    boxes = np.concatenate((boxes, np.full(shape=(boxes.shape[0], 1), fill_value=1., dtype=np.float32)), axis=-1)

    img_resized = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB).astype(np.float32)

    # the input of yolo_v3 should be in range 0~1, lets change to -0.5 to +0.5
    img_resized = (img_resized - 127.5)/ 255.

    y_true_19, y_true_38, y_true_76 = process_box(boxes, img_width, img_height, labels, target_size, class_num, anchors)

    return img_idx, img_resized, y_true_19, y_true_38, y_true_76


In [44]:
## Making the data ready for entering into network :

anchors = anchors_wrt_target
image_index = []
image_resized = []
image_y_true_19 = []
image_y_true_38 = []
image_y_true_76 = []

for i in range(len(data_train)):

    line = data_train[i]
    
    img_idx, img_resized, y_true_19, y_true_38, y_true_76 = parse_data(line, class_num, target_size, anchors)
    
    
    image_index.append(img_idx)
    image_resized.append(img_resized)
    image_y_true_19.append(y_true_19)
    image_y_true_38.append(y_true_38)
    image_y_true_76.append(y_true_76)
    
    print('train img_index. = ', img_idx)
    print('train sl. no. = ', i)
    print('=============================')
train_image_index = image_index
X_train = np.array(image_resized).astype(np.float32)
Y_train_19 = np.array(image_y_true_19).astype(np.float32)
Y_train_38 = np.array(image_y_true_38).astype(np.float32)
Y_train_76 = np.array(image_y_true_76).astype(np.float32)


image_index = []
image_resized = []
image_y_true = []
image_anchor_mask = []

for i in range(len(data_val)):
    line = data_val[i]
    
    
    img_idx, img_resized, y_true_19, y_true_38, y_true_76 = parse_data(line, class_num, target_size, anchors)
    
    image_index.append(img_idx)
    image_resized.append(img_resized)
    image_y_true_19.append(y_true_19)
    image_y_true_38.append(y_true_38)
    image_y_true_76.append(y_true_76)
    
    print('val img_index. = ', img_idx)
    print('val sl. no. = ', i)
    print('=============================')
    
val_image_index = image_index
X_val = np.array(image_resized).astype(np.float32)
Y_val_19 = np.array(image_y_true_19).astype(np.float32)
Y_val_38 = np.array(image_y_true_38).astype(np.float32)
Y_val_76 = np.array(image_y_true_76).astype(np.float32)


image_index = []
image_resized = []
image_y_true = []


14 11 2 0
29 23 2 0
58 47 2 0
train img_index. =  30
train sl. no. =  0
14 13 0 0
28 27 0 0
56 55 0 0
train img_index. =  426
train sl. no. =  1
14 15 0 0
29 30 0 0
59 60 0 0
train img_index. =  276
train sl. no. =  2
14 14 1 0
29 28 1 0
59 56 1 0
train img_index. =  252
train sl. no. =  3
15 15 0 0
30 30 0 0
60 60 0 0
train img_index. =  629
train sl. no. =  4
15 14 1 0
30 29 1 0
60 58 1 0
train img_index. =  154
train sl. no. =  5
15 14 1 0
31 28 1 0
62 56 1 0
train img_index. =  612
train sl. no. =  6
14 12 1 0
29 25 1 0
58 50 1 0
train img_index. =  164
train sl. no. =  7
13 14 0 0
27 28 0 0
55 56 0 0
train img_index. =  421
train sl. no. =  8
14 12 2 0
29 25 2 0
58 50 2 0
train img_index. =  186
train sl. no. =  9
14 14 1 0
28 29 1 0
57 58 1 0
train img_index. =  433
train sl. no. =  10
14 14 0 0
28 29 0 0
57 59 0 0
train img_index. =  349
train sl. no. =  11
14 12 2 0
28 24 2 0
57 49 2 0
train img_index. =  63
train sl. no. =  12
14 11 2 0
28 23 2 0
56 47 2 0
train img_index. =  

5 15 1 0
11 31 1 0
22 62 1 0
train img_index. =  488
train sl. no. =  82
14 10 2 0
29 21 2 0
58 42 2 0
train img_index. =  177
train sl. no. =  83
14 15 0 0
29 30 0 0
59 60 0 0
train img_index. =  372
train sl. no. =  84
14 14 0 0
29 29 0 0
59 58 0 0
train img_index. =  409
train sl. no. =  85
15 15 0 0
30 30 0 0
60 60 0 0
train img_index. =  273
train sl. no. =  86
14 14 1 0
29 28 1 0
58 57 1 0
train img_index. =  392
train sl. no. =  87
14 13 1 0
29 26 1 0
58 52 1 0
train img_index. =  91
train sl. no. =  88
14 13 2 0
29 27 2 0
59 54 2 0
train img_index. =  452
train sl. no. =  89
15 14 1 0
31 29 1 0
63 58 1 0
train img_index. =  476
train sl. no. =  90
14 15 0 0
29 30 0 0
59 60 0 0
train img_index. =  220
train sl. no. =  91
14 13 2 0
28 27 2 0
56 54 2 0
train img_index. =  363
train sl. no. =  92
4 13 2 0
9 27 2 0
18 54 2 0
train img_index. =  300
train sl. no. =  93
14 12 1 0
29 25 1 0
58 51 1 0
train img_index. =  139
train sl. no. =  94
14 14 1 0
28 29 1 0
56 58 1 0
train img_in

14 13 0 0
29 26 0 0
58 53 0 0
train img_index. =  142
train sl. no. =  162
5 15 1 0
11 30 1 0
22 61 1 0
train img_index. =  533
train sl. no. =  163
14 14 1 0
28 29 1 0
57 59 1 0
train img_index. =  613
train sl. no. =  164
14 13 1 0
28 27 1 0
57 55 1 0
train img_index. =  403
train sl. no. =  165
4 13 2 0
9 26 2 0
19 53 2 0
train img_index. =  530
train sl. no. =  166
13 12 1 0
26 24 1 0
52 48 1 0
train img_index. =  86
train sl. no. =  167
14 13 2 0
28 26 2 0
56 52 2 0
train img_index. =  622
train sl. no. =  168
14 13 1 0
28 27 1 0
57 54 1 0
train img_index. =  436
train sl. no. =  169
14 11 2 0
29 23 2 0
58 47 2 0
train img_index. =  230
train sl. no. =  170
5 13 2 0
11 27 2 0
23 55 2 0
train img_index. =  483
train sl. no. =  171
16 15 0 0
32 30 0 0
65 61 0 0
train img_index. =  13
train sl. no. =  172
14 13 2 0
29 26 2 0
59 53 2 0
train img_index. =  214
train sl. no. =  173
14 14 1 0
29 29 1 0
58 59 1 0
train img_index. =  260
train sl. no. =  174
14 15 0 0
29 30 0 0
58 60 0 0
t

14 13 2 0
29 27 2 0
59 54 2 0
train img_index. =  453
train sl. no. =  242
15 14 0 0
31 29 0 0
63 59 0 0
train img_index. =  217
train sl. no. =  243
15 15 0 0
30 30 0 0
60 61 0 0
train img_index. =  575
train sl. no. =  244
14 12 2 0
29 24 2 0
58 48 2 0
train img_index. =  184
train sl. no. =  245
14 11 2 0
28 23 2 0
57 46 2 0
train img_index. =  50
train sl. no. =  246
14 15 0 0
29 30 0 0
58 60 0 0
train img_index. =  368
train sl. no. =  247
14 14 1 0
29 29 1 0
59 58 1 0
train img_index. =  227
train sl. no. =  248
14 14 0 0
28 28 0 0
56 57 0 0
train img_index. =  603
train sl. no. =  249
14 15 0 0
29 30 0 0
59 61 0 0
train img_index. =  536
train sl. no. =  250
13 14 0 0
26 29 0 0
53 58 0 0
train img_index. =  447
train sl. no. =  251
14 11 2 0
28 23 2 0
57 47 2 0
train img_index. =  558
train sl. no. =  252
14 13 0 0
28 26 0 0
56 53 0 0
train img_index. =  83
train sl. no. =  253
15 14 0 0
30 29 0 0
61 59 0 0
train img_index. =  628
train sl. no. =  254
14 15 0 0
28 30 0 0
57 60 0

14 14 1 0
28 29 1 0
56 58 1 0
train img_index. =  456
train sl. no. =  321
14 14 0 0
29 29 0 0
58 58 0 0
train img_index. =  543
train sl. no. =  322
14 15 0 0
29 30 0 0
59 60 0 0
train img_index. =  165
train sl. no. =  323
14 13 1 0
28 26 1 0
56 52 1 0
train img_index. =  75
train sl. no. =  324
14 14 2 0
29 28 2 0
58 56 2 0
train img_index. =  386
train sl. no. =  325
15 13 1 0
30 27 1 0
60 54 1 0
train img_index. =  89
train sl. no. =  326
14 11 2 0
29 23 2 0
58 47 2 0
train img_index. =  84
train sl. no. =  327
15 14 0 0
30 29 0 0
61 59 0 0
train img_index. =  308
train sl. no. =  328
14 12 1 0
29 25 1 0
58 50 1 0
train img_index. =  115
train sl. no. =  329
14 13 0 0
28 26 0 0
56 53 0 0
train img_index. =  87
train sl. no. =  330
13 12 2 0
27 25 2 0
54 51 2 0
train img_index. =  233
train sl. no. =  331
15 15 0 0
31 30 0 0
62 60 0 0
train img_index. =  521
train sl. no. =  332
14 14 1 0
29 29 1 0
59 59 1 0
train img_index. =  472
train sl. no. =  333
14 11 2 0
29 23 2 0
58 47 2 0

14 14 0 0
29 29 0 0
58 58 0 0
train img_index. =  600
train sl. no. =  401
15 15 0 0
31 30 0 0
62 60 0 0
train img_index. =  303
train sl. no. =  402
5 15 1 0
11 30 1 0
22 60 1 0
train img_index. =  529
train sl. no. =  403
14 13 2 0
29 27 2 0
58 55 2 0
train img_index. =  246
train sl. no. =  404
14 15 0 0
29 30 0 0
58 60 0 0
train img_index. =  285
train sl. no. =  405
14 14 0 0
29 28 0 0
59 57 0 0
train img_index. =  542
train sl. no. =  406
13 12 2 0
27 24 2 0
54 49 2 0
train img_index. =  106
train sl. no. =  407
14 12 1 0
28 25 1 0
56 50 1 0
train img_index. =  130
train sl. no. =  408
14 12 2 0
28 24 2 0
56 49 2 0
train img_index. =  150
train sl. no. =  409
14 14 1 0
29 29 1 0
58 58 1 0
train img_index. =  621
train sl. no. =  410
14 11 2 0
29 23 2 0
59 47 2 0
train img_index. =  133
train sl. no. =  411
14 14 1 0
29 29 1 0
59 58 1 0
train img_index. =  381
train sl. no. =  412
14 13 2 0
28 26 2 0
57 53 2 0
train img_index. =  12
train sl. no. =  413
4 15 1 0
9 30 1 0
19 60 1 0

14 14 0 0
29 29 0 0
59 58 0 0
train img_index. =  360
train sl. no. =  481
14 12 1 0
29 25 1 0
58 50 1 0
train img_index. =  70
train sl. no. =  482
14 12 2 0
29 25 2 0
59 50 2 0
train img_index. =  4
train sl. no. =  483
14 13 1 0
29 26 1 0
58 52 1 0
train img_index. =  114
train sl. no. =  484
14 12 1 0
28 24 1 0
56 49 1 0
train img_index. =  131
train sl. no. =  485
4 15 1 0
9 30 1 0
18 60 1 0
train img_index. =  518
train sl. no. =  486
14 13 2 0
28 26 2 0
57 52 2 0
train img_index. =  348
train sl. no. =  487
14 13 0 0
29 26 0 0
58 53 0 0
train img_index. =  185
train sl. no. =  488
14 12 2 0
28 25 2 0
57 50 2 0
train img_index. =  619
train sl. no. =  489
14 12 1 0
29 25 1 0
58 50 1 0
train img_index. =  116
train sl. no. =  490
13 14 1 0
27 28 1 0
55 56 1 0
train img_index. =  321
train sl. no. =  491
14 12 2 0
29 24 2 0
58 48 2 0
train img_index. =  191
train sl. no. =  492
14 14 1 0
28 29 1 0
56 58 1 0
train img_index. =  282
train sl. no. =  493
14 14 1 0
28 28 1 0
57 57 1 0


5 15 1 0
11 30 1 0
23 61 1 0
train img_index. =  513
train sl. no. =  561
14 13 2 0
29 27 2 0
58 54 2 0
train img_index. =  319
train sl. no. =  562
15 14 0 0
30 29 0 0
61 59 0 0
train img_index. =  288
train sl. no. =  563
6 15 0 0
12 31 0 0
24 63 0 0
train img_index. =  494
train sl. no. =  564
15 14 1 0
31 29 1 0
62 58 1 0
train img_index. =  618
train sl. no. =  565
15 13 2 0
31 27 2 0
63 55 2 0
train img_index. =  26
train sl. no. =  566
14 13 1 0
28 27 1 0
56 54 1 0
train img_index. =  445
train sl. no. =  567
14 14 0 0
28 29 0 0
57 58 0 0
train img_index. =  428
train sl. no. =  568
4 13 2 0
8 26 2 0
16 53 2 0
train img_index. =  480
train sl. no. =  569


MemoryError: Unable to allocate 2.35 GiB for an array with shape (570, 608, 608, 3) and data type float32

In [45]:
def xywh_to_x1y1x2y2(box):
    xy = box[..., 0:2]
    wh = box[..., 2:4]

    x1y1 = xy - wh / 2
    x2y2 = xy + wh / 2

    y_box = K.concatenate([x1y1, x2y2], axis=-1)
    return y_box

def broadcast_iou(box_a, box_b):
    """
    calculate iou between box_a and multiple box_b in a broadcast way
    inputs: box_a: a tensor full of boxes, eg. (B, N, 4), box is in x1y1x2y2
            box_b: another tensor full of boxes, eg. (B, M, 4)
    """

    # (B, N, 1, 4)
    box_a = tf.expand_dims(box_a, -2)
    # (B, 1, M, 4)
    box_b = tf.expand_dims(box_b, -3)
    # (B, N, M, 4)
    new_shape = tf.broadcast_dynamic_shape(tf.shape(box_a), tf.shape(box_b))

    # (B, N, M, 4)
    # (B, N, M, 4)
    box_a = tf.broadcast_to(box_a, new_shape)
    box_b = tf.broadcast_to(box_b, new_shape)

    # (B, N, M, 1)
    al, at, ar, ab = tf.split(box_a, 4, -1)
    bl, bt, br, bb = tf.split(box_b, 4, -1)

    # (B, N, M, 1)
    left = tf.math.maximum(al, bl)
    right = tf.math.minimum(ar, br)
    top = tf.math.maximum(at, bt)
    bot = tf.math.minimum(ab, bb)

    # (B, N, M, 1)
    iw = tf.clip_by_value(right - left, 0, 1)
    ih = tf.clip_by_value(bot - top, 0, 1)
    i = iw * ih

    # (B, N, M, 1)
    area_a = (ar - al) * (ab - at)
    area_b = (br - bl) * (bb - bt)
    union = area_a + area_b - i

    # (B, N, M)
    iou = tf.squeeze(i / (union + 1e-7), axis=-1)

    return iou

## https://github.com/ethanyanjiali/deep-vision/blob/master/YOLO/tensorflow/yolov3.py#L213

def calc_ignore_mask(ignore_thresh, true_box, pred_box):
    
        # YOLOv3:
        # "If the bounding box prior is not the best but does overlap a ground
        # truth object by more than some threshold we ignore the prediction,
        # following [17]. We use the threshold of .5."
        # calculate the iou for each pair of pred bbox and true bbox, then find the best among them

        # (None, 13, 13, 3, 4)
        
        true_box_reorganised = xywh_to_x1y1x2y2(true_box)  # reorganised to x1, y1, x2, y2
        pred_box_reorganised = xywh_to_x1y1x2y2(pred_box)
        
        true_box_shape = tf.shape(true_box_reorganised)  
        # (None, 13, 13, 3, 4)
        pred_box_shape = tf.shape(pred_box_reorganised)  
        # (None, 507, 4)
        true_box_reorganised = tf.reshape(true_box_reorganised, [true_box_shape[0], -1, 4])
        # sort true_box to have non-zero boxes rank first
        true_box_reorganised = tf.sort(true_box_reorganised, axis=1, direction="DESCENDING")
        # (None, 100, 4)
        # only use maximum 100 boxes per groundtruth to calcualte IOU, otherwise
        # GPU emory comsumption would explode for a matrix like (16, 52*52*3, 52*52*3, 4)
        true_box_reorganised = true_box_reorganised[:, 0:100, :]
        # (None, 507, 4)
        pred_box_reorganised = tf.reshape(pred_box_reorganised, [pred_box_shape[0], -1, 4])

        # https://github.com/dmlc/gluon-cv/blob/06bb7ec2044cdf3f433721be9362ab84b02c5a90/gluoncv/model_zoo/yolo/yolo_target.py#L198
        # (None, 507, 507)
        iou = broadcast_iou(pred_box_reorganised, true_box_reorganised)
        # (None, 507)
        best_iou = tf.reduce_max(iou, axis=-1)
        # (None, 13, 13, 3)
        best_iou = tf.reshape(best_iou, [pred_box_shape[0], pred_box_shape[1], pred_box_shape[2], pred_box_shape[3]])
        # ignore_mask = 1 => don't ignore
        # ignore_mask = 0 => should ignore
        ignore_mask = tf.cast(best_iou < ignore_thresh, tf.float32)
        # (None, 13, 13, 3, 1)
        ignore_mask = tf.expand_dims(ignore_mask, axis=-1)
        
        return ignore_mask

In [46]:
anchors = np.reshape(anchors_wrt_target, [1,1,1,3,2])

In [47]:
## made on 16/7/2020 at 8:48 pm

from functools import partial

# anchors = np.reshape(anchors_wrt_target, [1,1,1,3,2])

def my_custom_loss(y_true, y_pred):
    
    def pre_loss(my_custom_loss, anchors):
        
        num_anchors = len(anchors)
        num_classes = 1
        ignore_thresh = 0.5
        grid_size = [25., 25.]
        grid_stride = 800. / grid_size[0]
        batch_shape = y_pred.get_shape()
#        batch_size = batch_shape[0]
        batch_size = 8.
    
        scaled_anchors = anchors / grid_stride
    
        Lambda_Coord = 5.0
        Lambda_no_obj = 0.5
    
        grid_x = np.arange(grid_size[1])
        grid_y = np.arange(grid_size[0])
    
        a = np.array(np.meshgrid(grid_x, grid_y))
        b = np.array(np.meshgrid(grid_x, grid_y))
        c = np.array(np.meshgrid(grid_x, grid_y))
        d = np.concatenate((a,b,c), axis = 0)
        e = d.transpose(2, 1, 0)
        grid_final = np.reshape(e,[1,25,25,3,2])
        grid_final[..., [0,1]] = grid_final[..., [1,0]]
    
        tot_loss = tf.zeros(1, dtype='float32')

        obj_mask = y_true[..., 4:5]

## ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 

        pred_box_xy = K.sigmoid(y_pred[..., :2]) + grid_final  # this gives x & y in no. of cells. x & y w.r.t. target
                                                               # image = (x & y in no. of cells) / no. of cells
        pred_box_xy_wrt_target_image = (pred_box_xy * grid_stride) / 608.
        true_box_xy_wrt_target_image = y_true[..., :2]

        xy_arr = Lambda_Coord * K.square(true_box_xy_wrt_target_image - pred_box_xy_wrt_target_image)
    
        xy_loss = K.sum(xy_arr * obj_mask) / batch_size
    
## ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 
                       
        pred_box_wdht = K.exp(y_pred[..., 2:4]) * (anchors / 608.)
    
        true_box_wdht = y_true[..., 2:4]
    
        wh_arr = Lambda_Coord * K.square(true_box_wdht - pred_box_wdht)
    
        wh_loss = K.sum(wh_arr * obj_mask) / batch_size
    
## ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 
        
        pred_obj_mask = K.sigmoid(y_pred[..., 4:5])  # shape = 28, 17, 17, 3, 1
       
        true_box_wrt_ti = K.concatenate([true_box_xy_wrt_target_image, true_box_wdht], axis = -1)  ## in x,y,w,h format
        pred_box_wrt_ti = K.concatenate([pred_box_xy_wrt_target_image, pred_box_wdht], axis = -1)  ## in x,y,w,h format
    
        ignore_mask = calc_ignore_mask(ignore_thresh, true_box_wrt_ti, pred_box_wrt_ti)
        
#        bce = tf.keras.losses.BinaryCrossentropy()        
#        obj_loss = K.sum(bce(obj_mask, pred_obj_mask) * obj_mask)

        obj_loss_arr = K.square(obj_mask - pred_obj_mask)
        obj_loss = K.sum(obj_loss_arr * obj_mask) / batch_size
    
        no_obj_mask = 1. - obj_mask
                
        noobj_loss_arr = Lambda_no_obj * K.square(obj_mask - pred_obj_mask)
        noobj_loss = K.sum(noobj_loss_arr * no_obj_mask * ignore_mask) / batch_size
        
        
## ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

        true_classes = y_true[..., 5:6]
    
        pred_classes = K.sigmoid(y_pred[..., 5:6])
        
#        cce = tf.keras.losses.CategoricalCrossentropy()
    
#        class_loss = K.sum(bce(true_classes, pred_classes) * obj_mask)

        class_loss_arr = K.square(true_classes - pred_classes)
        class_loss = K.sum(class_loss_arr * obj_mask) / batch_size
        
#        tot_loss = xy_loss + wh_loss + obj_loss + noobj_loss + class_loss

        tot_loss = xy_loss + wh_loss + obj_loss + noobj_loss + class_loss

        
        return tot_loss
    
    loss = pre_loss(my_custom_loss, anchors)
    
    return loss
    
