In [1]:
cd models/research/object_detection

/home/scar3crow/Dropbox/WorkStation-Subrata/python/models/research/object_detection


In [281]:
import numpy as np
import pandas as pd
import cv2
import os
import tqdm
from scipy.io import loadmat

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from PIL import Image
import pytesseract

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from keras import backend as K

from utils import *

from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.layers import *

from keras.applications import MobileNetV2
from keras.applications import InceptionResNetV2

from keras.models import Model
from keras.models import model_from_json


In [282]:
target_size = [208, 208]
target_w = 208 # target sizes of image in model input
target_h = 208 #target sizes of image in model input

grid_y_axis = 13  # each image is to be segmented to 13 x 13 grid
grid_x_axis = 13  # # each image is to be segmented to 13 x 13 grid

grid_w = target_w / grid_x_axis  # grid cell width
grid_h = target_h / grid_y_axis  # grid cell height

channels = 3
num_anchors = 2
classes = 5 # vendor, invoice, inv_date, po, buyer
info = 5 + classes    # pc, x, y, h, w, and class probabilities

categories = ['vendor', 'invoice', 'inv_date', 'po', 'buyer'] # details of classes

In [283]:
# Making a list of image path

inv_directory = '/home/scar3crow/Downloads/8-6-new-scan'  ## 'invoices' is a zip file of jpg images in ...../Downloads 
                                                        
inv_new_image = ['/home/scar3crow/Downloads/8-6-new-scan/{}'.format(i) for i in os.listdir(inv_directory)] # making the list
inv_new_image.sort() # Sorting the list

print('Number of images = ', len(inv_new_image))
inv_new_image[20]

Number of images =  36


'/home/scar3crow/Downloads/8-6-new-scan/121a.jpg'

In [284]:
# Check sizes of exiting images & Create a Dataframe with image id and height(row) and width(column):

rows = []
columns = []
image_sl = []
df_new = pd.DataFrame()

for i in range(len(inv_new_image)):
    image = cv2.imread(inv_new_image[i]) ## Loading image
    height, width, _ = image.shape
    rows.append(height)
    columns.append(width)
    image_sl.append(inv_new_image[i])
    
row_values = pd.Series(rows)
col_values = pd.Series(columns)
image_num = pd.Series(image_sl)


df_new.insert(loc=0, column='image_serial', value=image_num)
df_new.insert(loc=1, column='rows', value=row_values)
df_new.insert(loc=2, column='columns', value=col_values)

df_new.head(3)

Unnamed: 0,image_serial,rows,columns
0,/home/scar3crow/Downloads/8-6-new-scan/101a.jpg,160,416
1,/home/scar3crow/Downloads/8-6-new-scan/102a.jpg,406,870
2,/home/scar3crow/Downloads/8-6-new-scan/103a.jpg,260,416


In [285]:
# Loading output of VGG Image Annotation tool and create a dataframe

r_new_data = pd.read_csv('/home/scar3crow/Downloads/via_new_data.csv')
num_obj = r_new_data['region_count'][0] # number of objects in each photo
r_new_data.drop(r_new_data.columns[[1, 2, 3, 4]], axis=1, inplace=True) # reduce unnecessary columns
r_new_data.sort_values(by=['#filename'], ascending=True) # Sorting based on image-id
num_images = r_new_data["#filename"].nunique() # Find out number of unique images

print('Number of classes = ', num_obj)
print('Number of unique images = ', num_images)
r_new_data[58:61]

Number of classes =  5
Number of unique images =  36


Unnamed: 0,#filename,region_shape_attributes,region_attributes
58,63a.jpg,"{""name"":""rect"",""x"":211,""y"":64,""width"":76,""heig...","{""text"":""po""}"
59,63a.jpg,"{""name"":""rect"",""x"":2,""y"":68,""width"":165,""heigh...","{""text"":""buyer""}"
60,101a.jpg,"{""name"":""rect"",""x"":6,""y"":23,""width"":119,""heigh...","{""text"":""vendor""}"


In [1]:
# Making a dataframe for Image_id, x, y, width, height, class, image_width and image_height

x = []
y = []
width = []
height = []
obj_class = []
i_width = []
i_height = []



for i in range(len(r_new_data)):
    
    r_size = r_new_data.values[i, 1][1:(len(r_new_data.values[i, 1])-1)]
    r_size_par = r_size.split(",")
    
    x.append(int("".join(filter(str.isdigit, r_size_par[1]))))
    y.append(int("".join(filter(str.isdigit, r_size_par[2]))))
    width.append(int("".join(filter(str.isdigit, r_size_par[3]))))
    height.append(int("".join(filter(str.isdigit, r_size_par[4]))))
    
    r_attribs = r_new_data.values[i, 2][1:(len(r_new_data.values[i, 2])-1)]
    r_attribs_par = r_attribs.split(':')[1]
    obj_class.append(r_attribs_par[1:(len(r_attribs_par)-1)])
    
    foto_id = r_new_data['#filename'][i]
    img_path = '/home/scar3crow/Downloads/8-6-new-scan/' + foto_id
    foto_index = df_new.index[df_new['image_serial'] == img_path]
    foto_width = df_new['columns'][foto_index].tolist()
    foto_height = df_new['rows'][foto_index].tolist()
    i_width.append(foto_width[0])
    i_height.append(foto_height[0])
    img_path.append(i_path)
    
x_values = pd.Series(x)
y_values = pd.Series(y)
width_values = pd.Series(width)
height_values = pd.Series(height)
class_values = pd.Series(obj_class)
i_width_values = pd.Series(i_width)
i_height_values = pd.Series(i_height)



r_new_data.insert(loc=1, column='x', value=x_values)
r_new_data.insert(loc=2, column='y', value=y_values)
r_new_data.insert(loc=3, column='width', value=width_values)
r_new_data.insert(loc=4, column='height', value=height_values)
r_new_data.insert(loc=5, column='obj_class', value=class_values)
r_new_data.insert(loc=6, column='img_wd', value=i_width_values)
r_new_data.insert(loc=7, column='img_ht', value=i_height_values)



r_new_data.drop(r_new_data.columns[[8, 9]], axis=1, inplace=True) # reduce unnecessary columns

r_new_data.rename({'#filename': 'img_id'}, axis=1, inplace=True) # changing column name

r_new_data[3:6]

NameError: name 'r_new_data' is not defined

In [287]:
print('Number of unique images = ', r_new_data['img_id'].nunique())  # print total no, of unique images

print('Number of classes in diff. categories = ', r_new_data['obj_class'].value_counts())

Number of unique images =  36
Number of classes in diff. categories =  buyer      38
date       36
vendor     36
invoice    36
po         33
order       1
Name: obj_class, dtype: int64


In [288]:
# We have to correct above :

# To find smallest width & height boxes in 'buyer' which should be 'po'
gb = r_new_data.groupby('obj_class')    
[gb.get_group('buyer') for x in gb.groups]


[       img_id    x    y  width  height obj_class  image_width  image_height
 4     50a.jpg    5   57    206      56     buyer          416           209
 9     51a.jpg    4   53    152      64     buyer          416           194
 14    52a.jpg    1   50    161      74     buyer          416           188
 19    53a.jpg    0   50    177      76     buyer          416           194
 24    54a.jpg   31  103    186      61     buyer          416           168
 29    55a.jpg    1   56    183      74     buyer          416           144
 34    56a.jpg    1   56    166      62     buyer          416           123
 39    59a.jpg    3   58    175      62     buyer          416           200
 44    60a.jpg    0   44    165      52     buyer          416           106
 49    61a.jpg    1   56    155      63     buyer          416           121
 54    62a.jpg    4   58    163      61     buyer          416           123
 59    63a.jpg    2   68    165      55     buyer          416           191

In [289]:
# Correcting above wrong spelling & converting buyer to po of object classes and rechecking

id_1 = r_new_data.index[r_new_data['obj_class'] == 'order'] # Finding the index
id_2 = r_new_data.index[r_new_data['obj_class'] == 'date'] # to change 'date' to 'inv_date' to be consistent with old data

r_new_data.at[id_1, 'obj_class'] = 'po' # writing the correct spelling 
r_new_data.at[88, 'obj_class'] = 'po' # # 'buyer' to 'po'
r_new_data.at[163, 'obj_class'] = 'po' # # 'buyer' to 'po'
r_new_data.at[id_2, 'obj_class'] = 'inv_date' # # 'date' to 'inv_date'

print('Number of unique images = ', r_new_data['img_id'].nunique())  # print total no, of unique images
print('Number of unique classes = ', r_new_data['obj_class'].nunique())
print('Number of classes in diff. categories = ', r_new_data['obj_class'].value_counts()) 


Number of unique images =  36
Number of unique classes =  5
Number of classes in diff. categories =  vendor      36
buyer       36
po          36
inv_date    36
invoice     36
Name: obj_class, dtype: int64


In [290]:
# Converting categories into one-hot-coding :

categories = ['vendor', 'invoice', 'inv_date', 'po', 'buyer']

values = np.array(categories)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
cat_encoded = onehot_encoder.fit_transform(integer_encoded)
                               
print(cat_encoded)

[[0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0.]]


In [291]:
## iou based on width and height for the purpose of calculating anchors through k-means :

def iou_kmeans(box, clusters):
    """
    Calculates the Intersection over Union (IoU) between a box and k clusters.
    :param box: tuple or array, shifted to the origin (i. e. width and height)
    :param clusters: numpy array of shape (k, 2) where k is the number of clusters
    :return: numpy array of shape (k, 0) where k is the number of clusters
    """
    x = np.minimum(clusters[:, 0], box[0])
    y = np.minimum(clusters[:, 1], box[1])
    if np.count_nonzero(x == 0) > 0 or np.count_nonzero(y == 0) > 0:
        raise ValueError("Box has no area")

    intersection = x * y
    box_area = box[0] * box[1]
    cluster_area = clusters[:, 0] * clusters[:, 1]

    iou = intersection / (box_area + cluster_area - intersection)

    return iou


In [292]:
## calculating anchors from true boundary boxes :

def kmeans(boxes, k, dist=np.median):
    """
    Calculates k-means clustering with the Intersection over Union (IoU) metric.
    :param boxes: numpy array of shape (r, 2), where r is the number of rows
    :param k: number of clusters
    :param dist: distance function
    :return: numpy array of shape (k, 2)
    """
    rows = boxes.shape[0]

    distances = np.empty((rows, k))
    last_clusters = np.zeros((rows,))

    np.random.seed()

    # the Forgy method will fail if the whole array contains the same rows
    clusters = boxes[np.random.choice(rows, k, replace=False)]


    while True:
        for row in range(rows):
            distances[row] = 1 - iou_kmeans(boxes[row], clusters)

        nearest_clusters = np.argmin(distances, axis=1)

        if (last_clusters == nearest_clusters).all():
            break

        for cluster in range(k):
            clusters[cluster] = dist(boxes[nearest_clusters == cluster], axis=0)

        last_clusters = nearest_clusters

    return clusters

In [293]:
## Finding out anchors :
## Firstly, converting true boundary box width, height to width & height with respect to grid cells :
## and then convert dimensions w.r.t. target dimensions and then devide by cell dimension finalyy dind anchors


num_all_bb = len(r_new_data) # total number of boundary boxes = no. of images * 5

b_box_wrt_cell = np.zeros((num_all_bb,2))

for i in range(num_all_bb):
    
    image_w = r_new_data['image_width'][i]
    image_h = r_new_data['image_height'][i]

    x_ratio = target_w / image_w 
    y_ratio = target_h / image_h
    
    anchor_w = r_new_data['width'][i] * x_ratio / (target_w/grid_x_axis)
    anchor_h = r_new_data['height'][i] * y_ratio / (target_h/grid_y_axis)
    b_box_wrt_cell[i, 0] = anchor_w
    b_box_wrt_cell[i, 1] = anchor_h
    
anchors_wrt_cell = kmeans(b_box_wrt_cell, 2)

print(anchors_wrt_cell.shape)
print(anchors_wrt_cell)


(2, 2)
[[5.125      4.33333333]
 [2.03125    1.54761905]]


In [294]:
## Creating true boundary box array of x, y, w, h, class where x, y, w, h are in the range of [0, 1] indicating
## all with respect to target image size

image_list = r_new_data['img_id'].unique() # make a list of unique images
anchors = anchors_wrt_cell
true_boxes = np.zeros((num_images, num_obj, 5))

for i in range(len(image_list)):
    
    r_new_data_slice = r_new_data.loc[r_new_data['img_id'].isin([image_list[i]])].reset_index(drop=True)

    obj = 0

    for j in range(len(r_new_data_slice)):
    
        image_w = r_new_data_slice['image_width'][j]
        image_h = r_new_data_slice['image_height'][j]
    
        x_ratio = target_size[1] / image_w
        y_ratio = target_size[0] / image_h
    
        xmin = r_new_data_slice['x'][j] * x_ratio
        ymin = r_new_data_slice['y'][j] * y_ratio
        
        xmax = (r_new_data_slice['x'][j] + r_new_data_slice['width'][j]) * x_ratio
        ymax = (r_new_data_slice['y'][j] + r_new_data_slice['height'][j]) * y_ratio
        
        w = (r_new_data_slice['width'][j] * x_ratio) / target_size[1]
        h = (r_new_data_slice['height'][j] * y_ratio) / target_size[0]
    
        x = (xmin + (xmax-xmin)/2) / target_size[1]
        y = (ymin + (ymax-ymin)/2) / target_size[0] 
        
        true_boxes[i, j][0] = x
        true_boxes[i, j][1] = y
        true_boxes[i, j][2] = w
        true_boxes[i, j][3] = h
        true_boxes[i, j][4] = obj
        obj = obj+1

print(true_boxes.shape)


(36, 5, 5)


In [17]:
## Creating y_true for training or here, it is called matching_true_boxes :

def preprocess_true_boxes(true_boxes, anchors, target_size):
    """Find detector in YOLO where ground truth box should appear.
    Parameters
    ----------
    true_boxes : array
        List of ground truth boxes in form of relative x, y, w, h, class.
        Relative coordinates are in the range [0, 1] indicating a percentage
        of the original image dimensions.
    anchors : array
        List of anchors in form of w, h.
        Anchors are assumed to be in the range [0, conv_size] where conv_size
        is the spatial dimension of the final convolutional features.
    image_size : array-like
        List of image dimensions in form of h, w in pixels.
    Returns
    -------
    detectors_mask : array
        0/1 mask for detectors in [conv_height, conv_width, num_anchors, 1]
        that should be compared with a matching ground truth box.
    matching_true_boxes: array
        Same shape as detectors_mask with the corresponding ground truth box
        adjusted for comparison with predicted parameters at training time.
        
    """
    
    height, width = target_size
    num_anchors = len(anchors)
    
    conv_height = height // 16  ## cell dimension is 16 X 16 and no. of cells are 13 X 13
    conv_width = width // 16   ## cell dimension is 16 X 16 and no. of cells are 13 X 13
        
    detectors_mask = np.zeros((conv_height, conv_width, num_anchors, 1), dtype=np.float32)
    matching_true_boxes = np.zeros((conv_height, conv_width, num_anchors, info),dtype=np.float32)

    for box in true_boxes:
        
        # scale box to convolutional feature spatial dimensions
        box_class = int(box[4:5])
        box = box[0:4] * np.array([conv_width, conv_height, conv_width, conv_height])
        
        i = np.floor(box[1]).astype('int')
             
#        j = min(np.floor(box[0]).astype('int'),1)
        j = np.floor(box[0]).astype('int')
                
        best_iou = 0
        best_anchor = 0
                
        for k, anchor in enumerate(anchors):
            # Find IOU between box shifted to origin and anchor box.
            box_maxes = box[2:4] / 2.
            box_mins = -box_maxes
            anchor_maxes = (anchor / 2.)
            anchor_mins = -anchor_maxes

            intersect_mins = np.maximum(box_mins, anchor_mins)
            intersect_maxes = np.minimum(box_maxes, anchor_maxes)
            intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.)
            intersect_area = intersect_wh[0] * intersect_wh[1]
            box_area = box[2] * box[3]
            anchor_area = anchor[0] * anchor[1]
            iou = intersect_area / (box_area + anchor_area - intersect_area)
            if iou > best_iou:
                best_iou = iou
                best_anchor = k
                
        if best_iou > 0:
            detectors_mask[i, j, best_anchor] = 1
            
            adjusted_box = np.array(
                [   1,
                    box[0] - j, box[1] - i,
                    np.log(box[2] / anchors[best_anchor][0]),
                    np.log(box[3] / anchors[best_anchor][1])
                ],
                dtype=np.float32)
            matching_true_boxes[i, j, best_anchor] = np.hstack((adjusted_box, cat_encoded[box_class]))
    return detectors_mask, matching_true_boxes

In [18]:
# Preparing Input(X) and Target(Y) file for training :

X_final = [] # X_final list to convert to np array later
Y_true_final = [] # Y_final list to convert to np array later
Y_mask_final = []

image_list = r_new_data['img_id'].unique() # make a list of unique images

for i in image_list:
    
    image_path = '/home/scar3crow/Downloads/8-6-new-scan/' + i
    
    x = cv2.imread(image_path)
    x_ratio = target_w / x.shape[1]
    y_ratio = target_h / x.shape[0]
    img = cv2.resize(x,(target_w, target_h))
    
    X_final.append(img)
    
Y_true = np.zeros((grid_y_axis,grid_x_axis,num_anchors,info))
Y_mask = np.zeros((grid_y_axis,grid_x_axis,num_anchors,1))

for j in range(len(image_list)):
    
    Y_mask, Y_true = preprocess_true_boxes(true_boxes[j], anchors_wrt_cell, target_size)
    
    Y_true_final.append(Y_true)
    Y_mask_final.append(Y_mask)
    

X = np.array(X_final) 
#X_final = []
Y_true_target = np.array(Y_true_final)
Y_mask_target = np.array(Y_mask_final)
#Y_final = []

X = (X - 127.5)/127.5  # X normalising since pixels vary from 0 to 255
    
# np.save('/home/scar3crow/Downloads/Data1/X_short.npy',X)
# np.save('/home/scar3crow/Downloads/Data1/Y_short.npy',Y)


In [20]:
X.shape

(36, 208, 208, 3)

In [19]:
Y_true_target.shape

(36, 13, 13, 2, 10)

In [20]:
Y_mask_target.shape

(36, 13, 13, 2, 1)

In [23]:
XX = X
YY = Y_true_target
ZZ = Y_mask_target

X_train , X_val , Y_train , Y_val  = train_test_split(XX, YY,train_size = 0.8 , shuffle = True)




In [24]:
def my_model(input_shape):
    
    
    inp = Input(input_shape)
   
    model = InceptionResNetV2( input_tensor= inp , include_top=False, weights='imagenet')
    last_layer = model.output
    
    last_3 = Flatten()(last_layer)
    last_2 = Dense(640, activation = 'relu')(last_3)
    last_1 = Dense(320, activation = 'relu')(last_2)
    last_0 = Dense(5, activation = 'softmax')(last_1)
        
    model = Model(inp,last_0)
    
    return model


In [25]:
input_size = (target_h,target_w,3)

my_invoice_yolo_model = my_model(input_size)

print(my_invoice_yolo_model.summary())




Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 208, 208, 3)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 103, 103, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 103, 103, 32) 96          conv2d_1[0][0]                   
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 103, 103, 32) 0           batch_normalization_1[0][0]      
__________________________________________________________________________________________

In [26]:
model_yolo = my_invoice_yolo_model
model_yolo.layers.pop()
model_yolo.layers.pop()
model_yolo.layers.pop()
model_yolo.layers.pop()

model_yolo.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 208, 208, 3)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 103, 103, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 103, 103, 32) 96          conv2d_1[0][0]                   
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 103, 103, 32) 0           batch_normalization_1[0][0]      
____________________________________________________________________________________________

activation_17 (Activation)      (None, 23, 23, 48)   0           batch_normalization_17[0][0]     
__________________________________________________________________________________________________
conv2d_13 (Conv2D)              (None, 23, 23, 32)   10240       mixed_5b[0][0]                   
__________________________________________________________________________________________________
conv2d_15 (Conv2D)              (None, 23, 23, 32)   9216        activation_14[0][0]              
__________________________________________________________________________________________________
conv2d_18 (Conv2D)              (None, 23, 23, 64)   27648       activation_17[0][0]              
__________________________________________________________________________________________________
batch_normalization_13 (BatchNo (None, 23, 23, 32)   96          conv2d_13[0][0]                  
__________________________________________________________________________________________________
batch_norm

__________________________________________________________________________________________________
conv2d_139 (Conv2D)             (None, 11, 11, 160)  143360      activation_138[0][0]             
__________________________________________________________________________________________________
batch_normalization_139 (BatchN (None, 11, 11, 160)  480         conv2d_139[0][0]                 
__________________________________________________________________________________________________
activation_139 (Activation)     (None, 11, 11, 160)  0           batch_normalization_139[0][0]    
__________________________________________________________________________________________________
conv2d_137 (Conv2D)             (None, 11, 11, 192)  208896      block17_15_ac[0][0]              
__________________________________________________________________________________________________
conv2d_140 (Conv2D)             (None, 11, 11, 192)  215040      activation_139[0][0]             
__________

conv2d_199 (Conv2D)             (None, 5, 5, 256)    172032      activation_198[0][0]             
__________________________________________________________________________________________________
batch_normalization_196 (BatchN (None, 5, 5, 192)    576         conv2d_196[0][0]                 
__________________________________________________________________________________________________
batch_normalization_199 (BatchN (None, 5, 5, 256)    768         conv2d_199[0][0]                 
__________________________________________________________________________________________________
activation_196 (Activation)     (None, 5, 5, 192)    0           batch_normalization_196[0][0]    
__________________________________________________________________________________________________
activation_199 (Activation)     (None, 5, 5, 256)    0           batch_normalization_199[0][0]    
__________________________________________________________________________________________________
block8_9_m

In [27]:
last_2 = Conv2D(768,(5,5) , activation='relu' , padding='same')(model_yolo.layers[-1].output)
    
last_1  = Conv2D(384,(3,3) , activation='relu' , padding='same')(last_2)
    
last_0 = Conv2D(1690,(3,3) , activation='relu' , padding='valid')(last_1)

# last = Conv2D(5070,(3,3) , activation='relu', padding='valid')(last_0)

last = Conv2D(3380,(3,3), padding='valid')(last_0)
    
final = Reshape((13, 13, 2, 10))(last)
        
model_yolo_1 = Model(model_yolo.input, final)
    

model_yolo_1.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 208, 208, 3)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 103, 103, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 103, 103, 32) 96          conv2d_1[0][0]                   
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 103, 103, 32) 0           batch_normalization_1[0][0]      
____________________________________________________________________________________________

In [28]:
opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model_yolo_1.compile(optimizer= opt,loss='categorical_crossentropy',metrics=['accuracy'])
model_yolo_1.fit(X_train, Y_train, epochs= 5, batch_size = 4, validation_data=(X_val,Y_val))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 28 samples, validate on 8 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7fe8a9046278>

In [29]:
model_yolo_1.fit(X_train, Y_train, epochs= 15, batch_size = 4, validation_data=(X_val,Y_val))

Train on 28 samples, validate on 8 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x7fe8a8b7e860>

In [21]:
## Creating y_true for training or here, it is called matching_true_boxes :

def preprocess_true_boxes(true_boxes, anchors, target_size):
    """Find detector in YOLO where ground truth box should appear.
    Parameters
    ----------
    true_boxes : array
        List of ground truth boxes in form of relative x, y, w, h, class.
        Relative coordinates are in the range [0, 1] indicating a percentage
        of the original image dimensions.
    anchors : array
        List of anchors in form of w, h.
        Anchors are assumed to be in the range [0, conv_size] where conv_size
        is the spatial dimension of the final convolutional features.
    image_size : array-like
        List of image dimensions in form of h, w in pixels.
    Returns
    -------
    detectors_mask : array
        0/1 mask for detectors in [conv_height, conv_width, num_anchors, 1]
        that should be compared with a matching ground truth box.
    matching_true_boxes: array
        Same shape as detectors_mask with the corresponding ground truth box
        adjusted for comparison with predicted parameters at training time.
        
    """
    
    height, width = target_size
    num_anchors = len(anchors)
    
    conv_height = height // 16  ## cell dimension is 16 X 16 and no. of cells are 13 X 13
    conv_width = width // 16   ## cell dimension is 16 X 16 and no. of cells are 13 X 13
        
    detectors_mask = np.zeros((conv_height, conv_width, num_anchors, 1), dtype=np.float32)
    matching_true_boxes = np.zeros((conv_height, conv_width, num_anchors, info),dtype=np.float32)

    for box in true_boxes:
        
        # scale box to convolutional feature spatial dimensions
        box_class = int(box[4:5])
        box = box[0:4] * np.array([conv_width, conv_height, conv_width, conv_height])
        
        i = np.floor(box[1]).astype('int')
             
#        j = min(np.floor(box[0]).astype('int'),1)
        j = np.floor(box[0]).astype('int')
                
        best_iou = 0
        best_anchor = 0
                
        for k, anchor in enumerate(anchors):
            # Find IOU between box shifted to origin and anchor box.
            box_maxes = box[2:4] / 2.
            box_mins = -box_maxes
            anchor_maxes = (anchor / 2.)
            anchor_mins = -anchor_maxes

            intersect_mins = np.maximum(box_mins, anchor_mins)
            intersect_maxes = np.minimum(box_maxes, anchor_maxes)
            intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.)
            intersect_area = intersect_wh[0] * intersect_wh[1]
            box_area = box[2] * box[3]
            anchor_area = anchor[0] * anchor[1]
            iou = intersect_area / (box_area + anchor_area - intersect_area)
            if iou > best_iou:
                best_iou = iou
                best_anchor = k
                
        if best_iou > 0:
            detectors_mask[i, j, best_anchor] = 1
            
            adjusted_box = np.array(
                [   
                    box[0] - j, box[1] - i,
                    np.log(box[2] / anchors[best_anchor][0]),
                    np.log(box[3] / anchors[best_anchor][1]),1
                ],
                dtype=np.float32)
            matching_true_boxes[i, j, best_anchor] = np.hstack((adjusted_box, cat_encoded[box_class]))
    return detectors_mask, matching_true_boxes

In [22]:
# Preparing Input(X) and Target(Y) file for training :

X_final = [] # X_final list to convert to np array later
Y_true_final = [] # Y_final list to convert to np array later
Y_mask_final = []

image_list = r_new_data['img_id'].unique() # make a list of unique images

for i in image_list:
    
    image_path = '/home/scar3crow/Downloads/8-6-new-scan/' + i
    
    x = cv2.imread(image_path)
    x_ratio = target_w / x.shape[1]
    y_ratio = target_h / x.shape[0]
    img = cv2.resize(x,(target_w, target_h))
    
    X_final.append(img)
    
Y_true = np.zeros((grid_y_axis,grid_x_axis,num_anchors,info))
Y_mask = np.zeros((grid_y_axis,grid_x_axis,num_anchors,1))

for j in range(len(image_list)):
    
    Y_mask, Y_true = preprocess_true_boxes(true_boxes[j], anchors_wrt_cell, target_size)
    
    Y_true_final.append(Y_true)
    Y_mask_final.append(Y_mask)
    

X = np.array(X_final) 
#X_final = []
Y_true_target = np.array(Y_true_final)
Y_mask_target = np.array(Y_mask_final)
#Y_final = []

X = (X - 127.5)/127.5  # X normalising since pixels vary from 0 to 255
    
# np.save('/home/scar3crow/Downloads/Data1/X_short.npy',X)
# np.save('/home/scar3crow/Downloads/Data1/Y_short.npy',Y)


In [258]:
image_list[4]

'54a.jpg'

In [240]:
image_list[16]

'105a.jpg'

In [160]:
r_new_data[20:25]

Unnamed: 0,img_id,x,y,width,height,obj_class,image_width,image_height
20,54a.jpg,87,7,259,56,vendor,416,168
21,54a.jpg,254,99,97,19,invoice,416,168
22,54a.jpg,253,117,113,15,inv_date,416,168
23,54a.jpg,257,132,135,19,po,416,168
24,54a.jpg,31,103,186,61,buyer,416,168


In [23]:
Y_true_target.shape

(36, 13, 13, 2, 10)

In [239]:
print(Y_true_target[16,2,0:13])
    

[[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
    0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
    0.0000000e+00  0.0000000e+00]
  [ 0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
    0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
    0.0000000e+00  0.0000000e+00]]

 [[ 0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
    0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
    0.0000000e+00  0.0000000e+00]
  [ 0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
    0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
    0.0000000e+00  0.0000000e+00]]

 [[ 0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
    0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
    0.0000000e+00  0.0000000e+00]
  [ 1.2500000e-01  2.5510204e-01 -2.4783616e-01 -2.2204460e-16
    1.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
    0.0000000e+00  1.0000000e+00]]

 [[ 0.0000000e+00  0.0000000e+00  

In [24]:
XX = X
YY = Y_true_target
ZZ = Y_mask_target

X_train , X_val , Y_train , Y_val  = train_test_split(XX, YY,train_size = 0.8 , shuffle = True)




In [25]:
def my_model(input_shape):
    
    
    inp = Input(input_shape)
   
    model = InceptionResNetV2( input_tensor= inp , include_top=False, weights='imagenet')
    last_layer = model.output
    
    last_3 = Flatten()(last_layer)
    last_2 = Dense(640, activation = 'relu')(last_3)
    last_1 = Dense(320, activation = 'relu')(last_2)
    last_0 = Dense(5, activation = 'softmax')(last_1)
        
    model = Model(inp,last_0)
    
    return model

input_size = (target_h,target_w,3)

my_invoice_yolo_model = my_model(input_size)

#  print(my_invoice_yolo_model.summary())


model_yolo = my_invoice_yolo_model
model_yolo.layers.pop()
model_yolo.layers.pop()
model_yolo.layers.pop()
model_yolo.layers.pop()

#  model_yolo.summary()

last_2 = Conv2D(768,(5,5) , activation='relu' , padding='same')(model_yolo.layers[-1].output)
    
last_1  = Conv2D(384,(3,3) , activation='relu' , padding='same')(last_2)
    
last_0 = Conv2D(1690,(3,3) , activation='relu' , padding='valid')(last_1)

# last = Conv2D(5070,(3,3) , activation='relu', padding='valid')(last_0)

last = Conv2D(3380,(3,3), padding='valid')(last_0)
    
final = Reshape((13, 13, 2, 10))(last)
        
model_yolo_3 = Model(model_yolo.input, final)
    

model_yolo_3.summary()




Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 208, 208, 3)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 103, 103, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 103, 103, 32) 96          conv2d_1[0][0]                   
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 103, 103, 32) 0           batch_normalization_1[0][0]      
__________________________________________________________________________________________

In [35]:
opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model_yolo_2.compile(optimizer= opt,loss='categorical_crossentropy',metrics=['accuracy'])
model_yolo_2.fit(X_train, Y_train, epochs= 10, batch_size = 4, validation_data=(X_val,Y_val))

Train on 28 samples, validate on 8 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fe870041908>

In [36]:
opt = Adam(lr=0.00001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model_yolo_2.compile(optimizer= opt,loss='categorical_crossentropy',metrics=['accuracy'])
model_yolo_2.fit(X_train, Y_train, epochs= 10, batch_size = 4, validation_data=(X_val,Y_val))

Train on 28 samples, validate on 8 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fe867b7ecf8>

In [37]:
def my_model(input_shape):
    
    
    inp = Input(input_shape)
   
    model = InceptionResNetV2( input_tensor= inp , include_top=False, weights='imagenet')
    last_layer = model.output
    
    last_3 = Flatten()(last_layer)
    last_2 = Dense(640, activation = 'relu')(last_3)
    last_1 = Dense(320, activation = 'relu')(last_2)
    last_0 = Dense(5, activation = 'softmax')(last_1)
        
    model = Model(inp,last_0)
    
    return model

input_size = (target_h,target_w,3)

my_invoice_yolo_model = my_model(input_size)

#  print(my_invoice_yolo_model.summary())


model_yolo = my_invoice_yolo_model
model_yolo.layers.pop()
model_yolo.layers.pop()
model_yolo.layers.pop()
model_yolo.layers.pop()

#  model_yolo.summary()

last_2 = Conv2D(768,(5,5) , activation='relu' , padding='same')(model_yolo.layers[-1].output)
    
last_1  = Conv2D(384,(3,3) , activation='relu' , padding='same')(last_2)
    
last_0 = Conv2D(1690,(3,3) , activation='relu' , padding='valid')(last_1)

# last = Conv2D(5070,(3,3) , activation='relu', padding='valid')(last_0)

last = Conv2D(3380,(3,3), activation = 'relu', padding='valid')(last_0)
    
final = Reshape((13, 13, 2, 10))(last)
        
model_yolo_2 = Model(model_yolo.input, final)
    

model_yolo_2.summary()


Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 208, 208, 3)  0                                            
__________________________________________________________________________________________________
conv2d_415 (Conv2D)             (None, 103, 103, 32) 864         input_3[0][0]                    
__________________________________________________________________________________________________
batch_normalization_407 (BatchN (None, 103, 103, 32) 96          conv2d_415[0][0]                 
__________________________________________________________________________________________________
activation_407 (Activation)     (None, 103, 103, 32) 0           batch_normalization_407[0][0]    
____________________________________________________________________________________________

batch_normalization_533 (BatchN (None, 11, 11, 160)  480         conv2d_541[0][0]                 
__________________________________________________________________________________________________
activation_533 (Activation)     (None, 11, 11, 160)  0           batch_normalization_533[0][0]    
__________________________________________________________________________________________________
conv2d_539 (Conv2D)             (None, 11, 11, 192)  208896      block17_12_ac[0][0]              
__________________________________________________________________________________________________
conv2d_542 (Conv2D)             (None, 11, 11, 192)  215040      activation_533[0][0]             
__________________________________________________________________________________________________
batch_normalization_531 (BatchN (None, 11, 11, 192)  576         conv2d_539[0][0]                 
__________________________________________________________________________________________________
batch_norm

In [26]:
opt = Adam(lr=0.00001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model_yolo_3.compile(optimizer= opt,loss='categorical_crossentropy',metrics=['accuracy'])
model_yolo_3.fit(X_train, Y_train, epochs= 10, batch_size = 4, validation_data=(X_val,Y_val))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 28 samples, validate on 8 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fd82b3fd080>

In [63]:
Y_train[0,0,0:13]

array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.,

In [62]:
Y_train.shape

(28, 13, 13, 2, 10)

In [90]:
Y_train[5,2, 0:13]

array([[[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
          0.0000000e+00,  0.0000000e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
          0.0000000e+00,  0.0000000e+00]],

       [[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
          0.0000000e+00,  0.0000000e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
          0.0000000e+00,  0.0000000e+00]],

       [[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
          0.0000000e+00,  0.0000000e+00],
        [ 1.2500000e-01,  2.5510204e-01, -2.4783616e-01,

In [102]:
Y_train[5,12, 0:13]

array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.,

In [295]:
##  https://github.com/Tony607/YOLO_Object_Localization_Keras/blob/master/yad2k/models/keras_yolo.py

##  voc_anchors = np.array(
##    [[1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]])

## voc_classes = [
##    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
##    "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
##    "pottedplant", "sheep", "sofa", "train", "tvmonitor"]


def preprocess_true_boxes(true_boxes, anchors, image_size):
    """Find detector in YOLO where ground truth box should appear.
    Parameters
    ----------
    true_boxes : array
        List of ground truth boxes in form of relative x, y, w, h, class.
        Relative coordinates are in the range [0, 1] indicating a percentage
        of the original image dimensions.
    anchors : array
        List of anchors in form of w, h.
        Anchors are assumed to be in the range [0, conv_size] where conv_size
        is the spatial dimension of the final convolutional features.
    image_size : array-like
        List of image dimensions in form of h, w in pixels.
    Returns
    -------
    detectors_mask : array
        0/1 mask for detectors in [conv_height, conv_width, num_anchors, 1]
        that should be compared with a matching ground truth box.
    matching_true_boxes: array
        Same shape as detectors_mask with the corresponding ground truth box
        adjusted for comparison with predicted parameters at training time.
        
    """
    
    height, width = target_size
    print(height, width)
    num_anchors = len(anchors)
    # Downsampling factor of 5x 2-stride max_pools == 32.
    # TODO: Remove hardcoding of downscaling calculations.
#    assert height % 32 == 0, 'Image sizes in YOLO_v2 must be multiples of 32.'
#    assert width % 32 == 0, 'Image sizes in YOLO_v2 must be multiples of 32.'
    conv_height = height // 16
    conv_width = width // 16
    num_box_params = true_boxes.shape[1]
    detectors_mask = np.zeros(
        (conv_height, conv_width, num_anchors, 1), dtype=np.float32)
    
    matching_true_boxes = np.zeros(
        (conv_height, conv_width, num_anchors, info),
        dtype=np.float32)

    for box in true_boxes:
        print('true box sizes =', box)
        # scale box to convolutional feature spatial dimensions
        box_class = int(box[4:5])
        box = box[0:4] * np.array(
            [conv_width, conv_height, conv_width, conv_height])
        print('##################################')
        print(box)
        i = np.floor(box[1]).astype('int')
        print('++++++++++++++++++', i)
        
#        j = min(np.floor(box[0]).astype('int'),1)
        j = np.floor(box[0]).astype('int')
        print('====================', j)
        
        best_iou = 0
        best_anchor = 0
                
        for k, anchor in enumerate(anchors):
            # Find IOU between box shifted to origin and anchor box.
            box_maxes = box[2:4] / 2.
            box_mins = -box_maxes
            anchor_maxes = (anchor / 2.)
            anchor_mins = -anchor_maxes

            intersect_mins = np.maximum(box_mins, anchor_mins)
            intersect_maxes = np.minimum(box_maxes, anchor_maxes)
            intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.)
            intersect_area = intersect_wh[0] * intersect_wh[1]
            box_area = box[2] * box[3]
            anchor_area = anchor[0] * anchor[1]
            iou = intersect_area / (box_area + anchor_area - intersect_area)
            if iou > best_iou:
                best_iou = iou
                best_anchor = k
                
        print('iou = ', best_iou, best_anchor)
        if best_iou > 0:
            detectors_mask[i, j, best_anchor] = 1
            print('??????????????????', i, j, box_class)
            print('[[[[[[[[[]]]]]]]]]', box[3], '+++++++++++++++', anchors[best_anchor][1])
            adjusted_box = np.array(
                [   1,
                    box[0] - j, box[1] - i,
                    np.log(round(box[2], 5) / round(anchors[best_anchor][0], 5)),
                    np.log(round(box[3], 5) / round(anchors[best_anchor][1], 5))
                 ],
                dtype=np.float32)
            matching_true_boxes[i, j, best_anchor] = np.hstack((adjusted_box, cat_encoded[box_class]))
    return detectors_mask, matching_true_boxes

In [296]:
## x & y w.r.t. cell and w,h w.r.t. image

target_size = [208, 208]


## ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

anchors = anchors_wrt_cell

r_new_data_slice = r_new_data.loc[r_new_data['img_id']=='105a.jpg'].reset_index(drop=True) # slicing

true_boxes = np.zeros((len(r_new_data_slice),5))

obj = 0

for j in range(len(r_new_data_slice)):
    
    image_w = r_new_data_slice['image_width'][j]
    image_h = r_new_data_slice['image_height'][j]
    
    x_ratio = target_size[1] / image_w
    
    y_ratio = target_size[0] / image_h
    
#    print(x_ratio, y_ratio)
    

    
    xmin = r_new_data_slice['x'][j] * x_ratio
    ymin = r_new_data_slice['y'][j] * y_ratio
        
    xmax = (r_new_data_slice['x'][j] + r_new_data_slice['width'][j]) * x_ratio
    ymax = (r_new_data_slice['y'][j] + r_new_data_slice['height'][j]) * y_ratio
        
#    obj_class = r_new_data_slice['obj_class'][j]
    
    
#    obj = j+1
    
    
        
    w = (r_new_data_slice['width'][j] * x_ratio) / target_size[1]
    
    h = (r_new_data_slice['height'][j] * y_ratio) / target_size[0]
    
    
    x = (xmin + (xmax-xmin)/2) / target_size[1]
    y = (ymin + (ymax-ymin)/2) / target_size[0] 
    
#    x = (x * grid_x_axis/image_size[1]) - int(x * grid_x_axis/image_size[1])
#    y = (y * grid_y_axis/image_size[0]) - int(y * grid_y_axis/image_size[0])
    
    true_boxes[j][0] = x
    true_boxes[j][1] = y
    true_boxes[j][2] = w
    true_boxes[j][3] = h
    true_boxes[j][4] = obj
    obj = obj+1

print(true_boxes.shape)

r_new_data_slice.head()


(5, 5)


Unnamed: 0,img_id,x,y,width,height,obj_class,image_width,image_height
0,105a.jpg,4,1,128,49,vendor,416,147
1,105a.jpg,236,4,47,21,invoice,416,147
2,105a.jpg,327,3,53,20,inv_date,416,147
3,105a.jpg,236,56,63,20,po,416,147
4,105a.jpg,4,53,158,80,buyer,416,147


In [277]:
true_boxes

array([[0.16346154, 0.17346939, 0.30769231, 0.33333333, 0.        ],
       [0.62379808, 0.09863946, 0.11298077, 0.14285714, 1.        ],
       [0.84975962, 0.08843537, 0.12740385, 0.13605442, 2.        ],
       [0.64302885, 0.44897959, 0.15144231, 0.13605442, 3.        ],
       [0.19951923, 0.63265306, 0.37980769, 0.54421769, 4.        ]])

In [278]:
c, d = preprocess_true_boxes(true_boxes, anchors, target_size)

208 208
true box sizes = [0.16346154 0.17346939 0.30769231 0.33333333 0.        ]
##################################
[2.125      2.25510204 4.         4.33333333]
++++++++++++++++++ 2
iou =  0.7804878048780485 1
?????????????????? 2 2 0
[[[[[[[[[]]]]]]]]] 4.333333333333333 +++++++++++++++ 4.333333333333334
true box sizes = [0.62379808 0.09863946 0.11298077 0.14285714 1.        ]
##################################
[8.109375   1.28231293 1.46875    1.85714286]
++++++++++++++++++ 1
iou =  0.6317204301075269 0
?????????????????? 1 8 1
[[[[[[[[[]]]]]]]]] 1.857142857142857 +++++++++++++++ 1.5476190476190477
true box sizes = [0.84975962 0.08843537 0.12740385 0.13605442 2.        ]
##################################
[11.046875    1.14965986  1.65625     1.76870748]
++++++++++++++++++ 1
iou =  0.7303149606299215 0
?????????????????? 1 11 2
[[[[[[[[[]]]]]]]]] 1.768707482993197 +++++++++++++++ 1.5476190476190477
true box sizes = [0.64302885 0.44897959 0.15144231 0.13605442 3.        ]
###########

In [272]:
d[2,2]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [279]:
d[2,0]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [280]:
d[2,:]

array([[[ 0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ],
        [ 1.        ,  0.125     ,  0.25510204, -0.24783616,
          0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  1.        ]],

       [[ 0.        ,  0.        ,  0.        

In [265]:
## x & y w.r.t. cell and w,h w.r.t. image

target_size = [208, 208]


## ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

anchors = anchors_wrt_cell

r_new_data_slice = r_new_data.loc[r_new_data['img_id']=='54a.jpg'].reset_index(drop=True) # slicing

true_boxes = np.zeros((len(r_new_data_slice),5))

obj = 0

for j in range(len(r_new_data_slice)):
    
    image_w = r_new_data_slice['image_width'][j]
    image_h = r_new_data_slice['image_height'][j]
    
    x_ratio = target_size[1] / image_w
    
    y_ratio = target_size[0] / image_h
    
#    print(x_ratio, y_ratio)
    

    
    xmin = r_new_data_slice['x'][j] * x_ratio
    ymin = r_new_data_slice['y'][j] * y_ratio
        
    xmax = (r_new_data_slice['x'][j] + r_new_data_slice['width'][j]) * x_ratio
    ymax = (r_new_data_slice['y'][j] + r_new_data_slice['height'][j]) * y_ratio
        
#    obj_class = r_new_data_slice['obj_class'][j]
    
    
#    obj = j+1
    
    
        
    w = (r_new_data_slice['width'][j] * x_ratio) / target_size[1]
    
    h = (r_new_data_slice['height'][j] * y_ratio) / target_size[0]
    
    
    x = (xmin + (xmax-xmin)/2) / target_size[1]
    y = (ymin + (ymax-ymin)/2) / target_size[0] 
    
#    x = (x * grid_x_axis/image_size[1]) - int(x * grid_x_axis/image_size[1])
#    y = (y * grid_y_axis/image_size[0]) - int(y * grid_y_axis/image_size[0])
    
    true_boxes[j][0] = x
    true_boxes[j][1] = y
    true_boxes[j][2] = w
    true_boxes[j][3] = h
    true_boxes[j][4] = obj
    obj = obj+1

print(true_boxes.shape)

r_new_data_slice.head()


(5, 5)


Unnamed: 0,img_id,x,y,width,height,obj_class,image_width,image_height
0,54a.jpg,87,7,259,56,vendor,416,168
1,54a.jpg,254,99,97,19,invoice,416,168
2,54a.jpg,253,117,113,15,inv_date,416,168
3,54a.jpg,257,132,135,19,po,416,168
4,54a.jpg,31,103,186,61,buyer,416,168


In [266]:
true_boxes

array([[0.52043269, 0.20833333, 0.62259615, 0.33333333, 0.        ],
       [0.72716346, 0.64583333, 0.23317308, 0.11309524, 1.        ],
       [0.74399038, 0.74107143, 0.27163462, 0.08928571, 2.        ],
       [0.78004808, 0.8422619 , 0.32451923, 0.11309524, 3.        ],
       [0.29807692, 0.79464286, 0.44711538, 0.36309524, 4.        ]])

In [268]:
a, b = preprocess_true_boxes(true_boxes, anchors, target_size)

208 208
true box sizes = [0.52043269 0.20833333 0.62259615 0.33333333 0.        ]
##################################
[6.765625   2.70833333 8.09375    4.33333333]
++++++++++++++++++ 2
iou =  0.6332046332046332 1
?????????????????? 2 6 0
[[[[[[[[[]]]]]]]]] 4.333333333333334 +++++++++++++++ 4.333333333333334
true box sizes = [0.72716346 0.64583333 0.23317308 0.11309524 1.        ]
##################################
[9.453125   8.39583333 3.03125    1.4702381 ]
++++++++++++++++++ 8
iou =  0.6472746331236898 0
?????????????????? 8 9 1
[[[[[[[[[]]]]]]]]] 1.4702380952380953 +++++++++++++++ 1.5476190476190477
true box sizes = [0.74399038 0.74107143 0.27163462 0.08928571 2.        ]
##################################
[9.671875   9.63392857 3.53125    1.16071429]
++++++++++++++++++ 9
iou =  0.48267326732673277 0
?????????????????? 9 9 2
[[[[[[[[[]]]]]]]]] 1.1607142857142858 +++++++++++++++ 1.5476190476190477
true box sizes = [0.78004808 0.8422619  0.32451923 0.11309524 3.        ]
#############

In [269]:
round(4.333333333333333333333, 5)

4.33333

In [297]:
# Preparing Input(X) and Target(Y) file for training :

X_final = [] # X_final list to convert to np array later
Y_true_final = [] # Y_final list to convert to np array later
Y_mask_final = []

image_list = r_new_data['img_id'].unique() # make a list of unique images

for i in image_list:
    
    image_path = '/home/scar3crow/Downloads/8-6-new-scan/' + i
    
    x = cv2.imread(image_path)
    x_ratio = target_w / x.shape[1]
    y_ratio = target_h / x.shape[0]
    img = cv2.resize(x,(target_w, target_h))
    
    X_final.append(img)
    
Y_true = np.zeros((grid_y_axis,grid_x_axis,num_anchors,info))
Y_mask = np.zeros((grid_y_axis,grid_x_axis,num_anchors,1))

for j in range(len(image_list)):
    
    Y_mask, Y_true = preprocess_true_boxes(true_boxes[j], anchors_wrt_cell, target_size)
    
    Y_true_final.append(Y_true)
    Y_mask_final.append(Y_mask)
    

X = np.array(X_final) 
#X_final = []
Y_true_target = np.array(Y_true_final)
Y_mask_target = np.array(Y_mask_final)
#Y_final = []

X = (X - 127.5)/127.5  # X normalising since pixels vary from 0 to 255
    
# np.save('/home/scar3crow/Downloads/Data1/X_short.npy',X)
# np.save('/home/scar3crow/Downloads/Data1/Y_short.npy',Y)


208 208


IndexError: tuple index out of range