In [73]:
import tensorflow as tf
import numpy as np
import tensorflow.keras as keras
from tensorflow.keras.layers import ZeroPadding2D,Conv2D,BatchNormalization,Dense,Flatten,Concatenate,Activation,Input,LeakyReLU,MaxPooling2D,GlobalAveragePooling2D,AveragePooling2D,Dropout
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.optimizers import Adam
from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt
import pandas as pd
import glob
import os
from tqdm import tqdm
import cv2 as cv
from PIL import Image,ImageDraw
from tensorflow.keras.preprocessing import image_dataset_from_directory
import argparse
import xml.etree.ElementTree as ET
import tensorflow.keras.backend as K

In [2]:
classes_num = {'aeroplane': 0, 'bicycle': 1, 'bird': 2, 'boat': 3, 'bottle': 4, 'bus': 5,
               'car': 6, 'cat': 7, 'chair': 8, 'cow': 9, 'diningtable': 10, 'dog': 11,
               'horse': 12, 'motorbike': 13, 'person': 14, 'pottedplant': 15, 'sheep': 16,
               'sofa': 17, 'train': 18, 'tvmonitor': 19}
sets = ["train","val"]

In [10]:
root = ET.parse("/content/drive/MyDrive/YOLO/VOC2007/VOCdevkit/VOC2007/Annotations/000005.xml")
root.getroot()

<Element 'annotation' at 0x7fc893c7fd10>

In [11]:
def convert_annotation(image_id,f):
  file_path = os.path.join("/content/drive/MyDrive/YOLO/VOC2007/VOCdevkit/VOC2007/Annotations/%s.xml" %(image_id))
  tree = ET.parse(file_path)
  root = tree.getroot()

  for obj in root.iter("object"):
    cls_name = obj.find("name").text
    difficult = obj.find("difficult").text
    classes = list(classes_num.keys())

    if cls_name not in classes or int(difficult) == 1:
      continue
    
    cls_id = classes_num[cls_name]
    b_box = obj.find("bndbox")

    box = (int(b_box.find('xmin').text), int(b_box.find('ymin').text),
             int(b_box.find('xmax').text), int(b_box.find('ymax').text))

    f.write(" " + ",".join([str(a) for a in box]) + "," + str(cls_id))



In [12]:
for image_set in sets:
  print(image_set)
  with open(os.path.join("/content/drive/MyDrive/YOLO/VOC2007/VOCdevkit/VOC2007/ImageSets/Main/%s.txt"%(image_set)),"r") as f:
    image_ids = f.read().strip().split()
    
  with open(os.path.join("/content/drive/MyDrive/YOLO/VOC2007/VOCdevkit", 'My_%s.txt' % (image_set)), 'w') as f:
      for image_id in image_ids:
          f.write('/content/drive/MyDrive/YOLO/VOC2007/VOCdevkit/VOC%s/JPEGImages/%s.jpg' % (str(2007), image_id))
          convert_annotation(image_id, f)
          f.write('\n')

train
val


In [47]:
def read(image_path,label):
  image = Image.open(image_path)
  image_w, image_h = image.size
  image = image.resize((448,448))
  image = np.array(image) / 255.


  label_matrix = np.zeros((7,7,30))
  
  for l in label:
    l = l.split(",")
    l = np.array(l,dtype=np.int)
    xmin = l[0]
    ymin = l[1]
    xmax = l[2]
    ymax = l[3]
    cls_id = l[4]

    x = (xmin + xmax) / 2 / image_w
    y = (ymin + ymax) / 2 / image_h
    w = (xmax - xmin) / image_w
    h = (ymax - ymin) / image_h

    loc = [7*x, 7*y]
    loc_i = int(loc[1])
    loc_j = int(loc[0])

    y = loc[1] - loc_i
    x = loc[0] - loc_j

    if label_matrix[loc_i,loc_j,24] == 0:
      label_matrix[loc_i, loc_j, cls_id] = 1
      label_matrix[loc_i,loc_j,20:24] = [x, y, w, h]
      label_matrix[loc_i,loc_j,24] = 1

    return image, label_matrix

In [48]:
class img_generator(keras.utils.Sequence):
  def __init__(self,x,y,batch_size):
    self.x = x
    self.y = y
    self.batch_size = batch_size

  def __len__(self):
    temp = int(np.ceil(len(self.x) / self.batch_size))
    return temp

  def __getitem__(self,idx):
    batch_x = self.x[idx * self.batch_size : (idx + 1) * self.batch_size]
    batch_y = self.y[idx * self.batch_size : (idx + 1) * self.batch_size]

    train_image = []
    train_label = []

    for i in range(0, len(batch_x)):
      img_path = batch_x[i]
      label = batch_y[i]
      image, label_matrix = read(img_path, label)
      train_image.append(image)
      train_label.append(label_matrix)
      
    return np.array(train_image), np.array(train_label)
    

In [49]:
img_path = []

for i in glob.glob("/content/drive/MyDrive/YOLO/VOC2007/VOCdevkit/VOC2007/JPEGImages/*"):
  img_path.append(i)

In [67]:
x_train_dataset = []
x_val_dataset = []

with open("/content/drive/MyDrive/YOLO/VOC2007/VOCdevkit/My_train.txt","r") as f:
  x_train_dataset += f.readlines()

with open("/content/drive/MyDrive/YOLO/VOC2007/VOCdevkit/My_val.txt","r") as f:
  x_val_dataset += f.readlines()

x_train = []
y_train = []

x_val = []
y_val = []

for item in x_train_dataset:
  item = item.replace("\n","").split(" ")
  if item[0] in img_path:
    x_train.append(item[0])
    temp = []
    for i in range(1, len(item)):
      temp.append(item[i])

    y_train.append(temp)


for item in x_val_dataset:
  item = item.replace("\n","").split(" ")
  if item[0] in img_path:
    x_val.append(item[0])
    temp = []
    for i in range(1, len(item)):
      temp.append(item[i])

    y_val.append(temp)


In [69]:
batch_size = 64 #64 used in YOLO paper

train_gen = img_generator(x_train, y_train, batch_size)
val_gen = img_generator(x_val, y_val, batch_size)

train_x, train_y = train_gen.__getitem__(1)
val_x, val_y = val_gen.__getitem__(1)

In [70]:
train_x.shape,train_y.shape,val_x.shape,val_y.shape

((64, 448, 448, 3), (64, 7, 7, 30), (64, 448, 448, 3), (64, 7, 7, 30))

In [53]:
class reshape_op(tf.keras.layers.Layer):
  
  def __init__(self,target_shape):
    super(reshape_op,self).__init__()
    self.target_shape = tuple(target_shape)

  def getconfig(self):
    config = super().getconfig.copy()
    config.update({"target_shape":self.target_shape})
    return config

  def call(self,input):
    s = [self.target_shape[0],self.target_shape[1]] #Grid size
    c = 20 #classes
    b = 2 #boxes
    idx1 = s[0] * s[1] * c
    idx2 =  idx1 + s[0] * s[1] * b
  
    class_probs = K.reshape(input[:, :idx1], (K.shape(input)[0],) + tuple([s[0], s[1], c]))
    class_probs = K.softmax(class_probs)

    #confidence
    confs = K.reshape(input[:, idx1:idx2], (K.shape(input)[0],) + tuple([s[0], s[1], b]))
    confs = K.sigmoid(confs)

    # boxes
    boxes = K.reshape(input[:, idx2:], (K.shape(input)[0],) + tuple([s[0], s[1], b * 4]))
    boxes = K.sigmoid(boxes)

    outputs = K.concatenate([class_probs, confs, boxes])
    return outputs

In [54]:
def my_model(input_shape = (448,448,3)):
  base_model = tf.keras.applications.InceptionResNetV2(
      input_shape = (448,448,3),
      weights = "imagenet",
      include_top = False
  )
  base_model.trainable = False
  inputs = Input(shape = input_shape)
  x = base_model(inputs)
  x = MaxPooling2D(pool_size = 2, strides = 2)(x)
  x = Conv2D(filters = 1536, kernel_size = 3, activation = "relu")(x)
  x = MaxPooling2D(pool_size = 2, strides = 2)(x)
  x = GlobalAveragePooling2D()(x)
  x = Flatten()(x)
  x = Dense(units = 2048, activation = "relu")(x)
  x = Dropout(0.4)(x)
  x = Dense(units = 2048, activation = "relu")(x)
  x = Dropout(0.4)(x)
  x = Dense(units = 1470, activation = "sigmoid")(x)
  x = reshape_op(target_shape=(7,7,30))(x)

  model = Model(inputs, x)

  return model

In [55]:
model = my_model()

In [56]:
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_13 (InputLayer)        [(None, 448, 448, 3)]     0         
_________________________________________________________________
inception_resnet_v2 (Functio (None, 12, 12, 1536)      54336736  
_________________________________________________________________
max_pooling2d_33 (MaxPooling (None, 6, 6, 1536)        0         
_________________________________________________________________
conv2d_1422 (Conv2D)         (None, 4, 4, 1536)        21235200  
_________________________________________________________________
max_pooling2d_34 (MaxPooling (None, 2, 2, 1536)        0         
_________________________________________________________________
global_average_pooling2d_1 ( (None, 1536)              0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 1536)              0   

In [74]:
# def create_model(input_shape=(448,448,3)):
#   x_input = Input(input_shape)
  
#   x = Conv2D(filters=64,kernel_size=(7,7),strides=1,padding="same")(x_input)
#   x = BatchNormalization(axis=-1)(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = MaxPooling2D(pool_size=2,strides=2)(x)

#   x = Conv2D(filters=192, kernel_size=(3,3), strides=1,padding="same")(x)
#   x = BatchNormalization(axis=-1)(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = MaxPooling2D(pool_size=2,strides=2)(x)

#   x = Conv2D(filters=128, kernel_size=1)(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=256,kernel_size=3,padding="same")(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=256, kernel_size=1)(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=512,kernel_size=3,padding="same")(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = MaxPooling2D(pool_size=2,strides=2)(x)

#   x = Conv2D(filters=256, kernel_size=1)(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=512,kernel_size=3,padding="same")(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=256, kernel_size=1)(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=512,kernel_size=3,padding="same")(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=256, kernel_size=1)(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=512,kernel_size=3,padding="same")(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=256, kernel_size=1)(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=512,kernel_size=3,padding="same")(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=512,kernel_size=1)(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=1024, kernel_size=3, padding="same")(x)
#   x = MaxPooling2D(pool_size=2,strides=2)(x)

#   x = Conv2D(filters=512, kernel_size=1)(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=1024, kernel_size=3,padding="same")(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=512, kernel_size=1)(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=1024, kernel_size=3,padding="same")(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=1024,kernel_size=3,padding="same")(x)
#   x = Conv2D(filters=1024,kernel_size=3,strides=2,padding="same")(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=1024,kernel_size=3,padding="valid")(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)
#   x = Conv2D(filters=1024,kernel_size=3,padding="valid")(x)
#   x = BatchNormalization()(x)
#   x = LeakyReLU(alpha=0.1)(x)

#   x = Flatten()(x)
#   x = Dense(units=512)(x)
#   x = Dense(units=1024)(x)
#   x = Dropout(rate=0.5)(x)
#   x = Dense(units=1470,activation="sigmoid")(x)
#   x = reshape_op(target_shape=(7,7,30))(x)

#   mod = Model(inputs=x_input, outputs=x)

#   return mod


In [75]:
# mod = model((448,448,3))

In [76]:
# mod.summary()

In [58]:
class CustomLearningRateScheduler(keras.callbacks.Callback):
    """Learning rate scheduler which sets the learning rate according to schedule.

  Arguments:
      schedule: a function that takes an epoch index
          (integer, indexed from 0) and current learning rate
          as inputs and returns a new learning rate as output (float).
  """

    def __init__(self, schedule):
        super(CustomLearningRateScheduler, self).__init__()
        self.schedule = schedule

    def on_epoch_begin(self, epoch, logs=None):
        if not hasattr(self.model.optimizer, "lr"):
            raise ValueError('Optimizer must have a "lr" attribute.')
        # Get the current learning rate from model's optimizer.
        lr = float(tf.keras.backend.get_value(self.model.optimizer.learning_rate))
        # Call schedule function to get the scheduled learning rate.
        scheduled_lr = self.schedule(epoch, lr)
        # Set the value back to the optimizer before this epoch starts
        tf.keras.backend.set_value(self.model.optimizer.lr, scheduled_lr)
        if epoch == 0 or epoch == 75 or epoch == 105:
          print("\nEpoch %d: Learning rate is %f." % (epoch, scheduled_lr))


LR_SCHEDULE = [
    # (epoch to start, learning rate) tuples
    (0, 0.01),
    (75, 0.001),
    (105, 0.0001),
]


def lr_schedule(epoch, lr):
    """Helper function to retrieve the scheduled learning rate based on epoch."""
    if epoch < LR_SCHEDULE[0][0] or epoch > LR_SCHEDULE[-1][0]:
        return lr
    for i in range(len(LR_SCHEDULE)):
        if epoch == LR_SCHEDULE[i][0]:
            return LR_SCHEDULE[i][1]
    return lr

In [59]:
def xywh2minmax(xy, wh):
    xy_min = xy - wh / 2
    xy_max = xy + wh / 2

    return xy_min, xy_max

In [60]:
def iou(pred_mins, pred_maxes, true_mins, true_maxes):
    intersect_mins = K.maximum(pred_mins, true_mins)
    intersect_maxes = K.minimum(pred_maxes, true_maxes)
    intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.)
    intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]

    pred_wh = pred_maxes - pred_mins
    true_wh = true_maxes - true_mins
    pred_areas = pred_wh[..., 0] * pred_wh[..., 1]
    true_areas = true_wh[..., 0] * true_wh[..., 1]

    union_areas = pred_areas + true_areas - intersect_areas
    iou_scores = intersect_areas / union_areas

    return iou_scores

In [61]:
def yolo_head(feats):
    conv_dims = K.shape(feats)[1:3]  # assuming channels last
    # In YOLO the height index is the inner most iteration.
    conv_height_index = K.arange(0, stop=conv_dims[0])
    conv_width_index = K.arange(0, stop=conv_dims[1])
    conv_height_index = K.tile(conv_height_index, [conv_dims[1]])

    # TODO: Repeat_elements and tf.split doesn't support dynamic splits.
    # conv_width_index = K.repeat_elements(conv_width_index, conv_dims[1], axis=0)
    conv_width_index = K.tile(K.expand_dims(conv_width_index, 0), [conv_dims[0], 1])
    conv_width_index = K.flatten(K.transpose(conv_width_index))
    conv_index = K.transpose(K.stack([conv_height_index, conv_width_index]))
    conv_index = K.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2])
    conv_index = K.cast(conv_index, K.dtype(feats))

    conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats))

    box_xy = (feats[..., :2] + conv_index) / conv_dims * 448
    box_wh = feats[..., 2:4] * 448

    return box_xy, box_wh


In [62]:
def yolo_loss(y_true, y_pred):
    label_class = y_true[..., :20]  # ? * 7 * 7 * 20
    label_box = y_true[..., 20:24]  # ? * 7 * 7 * 4
    response_mask = y_true[..., 24]  # ? * 7 * 7
    response_mask = K.expand_dims(response_mask)  # ? * 7 * 7 * 1

    predict_class = y_pred[..., :20]  # ? * 7 * 7 * 20
    predict_trust = y_pred[..., 20:22]  # ? * 7 * 7 * 2
    predict_box = y_pred[..., 22:]  # ? * 7 * 7 * 8

    _label_box = K.reshape(label_box, [-1, 7, 7, 1, 4])
    _predict_box = K.reshape(predict_box, [-1, 7, 7, 2, 4])

    label_xy, label_wh = yolo_head(_label_box)  # ? * 7 * 7 * 1 * 2, ? * 7 * 7 * 1 * 2
    label_xy = K.expand_dims(label_xy, 3)  # ? * 7 * 7 * 1 * 1 * 2
    label_wh = K.expand_dims(label_wh, 3)  # ? * 7 * 7 * 1 * 1 * 2
    label_xy_min, label_xy_max = xywh2minmax(label_xy, label_wh)  # ? * 7 * 7 * 1 * 1 * 2, ? * 7 * 7 * 1 * 1 * 2

    predict_xy, predict_wh = yolo_head(_predict_box)  # ? * 7 * 7 * 2 * 2, ? * 7 * 7 * 2 * 2
    predict_xy = K.expand_dims(predict_xy, 4)  # ? * 7 * 7 * 2 * 1 * 2
    predict_wh = K.expand_dims(predict_wh, 4)  # ? * 7 * 7 * 2 * 1 * 2
    predict_xy_min, predict_xy_max = xywh2minmax(predict_xy, predict_wh)  # ? * 7 * 7 * 2 * 1 * 2, ? * 7 * 7 * 2 * 1 * 2

    iou_scores = iou(predict_xy_min, predict_xy_max, label_xy_min, label_xy_max)  # ? * 7 * 7 * 2 * 1
    best_ious = K.max(iou_scores, axis=4)  # ? * 7 * 7 * 2
    best_box = K.max(best_ious, axis=3, keepdims=True)  # ? * 7 * 7 * 1

    box_mask = K.cast(best_ious >= best_box, K.dtype(best_ious))  # ? * 7 * 7 * 2

    no_object_loss = 0.5 * (1 - box_mask * response_mask) * K.square(0 - predict_trust)
    object_loss = box_mask * response_mask * K.square(1 - predict_trust)
    confidence_loss = no_object_loss + object_loss
    confidence_loss = K.sum(confidence_loss)

    class_loss = response_mask * K.square(label_class - predict_class)
    class_loss = K.sum(class_loss)

    _label_box = K.reshape(label_box, [-1, 7, 7, 1, 4])
    _predict_box = K.reshape(predict_box, [-1, 7, 7, 2, 4])

    label_xy, label_wh = yolo_head(_label_box) 
    predict_xy, predict_wh = yolo_head(_predict_box)
    box_mask = K.expand_dims(box_mask)
    response_mask = K.expand_dims(response_mask)

    box_loss = 5 * box_mask * response_mask * K.square((label_xy - predict_xy) / 448)
    box_loss += 5 * box_mask * response_mask * K.square((K.sqrt(label_wh) - K.sqrt(predict_wh)) / 448)
    box_loss = K.sum(box_loss)

    loss = confidence_loss + class_loss + box_loss

    return loss

In [63]:
model.compile(optimizer="adam",loss=yolo_loss)

In [71]:
checkpoint_path = "/content/drive/MyDrive/YOLO/VOC2007/checkpoint/"

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [None]:
hist = model.fit(train_gen,
               epochs=135,
               callbacks=[CustomLearningRateScheduler(lr_schedule), model_checkpoint_callback],
               validation_data = val_gen
        )