In [1]:
import cv2, numpy as np
import time
import math as mth
from PIL import Image, ImageDraw, ImageFont
import scipy.io
from keras.models import Sequential
from keras import initializations
from keras.initializations import normal, identity
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import RMSprop, SGD, Adam
from keras import backend as K
import random
from scipy import ndimage
from keras.preprocessing import image as keras_image_helper
from sklearn.preprocessing import OneHotEncoder
import math
import numpy, scipy
from scipy import interpolate
import scipy.ndimage
import xml.etree.ElementTree as ET
from tqdm.notebook import trange, tqdm

Using Theano backend.


## Helper functions to manipulate image and features

In [2]:
# the feature size is of 7x7xp, being p the number of channels
feature_size = 7
# the relative scale reduction of the shallower feature map compared to the initial image input
scale_reduction_shallower_feature = 16
# the relative scale reduction of the deeper feature map compared to the initial image input
scale_reduction_deeper_feature = 32
# scaling of the input image
factor_x_input = float(1)
factor_y_input = float(1)


# Interpolation of 2d features for a single channel of a feature map
def interpolate_2d_features(features):
    out_size = feature_size
    x = np.arange(features.shape[0])
    y = np.arange(features.shape[1])
    z = features
    xx = np.linspace(x.min(), x.max(), out_size)
    yy = np.linspace(y.min(), y.max(), out_size)
    new_kernel = interpolate.RectBivariateSpline(x, y, z, kx=1, ky=1)
    kernel_out = new_kernel(xx, yy)
    return kernel_out


# Interpolation 2d of each channel, so we obtain 3d interpolated feature maps
def interpolate_3d_features(features):
    new_features = np.zeros([512, feature_size, feature_size])
    for i in range(features.shape[0]):
        new_features[i, :, :] = interpolate_2d_features(features[i, :, :])
    return new_features


def pop_layer(model):
    if not model.outputs:
        raise Exception('Sequential model cannot be popped: model is empty.')
    model.layers.pop()
    if not model.layers:
        model.outputs = []
        model.inbound_nodes = []
        model.outbound_nodes = []
    else:
        model.layers[-1].outbound_nodes = []
        model.outputs = [model.layers[-1].output]
    model.built = False
    return model


def get_convolutional_vgg16_compiled(vgg_weights_path):
    model_vgg = obtain_compiled_vgg_16(vgg_weights_path)
    for i in range(0, 6):
        model_vgg = pop_layer(model_vgg)
    return model_vgg


def get_feature_maps(model, img):
    return [get_feature_map_4(model, img), get_feature_map_8(model, img)]


# get deeper feature map
def get_feature_map_8(model, im):
    im = im.astype(np.float32)
    dim_ordering = K.image_dim_ordering()
    if dim_ordering == 'th':
        # 'RGB'->'BGR'
        im = im[::-1, :, :]
        # Zero-center by mean pixel
        im[0, :, :] -= 103.939
        im[1, :, :] -= 116.779
        im[2, :, :] -= 123.68
    else:
        # 'RGB'->'BGR'
        im = im[:, :, ::-1]
        # Zero-center by mean pixel
        im[:, :, 0] -= 103.939
        im[:, :, 1] -= 116.779
        im[:, :, 2] -= 123.68
    im = im.transpose((2, 0, 1))
    im = np.expand_dims(im, axis=0)
    inputs = [K.learning_phase()] + model.inputs
    _convout1_f = K.function(inputs, model.outputs)
    feature_map = _convout1_f([0] + [im])
    feature_map = np.array([feature_map])
    feature_map = feature_map[0, 0, 0, :, :, :]
    return feature_map


# get shallower feature map
def get_feature_map_4(model, im):
    im = im.astype(np.float32)
    dim_ordering = K.image_dim_ordering()
    if dim_ordering == 'th':
        # 'RGB'->'BGR'
        im = im[::-1, :, :]
        # Zero-center by mean pixel
        im[0, :, :] -= 103.939
        im[1, :, :] -= 116.779
        im[2, :, :] -= 123.68
    else:
        # 'RGB'->'BGR'
        im = im[:, :, ::-1]
        # Zero-center by mean pixel
        im[:, :, 0] -= 103.939
        im[:, :, 1] -= 116.779
        im[:, :, 2] -= 123.68
    im = im.transpose((2, 0, 1))
    im = np.expand_dims(im, axis=0)
    inputs = [K.learning_phase()] + model.inputs
    _convout1_f = K.function(inputs, [model.layers[23].output])
    feature_map = _convout1_f([0] + [im])
    feature_map = np.array([feature_map])
    feature_map = feature_map[0, 0, 0, :, :, :]
    return feature_map


def crop_roi(feature_map, coordinates):
    return feature_map[:, coordinates[0]:coordinates[0]+coordinates[2], coordinates[1]:coordinates[1]+coordinates[3]]


# this method decides whether to use the deeper or the shallower feature map
# and then crops and interpolates if necessary the features to obtain a final descriptor of 7x7xp
def obtain_descriptor_from_feature_map(feature_maps, region_coordinates):
    initial_width = region_coordinates[2]*factor_x_input
    initial_height = region_coordinates[3]*factor_y_input
    scale_aux = math.sqrt(initial_height*initial_width)/math.sqrt(feature_size*feature_size)
    if scale_aux > scale_reduction_deeper_feature:
        scale = scale_reduction_deeper_feature
        feature_map = feature_maps[1]
    else:
        scale = scale_reduction_shallower_feature
        feature_map = feature_maps[0]
    new_width = initial_width/scale
    new_height = initial_height/scale
    if new_width < feature_size:
        new_width = feature_size
    if new_height < feature_size:
        new_height = feature_size
    xo = region_coordinates[0]/scale
    yo = region_coordinates[1]/scale
    feat = np.array([feature_map])
    if new_width + xo > feat.shape[2]:
        xo = feat.shape[2] - new_width
    if new_height + yo > feat.shape[3]:
        yo = feat.shape[3] - new_height
    if xo < 0:
        xo = 0
    if yo < 0:
        yo = 0
    new_coordinates = np.array([xo, yo, new_width, new_height])
    roi = crop_roi(feature_map, new_coordinates)
    if roi.shape[1] < feature_size & roi.shape[2] < feature_size:
        features = interpolate_3d_features(roi)
    elif roi.shape[2] < feature_size:
        features = interpolate_3d_features(roi)
    elif roi.shape[1] < feature_size:
        features = interpolate_3d_features(roi)
    else:
        features = extract_features_from_roi(roi)
    return features


# ROI-pooling features
def extract_features_from_roi(roi):
    roi_width = roi.shape[1]
    roi_height = roi.shape[2]
    new_width = roi_width / feature_size
    new_height = roi_height / feature_size
    pooled_values = np.zeros([feature_size, feature_size, 512])
    for j in range(512):
        for i in range(feature_size):
            for k in range(feature_size):
                if k == (feature_size-1) & i == (feature_size-1):
                    patch = roi[j, i * new_width:roi_width, k * new_height:roi_height]
                elif k == (feature_size-1):
                    patch = roi[j, i * new_width:(i + 1) * new_width, k * new_height:roi_height]
                elif i == (feature_size-1):
                    patch = roi[j, i * new_width:roi_width, k * new_height:(k + 1) * new_height]
                else:
                    patch = roi[j, i * new_width:(i + 1) * new_width, k * new_height:(k + 1) * new_height]
                pooled_values[i, k, j] = np.max(patch)
    return pooled_values


def calculate_all_initial_feature_maps(images, model, image_names):
    initial_feature_maps = []
    for z in range(np.size(image_names)):
        initial_feature_maps.append(get_feature_maps(model, np.array(images[z])))
    return initial_feature_maps


def get_image_descriptor_for_image(image, model):
    im = cv2.resize(image, (224, 224)).astype(np.float32)
    dim_ordering = K.image_dim_ordering()
    if dim_ordering == 'th':
        # 'RGB'->'BGR'
        im = im[::-1, :, :]
        # Zero-center by mean pixel
        im[0, :, :] -= 103.939
        im[1, :, :] -= 116.779
        im[2, :, :] -= 123.68
    else:
        # 'RGB'->'BGR'
        im = im[:, :, ::-1]
        # Zero-center by mean pixel
        im[:, :, 0] -= 103.939
        im[:, :, 1] -= 116.779
        im[:, :, 2] -= 123.68
    im = im.transpose((2, 0, 1))
    im = np.expand_dims(im, axis=0)
    inputs = [K.learning_phase()] + model.inputs
    _convout1_f = K.function(inputs, [model.layers[33].output])
    return _convout1_f([0] + [im])


def get_conv_image_descriptor_for_image(image, model):
    im = cv2.resize(image, (224, 224)).astype(np.float32)
    dim_ordering = K.image_dim_ordering()
    if dim_ordering == 'th':
        # 'RGB'->'BGR'
        im = im[::-1, :, :]
        # Zero-center by mean pixel
        im[0, :, :] -= 103.939
        im[1, :, :] -= 116.779
        im[2, :, :] -= 123.68
    else:
        # 'RGB'->'BGR'
        im = im[:, :, ::-1]
        # Zero-center by mean pixel
        im[:, :, 0] -= 103.939
        im[:, :, 1] -= 116.779
        im[:, :, 2] -= 123.68
    im = im.transpose((2, 0, 1))
    im = np.expand_dims(im, axis=0)
    inputs = [K.learning_phase()] + model.inputs
    _convout1_f = K.function(inputs, [model.layers[31].output])
    return _convout1_f([0] + [im])


def obtain_compiled_vgg_16(vgg_weights_path):
    model = vgg_16(vgg_weights_path)
    sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(optimizer=sgd, loss='categorical_crossentropy')
    return model


def vgg_16(weights_path=None):
    model = Sequential()
    model.add(ZeroPadding2D((1, 1), input_shape=(3, 224, 224)))
    model.add(Convolution2D(64, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(64, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation='softmax'))

    if weights_path:
        model.load_weights(weights_path)

    return model

## Helper Functions to deal with reinforcement learning

In [3]:
# Different actions that the agent can do
number_of_actions = 6
# Actions captures in the history vector
actions_of_history = 4
# Visual descriptor size
visual_descriptor_size = 25088
# Reward movement action
reward_movement_action = 1
# Reward terminal action
reward_terminal_action = 3
# IoU required to consider a positive detection
iou_threshold = 0.5


def update_history_vector(history_vector, action):
    action_vector = np.zeros(number_of_actions)
    action_vector[action-1] = 1
    size_history_vector = np.size(np.nonzero(history_vector))
    updated_history_vector = np.zeros(number_of_actions*actions_of_history)
    if size_history_vector < actions_of_history:
        aux2 = 0
        for l in range(number_of_actions*size_history_vector, number_of_actions*size_history_vector+number_of_actions - 1):
            history_vector[l] = action_vector[aux2]
            aux2 += 1
        return history_vector
    else:
        for j in range(0, number_of_actions*(actions_of_history-1) - 1):
            updated_history_vector[j] = history_vector[j+number_of_actions]
        aux = 0
        for k in range(number_of_actions*(actions_of_history-1), number_of_actions*actions_of_history):
            updated_history_vector[k] = action_vector[aux]
            aux += 1
        return updated_history_vector


def get_state(image, history_vector, model_vgg):
    descriptor_image = get_conv_image_descriptor_for_image(image, model_vgg)
    descriptor_image = np.reshape(descriptor_image, (visual_descriptor_size, 1))
    history_vector = np.reshape(history_vector, (number_of_actions*actions_of_history, 1))
    state = np.vstack((descriptor_image, history_vector))
    return state


def get_state_pool45(history_vector,  region_descriptor):
    history_vector = np.reshape(history_vector, (24, 1))
    return np.vstack((region_descriptor, history_vector))


def get_reward_movement(iou, new_iou):
    if new_iou > iou:
        reward = reward_movement_action
    else:
        reward = - reward_movement_action
    return reward


def get_reward_trigger(new_iou):
    if new_iou > iou_threshold:
        reward = reward_terminal_action
    else:
        reward = - reward_terminal_action
    return reward


def get_q_network(weights_path):
    model = Sequential()
    model.add(Dense(1024, init=lambda shape, name: normal(shape, scale=0.01, name=name), input_shape=(25112,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1024, init=lambda shape, name: normal(shape, scale=0.01, name=name)))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(6, init=lambda shape, name: normal(shape, scale=0.01, name=name)))
    model.add(Activation('linear'))
    adam = Adam(lr=1e-6)
    model.compile(loss='mse', optimizer=adam)
    if weights_path != "0":
        model.load_weights(weights_path)
    return model


def get_array_of_q_networks_for_pascal(weights_path, class_object):
    q_networks = []
    if weights_path == "0":
        for i in range(20):
            q_networks.append(get_q_network("0"))
    else:
        for i in range(20):
            if i == (class_object-1):
                q_networks.append(get_q_network(weights_path + "/model" + str(i) + "h5"))
            else:
                q_networks.append(get_q_network("0"))
    return np.array([q_networks])

## Helper functions to visualize

In [4]:
path_font = "../fonts/FreeMono.ttf"
font = ImageFont.truetype(path_font, 32)


def string_for_action(action):
    if action == 0:
        return "START"
    if action == 1:
        return 'up-left'
    elif action == 2:
        return 'up-right'
    elif action == 3:
        return 'down-left'
    elif action == 4:
        return 'down-right'
    elif action == 5:
        return 'center'
    elif action == 6:
        return 'TRIGGER'


def draw_sequences(i, k, step, action, draw, region_image, background, path_testing_folder, iou, reward,
                   gt_mask, region_mask, image_name, save_boolean):
    mask = Image.fromarray(255 * gt_mask)
    mask_img = Image.fromarray(255 * region_mask)
    image_offset = (1000 * step, 70)
    text_offset = (1000 * step, 550)
    masked_image_offset = (1000 * step, 1400)
    mask_offset = (1000 * step, 700)
    action_string = string_for_action(action)
    footnote = 'action: ' + action_string + ' ' + 'reward: ' + str(reward) + ' Iou:' + str(iou)
    draw.text(text_offset, str(footnote), (0, 0, 0), font=font)
    img_for_paste = Image.fromarray(region_image)
    background.paste(img_for_paste, image_offset)
    background.paste(mask, mask_offset)
    background.paste(mask_img, masked_image_offset)
    file_name = path_testing_folder + '/' + image_name + str(i) + '_object_' + str(k) + '.png'
    if save_boolean == 1:
        background.save(file_name)
    return background


def draw_sequences_test(step, action, qval, draw, region_image, background, path_testing_folder,
                        region_mask, image_name, save_boolean):
    aux = np.asarray(region_image, np.uint8)
    img_offset = (1000 * step, 70)
    footnote_offset = (1000 * step, 550)
    q_predictions_offset = (1000 * step, 500)
    mask_img_offset = (1000 * step, 700)
    img_for_paste = Image.fromarray(aux)
    background.paste(img_for_paste, img_offset)
    mask_img = Image.fromarray(255 * region_mask)
    background.paste(mask_img, mask_img_offset)
    footnote = 'action: ' + str(action)
    qval_new = qval.copy()
    qval_new = [round(s,2) for s in qval_new[0]]
    q_val_predictions_text = str(qval_new)
    draw.text(footnote_offset, footnote, (0, 0, 0), font=font)
    draw.text(q_predictions_offset, q_val_predictions_text, (0, 0, 0), font=font)
    file_name = path_testing_folder + image_name + '.png'
    if save_boolean == 1:
        background.save(file_name)
    return background

In [5]:

def calculate_iou(img_mask, gt_mask):
    gt_mask *= 1.0
    img_and = cv2.bitwise_and(img_mask, gt_mask)
    img_or = cv2.bitwise_or(img_mask, gt_mask)
    j = np.count_nonzero(img_and)
    i = np.count_nonzero(img_or)
    iou = float(float(j)/float(i))
    return iou


def calculate_overlapping(img_mask, gt_mask):
    gt_mask *= 1.0
    img_and = cv2.bitwise_and(img_mask, gt_mask)
    j = np.count_nonzero(img_and)
    i = np.count_nonzero(gt_mask)
    overlap = float(float(j)/float(i))
    return overlap


def follow_iou(gt_masks, mask, array_classes_gt_objects, object_id, last_matrix, available_objects):
    results = np.zeros([np.size(array_classes_gt_objects), 1])
    for k in range(np.size(array_classes_gt_objects)):
        if array_classes_gt_objects[k] == object_id:
            if available_objects[k] == 1:
                gt_mask = gt_masks[:, :, k]
                iou = calculate_iou(mask, gt_mask)
                results[k] = iou
            else:
                results[k] = -1
    max_result = max(results)
    ind = np.argmax(results)
    iou = last_matrix[ind]
    new_iou = max_result
    return iou, new_iou, results, ind

## Random Image Utilities

In [6]:
def get_all_ids(annotations):
    all_ids = []
    for i in range(len(annotations)):
        all_ids.append(get_ids_objects_from_annotation(annotations[i]))
    return all_ids


def get_all_images(image_names, path_voc):
    images = []
    for j in range(np.size(image_names)):
        image_name = image_names[0][j]
        string = path_voc + '/JPEGImages/' + image_name + '.jpg'
        images.append(keras_image_helper.load_img(string, False))
    return images


def get_all_images_pool(image_names, path_voc):
    images = []
    for j in range(np.size(image_names)):
        image_name = image_names[j]
        string = path_voc + '/JPEGImages/' + image_name + '.jpg'
        images.append(keras_image_helper.load_img(string, False))
    return images


def load_images_names_in_data_set(data_set_name, path_voc):
    file_path = path_voc + '/ImageSets/Main/' + data_set_name + '.txt'
    f = open(file_path)
    image_names = f.readlines()
    image_names = [x.strip('\n') for x in image_names]
    if data_set_name.startswith("aeroplane") | data_set_name.startswith("bird") | data_set_name.startswith("cow"):
        return [x.split(None, 1)[0] for x in image_names]
    else:
        return [x.strip('\n') for x in image_names]


def load_images_labels_in_data_set(data_set_name, path_voc):
    file_path = path_voc + '/ImageSets/Main/' + data_set_name + '.txt'
    f = open(file_path)
    images_names = f.readlines()
    images_names = [x.split(None, 1)[1] for x in images_names]
    images_names = [x.strip('\n') for x in images_names]
    return images_names


def mask_image_with_mean_background(mask_object_found, image):
    new_image = image
    size_image = np.shape(mask_object_found)
    for j in range(size_image[0]):
        for i in range(size_image[1]):
            if mask_object_found[j][i] == 1:
                    new_image[j, i, 0] = 103.939
                    new_image[j, i, 1] = 116.779
                    new_image[j, i, 2] = 123.68
    return new_image

## VOC toolkit helper functions

In [7]:
def get_bb_of_gt_from_pascal_xml_annotation(xml_name, voc_path):
    string = voc_path + '/Annotations/' + xml_name + '.xml'
    tree = ET.parse(string)
    root = tree.getroot()
    names = []
    x_min = []
    x_max = []
    y_min = []
    y_max = []
    for child in root:
        if child.tag == 'object':
            for child2 in child:
                if child2.tag == 'name':
                    names.append(child2.text)
                elif child2.tag == 'bndbox':
                    for child3 in child2:
                        if child3.tag == 'xmin':
                            x_min.append(child3.text)
                        elif child3.tag == 'xmax':
                            x_max.append(child3.text)
                        elif child3.tag == 'ymin':
                            y_min.append(child3.text)
                        elif child3.tag == 'ymax':
                            y_max.append(child3.text)
    category_and_bb = np.zeros([np.size(names), 5])
    for i in range(np.size(names)):
        category_and_bb[i][0] = get_id_of_class_name(names[i])
        category_and_bb[i][1] = x_min[i]
        category_and_bb[i][2] = x_max[i]
        category_and_bb[i][3] = y_min[i]
        category_and_bb[i][4] = y_max[i]
    return category_and_bb


def get_all_annotations(image_names, voc_path):
    annotations = []
    for i in range(np.size(image_names)):
        image_name = image_names[0][i]
        annotations.append(get_bb_of_gt_from_pascal_xml_annotation(image_name, voc_path))
    return annotations


def generate_bounding_box_from_annotation(annotation, image_shape):
    length_annotation = annotation.shape[0]
    masks = np.zeros([image_shape[0], image_shape[1], length_annotation])
    for i in range(0, length_annotation):
        masks[int(annotation[i, 3]):int(annotation[i, 4]), int(annotation[i, 1]):int(annotation[i, 2]), i] = 1
    return masks


def get_ids_objects_from_annotation(annotation):
    return annotation[:, 0]


def get_id_of_class_name (class_name):
    if class_name == 'aeroplane':
        return 1
    elif class_name == 'bicycle':
        return 2
    elif class_name == 'bird':
        return 3
    elif class_name == 'boat':
        return 4
    elif class_name == 'bottle':
        return 5
    elif class_name == 'bus':
        return 6
    elif class_name == 'car':
        return 7
    elif class_name == 'cat':
        return 8
    elif class_name == 'chair':
        return 9
    elif class_name == 'cow':
        return 10
    elif class_name == 'diningtable':
        return 11
    elif class_name == 'dog':
        return 12
    elif class_name == 'horse':
        return 13
    elif class_name == 'motorbike':
        return 14
    elif class_name == 'person':
        return 15
    elif class_name == 'pottedplant':
        return 16
    elif class_name == 'sheep':
        return 17
    elif class_name == 'sofa':
        return 18
    elif class_name == 'train':
        return 19
    elif class_name == 'tvmonitor':
        return 20

## Main Function to test

In [14]:
# path of PASCAL VOC 2012 or other database to use for training
path_voc = "../data/voc/VOC2007/"
# path of other PASCAL VOC dataset, if you want to train with 2007 and 2012 train datasets
path_voc2 = "../data/voc/VOC2012/"
# path of where to store the models
path_model = "../model/models_image_zooms"
# path of where to store visualizations of search sequences
path_testing_folder = '../results/voc/train/'
# path of VGG16 weights
path_vgg = "../model/vgg16_weights.h5"

######## PARAMETERS ########
epochs_id = 0
# Class category of PASCAL that the RL agent will be searching
class_object = 1
# Scale of subregion for the hierarchical regions (to deal with 2/4, 3/4)
scale_subregion = float(3)/4
scale_mask = float(1)/(scale_subregion*4)
# 1 if you want to obtain visualizations of the search for objects
bool_draw = 0
# How many steps can run the agent until finding one object
number_of_steps = 10
# Boolean to indicate if you want to use the two databases, or just one
two_databases = 0
epochs = 50
gamma = 0.90
epsilon = 1
batch_size = 100
# Pointer to where to store the last experience in the experience replay buffer,
# actually there is a pointer for each PASCAL category, in case all categories
# are trained at the same time
h = np.zeros([20])
# Each replay memory (one for each possible category) has a capacity of 100 experiences
buffer_experience_replay = 1000
# Init replay memories
replay = [[] for i in range(20)]
reward = 0

######## MODELS ########

model_vgg = obtain_compiled_vgg_16(path_vgg)

# If you want to train it from first epoch, first option is selected. Otherwise,
# when making checkpointing, weights of last stored weights are loaded for a particular class object

if epochs_id == 0:
    models = get_array_of_q_networks_for_pascal("0", class_object)
else:
    models = get_array_of_q_networks_for_pascal(path_model, class_object)

######## LOAD IMAGE NAMES ########

if two_databases == 1:
    image_names1 = np.array([load_images_names_in_data_set('trainval', path_voc)])
    image_names2 = np.array([load_images_names_in_data_set('trainval', path_voc2)])
    image_names = np.concatenate([image_names1, image_names2])
else:
    image_names = np.array([load_images_names_in_data_set('trainval', path_voc)])

######## LOAD IMAGES ########

if two_databases == 1:
    images1 = get_all_images(image_names1, path_voc)
    images2 = get_all_images(image_names2, path_voc2)
    images = np.concatenate([images1, images2])
else:
    images = get_all_images(image_names, path_voc)

In [21]:
for i in range(epochs_id, epochs_id + epochs):
    for j in trange(np.size(image_names)):
        masked = 0
        not_finished = 1
        image = np.array(images[j])
        image_name = image_names[0][j]
        annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc)
        if two_databases == 1:
            if j < np.size(image_names1):
                annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc)
            else:
                annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc2)
        gt_masks = generate_bounding_box_from_annotation(annotation, image.shape)
        array_classes_gt_objects = get_ids_objects_from_annotation(annotation)
        region_mask = np.ones([image.shape[0], image.shape[1]])
        shape_gt_masks = np.shape(gt_masks)
        available_objects = np.ones(np.size(array_classes_gt_objects))
        # Iterate through all the objects in the ground truth of an image
        for k in range(np.size(array_classes_gt_objects)):
            # Init visualization
            background = Image.new('RGBA', (10000, 2500), (255, 255, 255, 255))
            draw = ImageDraw.Draw(background)
            # We check whether the ground truth object is of the target class category
            if array_classes_gt_objects[k] == class_object:
                gt_mask = gt_masks[:, :, k]
                step = 0
                new_iou = 0
                # this matrix stores the IoU of each object of the ground-truth, just in case
                # the agent changes of observed object
                last_matrix = np.zeros([np.size(array_classes_gt_objects)])
                region_image = image
                offset = (0, 0)
                size_mask = (image.shape[0], image.shape[1])
                original_shape = size_mask
                old_region_mask = region_mask
                region_mask = np.ones([image.shape[0], image.shape[1]])
                # If the ground truth object is already masked by other already found masks, do not
                # use it for training
                if masked == 1:
                    for p in range(gt_masks.shape[2]):
                        overlap = calculate_overlapping(old_region_mask, gt_masks[:, :, p])
                        if overlap > 0.60:
                            available_objects[p] = 0
                # We check if there are still obejcts to be found
                if np.count_nonzero(available_objects) == 0:
                    not_finished = 0
                # follow_iou function calculates at each time step which is the groun truth object
                # that overlaps more with the visual region, so that we can calculate the rewards appropiately
                iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask, array_classes_gt_objects,
                                                              class_object, last_matrix, available_objects)
                new_iou = iou
                gt_mask = gt_masks[:, :, index]
                # init of the history vector that indicates past actions (6 actions * 4 steps in the memory)
                history_vector = np.zeros([24])
                # computation of the initial state
                state = get_state(region_image, history_vector, model_vgg)
                # status indicates whether the agent is still alive and has not triggered the terminal action
                status = 1
                action = 0
                reward = 0
                if step > number_of_steps:
                    background = draw_sequences(i, k, step, action, draw, region_image, background,
                                                path_testing_folder, iou, reward, gt_mask, region_mask, image_name,
                                                bool_draw)
                    step += 1
                while (status == 1) & (step < number_of_steps) & not_finished:
                    category = int(array_classes_gt_objects[k]-1)
                    model = models[0][category]
                    qval = model.predict(state.T, batch_size=1)
                    background = draw_sequences(i, k, step, action, draw, region_image, background,
                                                path_testing_folder, iou, reward, gt_mask, region_mask, image_name,
                                                bool_draw)
                    step += 1
                    # we force terminal action in case actual IoU is higher than 0.5, to train faster the agent
                    if (i < 100) & (new_iou > 0.5):
                        action = 6
                    # epsilon-greedy policy
                    elif random.random() < epsilon:
                        action = np.random.randint(1, 7)
                    else:
                        action = (np.argmax(qval))+1
                    # terminal action
                    if action == 6:
                        iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask,
                                                                      array_classes_gt_objects, class_object,
                                                                      last_matrix, available_objects)
                        gt_mask = gt_masks[:, :, index]
                        reward = get_reward_trigger(new_iou)
                        background = draw_sequences(i, k, step, action, draw, region_image, background,
                                                    path_testing_folder, iou, reward, gt_mask, region_mask,
                                                    image_name, bool_draw)
                        step += 1
                    # movement action, we perform the crop of the corresponding subregion
                    else:
                        region_mask = np.zeros(original_shape)
                        size_mask = (size_mask[0] * scale_subregion, size_mask[1] * scale_subregion)
                        if action == 1:
                            offset_aux = (0, 0)
                        elif action == 2:
                            offset_aux = (0, size_mask[1] * scale_mask)
                            offset = (offset[0], offset[1] + size_mask[1] * scale_mask)
                        elif action == 3:
                            offset_aux = (size_mask[0] * scale_mask, 0)
                            offset = (offset[0] + size_mask[0] * scale_mask, offset[1])
                        elif action == 4:
                            offset_aux = (size_mask[0] * scale_mask, 
                                          size_mask[1] * scale_mask)
                            offset = (offset[0] + size_mask[0] * scale_mask,
                                      offset[1] + size_mask[1] * scale_mask)
                        elif action == 5:
                            offset_aux = (size_mask[0] * scale_mask / 2,
                                          size_mask[0] * scale_mask / 2)
                            offset = (offset[0] + size_mask[0] * scale_mask / 2,
                                      offset[1] + size_mask[0] * scale_mask / 2)
                        region_image = region_image[int(offset_aux[0]):int(offset_aux[0] + size_mask[0]),
                                       int(offset_aux[1]):int(offset_aux[1] + size_mask[1])]
                        region_mask[int(offset[0]):int(offset[0] + size_mask[0]), int(offset[1]):int(offset[1] + size_mask[1])] = 1
                        iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask,
                                                                      array_classes_gt_objects, class_object,
                                                                      last_matrix, available_objects)
                        gt_mask = gt_masks[:, :, index]
                        reward = get_reward_movement(iou, new_iou)
                        iou = new_iou
                    history_vector = update_history_vector(history_vector, action)
                    new_state = get_state(region_image, history_vector, model_vgg)
                    # Experience replay storage
                    if len(replay[category]) < buffer_experience_replay:
                        replay[category].append((state, action, reward, new_state))
                    else:
                        if h[category] < (buffer_experience_replay-1):
                            h[category] += 1
                        else:
                            h[category] = 0
                        h_aux = h[category]
                        h_aux = int(h_aux)
                        replay[category][h_aux] = (state, action, reward, new_state)
                        minibatch = random.sample(replay[category], batch_size)
                        X_train = []
                        y_train = []
                        # we pick from the replay memory a sampled minibatch and generate the training samples
                        for memory in minibatch:
                            old_state, action, reward, new_state = memory
                            old_qval = model.predict(old_state.T, batch_size=1)
                            newQ = model.predict(new_state.T, batch_size=1)
                            maxQ = np.max(newQ)
                            y = np.zeros([1, 6])
                            y = old_qval
                            y = y.T
                            if action != 6: #non-terminal state
                                update = (reward + (gamma * maxQ))
                            else: #terminal state
                                update = reward
                            y[action-1] = update #target output
                            X_train.append(old_state)
                            y_train.append(y)
                        X_train = np.array(X_train)
                        y_train = np.array(y_train)
                        X_train = X_train.astype("float32")
                        y_train = y_train.astype("float32")
                        X_train = X_train[:, :, 0]
                        y_train = y_train[:, :, 0]
                        hist = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1, verbose=0)
                        models[0][category] = model
                        state = new_state
                    if action == 6:
                        status = 0
                        masked = 1
                        # we mask object found with ground-truth so that agent learns faster
                        image = mask_image_with_mean_background(gt_mask, image)
                    else:
                        masked = 0
                available_objects[index] = 0
    if epsilon > 0.1:
        epsilon -= 0.1
    for t in range (np.size(models)):
        if t == (class_object-1):
            string = path_model + '/model' + str(t) + '_epoch_' + str(i) + 'h5'
            string2 = path_model + '/model' + str(t) + 'h5'
            model = models[0][t]
            model.save_weights(string, overwrite=True)
            model.save_weights(string2, overwrite=True)

  0%|          | 0/5011 [00:00<?, ?it/s]

KeyboardInterrupt: 