In [1]:
import torchvision.models as models
from torchvision import transforms
from torch import nn
from PIL import Image, ImageDraw, ImageFont

import copy
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import torch
import random
import numpy as np
from tqdm.notebook import trange, tqdm
import os, glob,shutil
import torch.optim as optim

In [48]:
def string_for_action(action):
    if action == 0:
        return "START"
    if action == 1:
        return 'up-left'
    elif action == 2:
        return 'up-right'
    elif action == 3:
        return 'down-left'
    elif action == 4:
        return 'down-right'
    elif action == 5:
        return 'center'
    elif action == 6:
        return 'TRIGGER'


def draw_sequences(i, k, step, action, draw, region_image, background, path_testing_folder, iou, reward,
                   gt_mask, region_mask, image_name, save_boolean):
    
#     print(gt_mask.shape)
#     print(region_mask.shape)
    mask = Image.fromarray(255 * gt_mask)
    mask_img = Image.fromarray(255 * region_mask)
    image_offset = (1000 * step, 70)
    text_offset = (1000 * step, 550)
    masked_image_offset = (1000 * step, 1400)
    mask_offset = (1000 * step, 700)
    action_string = string_for_action(action)
    myFont = ImageFont.truetype('../fonts/FreeMono.ttf', 30)
    footnote = 'action: ' + action_string + ' ' + 'reward: ' + str(round(reward,2)) + ' Iou:' + str(round(iou,2))
    draw.text(text_offset, str(footnote), (0, 0, 0),font=myFont)
    region_image_np = np.einsum('ijk->jki',np.array(region_image.detach().cpu()))
    
    region_image_np = ((region_image_np - region_image_np.min())/(region_image_np.max() - region_image_np.min()))*255
#     print(region_image_np.shape)
    img_for_paste = Image.fromarray(region_image_np.astype(np.uint8))
    background.paste(img_for_paste, image_offset)
    background.paste(mask, mask_offset)
    background.paste(mask_img, masked_image_offset)
    file_name = path_testing_folder + '/' + image_name + str(i) + '_object_' + str(k) + '.png'
    if save_boolean == 1:
        background.save(file_name)
    return background

def draw_sequences_test(step, action, qval, draw, region_image, background, path_testing_folder,
                        region_mask, image_name, save_boolean):
    aux = np.asarray(region_image, np.uint8)
    img_offset = (1000 * step, 70)
    footnote_offset = (1000 * step, 550)
    q_predictions_offset = (1000 * step, 500)
    mask_img_offset = (1000 * step, 700)
    img_for_paste = Image.fromarray(aux)
    background.paste(img_for_paste, img_offset)
    mask_img = Image.fromarray(255 * region_mask)
    background.paste(mask_img, mask_img_offset)
    footnote = 'action: ' + str(action)
    q_val_predictions_text = str(qval)
    draw.text(footnote_offset, footnote, (0, 0, 0))
    draw.text(q_predictions_offset, q_val_predictions_text, (0, 0, 0))
    file_name = path_testing_folder + image_name + '.png'
    if save_boolean == 1:
        background.save(file_name)
    return background

def mask_image_with_mean_background(mask_object_found, image):
    new_image = image
    size_image = np.shape(mask_object_found)
    for j in range(size_image[0]):
        for i in range(size_image[1]):
            if mask_object_found[j][i] == 1:
                    new_image[0, j, i] = 0.485
                    new_image[1, j, i] = 0.456
                    new_image[2, j, i] = 0.406
#                     0.485, 0.456, 0.406
    return new_image


In [23]:
class res_model_no_top(nn.Module):
    def __init__(self, output_layer):
        super().__init__()
        self.output_layer = output_layer
        self.pretrained = models.resnet18(pretrained=True)
        self.children_list = []
        for n,c in self.pretrained.named_children():
            self.children_list.append(c)
            if n == self.output_layer:
                break

        self.net = nn.Sequential(*self.children_list)
        self.pretrained = None
        
    def forward(self,x):
        x = self.net(x)
        return x

# Different actions that the agent can do
number_of_actions = 6
# Actions captures in the history vector
actions_of_history = 4
# Visual descriptor size
feature_shape = 131072
def get_state(image, history_vector):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    image_ = image.clone().detach().cpu().numpy()
    image_ = np.resize(image_,(3,512,512))
    
    image_ = torch.from_numpy(image_)
    image_ = image_.to(device)
    with torch.no_grad():
        get_features = res_model_no_top('layer4')
        get_features = get_features.to(device)
        descriptor_image = get_features(image_[None])
#         print(descriptor_image.shape)
        descriptor_image = descriptor_image.reshape((1,feature_shape))
        history_vector = torch.reshape(history_vector, (1, number_of_actions*actions_of_history))
        state = torch.hstack((descriptor_image, history_vector))
    return state


In [4]:
def update_history_vector(history_vector, action):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    action_vector = np.zeros(number_of_actions)
    history_vector = np.array(history_vector.clone().detach().cpu())
    action_vector[action-1] = 1
    size_history_vector = np.size(np.nonzero(history_vector))
    updated_history_vector = np.zeros(number_of_actions*actions_of_history)
    if size_history_vector < actions_of_history:
        aux2 = 0
        for l in range(number_of_actions*size_history_vector, number_of_actions*size_history_vector+number_of_actions - 1):
            history_vector[l] = action_vector[aux2]
            aux2 += 1
        return torch.Tensor(history_vector).to(device)
    else:
        for j in range(0, number_of_actions*(actions_of_history-1) - 1):
            updated_history_vector[j] = history_vector[j+number_of_actions]
        aux = 0
        for k in range(number_of_actions*(actions_of_history-1), number_of_actions*actions_of_history):
            updated_history_vector[k] = action_vector[aux]
            aux += 1
        return torch.Tensor(updated_history_vector).to(device)

In [5]:
# Reward movement action
reward_movement_action = 1
# Reward terminal action
reward_terminal_action = 3
# IoU required to consider a positive detection
iou_threshold = 0.5
def get_reward_movement(iou, new_iou):
    if new_iou > iou:
        reward = reward_movement_action
    else:
        reward = - reward_movement_action
    return reward


def get_reward_trigger(new_iou):
    if new_iou > iou_threshold:
        reward = reward_terminal_action
    else:
        reward = - reward_terminal_action
    return reward

In [21]:
class q_network(nn.Module):
    def __init__(self, num_hidden_layer, dim_hidden_layer, output_dim):
        super(q_network, self).__init__()

        """CODE HERE: construct your Deep neural network
        """
        self.input_linear = nn.Linear(131072+24,dim_hidden_layer)
        self.relu1 = nn.ReLU()
        self.linears = nn.ModuleList(nn.Linear(dim_hidden_layer,dim_hidden_layer) for i in range(num_hidden_layer))
        self.relus = nn.ModuleList(nn.ReLU() for i in range(num_hidden_layer))
        self.output_linear = nn.Linear(dim_hidden_layer, output_dim)
    def forward(self, x):
        """CODE HERE: implement your forward propagation
        """
        
        x = self.input_linear(x)
        x = self.relu1(x)
        for linear,relu in zip(self.linears,self.relus):
            x = linear(x)
            x =  relu(x)
        y = self.output_linear(x)
        return y

In [7]:
def calculate_iou(img_mask, gt_mask):
    gt_mask *= 1.0
    img_and = np.multiply(img_mask, gt_mask)
    img_or = img_mask + gt_mask
    j = len(img_and[img_and>0])
    i = len(img_and[img_or>0])
    iou = float(float(j)/float(i))
    return iou


def calculate_overlapping(img_mask, gt_mask):
    gt_mask *= 1.0
    img_and = np.multiply(img_mask, gt_mask)
    j = np.count_nonzero(img_and)
    i = np.count_nonzero(gt_mask)
    overlap = float(float(j)/float(i))
    return overlap


def follow_iou(gt_masks, mask, last_matrix, available_objects):
    results = np.zeros(len(gt_masks))
    for k in range(len(gt_masks)):
        if available_objects[k] == 1:
            gt_mask = gt_masks[k,:, :]
            iou = calculate_iou(mask, gt_mask)
            results[k] = iou
        else:
            results[k] = -1
    max_result = max(results)
    ind = np.argmax(results)
    iou = last_matrix[ind]
    new_iou = max_result
    return iou, new_iou, results, ind

In [37]:
def load_images_names_in_data_set(data_set_name, path_voc):
    file_path = path_voc + '/ImageSets/Main/' + data_set_name + '.txt'
    f = open(file_path)
    image_names = f.readlines()
    image_names = [x.strip('\n') for x in image_names]
    if data_set_name.startswith("aeroplane") | data_set_name.startswith("bird") | data_set_name.startswith("cow"):
        return [x.split(None, 1)[0] for x in image_names]
    else:
        return [x.strip('\n') for x in image_names]
    
def load_images_labels_in_data_set(data_set_name, path_voc):
    file_path = path_voc + '/ImageSets/Main/' + data_set_name + '.txt'
    f = open(file_path)
    images_names = f.readlines()
    images_names = [x.split(None, 1)[1] for x in images_names]
    images_names = [x.strip('\n') for x in images_names]
    return images_names

def get_all_images(image_names):
    image_names_clean = copy.copy(image_names)
    preprocess = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )])
    images = []
    for string in image_names:
        try:
            im = Image.open(string).convert('RGB')
            im = preprocess(im)
            images.append(im)
        except:
            image_names_clean.remove(image_name)
            pass
        
    names = []
    
    for name in image_names_clean:
        names.append(name.split('/')[-1].split('_')[0])
    return images, names

def get_bb_of_gt_from_pascal_xml_annotation(xml_name, voc_path):
    string = voc_path + '/Annotations/' + xml_name + '.xml'
    tree = ET.parse(string)
    root = tree.getroot()
    names = []
    x_min = []
    x_max = []
    y_min = []
    y_max = []
    for child in root:
        if child.tag == 'object':
            for child2 in child:
                if child2.tag == 'name':
                    names.append(child2.text)
                elif child2.tag == 'bndbox':
                    for child3 in child2:
                        if child3.tag == 'xmin':
                            x_min.append(child3.text)
                        elif child3.tag == 'xmax':
                            x_max.append(child3.text)
                        elif child3.tag == 'ymin':
                            y_min.append(child3.text)
                        elif child3.tag == 'ymax':
                            y_max.append(child3.text)
    category_and_bb = np.zeros([np.size(names), 5])
    for i in range(np.size(names)):
        category_and_bb[i][0] = get_id_of_class_name(names[i])
        category_and_bb[i][1] = x_min[i]
        category_and_bb[i][2] = x_max[i]
        category_and_bb[i][3] = y_min[i]
        category_and_bb[i][4] = y_max[i]
    return category_and_bb
def get_id_of_class_name (class_name):
    if class_name == 'aeroplane':
        return 1
    elif class_name == 'bicycle':
        return 2
    elif class_name == 'bird':
        return 3
    elif class_name == 'boat':
        return 4
    elif class_name == 'bottle':
        return 5
    elif class_name == 'bus':
        return 6
    elif class_name == 'car':
        return 7
    elif class_name == 'cat':
        return 8
    elif class_name == 'chair':
        return 9
    elif class_name == 'cow':
        return 10
    elif class_name == 'diningtable':
        return 11
    elif class_name == 'dog':
        return 12
    elif class_name == 'horse':
        return 13
    elif class_name == 'motorbike':
        return 14
    elif class_name == 'person':
        return 15
    elif class_name == 'pottedplant':
        return 16
    elif class_name == 'sheep':
        return 17
    elif class_name == 'sofa':
        return 18
    elif class_name == 'train':
        return 19
    elif class_name == 'tvmonitor':
        return 20
    
def generate_bounding_box_from_annotation(annotation, image_shape):
    annotation = np.array(annotation,dtype=np.int16)
    length_annotation = annotation.shape[0]
    masks = np.zeros([length_annotation, image_shape[1], image_shape[2]])
    for i in range(0, length_annotation):
        masks[i, max(annotation[i][3],0):min(annotation[i][4],image_shape[1]), max(annotation[i][1],0):min(annotation[i][2],image_shape[1])] = 1
    return masks

## Vedai Helper Functions

In [14]:
def get_annotation_vedai(annot_path_vedai512, filename):
    with open(annot_path_vedai512+f'/{filename}.txt') as fp:
        annot = fp.readlines()
        annotation = np.zeros((len(annot),5))
        for i, line in enumerate(annot):
            cords = line.split()
            car_type = int(cords[3])
            cords = [int(n) for n in cords[-8:]]
            xmin = min(cords[-4:])
            xmax = max(cords[-4:])

            ymin = min(cords[:4])
            ymax = max(cords[:4])

            annotation[i] = np.array([car_type, ymin, ymax, xmin, xmax])
    return annotation
def show_image_mask(image, annotation):
    image_ = image.copy()
    mask = np.zeros(np.shape(image)[:2])
    for annot in annotation:
        image_[int(annot[3]):int(annot[4]),int(annot[1]):int(annot[2]),:] = 1
        mask[int(annot[3]):int(annot[4]),int(annot[1]):int(annot[2])] = 1
    fig = plt.figure(figsize=(15,5))
    ax1 = plt.subplot(1,3,1)
    ax1.imshow(image)
    ax1.axis('off')
    ax1.set_title('Original')
    ax2 = plt.subplot(1,3,2)
    ax2.imshow(image_)
    ax2.axis('off')
    ax2.set_title('Masked')
    
    ax3 = plt.subplot(1,3,3)
    ax3.imshow(mask)
    ax3.axis('off')
    ax3.set_title('Mask')
    plt.show()

In [9]:
# path_voc07 = '../data/voc/VOC2007'
path_model = '../model/'
# image_names_ = np.array([load_images_names_in_data_set('trainval', path_voc07)])

In [10]:
data_path_vedai512 = '../data/vedai/Vehicules512'
annot_path_vedai512 = '../data/vedai/Annotations512'

In [11]:
# images, image_names = get_all_images(image_names_[0], path_voc07)
# len(images) == len(image_names)

In [16]:
image_paths_512 = glob.glob(data_path_vedai512+'/*.png')
images, image_names = get_all_images(image_paths_512)

In [50]:
######## PARAMETERS ########

# Class category of PASCAL that the RL agent will be searching
class_object = 1
# Scale of subregion for the hierarchical regions (to deal with 2/4, 3/4)
scale_subregion = float(3)/4
scale_mask = float(1)/(scale_subregion*4)
# 1 if you want to obtain visualizations of the search for objects
bool_draw = 1
# How many steps can run the agent until finding one object
number_of_steps = 14
# Boolean to indicate if you want to use the two databases, or just one
two_databases = 0
epochs = 50
gamma = 0.90
epsilon = 1
batch_size = 100
# Pointer to where to store the last experience in the experience replay buffer,
# actually there is a pointer for each PASCAL category, in case all categories
# are trained at the same time
h = np.zeros([1])
# Each replay memory (one for each possible category) has a capacity of 100 experiences
buffer_experience_replay = 1000
# Init replay memories
replay = [[] for i in range(1)]
reward = 0

path_testing_folder = '../results/vedai/train/.'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
q_net = q_network(2, 1024, 6)
q_net = q_net.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(q_net.parameters(), lr=0.001)

q_net_target = q_network(2, 1024, 6)
q_net_target = q_net_target.to(device)
q_net_target.load_state_dict(q_net.state_dict())

In [51]:
loss_curve = []
epochs_id=0
update_target = 100

num_target = 0
for j in range(epochs_id, epochs_id+epochs):
    count_bool_draw = 0
    for i in trange(len(images)):
        count_bool_draw+=1
        not_finished = 1
        masked = 0
        image = images[i]
        image = image.to(device)
        image_name = image_names[i]
        annotation = get_annotation_vedai(annot_path_vedai512, image_name)
        gt_masks = generate_bounding_box_from_annotation(annotation, image.shape)
        region_mask = np.ones([image.shape[0], image.shape[1]])
        shape_gt_masks = np.shape(gt_masks)
#         available_objects = np.ones(np.size(array_classes_gt_objects))
         # number of masks
        for k in range(shape_gt_masks[0]):
            background = Image.new('RGBA', (14000, 2500), (255, 255, 255, 255))
            draw = ImageDraw.Draw(background)
            gt_mask = gt_masks[k]
            step = 0
            new_iou = 0
            if count_bool_draw%20 == 0:
                bool_draw = 1
            else:
                bool_draw = 0
            region_image = image.clone().detach()
            offset = (0, 0)
            size_mask = (image.shape[1], image.shape[2])
            original_shape = size_mask
            region_mask = np.ones([image.shape[1], image.shape[2]])
            old_region_mask = np.zeros([image.shape[1], image.shape[2]])
            available_objects = np.ones(gt_masks.shape[0])
            last_matrix = np.zeros(gt_masks.shape[0])
            if masked == 1:
                for p in range(gt_masks.shape[0]):
                    overlap = calculate_overlapping(old_region_mask, gt_masks[p,:,:])
                    if overlap > 0.60:
                        available_objects[p] = 0
            # We check if there are still obejcts to be found
            if np.count_nonzero(available_objects) == 0:
                not_finished = 0

            iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask, last_matrix, available_objects)
            new_iou = iou
            gt_mask = gt_masks[index,:, :]
            history_vector = torch.zeros([24])
            history_vector = history_vector.to(device)
            # computation of the initial state
            state = get_state(region_image, history_vector)
            # status indicates whether the agent is still alive and has not triggered the terminal action
            status = 1
            action = 0
            reward = 0
            if step > number_of_steps:
                background = draw_sequences(j, k, step, action, draw, region_image, background,
                                            path_testing_folder, iou, reward, gt_mask, region_mask, image_name,
                                            bool_draw)
                step += 1

            while (status == 1) & (step < number_of_steps) & not_finished:
                qval = q_net(state)
                background = draw_sequences(j, k, step, action, draw, region_image, background,
                                    path_testing_folder, iou, reward, gt_mask, region_mask, image_name,
                                    bool_draw)
                step += 1
                # we force terminal action in case actual IoU is higher than 0.5, to train faster the agent
                if (i < 100) & (new_iou > 0.5):
                    action = 6
                # epsilon-greedy policy
                elif random.random() < epsilon:
                    action = np.random.randint(1, 7)
                else:
                    action = int(torch.argmax(qval))+1
                # terminal action
                if action == 6:
                    iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask, last_matrix, available_objects)
                    gt_mask = gt_masks[index, :, :]
                    reward = get_reward_trigger(new_iou)
                    background = draw_sequences(j, k, step, action, draw, region_image, background,
                                                path_testing_folder, iou, reward, gt_mask, region_mask, image_name,
                                                bool_draw)
                    step += 1
                else:
                    region_mask = np.zeros(original_shape)
                    size_mask = (size_mask[0] * scale_subregion, size_mask[1] * scale_subregion)
                    if action == 1:
                        offset_aux = (0, 0)
                    elif action == 2:
                        offset_aux = (0, size_mask[1] * scale_mask)
                        offset = (offset[0], offset[1] + size_mask[1] * scale_mask)
                    elif action == 3:
                        offset_aux = (size_mask[0] * scale_mask, 0)
                        offset = (offset[0] + size_mask[0] * scale_mask, offset[1])
                    elif action == 4:
                        offset_aux = (size_mask[0] * scale_mask, 
                                      size_mask[1] * scale_mask)
                        offset = (offset[0] + size_mask[0] * scale_mask,
                                  offset[1] + size_mask[1] * scale_mask)
                    elif action == 5:
                        offset_aux = (size_mask[0] * scale_mask / 2,
                                      size_mask[0] * scale_mask / 2)
                        offset = (offset[0] + size_mask[0] * scale_mask / 2,
                                  offset[1] + size_mask[0] * scale_mask / 2)
                    region_image = region_image[:,int(offset_aux[0]):int(offset_aux[0] + size_mask[0]),
                                   int(offset_aux[1]):int(offset_aux[1] + size_mask[1])]
                    region_mask[int(offset[0]):int(offset[0] + size_mask[0]), int(offset[1]):int(offset[1] + size_mask[1])] = 1
                    iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask, last_matrix, available_objects)
                    gt_mask = gt_masks[index, :, :]
                    reward = get_reward_movement(iou, new_iou)
                    iou = new_iou
                history_vector = update_history_vector(history_vector, action)
                new_state = get_state(region_image, history_vector)
                # Experience replay storage
                if len(replay[0]) < buffer_experience_replay:
                    replay[0].append((state, action, reward, new_state))
                else:
#                     print('Training')
                    if h[0] < (buffer_experience_replay-1):
                        h[0] += 1
                    else:
                        h[0] = 0
                    h_aux = h[0]
                    h_aux = int(h_aux)
#                     print(replay[0][h_aux])
                    replay[0][h_aux] = (state, action, reward, new_state)
#                     print(replay[0][h_aux])
                    minibatch = random.sample(replay[0], batch_size)
                    X_train = []
                    y_train = []
                    # we pick from the replay memory a sampled minibatch and generate the training samples
                    for memory in minibatch:
                        old_state, action, reward, new_state = memory
#                         print(action)
                        with torch.no_grad():
                            old_qval = q_net_target(old_state)
                            newQ = q_net(new_state)
                            newQ_target = q_net_target(new_state)
                        maxQ_arg = int(newQ.argmax())
                        maxQ = float(newQ_target[0,maxQ_arg])
                        y = torch.zeros((1, 6))
                        y = old_qval
                        y = y
                        if action != 6: #non-terminal state
                            update = (reward + (gamma * maxQ))
                        else: #terminal state
                            update = reward
                        y[0][action-1] = update #target output
                        X_train.append(old_state)
                        y_train.append(y)
                    X_train = torch.cat(X_train)
                    y_train = torch.cat(y_train)
#                     print(X_train.shape)
#                     print(y_train.shape)
                    optimizer.zero_grad()
                    y_pred = q_net(X_train)
#                     hist = q_net.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1, verbose=0)
                    loss = criterion(y_pred, y_train)
                    loss.backward()
                    optimizer.step()
                    loss_curve.append(loss.item())
#                     print(q_net.linears[0].weight[0][:10])
                    state = new_state
                    num_target+=1
                    if num_target%update_target == 0:
                        q_net_target.load_state_dict(q_net.state_dict())
                if action == 6:
                    status = 0
                    masked = 1
                    # we mask object found with ground-truth so that agent learns faster
                    image = mask_image_with_mean_background(gt_mask, image)
                else:
                    masked = 0
    if epsilon > 0.1:
        epsilon -= 0.1
    string = path_model + '/model_epoch_' + str(i) + '.h5'
    string2 = path_model + '/model.h5'
    torch.save(q_net.state_dict(), string)
    torch.save(q_net.state_dict(), string2)

  0%|          | 0/2573 [00:00<?, ?it/s]



  0%|          | 0/2573 [00:00<?, ?it/s]

  0%|          | 0/2573 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [53]:
j


2

In [54]:
string = path_model + '/model_epoch_' + str(j) + '.h5'
string2 = path_model + '/model.h5'
torch.save(q_net.state_dict(), string)
torch.save(q_net.state_dict(), string2)

In [56]:
epoch

NameError: name 'epoch' is not defined