In [1]:
!pip3 install torch==1.5.1 torchvision==0.6.1 -f https://download.pytorch.org/whl/cu92/torch_stable.html

Looking in links: https://download.pytorch.org/whl/cu92/torch_stable.html


In [2]:
import torchvision
#import torchvision.datasets.SBDataset as sbd
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np

import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
#from sotabench.semantic_segmentation.transforms import Resize


device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 32
PATH="datasets"

class NoisySBDataset():
    def __init__(self, path, image_set="train", transforms = None, download=True):
        super().__init__()

        self.transforms = transforms

        # if you set download=True AND you've downloaded the files,
        # it'll never finish running :-(
        self.dataset = torchvision.datasets.SBDataset(root=path,
                                                      image_set=image_set,
                                                      download=download)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):  # a[x] for calling a.__getitem__(x)
        """Returns tuple of (model_input, ground_truth)
        Modifies each item of the dataset upon retrieval
        a[x] for calling a.__getitem__(x)
        """
        img, truth = self.dataset[idx]
        if self.transforms:
            img = self.transforms(img)

        #img = torch.Tensor(optics.srgb_to_linear(img))

        return (img, truth)


def get_transform(train):
    base_size = 520
    crop_size = 480

    min_size = int((0.5 if train else 1.0) * base_size)
    max_size = int((2.0 if train else 1.0) * base_size)
    transf = []
    #transforms.append(Resize((520, 480)))
    transf.append( transforms.ToTensor())
    transf.append(transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                  std=[0.229, 0.224, 0.225]))
    return transforms.Compose(transf)
    
def make_image_transform(image_transform_params: dict,
                         transform: object):

    resize_image = image_transform_params['image_mode']
    if resize_image == 'none':
        preprocess_image = None
    elif resize_image == 'shrink':
        preprocess_image = transforms.Resize((image_transform_params['output_image_size']['width'],
                                              image_transform_params['output_image_size']['height']))
    elif resize_image == 'crop':
        preprocess_image = transforms.CenterCrop((image_transform_params['output_image_size']['width'],
                                                  image_transform_params['output_image_size']['height']))

    if preprocess_image is not None:
        if transform is not None:
            image_transform = transforms.Compose([preprocess_image, transform])
        else:
            image_transform = preprocess_image
    else:
        image_transform = transform

    return image_transform


def read_voc_dataset(download=True, year='2007'):
    T = transforms.Compose([
                            transforms.Resize((224, 224)),
                            transforms.ToTensor()
                            ])
    voc_data =  torchvision.datasets.VOCDetection(PATH, year=year, image_set='train', 
                        download=download, transform=T)
    train_loader = DataLoader(voc_data,shuffle=False)

    voc_val =  torchvision.datasets.VOCDetection(PATH, year=year, image_set='val', 
                        download=download, transform=T)
    val_loader = DataLoader(voc_val,shuffle=False)

    return voc_data, voc_val
    #return train_loader, val_loader

def get_images_labels(dataloader):
    data_iter = iter(dataloader)
    images, labels = next(data_iter)
    return images, labels

def read_sbd_dataset(batch_size, download=True):
    T = transforms.Compose([
                            transforms.Resize((224, 224)),
                            transforms.ToTensor()
                            ])
    voc_data =  NoisySBDataset(PATH, image_set='train', 
                        download=download, transforms=T)
    train_loader = DataLoader(voc_data, batch_size=32,shuffle=False,  collate_fn=lambda x: x)

    voc_val =  NoisySBDataset(PATH, image_set='val', 
                        download=download, transforms=T)
    val_loader = DataLoader(voc_val, batch_size=32,shuffle=False,  collate_fn=lambda x: x)

    return train_loader, val_loader




#train_loader, val_loader = read_sbd_dataset(batch_size, download=False)
#data_loader = torch.utils.data.DataLoader( batch_size=batch_size, dataset=data, shuffle=shuffle, num_workers=0, collate_fn=lambda x: x )

In [3]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
"""


for (image, ground_truth) in train_loader:
    print(ground_truth)
"""
def show_new_bdbox(image, xmin, xmax, ymin, ymax):
    fig,ax = plt.subplots(1)
    ax.imshow(image.transpose(0, 2).transpose(0, 1))

    width = xmax-xmin
    height = ymax-ymin
    rect = patches.Rectangle((xmin,ymin),width,height,linewidth=3,edgecolor='r',facecolor='none')

    # Add the patch to the Axes
    ax.add_patch(rect)
    plt.show()


def show_bdbox(train_loader, index):
    fig,ax = plt.subplots(1)
    img, target = train_loader[index]
    ax.imshow(img.transpose(0, 2).transpose(0, 1))
    #print("Labels : "+str(labels['annotation']['size']))
    
    print(img)
    print(target)
    xmin = ( int(target['annotation']['object'][0]['bndbox']['xmin']) /  int(target['annotation']['size']['width']) ) * 224
    xmax = ( int(target['annotation']['object'][0]['bndbox']['xmax']) /  int(target['annotation']['size']['width']) ) * 224

    ymin = ( int(target['annotation']['object'][0]['bndbox']['ymin']) /  int(target['annotation']['size']['height']) ) * 224
    ymax = ( int(target['annotation']['object'][0]['bndbox']['ymax']) /  int(target['annotation']['size']['height']) ) * 224

    """
    print("xmin : "+str(xmin))
    print("xmax : "+str(xmax))

    print("ymin : "+str(ymin))
    print("ymax : "+str(ymax))

    print("width : "+str(width))
    print("height : "+str(height))
    """

    width = xmax-xmin
    height = ymax-ymin
    rect = patches.Rectangle((xmin,ymin),width,height,linewidth=3,edgecolor='r',facecolor='none')

    # Add the patch to the Axes
    ax.add_patch(rect)
    plt.show()

def extract(index, loader):
    img, target = loader[index]
    xmin = ( int(target['annotation']['object'][0]['bndbox']['xmin']) /  int(target['annotation']['size']['width']) ) * 224
    xmax = ( int(target['annotation']['object'][0]['bndbox']['xmax']) /  int(target['annotation']['size']['width']) ) * 224

    ymin = ( int(target['annotation']['object'][0]['bndbox']['ymin']) /  int(target['annotation']['size']['height']) ) * 224
    ymax = ( int(target['annotation']['object'][0]['bndbox']['ymax']) /  int(target['annotation']['size']['height']) ) * 224

    width = xmax-xmin
    height = ymax-ymin

    return img, [xmin, xmax, ymin, ymax]

In [7]:
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image
import torch.optim as optim
import cv2 as cv
import sys

Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


class Agent():
    def __init__(self, alpha=0.2, nu=0.3, threshold=0.6, num_episodes=50 ):
        self.BATCH_SIZE = 128
        self.GAMMA = 0.999
        self.EPS_START = 0.9
        self.EPS_END = 0.05
        self.EPS_DECAY = 200
        self.TARGET_UPDATE = 10

        # Get screen size so that we can initialize layers correctly based on shape
        # returned from AI gym. Typical dimensions at this point are close to 3x40x90
        # which is the result of a clamped and down-scaled render buffer in get_screen()
        
        screen_height, screen_width = 224, 224
        # Get number of actions from gym action space
        self.n_actions = 9
        self.feature_extractor = FeatureExtractor()
        self.policy_net = DQN(screen_height, screen_width, self.n_actions).to(device)
        self.target_net = DQN(screen_height, screen_width, self.n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.memory = ReplayMemory(10000)
        self.steps_done = 0
        self.episode_durations = []
        
        self.alpha = alpha # €[0, 1]  Scaling factor
        self.nu = nu # Reward of Trigger
        self.threshold = threshold
        self.actions_history = []
        self.num_episodes = num_episodes
        self.actions_history += [[100]*9]*20

    def select_action(self, state):
        
        sample = random.random()
        eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \
            math.exp(-1. * self.steps_done / self.EPS_DECAY)
        self.steps_done += 1
        if sample > eps_threshold:
            with torch.no_grad():
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                
                output = self.policy_net(state)
                #print("Output : "+str(output))
                #print("Output max(1) : "+str(torch.argmax(output)))
                #output_max = output.max(1)[1]
                output_max = torch.argmax(output)
                return output_max.view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)

    def intersection_over_union(self, boxA, boxB):
        # determine the (x, y)-coordinates of the intersection rectangle
        xA = max(boxA[0], boxB[0])
        yA = max(boxA[1], boxB[1])
        xB = min(boxA[2], boxB[2])
        yB = min(boxA[3], boxB[3])

        # compute the area of intersection rectangle
        interArea = abs(max((xB - xA, 0)) * max((yB - yA), 0))
        if interArea == 0:
            return 0
        # compute the area of both the prediction and ground-truth
        # rectangles
        boxAArea = abs((boxA[2] - boxA[0]) * (boxA[3] - boxA[1]))
        boxBArea = abs((boxB[2] - boxB[0]) * (boxB[3] - boxB[1]))

        # compute the intersection over union by taking the intersection
        # area and dividing it by the sum of prediction + ground-truth
        # areas - the interesection area
        iou = interArea / float(boxAArea + boxBArea - interArea)

        # return the intersection over union value
        return iou


    def compute_reward(self, actual_state, previous_state, ground_truth):
        res = self.intersection_over_union(actual_state, ground_truth) - self.intersection_over_union(previous_state, ground_truth)
        if res<0:   return -1
        return 1

    def compute_trigger_reward(self, actual_state, ground_truth):
        res = self.intersection_over_union(actual_state, ground_truth)
        if res>self.threshold: return -self.nu
        return self.nu

    def do_action(self, image, action, xmin, xmax, ymin, ymax):      
        r = random.randint(0,8)
        alpha_h = self.alpha * (  ymax - ymin )
        alpha_w = self.alpha * (  xmax - xmin )
        #print(r)
        if r == 0:
            #print("TRIGGER")
            pass
        if r == 1:
            xmin += alpha_w
        if r == 2:
            xmax -= alpha_w
        if r == 3:
            ymin += alpha_h
        if r == 4:
            ymax -= alpha_h
        if r == 5:
            xmin -= alpha_w
        if r == 6:
            xmax += alpha_w
        if r == 7:
            ymin -= alpha_h
        if r == 8:
            ymax += alpha_h
        return [xmin, xmax, ymin, ymax]

    def optimize_model(self):
        if len(self.memory) < self.BATCH_SIZE:
            return
        transitions = self.memory.sample(self.BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.BATCH_SIZE, device=device)
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def train(self, image, labels):
        
        xmin = 0
        xmax = image.shape[1]-1
        ymin = 0
        ymax = image.shape[1]-1
        tr = transforms.Compose([transforms.Resize((224,224))])
        actual_coordinates = [xmin, xmax, ymin, ymax]
        #show_new_bdbox(image, xmin, xmax, ymin, ymax)
        #print("Origin coordinates : x min "+str(xmin)+"  x max "+str(xmax)+"    y min "+str(ymin)+"     y max "+str(ymax))
        original_image = image.clone()
        ground_truth = labels
        self.history_actions = []
        for i_episode in range(self.num_episodes):
            print("\n\nEpisode : ("+str(i_episode)+"/"+str(self.num_episodes)+")")
            # Initialize the environment and state
            last_screen = image
            current_screen = image
            #state = current_screen - last_screen
            state = current_screen
            #print("original shape : "+str(state.shape))
            for t in count():
                #print("\n\nt ="+str(t))
                #print("debut shape : "+str(state.shape))
                # Select and perform an action
                
                if t==0:
                    state = transforms.functional.to_pil_image(state)
                    state = tr(state)
                    state = transforms.ToTensor()(state)
                #print("state shape : "+str(state.shape))
                #print("unsqueezed state shape : "+str(state.unsqueeze(0).shape))
                
                if t==0:
                    state = self.feature_extractor(state.unsqueeze(0)).to(device)
                    #print("Feature state shape : "+str(state.shape))
                    #print("self history : "+str(self.actions_history[-10:]))
                    c = self.actions_history[-10:]
                    #print("c : "+str(c))
                    v =  np.array(c).astype(np.float32)
                    d = v.flatten()
                    historic = torch.from_numpy(d).to(device)
                    state = torch.cat((historic, torch.flatten(state)), dim=0).to(device)
                    #print("final state : "+str(state.shape))
 
                action = self.select_action(state).cpu().numpy()
                #print("Action : "+str(action))
                #print("state shape : "+str(state.shape))
                
                values = [action]
                n_values = np.max(8) + 1
                b = np.eye(n_values)[values]
                self.actions_history.append(list(b[0][0]))
                del self.actions_history[0]
                if action == 0:
                    done = True
                else:
                    done = False
                
                last_screen = current_screen
                new_x_min, new_x_max, new_y_min, new_y_max = self.do_action(image, action, xmin, xmax, ymin, ymax)
                #show_new_bdbox(image, new_x_min, new_x_max, new_y_min, new_y_max)
                #print("New coordinates : x min "+str(new_x_min)+"  x max "+str(new_x_max)+"    y min "+str(new_y_min)+"     y max "+str(new_y_max))
                new_image = image[:, int(new_x_min):int(new_x_max),  int(new_y_min):int(new_y_max)]
                

                # Observe new state
                new_coordinates = [new_x_min, new_x_max, new_y_min, new_y_max]
                if done:
                    reward = self.compute_trigger_reward(new_coordinates,  ground_truth)
                else:
                    reward = self.compute_reward(new_coordinates, actual_coordinates, ground_truth)

                
                reward = torch.tensor([reward], device=device, dtype=torch.long)

                
                if not done:
                    new_image = transforms.functional.to_pil_image(new_image)
                    tr = transforms.Compose([transforms.Resize((224,224))])
                    new_image = tr(new_image)
                    new_image = transforms.ToTensor()(new_image)
                    next_state = new_image
                    next_state = transforms.functional.to_pil_image(next_state)
                    next_state = tr(next_state)
                    next_state = transforms.ToTensor()(next_state)
                    #print("state shape : "+str(state.shape))
                    #print("unsqueezed state shape : "+str(state.unsqueeze(0).shape))
                    next_state = self.feature_extractor(next_state.unsqueeze(0)).to(device)
                    #print("Feature state shape : "+str(state.shape))
                    #print("self history : "+str(self.actions_history[-10:]))
                    c = self.actions_history[-10:]
                    #print("c : "+str(c))
                    v =  np.array(c).astype(np.float32)
                    d = v.flatten()
                    historic = torch.from_numpy(d).to(device)
                    next_state = torch.cat((historic, torch.flatten(next_state)), dim=0).to(device)
                    #print("final state : "+str(state.shape))
 
                else:
                    next_state = None

                # Store the transition in memory
                self.memory.push(state, torch.from_numpy(action).to(device), next_state, reward)

                # Move to the next state
                state = next_state

                # Perform one step of the optimization (on the target network)
                self.optimize_model()
                if done:
                    self.episode_durations.append(t + 1)
                    #self.plot_durations()
                    break
            # Update the target network, copying all weights and biases in DQN
            if i_episode % self.TARGET_UPDATE == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())

        print('Complete')
        env.render()
        env.close()
        plt.ioff()
        plt.show()

In [5]:
class FeatureExtractor(nn.Module):
  def __init__(self):
    super(FeatureExtractor, self).__init__() # recopier toute la partie convolutionnelle
    vgg16 = torchvision.models.vgg16(pretrained=True)
    vgg16.eval() # to not do dropout
    self.features = nn.Sequential( *list(vgg16.features.children()))
    # understand feature and classifier: https://www.kaggle.com/carloalbertobarbano/vgg16-transfer-learning-pytorch
    # garder une partie du classifieur, -2 pour s'arrêter à relu7
    self.classifier = nn.Sequential(*list(vgg16.classifier.children())[:-2])

  def forward(self, x):
    x = self.features(x)
    x = x.view(x.size(0), -1)
    x = self.classifier(x)
    return x
class DQN(nn.Module):
    def __init__(self, h, w, outputs):
        super(DQN, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear( in_features= 90 + 4096, out_features=1024),
            nn.Linear( in_features= 1024, out_features=1024),
            nn.Linear( in_features= 1024, out_features=9)
        )
    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        return self.classifier(x)

In [None]:
train_loader, val_loader = read_voc_dataset(download=False, year='2012')

#print(labels)
#print("size : "+str(len(labels['annotation']['object'][0]['bndbox']['xmin'])) ) # ['annotation']['object'][0]['bndbox']['xmin']
index = 13
#show_bdbox(train_loader, index)

agent = Agent(alpha=0.2)
image, [xmin, xmax, ymin, ymax] = extract(index, train_loader)
agent.train(image, [xmin, xmax, ymin, ymax])



Episode : (0/50)






Episode : (1/50)


Episode : (2/50)


Episode : (3/50)


Episode : (4/50)


Episode : (5/50)


Episode : (6/50)


Episode : (7/50)


Episode : (8/50)


Episode : (9/50)


Episode : (10/50)


Episode : (11/50)


Episode : (12/50)


Episode : (13/50)


Episode : (14/50)


Episode : (15/50)


Episode : (16/50)


Episode : (17/50)


Episode : (18/50)


Episode : (19/50)


Episode : (20/50)


Episode : (21/50)


Episode : (22/50)


Episode : (23/50)


Episode : (24/50)


Episode : (25/50)


Episode : (26/50)


Episode : (27/50)


Episode : (28/50)


Episode : (29/50)
