In [172]:
import os

import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
import numpy as np

import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
from torchvision.transforms.functional import to_tensor

In [2]:
attribute_file = 'CelebAMask-HQ-attribute-anno.txt'
datadir = '/home/declan/Data/Faces/'
img_dir = os.path.join(datadir, 'CelebA-HQ-img')
attr_dir = os.path.join(datadir, 'CelebAMask-HQ-attribute-anno.txt')
attributes = pd.read_csv(attr_dir, delimiter=' ')
good_data = attributes[['Bald', 'Smiling', 'Wearing_Earrings', 'Wearing_Hat', 'Wearing_Lipstick', 'Wearing_Necktie', 'Wearing_Necklace', 'Gray_Hair', 'Eyeglasses']]
good_data = good_data.sample(frac=.05) ##### CHANGE THIS LATER ######
good_data = good_data.clip(lower=0) # Set -1 to 0.
good_data = good_data[(good_data.T != 0).any()] # drop any rows with only zeros.
img_names = good_data.index.values

In [18]:
'''
Generates a basic Rumelhart dataset.
'''
class FaceDataset(Dataset):
    
    def __init__(self):
        # Load the features.
        attribute_file = 'CelebAMask-HQ-attribute-anno.txt'
        datadir = '/home/declan/Data/Faces/'
        img_dir = os.path.join(datadir, 'CelebA-HQ-img')
        attr_dir = os.path.join(datadir, 'CelebAMask-HQ-attribute-anno.txt')
        attributes = pd.read_csv(attr_dir, delimiter=' ')
        good_data = attributes[['Bald', 'Smiling', 'Wearing_Earrings', 'Wearing_Hat', 'Wearing_Lipstick', 'Wearing_Necktie', 'Wearing_Necklace', 'Gray_Hair', 'Eyeglasses']]
        good_data = good_data.sample(frac=.01) ##### CHANGE THIS LATER ######
        good_data = good_data.clip(lower=0) # Set -1 to 0.
        good_data = good_data[(good_data.T != 0).any()] # drop any rows with only zeros.
        img_names = good_data.index.values
        self.images = [to_tensor(Image.open(os.path.join(datadir, 'CelebA-HQ-img', img))) for img in img_names]
        self.features = good_data.values
        
    def __getitem__(self, index):
        return self.images[index], self.features[index]
        
    def __len__(self):
        return len(self.images)

array(['0.jpg', '1.jpg', '2.jpg', ..., '29997.jpg', '29998.jpg',
       '29999.jpg'], dtype=object)

### Parameters to keep track of.

### Create a training function.

In [None]:
def train(opts):

    # 

### Create the policy network.

In [239]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
            
        # Network settings
        self.rnn_hidden_size = 256
        self.iscuda = False
        # The input size for location embedding / proprioception stack
        self.input_size_loc = 2 # Relative camera position
        self.input_size_act = 256

        # (1) Sense image: Takes in BxCx32x32 image input and converts it to Bx256 matrix.
        self.sense_im = nn.Sequential(self._conv(3, 16),      # Bx16x16x16
                                      self._conv(16, 32),     # Bx32x8x8
                                      self._conv(32, 64),     # Bx64x4x4
                                      View((-1, 1024)),       # Bx1024
                                      self._linear(1024, 256) # Bx256
                                     )

        # (2) Sense proprioception stack: Converts proprioception inputs to 16-D vector.
        self.sense_pro = self._linear(self.input_size_loc, 16)

        # (3) Fuse: Fusing the outputs of (1) and (2) to give 256-D vector per image
        # May be appropriate to add activation function later or change to self._linear.
        self.fuse = nn.Sequential(self._linear(272, 256), # Bx256
                                         nn.Linear(256, 256),    # Bx256
                                         nn.BatchNorm1d(256)
                                        )

        # (4) Aggregator: View aggregating LSTM
        self.aggregate = nn.LSTM(input_size=256, hidden_size=self.rnn_hidden_size, num_layers=1)

        # (5) Act module: Takes in aggregator hidden state to produce probability distribution over actions 
        self.act = nn.Sequential(self._linear(self.input_size_act, 128),
                                 self._linear(128, 128),
                                 nn.Linear(128, 256)    # because (512/32)**2=256
                                )
        
        # (6) Decode module: Given the current representation of the image, reconstruct the full view.
        self.decode = nn.Sequential(self._linear(256, 1024), # Bx1024
                                    View((-1, 64, 4, 4)),    # Bx64x4x4
                                    self._deconv(64, 64),    # Bx64x8x8
                                    self._deconv(64, 32),    # Bx32x16x16
                                    self._deconv(32, 32),    # Bx32x32x32
                                    self._deconv(32, 16),    # Bx16x64x64
                                    self._deconv(16, 16),    # Bx16x128x128
                                    self._deconv(16, 8),     # Bx8x256x256
                                    self._deconv(8, 3)       # Bx3x512x512
                                   )
 
            
    def forward(self, x, hidden=None):
        
        # "Sense" the image.
        x1 = self.sense_im(x['im']) #.squeeze()
        x2 = self.sense_pro(x['pro'])
        x = torch.cat([x1, x2], dim=1)
        batch_size = x.shape[0]
        
        # Set up the recurrent hidden layers.
        # NOTE: confused about why there are two hidden layers here. Maybe that's just how RNN's work in PyTorch?
        if hidden is None:
            hidden = [Variable(torch.zeros(1, batch_size, self.rnn_hidden_size)), # hidden state: (num_layers, batch_size, hidden size)
                      Variable(torch.zeros(1, batch_size, self.rnn_hidden_size))] # cell state  :(num_layers, batch_size, hidden size)
            if self.iscuda:
                hidden[0] = hidden[0].cuda()
                hidden[1] = hidden[1].cuda()
            
        # Fuse the proprioceptive representation and the view representation.
        x = self.fuse(x) 

        # Update the belief state about the image.
        # Note: input to aggregate lstm has to be (seq_length x batch_size x input_dims)
        # Since we are feeding in the inputs one by one, it is 1 x batch_size x 256
        x, hidden = self.aggregate(x.view(1, *x.size()), hidden)
        
        # Define input to the action and decoding layers.
        act_input = hidden[0].view(batch_size, -1)

        # Predict the probability of all actions.
        probs = F.softmax(self.act(act_input), dim=1)
            
        # Decode the whole image using the decoder.
        decoded = self.decode(act_input)

        return probs, hidden, decoded
    
    def _linear(self, in_size, out_size):
        return nn.Sequential(
            nn.Linear(in_size, out_size),
            nn.BatchNorm1d(out_size),
            nn.ReLU(inplace=True)
        )
            
    def _conv(self, in_size, out_size):
        return nn.Sequential(
            nn.Conv2d(in_size, out_size, kernel_size=5, stride=1, padding=2),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            nn.ReLU(inplace=True)
        )
    
    def _deconv(self, in_size, out_size):
        return nn.Sequential(
            nn.ConvTranspose2d(in_size, out_size, kernel_size=4, stride=2, padding=1),
            nn.ReLU(inplace=True)
        )

class View(nn.Module):
    def __init__(self, shape):
        super(View, self).__init__()
        self.shape = shape

    def forward(self, x):
        return x.view(*self.shape)

### Test that the Policy network works (or at least doesn't throw errors).

In [240]:
tensor = torch.zeros([32, 3, 512, 512]) # Random input images. batch size=32.
#action_probs = np.random.uniform(low=0, high=1, size=(32, 256)) # Random action probs.
state_object = StateObject(tensor)   # Create the view object.
#state_object.get_views(action_probs) # Get views corresponding to the max prob actions.

im, pro = state_object.get_view()
im, pro = torch.Tensor(im), torch.Tensor(pro)

# ---- Policy forward pass ----
policy_input = {'im': im, 'pro': pro}

tensor = torch.zeros([32, 3, 512, 512])
model = Policy()
probs, hidden, decoded = model(policy_input)

torch.Size([32, 256])
torch.Size([32, 16])


In [234]:
probs.shape

torch.Size([32, 256])

In [217]:
pro.shape

torch.Size([32, 2])

In [218]:
im.shape

torch.Size([32, 3, 32, 32])

### Testing out different view encoders.

In [126]:
tensor = torch.zeros([64, 3, 32, 32])
conv1 = nn.Conv2d(3, 16, 5, stride=1, padding=2)
maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
conv2 = nn.Conv2d(16, 32, 5, stride=1, padding=2)
maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
conv3 = nn.Conv2d(32, 64, 5, stride=1, padding=2)
maxpool3 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
linear = nn.Linear(1024, 256)
x = conv1(tensor)
#print(x.shape)
x = maxpool1(x)
print(x.shape)
x = conv2(x)
#print(x.shape)
x = maxpool2(x)
print(x.shape)
x = conv3(x)
#print(x.shape)
x = maxpool3(x)
print(x.shape)
x = x.view(64, -1)
x = linear(x)
print(x.shape)

torch.Size([64, 16, 16, 16])
torch.Size([64, 32, 8, 8])
torch.Size([64, 64, 4, 4])
torch.Size([64, 256])


### Testing out different view decoders.

In [157]:
tensor = torch.zeros([64, 256])
linear = nn.Linear(256, 1024)
convT1 = nn.ConvTranspose2d(64, 64, 4, stride=2, padding=1)
convT2 = nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1)
convT3 = nn.ConvTranspose2d(32, 32, 4, stride=2, padding=1)
convT4 = nn.ConvTranspose2d(32, 16, 4, stride=2, padding=1)
convT5 = nn.ConvTranspose2d(16, 16, 4, stride=2, padding=1)
convT6 = nn.ConvTranspose2d(16, 8, 4, stride=2, padding=1)
convT7 = nn.ConvTranspose2d(8, 8, 4, stride=2, padding=1)
conv8 = nn.ConvTranspose2d(8, 3, 3, stride=1, padding=1)

x = linear(tensor)
x = x.view(-1, 64, 4, 4)
x = convT1(x)
x = convT2(x)
x = convT3(x)
x = convT4(x)
x = convT5(x)
x = convT6(x)
x = convT7(x)
print(x.shape)
x = conv8(x)
x.shape

torch.Size([64, 8, 512, 512])


torch.Size([64, 3, 512, 512])

In [112]:
class Agent(BaseAgent):
    """
    This agent implements the policy from Policy class and uses REINFORCE / Actor-Critic for policy improvement
    """
    def __init__(self, opts, mode='train'):
        super(Agent, self).__init__(opts, mode=mode)


    def gather_trajectory(self, state_object, ):
        """
        gather_trajectory gets an observation, updates it's belief of the state, decodes the
        panorama and takes the next action. This is done repeatedly for T time steps.
        Note:
        eval_opts are provided only during testing
        """

        # Setup variables to store trajectory information.
        rewards = []
        log_probs = []
        rec_errs = []
        entropies = []
        hidden = None
        visited_idxes = []
        batch_size = state_object.batch_size
        decoded_all = []
        actions_taken = torch.zeros(batch_size, self.T-1)

        
        for t in range(self.T):
            # ---- Observe the panorama ----
            im, pro = state_object.get_view()
            im, pro = torch.Tensor(im), torch.Tensor(pro)
            
            # Keep track of visited locations
            visited_idxes.append(state_object.idx)

            # ---- Policy forward pass ----
            policy_input = {'im': im, 'pro': pro}
            
            # Update if using CUDA.
            if self.iscuda:
                for var in policy_input:
                    policy_input[var] = policy_input[var].cuda()
            
            # Otherwise use the CPU.
            else:
                for var in policy_input:
                    policy_input[var] = Variable(policy_input[var])

            # Note: decoded and hidden correspond to the previous transition
            # probs and value correspond to the new transition, where the value
            # and action probabilities of the current state are estimated for PG update.
            probs, hidden, decoded = self.policy.forward(policy_input, hidden)

            # Compute reconstruction loss (corresponding to the previous transition).
            ###### NOTE: MAY BE APPROPRIATE TO USE A DIFFERENT LOSS FUNCTION HERE.
            rec_err = F.mse_loss(decoded, state_object.images) #state_object.loss_fn(decoded, self.iscuda)
            # 

            # Reconstruction reward is obtained only at the final step
            # If there is only one step (T=1), then do not provide rewards
            # Note: This reward corresponds to the previous action
            if t < self.T-1 or t == 0:
                reward = torch.zeros(batch_size)
                if self.iscuda:
                    reward = reward.cuda()
            else:
                reward = -rec_err.data # Disconnects reward from future updates
                self.R_avg = (self.R_avg * self.avg_count + reward.sum())/(self.avg_count + batch_size)
                self.avg_count += batch_size
            if t > 0:
                rewards[t-1] += reward

            # There are self.T reconstruction errors as opposed to self.T-1 rewards
            rec_errs.append(rec_err)

            # ---- Sample action ----
            # except for the last time step when only the selected view from previous step is used in aggregate.
            if t < self.T - 1:
                # Act based on the policy network.
                if eval_opts == None or eval_opts['greedy'] == False:
                    act = probs.multinomial(num_samples=1).data
                else:
                    # This works only while evaluating, not while training
                    _, act = probs.max(dim=1)
                    act = act.data.view(-1, 1)
                # Compute entropy
                entropy = -(probs*((probs+1e-7).log())).sum(dim=1)
                # Store log probabilities of selected actions (Advanced indexing)
                log_prob = (probs[range(act.size(0)), act[:, 0]]+1e-7).log()

                # This is the intrinsic reward corresponding to the current action
                rewards.append(reward_expert*self.reward_scale_expert)
                log_probs.append(log_prob)
                entropies.append(entropy)

        return log_probs, rec_errs, rewards, entropies, decoded, values, visited_idxes, decoded_all, actions_taken

NameError: name 'BaseAgent' is not defined

In [None]:
def update_policy(self, rewards, log_probs, task_errs, entropies, values=None):
    """
    This function will take the rewards, log probabilities and task-spencific errors from
    the trajectory and perform the parameter updates for the policy using
    REINFORCE / Actor-Critic.
    INPUTS:
        rewards: list of T-1 Tensors containing reward for each batch at each time step
        log_probs: list of T-1 logprobs Variables of each transition of batch
        task_errs: list of T error Variables for each transition of batch
        entropies: list of T-1 entropy Variables for each transition of batch
        values: list of T-1 predicted values Variables for each transition of batch
    """
    # ---- Setup initial values ----
    batch_size = task_errs[0].size(0)
    R = torch.zeros(batch_size) # Reward accumulator
    B = 0 # Baseline accumulator - used primarily for the average baseline case
    loss = Variable(torch.Tensor([0]))
    if self.iscuda:
        loss = loss.cuda()
        R = R.cuda()

    # ---- Task-specific error computation
    for t in reversed(range(self.T)):
        loss = loss + task_errs[t].sum()/batch_size

    # --- REINFORCE loss based on T-1 transitions ----
    # Note: This will automatically be ignored when self.T = 1
    for t in reversed(range(self.T-1)):
        R = R + rewards[t] # A one sample MC estimate of Q[t]
        if t == self.T-2:
            B += self.R_avg
        B += self.R_avg_expert * self.reward_scale_expert
        adv = R - B
        # PG loss
        loss_term_1 = - (log_probs[t]*self.reward_scale*Variable(adv, requires_grad=False)).sum()/batch_size 
        # Entropy loss, maximize entropy
        loss_term_2 = - self.lambda_entropy*entropies[t].sum()/batch_size
        loss = loss + loss_term_1 + loss_term_2

    self.optimizer.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm(self.policy.parameters(), 10)
    self.optimizer.step()

In [None]:
log_probs, rec_errs, rewards, entropies, decoded, values, visited_idxes, decoded_all, _ = agent.gather_trajectory(state, eval_opts=None, pano_maps=pano_maps, opts=opts)
# Backward pass
agent.update_policy(rewards, log_probs, rec_errs, entropies, values) 

### Define a state object to retrieve view corresponding to the max actions.

In [205]:
class StateObject():
    # Images: (Batch_size x 3 x 512 x 512) giving the current batch of images.
    def __init__(self, images):
        self.images = images
        self.batch_size = images.shape[0]
        # Create a list to map from action index to views.
        action_to_view = []
        for i in range(16):
            for j in range(16):
                start_idx_x = i*32
                end_idx_x = (i+1)*32-1
                start_idx_y = j*32
                end_idx_y = (j+1)*32-1
                action_to_view.append(( (start_idx_x, end_idx_x), (start_idx_y, end_idx_y) ))
        self.action_to_view = action_to_view
    
    # Given a set of action probabilities, get the images corresponding to the maximal action.
    # actions: (batch_size x 256) tensor given the probabilities of selecting a given action.
    def get_view(self, actions=None):
        # If actions==None (ie. first time selecting an action), just return the upper left.
        if actions == None:
            return self.images[:, :, 0:32, 0:32], np.array([[0, 0] for _ in range(self.batch_size)])
            
        action_inds = np.argmax(actions, axis=1) # indexes of max prob action.
        # List of the indices used to retrieve the subarray corresponding to the views chosen by the max action.
        view_subarray_inds = [action_to_view[action_max] for action_max in action_inds]
        view_locs = np.array([[action_max%32/16, action_max//32/16] for action_max in action_inds]) # THIS MIGHT BE WRONG. ORDER MIGHT BE WRONG OR THE 2D REPRESENTATION OF LOCATION MIGHT BE INADEQUATE.
        # Iterate over subarray inds to generate binary masks.
        mask = np.zeros([self.batch_size, 3, 512, 512], dtype=bool)
        for i, inds in enumerate(view_subarray_inds):
            mask[i, :, inds[0][0]:inds[0][1], inds[1][0]:inds[1][1]] = 1
        views = self.images[mask].reshape([self.batch_size, 3, 32, 32])
        return views, view_locs