# Report - Continuous Control Project

## Learning Algorithm


A simplified version of a Proximal Policy Optimization (PPO) learning algorithm have been used in this project. 
PPO has been created by OpenAI in 2017 and represents a simplification of the Trust Region Policy Optimization (TRPO) algorithm. Compared to its predecesor, PPO achieves state of the art learning while being less complex and easier to implement and tune.

Starting with the neural networks used for the policies, the agent makes use of the actor critic style multi-layer perceptron network with 2 layers of 400 and 300 units each and tanh activations. The actor network will output a Gaussian distribution over the 4 estimated actions while the critic will learn to predict the state value of any given state. The hidden layer sizes have been chosen according to this paper from McGill University: https://arxiv.org/pdf/1708.04133. The Xavier wight initialization is implemented in order to improve the learning capabilities of the policies. FInally, the actor-critic policy outputs the actions, log probabilities, entropy loss (used in the regularization term of the surrogate loss) and estimated state values (used in calculating the advantages).

In [None]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal

# Xavier weight initialization for better learning
def w_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)


class ActorCriticPolicy(nn.Module):
    def __init__(self, state_size, action_size, fc1_size, fc2_size, seed):
        super().__init__()

        self.actor = Actor(state_size, action_size, fc1_size, fc2_size, seed)
        self.critic = Critic(state_size, fc1_size, fc2_size, seed)
        self.std = torch.Tensor(nn.Parameter(torch.ones(1, action_size)))

    def forward(self, states, actions=None):
        estimated_actions = self.actor(states)
        estimated_values = self.critic(states)

        # Define a gaussian distribution over the actions given by actor net
        gaussian = Normal(estimated_actions, self.std)
        i_dim = 2 # when evaluating: shape (batch_size, 20)

        if isinstance(actions, type(None)): 
            i_dim = 1 # when collecting trajectory: shape (20)
            actions = gaussian.sample()

        log_prob = torch.sum(gaussian.log_prob(actions), dim=i_dim, keepdim=True)
        entropy_loss = torch.sum(gaussian.entropy(), dim=i_dim) / 4

        return actions, log_prob, entropy_loss, estimated_values


class Actor(nn.Module):
    def __init__(self, state_size, action_size, fc1_size, fc2_size, seed):
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.fc3 = nn.Linear(fc2_size, action_size)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*w_init(self.fc1))
        self.fc2.weight.data.uniform_(*w_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return torch.tanh(self.fc3(x))


class Critic(nn.Module):
    def __init__(self, state_size, fc1_size, fc2_size, seed):
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.fc3 = nn.Linear(fc2_size, 1)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*w_init(self.fc1))
        self.fc2.weight.data.uniform_(*w_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)


For storing the trajectories of each agent, the following data structure has been created:

In [None]:
import numpy as np

class Trajectory(object):
    def __init__(self):
        self.traj = {}
        self.traj['state'] = []
        self.traj['reward'] = []
        self.traj['prob'] = []
        self.traj['action'] = []
        self.traj['value'] = []
        self.traj['done'] = []
        self.score = 0.

    def add(self, states, rewards, log_probs, actions, values, dones):
        self.traj['state'].append(states)
        self.traj['reward'].append(rewards)
        self.traj['prob'].append(log_probs)
        self.traj['action'].append(actions)
        self.traj['value'].append(values)
        self.traj['done'].append(dones)
        self.score += np.mean(rewards)


    def __len__(self):
        return len(self.traj['state'])

    def __getitem__(self, key):
        return self.traj[key]


Additionally, we implement a Batcher class in order to batch data from the rajectories in chunks that would allow the learning algorithm to make a good gradient update.

In [None]:
class Batcher:
    '''Helper class for enabling agent to learn in batches'''
    def __init__(self, batch_size, data):
        self.batch_size = batch_size
        self.data = data
        self.data_length = len(data[0])
        self.reset()

    def reset(self):
        self.batch_start = 0
        self.batch_end = self.batch_start + self.batch_size

    def end(self):
        return self.batch_start >= self.data_length

    def next_batch(self):
        batch = []
        for d in self.data:
            batch.append(d[self.batch_start: self.batch_end])
        self.batch_start = self.batch_end
        self.batch_end = min(self.batch_start + self.batch_size, self.data_length)
        return batch

    def shuffle(self):
        indices = np.arange(self.data_length)
        np.random.shuffle(indices)
        self.data = [d[indices] for d in self.data]

For the initialization of the PPO learning agent, we get inspired from the OpenAI Baselines PPO implementation: https://github.com/openai/baselines. Although we end up using difference hidden layer sizes for our policies, we keep batch size, learning rate, reward discount factor, gradient updates per trajectory, surrogate function's cliprange, value function loss coefficient and entropy loss coefficient the same. These parameters and updating every episode resulted in very fast learning.

We choose the optimizer to be Adam, as it is a popular choice for policy gradient algorithms.

In [None]:
class PPO:
    '''PPO Learning Agent'''
    def __init__(self, state_size, action_size, fc1_size=64, fc2_size=64, num_agents=1, seed=0, batch_size=128,
                 lr=0.00025, tau=0.95, gamma=0.99, nminibatches=4, cliprange=0.2, vf_coef=0.5, ent_coef=0.01, learn_every=1):
        self.state_size = state_size
        self.action_size = action_size
        self.fc1_size = fc1_size
        self.fc2_size = fc2_size
        self.num_agents = num_agents
        self.seed = seed
        self.batch_size = batch_size
        self.lr = lr
        self.tau = tau
        self.gamma = gamma
        self.nminibatches = nminibatches
        self.cliprange = cliprange
        self.vf_coef = vf_coef
        self.ent_coef = ent_coef
        self.learn_every = learn_every
        
        self.policy = ActorCriticPolicy(self.state_size, self.action_size, self.fc1_size, self.fc2_size, self.seed).to(device)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=self.lr)

        self.t_step = 0


The surrogate function loss is composed of the clipped ratio between the new policy and the one used to get the trajectory, mean squared error of the predicted state values by the critic and the entropy loss that encourages exploration in early stages of training.

In [1]:
 def _surrogate(self, policy, old_probs, states, actions, rewards, advantages):
        # Discount rewards
        discount = self.gamma**np.arange(len(rewards))
        dis_rewards = np.asarray(rewards)*discount[:, np.newaxis]
        # Convert to future rewards
        fut_rewards = dis_rewards[::-1].cumsum(axis=0)[::-1]
        # Normalize rewards
        mean = np.mean(fut_rewards, axis=1)
        std = np.std(fut_rewards, axis=1) + 1.0e-10 # zero std would lead to NAN errors
        norm_rewards = (fut_rewards-mean[:,np.newaxis]) / std[:,np.newaxis]
        
        # Convert data to tensors and move to device
        actions = torch.tensor(actions, dtype=torch.float, device=device)
        old_probs = torch.tensor(old_probs, dtype=torch.float, device=device)
        norm_rewards = torch.tensor(norm_rewards, dtype=torch.float, device=device)

        _, new_probs, entropy_loss, values = policy(states, actions)

        # Find new/old policy ratio
        ratio = torch.exp(new_probs - old_probs)

        # Define surrogate loss
        surr = ratio * advantages[:, :, np.newaxis]
        surr_clip = torch.clamp(ratio, 1-self.cliprange, 1+self.cliprange) * advantages[:, :, np.newaxis]
        vf_loss = torch.nn.MSELoss()
        norm_rewards = norm_rewards[:, :, np.newaxis]
        entropy_loss = entropy_loss[:, :, np.newaxis]
        
        loss = torch.min(surr, surr_clip) - self.vf_coef*vf_loss(values, norm_rewards) + self.ent_coef*entropy_loss
      
        return -loss.mean()

Finally, the agent takes the trajectory data collected, calculates the advantages and performs stochastic gradient ascent updates.

In [None]:
def _learn(self, trajectories):
        states = torch.Tensor(trajectories['state'])
        rewards = torch.Tensor(trajectories['reward'])
        old_probs = torch.Tensor(trajectories['prob'])
        actions = torch.Tensor(trajectories['action'])
        old_values = torch.Tensor(trajectories['value'])
        dones = torch.Tensor(trajectories['done'])

        # Calculate the advantages
        processed_rollout = [None] * (len(dones))
        advantages = torch.Tensor(np.zeros((self.num_agents, 1)))
        i_max = len(states)
        for i in reversed(range(i_max)):
            terminals_ = 1. - torch.Tensor(dones[i]).unsqueeze(1)
            rwrds_ = torch.Tensor(rewards[i]).unsqueeze(1)
            values_ = torch.Tensor(old_values[i])
            next_value_ = old_values[min(i_max-1, i+1)]

            td_error = rwrds_ + self.gamma * terminals_ * next_value_.detach()
            td_error -= values_.detach()
            advantages = advantages * self.tau * self.gamma * terminals_ + td_error
            processed_rollout[i] = advantages

        advantages = torch.stack(processed_rollout).squeeze(2)
        advantages = (advantages - advantages.mean()) / advantages.std() # normalize

        # Learn in batches
        batcher = Batcher(self.batch_size, [np.arange(states.size(0))])
        batcher.shuffle()
        while not batcher.end():
            batch_indices = batcher.next_batch()[0]
            batch_indices = torch.Tensor(batch_indices).long()

            loss = self._surrogate(self.policy,
                                   old_probs[batch_indices],
                                   states[batch_indices],
                                   actions[batch_indices],
                                   rewards[batch_indices],
                                   advantages[batch_indices])
            
            # Take gradient step
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

## Training
The algorithm managed to reach a score over 30 points in just 100 episodes. The following plot showcases the evolution of the reward in time.
![training.jpg](attachment:training.jpg)


## Ideas of Future Work

Given the good performance of the implementation, some additional time could be spent in optimizing the algorithm and bringing it closer to the production-ready version from OpenAI.

The next steps of this project would be to implement the other policy gradient and actor-critic learning algorithms and compare their performances with PPO: DDPG, SAC, TD3, A3C etc.
