In [1]:
from unityagents import UnityEnvironment
from collections import deque

import numpy as np

import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.autograd import Variable

In [2]:
# 1 agent
env = UnityEnvironment(file_name="./Reacher_Windows_x86_64_1_agent/Reacher.exe")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [4]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 1
Size of each action: 4
There are 1 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726671e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
pi = Variable(torch.FloatTensor([math.pi]), requires_grad=True).cuda()

def normal(x, mu, sigma_sq):
    a = (-1*(Variable(x, requires_grad=True)-mu).pow(2)/(2*sigma_sq)).exp()
    b = 1/(2*sigma_sq*pi.expand_as(sigma_sq)).sqrt()
    return a*b

def probs_to_actions(mu, sigma_sq):
    eps = torch.randn(mu.size())
    sigma_sq = torch.ones(mu.size()).cuda()
    action = (mu + sigma_sq.sqrt()*Variable(eps, requires_grad=True).cuda())
    prob = normal(action, mu, sigma_sq)
    
    return action.cpu().reshape(-1, 4).reshape(-1, 4), torch.log(prob).reshape(-1, 4)

In [7]:
class Policy(nn.Module):
    def __init__(self, state_size, action_size, fc1_size=256, fc2_size=128):
        super(Policy, self).__init__()
        
        self.fc1 = nn.Linear(33, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        
        self.fc3_mu = nn.Linear(fc2_size, action_size)
        self.fc3_sigma_squared = nn.Linear(fc2_size, action_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        mu = F.hardtanh(self.fc3_mu(x))
        sigma = F.relu(self.fc3_sigma_squared(x))
        
        return mu, sigma
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        mu, sigma = self.forward(state)
        return probs_to_actions(mu, sigma)

In [8]:
class Value(nn.Module):
    """
        Critic network that estimates a Value Function used as a baseline
    """
    def __init__(self, state_size, fc1_size=128, fc2_size=64):
        super(Value, self).__init__()
        
        self.fc1 = nn.Linear(33, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.fc3 = nn.Linear(fc2_size, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)
    
    def estimate(self, state):
        state = torch.from_numpy(state).float().to(device)
        return self.forward(state)

In [9]:
def unroll_trajectory(env, brain_name, policy, t_max=1000, gamma=0.995):
    """
        Unroll a trajectory and return 
        list of action probabilities, states, actions and rewards

        Parameters
        ----------
        env: unityagents.UnityEnvironment
        Environment to play on.
        brain_name: String
        Name of brain used for UnityEnvironment
        policy: torch.nn.Module
        A neural network mapping states to action probabilities 
        The action to take at a given state
        t_max: int
        Maximum number of episodes in trajectory
    """
    
    env_info = env.reset(train_mode=True)[brain_name]
    # num_agents = len(env.ps)

    state_list      = []
    reward_list     = []
    prob_list       = []
    action_list     = []
    values_list     = []
    advantages_list = []

    states = env_info.vector_observations
    
    for _ in range(t_max):

        a, p = policy_network.act(states)
        actions = a.cpu().detach().numpy()
        action_probabilities = p.cpu().detach().numpy()
        
        env_info = env.step(actions)[brain_name]
        next_states = env_info.vector_observations
        
        prob_list.append(action_probabilities)
        state_list.append(states)
        action_list.append(actions)
        
        rewards = env_info.rewards
        reward_list.append(env_info.rewards)
        
        cur_value    = value_network.estimate(states).cpu().detach().numpy()
        next_value   = value_network.estimate(next_states).cpu().detach().numpy()
        
        values       = rewards + gamma * cur_value
        values_list.append(values)
        advantages   = rewards + gamma * next_value - cur_value
        advantages_list.append(advantages)
        
        if np.any(env_info.local_done):
            break

        states = next_states

    return {
        "log_probs"  : np.array(prob_list), 
        "states"     : np.array(state_list), 
        "actions"    : np.array(action_list), 
        "rewards"    : np.array(reward_list), 
        "values"     : np.array(values_list), 
        "advantages" : np.array(advantages_list)
    }

In [10]:
def surrogate(old_probs, states, rewards, advantages, epsilon=0.2):
    old_probs = torch.tensor(old_probs, dtype=torch.float, device=device)
    _, new_probs = policy_network.act(states)
    new_probs = new_probs.reshape(old_probs.shape)        
    ratio = (new_probs/old_probs)
    pl = []
    for i, log_prob in enumerate(ratio):
        R = np.mean(rewards[:,i])
        A = np.mean(advantages[i])
        clip = torch.clamp(ratio[i], 1-epsilon, 1+epsilon)
        pl.append(torch.min(ratio[i] * R,  clip * R) - A)
    return torch.cat(pl).sum()

In [11]:
def a3c(n_episodes=300, max_t=1000, print_every=100):
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_episodes+1):
       
        trajectory = unroll_trajectory(env, brain_name, max_t)

        mean_rewards = np.mean(np.sum(trajectory['rewards'], axis=0))
        scores_deque.append(mean_rewards)
        scores.append(mean_rewards)
        
        # Modification 2: Use Future Rewards
        R = np.flip(np.flip(trajectory['rewards']).cumsum(axis=1)).reshape(-1, max_t).copy()
                
        # Modification 1: Batch Normalization of Reward Signal
        if R.shape[0] > 10:
            R_mean = np.mean(R, axis=0)
            R_std = np.std(R, axis=0) + 1.0e-10
            R = np.nan_to_num((R - R_mean) / R_std)
            
        # Modification 4: Value-function baseline
        for i in range(trajectory['states'].shape[0]):
            state = trajectory['states'][i]
            value = trajectory['values'][i]
            estimated_value = value_network.estimate(state)
            value = torch.from_numpy(value).float().to(device)
            value_loss = value_criterion(estimated_value, value)
            value_optimizer.zero_grad()
            value_loss.backward()
            value_optimizer.step()
        
        # Modification 3: PPO
        for i in range(0, 3):
            policy_loss = -surrogate(trajectory['log_probs'], trajectory['states'], R, trajectory['advantages'])
            policy_optimizer.zero_grad()
            policy_loss.backward()
            policy_optimizer.step()
            
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        
        if np.mean(scores_deque)>=30.0:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
            break
        
    return scores

In [None]:
policy_network   = Policy(state_size=33, action_size=action_size).to(device)
policy_optimizer = optim.Adam(policy_network.parameters(), lr=1e-4)

value_network    = Value(state_size=33).to(device)
value_criterion  = nn.MSELoss()
value_optimizer  = optim.Adam(value_network.parameters(), lr=1e-4)

%time scores = a3c()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()