# Continuous Control

---

### 1. Start the Environment

In [1]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name='./Reacher.app')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [2]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [3]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 1
Size of each action: 4
There are 1 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726671e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import random
import copy
from collections import deque, namedtuple

In [5]:
def random_sample(indices, batch_size):
    indices = np.asarray(np.random.permutation(indices))
    batches = indices[:len(indices) // batch_size * batch_size].reshape(-1, batch_size)
    for batch in batches:
        yield batch
    r = len(indices) % batch_size
    if r:
        yield indices[-r:]

def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

def layer_init(layer, w_scale=1.0):
    nn.init.orthogonal_(layer.weight.data)
    layer.weight.data.mul_(w_scale)
    nn.init.constant_(layer.bias.data, 0)
    return layer

In [None]:
# Models
class SubNetwork(nn.Module):
    
    def __init__(self, input_size, output_size, seed):
        super(SubNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = layer_init(nn.Linear(input_size, 512))
        self.fc2 = layer_init(nn.Linear(512, 256))
        self.fc3 = layer_init(nn.Linear(256, output_size), 1e-3)
        self.bn1 = nn.BatchNorm1d(128)
        self.dropout = nn.Dropout(0.25)
        
    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        x = self.fc3(x)
        return x    
            
class ActorAndCritic(nn.Module):
    
    def __init__(self, state_size, action_size, seed = 109):
        super(ActorAndCritic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.actor = SubNetwork(state_size, action_size, seed = 110)
        self.critic = SubNetwork(state_size, 1, seed = 111)
        self.std = nn.Parameter(torch.zeros(action_size))
        
    def forward(self, obs, action=None):
        a = self.actor(obs)
        v = self.critic(obs)
        mean = torch.tanh(a)
        dist = torch.distributions.Normal(mean, F.softplus(self.std))
        return v, dist
    
network = ActorAndCritic(33, 4)
optimizer = optim.Adam(network.parameters(), lr = 2e-4)

In [None]:
def get_prediction(action_distribution):
    actions = np.clip(action_distribution.sample(), -1, 1)
    return actions

def evaluate_actions_against_states(states, actions):
    value, distribution = network(states, actions)
    log_prob = get_log_prob(distribution, actions)
    return value, log_prob
    
def get_log_prob(action_distribution, actions):
    return action_distribution.log_prob(actions).mean(-1).unsqueeze(-1)
    
def loss_func(i_episode, idx, max_t, old_log_probs, actions, values, rewards, states, dones_num, returns, advantages, epoch = 10, gae_lamda = 0.95, discount = 0.99, epsilon=0.2, beta = 0.01):
    advantage = torch.zeros((1, ))
    for i in reversed(range(250)):
        if i == idx: td = rewards[i] - values[i]
        else: td = rewards[i] + (discount * dones_num[i] * values[i + 1]) - values[i]        
        advantage = advantage * gae_lamda * discount * dones_num[i] + td
        advantages[i] = advantage.detach()

    return_indiv = torch.zeros((1, ))
    for i in reversed(range(250)):
        return_indiv = rewards[i] + discount * dones_num[i] * return_indiv
        returns[i] = return_indiv.detach() 

    if i_episode <= 1: sample_range = idx + 1
    else: sample_range = 250
    advantages_0 = (advantages - advantages.mean()) / advantages.std()

    old_log_probs_0 = old_log_probs.reshape(sample_range*num_agents, 1)
    states_0 = states.reshape(sample_range*num_agents, 33)
    actions_0 = actions.reshape(sample_range*num_agents, 4)
    advantages_0 = advantages.reshape(sample_range*num_agents, 1)
    returns_0 = returns.reshape(sample_range*num_agents, 1)
    
    for e in range(epoch):
        sampled_idxs = random_sample(range(250 * num_agents), 64)
        for sampled_idx in sampled_idxs:
            old_log_probs_1 = old_log_probs_0[sampled_idx]
            states_1 = states_0[sampled_idx]
            actions_1 = actions_0[sampled_idx]
            values_1, new_log_probs_1 = evaluate_actions_against_states(states_1, actions_1)
            advantages_1 = advantages_0[sampled_idx]
            returns_1 = returns_0[sampled_idx]

            old_log_probs_1 = old_log_probs_1.detach()
            old_probs_1 = torch.exp(old_log_probs_1).mean(-1)
            new_probs_1 = torch.exp(new_log_probs_1).mean(-1)
            ratios = (new_probs_1 / old_probs_1).unsqueeze(1)
            ratios_alt = torch.clamp(ratios, 1 - epsilon, 1 + epsilon)

            entropy = -(new_probs_1 * torch.log(old_probs_1 + 1e-10) + (1.0 - new_probs_1) * torch.log(1.0 - old_probs_1 + 1e-10))
            L_clipped = -torch.min(ratios * advantages_1, ratios_alt * advantages_1)
            L_loss = torch.mean(L_clipped + beta * entropy)
            loss = torch.mean((values_1 - returns_1)**2)
            total_loss = (L_loss + loss)

            optimizer.zero_grad()
            total_loss.backward()
            nn.utils.clip_grad_norm_(network.parameters(), 1) 
            optimizer.step()

In [None]:
def training(n_episodes=100, max_t=1000):
    scores = []
    scores_window = deque(maxlen=100)
    log_probs, states, actions, rewards, dones_num, values, returns, advantages = \
    torch.empty(250, num_agents, 1), torch.empty(250, num_agents, 33), torch.empty(250, num_agents, 4), torch.empty(250, num_agents, 1), \
    torch.empty(250, num_agents, 1), torch.empty(250, num_agents, 1), torch.empty(250, num_agents, 1), torch.empty(250, num_agents, 1)
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode = True)[brain_name]
        state = torch.tensor(env_info.vector_observations, dtype = torch.float)
        score = np.zeros(num_agents)
        for t in range(max_t):
            value, dist = network(state)
            action = get_prediction(dist)
            value, log_prob = evaluate_actions_against_states(state, action)
            env_info = env.step(action.detach().numpy())[brain_name]
            next_state, reward, done = torch.tensor(env_info.vector_observations, dtype = torch.float), env_info.rewards, env_info.local_done
            
            idx = t % 250 
            log_probs[idx] = log_prob
            states[idx] = state
            actions[idx] = action
            rewards[idx] = torch.tensor(reward).unsqueeze(1)
            if done[0] is False: 
                dones_num[idx] = torch.ones(num_agents, 1)
            else: 
                dones_num[idx] = torch.zeros(num_agents, 1)
            
            score += reward
            state = next_state
            
            if idx == 249:
                loss_func(i_episode, idx, max_t, log_probs, actions, values, rewards, states, dones_num, returns, advantages, epoch = 10, gae_lamda = 0.98, discount = 0.98, epsilon=0.2)
        
        scores_window.append(score)
        scores.append(score)              
        if i_episode % 5 == 0:
            print('\rEpisode {}\tReward: {:.2f}\tAverage Reward: {:.2f}'.format(i_episode, np.mean(score), np.mean(scores_window)))
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=30.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.dqn.state_dict(), 'dqn.pth')
            break
    return scores

scores = training()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()