# Continuous Control

---

### 1. Start the Environment

In [1]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name='apps/Tennis.app')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [2]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [3]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import random
import copy
from collections import deque, namedtuple

In [5]:
# Helper Functions
def random_sample(indices, batch_size):
    indices = np.asarray(np.random.permutation(indices))
    batches = indices[:len(indices) // batch_size * batch_size].reshape(-1, batch_size)
    for batch in batches:
        yield batch
    r = len(indices) % batch_size
    if r:
        yield indices[-r:]

def layer_init(layer, w_scale=1.0):
    nn.init.orthogonal_(layer.weight.data)
    layer.weight.data.mul_(w_scale)
    nn.init.constant_(layer.bias.data, 0)
    return layer

def stack_tensor(some_list):
    return torch.cat(some_list[:1000], dim=0)

In [6]:
# Models
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

class Critic(nn.Module):
    def __init__(self, seed):
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(48, 400)
        self.fc2 = nn.Linear(400, 300)
        self.fc3 = nn.Linear(300, 2)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)
        
    def forward(self, states):
        states = states.reshape(-1, 48)
        x = F.relu(self.fc1(states))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
class Actor(nn.Module):
    def __init__(self, seed):
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(24, 400)
        self.fc2 = nn.Linear(400, 300)
        self.fc3 = nn.Linear(300, 2)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)
        
    def forward(self, states):
        x = F.relu(self.fc1(states))
        x = F.relu(self.fc2(x))
        x = F.tanh(self.fc3(x))
        return x  
    
class ActorAndCritic(nn.Module):
    def __init__(self, seed):
        super(ActorAndCritic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.sd = nn.Parameter(torch.zeros(2))
        self.actor = Actor(1)
        self.critic = Critic(1)
        
    def forward(self, states):
        x = self.actor(states)
        mean = F.tanh(x)
        values = self.critic(states).unsqueeze(2)
        dist = torch.distributions.Normal(mean, F.softplus(self.sd))
        actions = torch.clamp(dist.sample(), -1, 1)
        return actions, values, dist.log_prob(actions).mean(2).unsqueeze(2), dist

In [9]:
class Agent(nn.Module):
    def __init__(self):
        super(Agent, self).__init__()
        self.network = ActorAndCritic(5)
        self.optimizer = optim.Adam(self.network.parameters(), lr = 5e-5)
    
    def act(self, states):
        with torch.no_grad():
            actions, values, log_probs, dist = self.network(states)
        return actions, values, log_probs, dist
    
    def step(self, old_log_probs, states, actions, rewards, dones, values):
        rewards = torch.tensor(rewards, dtype = torch.float).unsqueeze(2)
        dones = torch.tensor(dones, dtype = torch.int).unsqueeze(2)
        
        advantages = [0.0]*250
        returns = [0.0]*250
        advantage = torch.zeros((2, 1))
        return_ = torch.zeros((2, 1))
        for i in reversed(range(250)):
            if i == 250 - 1:
                td = (rewards[i] - values[i]).squeeze(0)
            else:
                td = (rewards[i] + (0.99 * (1-dones[i]) * values[i + 1]) - values[i]).squeeze(0)
            advantage = advantage * 0.95 * 0.99 * (1-dones[i]) + td
            return_ = rewards[i] + 0.99 * (1-dones[i]) * return_           
            advantages[i] = advantage
            returns[i] = return_
            
        old_log_probs, states, actions, values, advantages, returns = \
        torch.stack(old_log_probs), torch.stack(states), torch.stack(actions), \
        torch.stack(values), torch.stack(advantages), torch.stack(returns)
        advantages = (advantages - advantages.mean()) / advantages.std()
        
        for e in range(10):
            samples = random_sample(np.arange(250), 50)
            for sample in samples:
                states_1 = states[sample]
                actions_1 = actions[sample]
                old_log_probs_1 = old_log_probs[sample]
                advantages_1 = advantages[sample]
                returns_1 = returns[sample]
                
                _, values_1, _, dist = self.network(states_1)
                new_log_probs_1 = dist.log_prob(actions_1).mean(2).unsqueeze(2)

                new_probs, old_probs = torch.exp(new_log_probs_1), torch.exp(old_log_probs_1)
                ratios = new_probs / old_probs
                ratios_clipped = torch.clamp(ratios, 0.9, 1.1)
                entropy = -(new_probs * torch.log(old_probs + 1e-10) + (1.0 - new_probs) * torch.log(1.0 - old_probs + 1e-10))
                entropy[torch.isnan(entropy)] = 0
                
                rewards_clipped = -torch.min(ratios * advantages_1, ratios_clipped * advantages_1)
                rewards_clipped = torch.mean(rewards_clipped + 0.01 * entropy)
                loss = torch.mean((returns_1 - values_1)**2)
                total_loss = rewards_clipped + loss
                
                self.optimizer.zero_grad()
                total_loss.backward()
                nn.utils.clip_grad_norm_(self.network.parameters(), 10) 
                self.optimizer.step()

agent = Agent()

In [11]:
scores = []
scores_window = deque(maxlen=100)
log_probs_all, states_all, actions_all, rewards_all, dones_all, values_all = [], [], [], [], [], []
sample_num = 0
for i in range(5000):                                         
    env_info = env.reset(train_mode=True)[brain_name]     
    states = torch.tensor(env_info.vector_observations, dtype = torch.float).unsqueeze(0)
    score = np.zeros(num_agents) 
    while True:
        actions, values, log_probs, _ = agent.act(states)
        env_info = env.step(actions.squeeze(0).detach().numpy())[brain_name]           
        next_states = torch.tensor(env_info.vector_observations, dtype = torch.float).unsqueeze(0)
        dones = env_info.local_done
        
        log_probs_all.append(log_probs.squeeze(0))
        states_all.append(states.squeeze(0))
        actions_all.append(actions.squeeze(0))
        rewards_all.append(env_info.rewards)
        dones_all.append(dones)
        values_all.append(values)
        score += env_info.rewards
        states = next_states 
        
        if sample_num == 249:
            agent.step(log_probs_all, states_all, actions_all, rewards_all, dones_all, values_all)
            log_probs_all, states_all, actions_all, rewards_all, dones_all, values_all = [], [], [], [], [], []
        
        sample_num = (sample_num + 1) % 250
        if np.any(dones):                                  
            break
        
    scores_window.append(score)       
    scores.append(score)
    print('\rEpisode {}\tReward: {:.5f}\tAverage Reward: {:.5f}'.format(i, np.mean(score), np.mean(scores_window)))

Episode 0	Reward: 0.14500	Average Reward: 0.14500
Episode 1	Reward: 0.04500	Average Reward: 0.09500
Episode 2	Reward: 0.04500	Average Reward: 0.07833
Episode 3	Reward: 0.04500	Average Reward: 0.07000
Episode 4	Reward: -0.00500	Average Reward: 0.05500
Episode 5	Reward: 0.19500	Average Reward: 0.07833
Episode 6	Reward: 0.04500	Average Reward: 0.07357
Episode 7	Reward: 0.04500	Average Reward: 0.07000
Episode 8	Reward: 0.04500	Average Reward: 0.06722
Episode 9	Reward: 0.14500	Average Reward: 0.07500
Episode 10	Reward: 0.29500	Average Reward: 0.09500
Episode 11	Reward: -0.00500	Average Reward: 0.08667
Episode 12	Reward: 0.14500	Average Reward: 0.09115
Episode 13	Reward: 0.04500	Average Reward: 0.08786
Episode 14	Reward: 0.04500	Average Reward: 0.08500
Episode 15	Reward: 0.04500	Average Reward: 0.08250
Episode 16	Reward: 0.09500	Average Reward: 0.08324
Episode 17	Reward: 0.04500	Average Reward: 0.08111
Episode 18	Reward: 0.04500	Average Reward: 0.07921
Episode 19	Reward: 0.14500	Average Rewa

Episode 160	Reward: 0.14500	Average Reward: 0.21400
Episode 161	Reward: 0.24500	Average Reward: 0.21600
Episode 162	Reward: 0.04500	Average Reward: 0.21600
Episode 163	Reward: 1.14500	Average Reward: 0.22400
Episode 164	Reward: 0.09500	Average Reward: 0.22100
Episode 165	Reward: 0.04500	Average Reward: 0.21750
Episode 166	Reward: -0.00500	Average Reward: 0.21700
Episode 167	Reward: 0.09500	Average Reward: 0.21550
Episode 168	Reward: 0.49500	Average Reward: 0.22000
Episode 169	Reward: 0.04500	Average Reward: 0.21900
Episode 170	Reward: 0.04500	Average Reward: 0.21650
Episode 171	Reward: 0.99500	Average Reward: 0.22300
Episode 172	Reward: 0.24500	Average Reward: 0.22450
Episode 173	Reward: 0.19500	Average Reward: 0.22500
Episode 174	Reward: 0.04500	Average Reward: 0.22500
Episode 175	Reward: 0.09500	Average Reward: 0.22500
Episode 176	Reward: 0.04500	Average Reward: 0.22500
Episode 177	Reward: 0.04500	Average Reward: 0.22500
Episode 178	Reward: -0.00500	Average Reward: 0.22350
Episode 17

Episode 318	Reward: 0.19500	Average Reward: 0.19350
Episode 319	Reward: 0.34500	Average Reward: 0.19300
Episode 320	Reward: 0.14500	Average Reward: 0.19400
Episode 321	Reward: 0.34500	Average Reward: 0.19700
Episode 322	Reward: 0.34500	Average Reward: 0.20000
Episode 323	Reward: 0.24500	Average Reward: 0.19850
Episode 324	Reward: 0.09500	Average Reward: 0.19750
Episode 325	Reward: 0.09500	Average Reward: 0.19800
Episode 326	Reward: 0.04500	Average Reward: 0.19750
Episode 327	Reward: 0.49500	Average Reward: 0.19850
Episode 328	Reward: 0.54500	Average Reward: 0.20050
Episode 329	Reward: 0.89500	Average Reward: 0.20800
Episode 330	Reward: 0.19500	Average Reward: 0.20850
Episode 331	Reward: 0.04500	Average Reward: 0.20700
Episode 332	Reward: -0.00500	Average Reward: 0.20650
Episode 333	Reward: 0.49500	Average Reward: 0.21100
Episode 334	Reward: 0.09500	Average Reward: 0.21150
Episode 335	Reward: 0.09500	Average Reward: 0.21150
Episode 336	Reward: -0.00500	Average Reward: 0.20800
Episode 33

Episode 476	Reward: 0.14500	Average Reward: 0.26650
Episode 477	Reward: 0.49500	Average Reward: 0.26800
Episode 478	Reward: 0.49500	Average Reward: 0.27200
Episode 479	Reward: 0.09500	Average Reward: 0.27250
Episode 480	Reward: 0.29500	Average Reward: 0.27450
Episode 481	Reward: 0.29500	Average Reward: 0.27450
Episode 482	Reward: 0.29500	Average Reward: 0.27300
Episode 483	Reward: 0.39500	Average Reward: 0.27150
Episode 484	Reward: 0.24500	Average Reward: 0.26750
Episode 485	Reward: 0.54500	Average Reward: 0.27150
Episode 486	Reward: 0.24500	Average Reward: 0.27200
Episode 487	Reward: 0.49500	Average Reward: 0.27650
Episode 488	Reward: 0.44500	Average Reward: 0.28050
Episode 489	Reward: 0.09500	Average Reward: 0.27700
Episode 490	Reward: 0.29500	Average Reward: 0.27700
Episode 491	Reward: 0.34500	Average Reward: 0.27350
Episode 492	Reward: 0.04500	Average Reward: 0.26950
Episode 493	Reward: -0.00500	Average Reward: 0.26850
Episode 494	Reward: 0.19500	Average Reward: 0.26850
Episode 495

Episode 635	Reward: 0.59500	Average Reward: 0.23940
Episode 636	Reward: 0.49500	Average Reward: 0.24340
Episode 637	Reward: 0.34500	Average Reward: 0.24490
Episode 638	Reward: 0.44500	Average Reward: 0.24840
Episode 639	Reward: -0.00500	Average Reward: 0.24840
Episode 640	Reward: 0.04500	Average Reward: 0.24790
Episode 641	Reward: 0.44500	Average Reward: 0.25090
Episode 642	Reward: 0.09500	Average Reward: 0.24590
Episode 643	Reward: 0.04500	Average Reward: 0.24090
Episode 644	Reward: 0.19500	Average Reward: 0.24290
Episode 645	Reward: -0.00500	Average Reward: 0.23890
Episode 646	Reward: -0.00500	Average Reward: 0.23640
Episode 647	Reward: 0.04500	Average Reward: 0.23590
Episode 648	Reward: 0.09500	Average Reward: 0.23540
Episode 649	Reward: 0.04500	Average Reward: 0.23490
Episode 650	Reward: 0.49500	Average Reward: 0.23890
Episode 651	Reward: 0.44500	Average Reward: 0.24290
Episode 652	Reward: 0.04500	Average Reward: 0.23990
Episode 653	Reward: 0.69500	Average Reward: 0.24540
Episode 6

Episode 793	Reward: 0.49500	Average Reward: 0.23495
Episode 794	Reward: 0.44500	Average Reward: 0.23845
Episode 795	Reward: 0.79500	Average Reward: 0.24345
Episode 796	Reward: 0.04500	Average Reward: 0.24345
Episode 797	Reward: 0.24500	Average Reward: 0.24545
Episode 798	Reward: 0.19500	Average Reward: 0.24595
Episode 799	Reward: 0.04500	Average Reward: 0.24545
Episode 800	Reward: 0.44500	Average Reward: 0.24595
Episode 801	Reward: 0.04500	Average Reward: 0.24445
Episode 802	Reward: 0.14500	Average Reward: 0.24145
Episode 803	Reward: 0.24500	Average Reward: 0.24145
Episode 804	Reward: 0.04500	Average Reward: 0.24145
Episode 805	Reward: 0.04500	Average Reward: 0.24145
Episode 806	Reward: 0.04500	Average Reward: 0.23245
Episode 807	Reward: 0.34500	Average Reward: 0.23545
Episode 808	Reward: 0.09500	Average Reward: 0.22745
Episode 809	Reward: 0.19500	Average Reward: 0.22895
Episode 810	Reward: -0.00500	Average Reward: 0.22645
Episode 811	Reward: 0.44500	Average Reward: 0.21095
Episode 812

Episode 951	Reward: 0.19500	Average Reward: 0.25555
Episode 952	Reward: 0.04500	Average Reward: 0.25505
Episode 953	Reward: 0.09500	Average Reward: 0.25555
Episode 954	Reward: 0.09500	Average Reward: 0.25205
Episode 955	Reward: 0.09500	Average Reward: 0.25255
Episode 956	Reward: 0.24500	Average Reward: 0.25355
Episode 957	Reward: 0.04500	Average Reward: 0.25355
Episode 958	Reward: 0.34500	Average Reward: 0.25255
Episode 959	Reward: 0.24500	Average Reward: 0.25355
Episode 960	Reward: 0.09500	Average Reward: 0.25405
Episode 961	Reward: 0.04500	Average Reward: 0.25355
Episode 962	Reward: 0.24500	Average Reward: 0.25555
Episode 963	Reward: 0.59500	Average Reward: 0.25755
Episode 964	Reward: 0.04500	Average Reward: 0.25805
Episode 965	Reward: 0.09500	Average Reward: 0.25005
Episode 966	Reward: 0.44500	Average Reward: 0.25405
Episode 967	Reward: 0.04500	Average Reward: 0.25205
Episode 968	Reward: 0.14500	Average Reward: 0.25055
Episode 969	Reward: 0.39500	Average Reward: 0.25355
Episode 970	

Episode 1107	Reward: 1.19500	Average Reward: 0.23040
Episode 1108	Reward: 0.19500	Average Reward: 0.23140
Episode 1109	Reward: 0.74500	Average Reward: 0.23840
Episode 1110	Reward: 0.54500	Average Reward: 0.23540
Episode 1111	Reward: 0.24500	Average Reward: 0.23340
Episode 1112	Reward: 0.94500	Average Reward: 0.23695
Episode 1113	Reward: 0.24500	Average Reward: 0.23895
Episode 1114	Reward: 0.19500	Average Reward: 0.23745
Episode 1115	Reward: 0.09500	Average Reward: 0.23795
Episode 1116	Reward: 0.14500	Average Reward: 0.23895
Episode 1117	Reward: 0.29500	Average Reward: 0.24145
Episode 1118	Reward: 0.49500	Average Reward: 0.24595


ValueError: Expected parameter loc (Tensor of shape (50, 2, 2)) of distribution Normal(loc: torch.Size([50, 2, 2]), scale: torch.Size([50, 2, 2])) to satisfy the constraint Real(), but found invalid values:
tensor([[[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan]]], grad_fn=<TanhBackward0>)

In [None]:
env.close()