# Continuous Control

---

### 1. Start the Environment

In [1]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name='apps/Tennis.app')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [2]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [3]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import random
import copy
from collections import deque, namedtuple

In [5]:
# Models
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

class Critic(nn.Module):
    def __init__(self, seed):
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(48, 400)
        self.fc2 = nn.Linear(404, 300)
        self.fc3 = nn.Linear(300, 2)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)
        
    def forward(self, states, actions):
        states = states.reshape(-1, 48)
        actions = actions.reshape(-1, 4)
        x = F.relu(self.fc1(states))
        x = torch.cat((x, actions), -1)
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
class Actor(nn.Module):
    def __init__(self, seed):
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(24, 400)
        self.fc2 = nn.Linear(400, 300)
        self.fc3 = nn.Linear(300, 2)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)
        
    def forward(self, states):
        x = F.relu(self.fc1(states))
        x = F.relu(self.fc2(x))
        x = F.tanh(self.fc3(x))
        return x

In [6]:
class Agent(nn.Module):
    def __init__(self):
        super(Agent, self).__init__()
        self.actor = Actor(1)
        self.critic = Critic(1)
        self.critic_target = Critic(1)
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr = 1e-4)
        self.optimizer_critic = optim.Adam(self.critic.parameters(), lr = 1e-4)
        self.states = torch.empty(100000, num_agents, 24)
        self.actions = torch.empty(100000, num_agents, 2)
        self.rewards = torch.empty(100000, num_agents, 1)
        self.next_states = torch.empty(100000, num_agents, 24)
        self.dones = torch.empty(100000, num_agents, 1)
        self.idx = 0
        self.count = 0
        self.scale = 2
    
    def act(self, states):
        with torch.no_grad():
            actions = self.actor(states)
        self.scale *= 0.9999
        actions += np.random.standard_normal(2) * self.scale
        return np.clip(actions, -1, 1)
    
    def push(self, states, actions, rewards, next_states, dones):
        self.states[self.idx] = states
        self.actions[self.idx] = actions
        self.rewards[self.idx] = torch.tensor(rewards, dtype = torch.float).unsqueeze(1)
        self.next_states[self.idx] = next_states
        self.dones[self.idx] = torch.tensor(dones, dtype = torch.float).unsqueeze(1)
        
        self.count += 1
        self.idx = self.count % 100000
        
    def sample(self):
        sample_range = min(self.count, 99999)
        batch_idxs = random.sample(range(sample_range), 128)
        return self.states[batch_idxs], self.actions[batch_idxs], self.rewards[batch_idxs], self.next_states[batch_idxs], self.dones[batch_idxs]

    def step(self):
        if self.count >= 128:
            states, actions, rewards, next_states, dones = self.sample()
            next_actions = self.actor(next_states)
            td_error = torch.mean((rewards + 0.99 * self.critic_target(next_states, next_actions).unsqueeze(2) * \
                                   (1 - dones) - self.critic(states, actions).unsqueeze(2))**2)
            
            self.optimizer_critic.zero_grad()
            td_error.backward()
#             torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
            self.optimizer_critic.step()
            for target_param, local_param in zip(self.critic_target.parameters(), self.critic.parameters()):
                target_param.data.copy_(0.001*local_param.data + (1.0-0.001)*target_param.data)
                    
            actions_pred = self.actor(states)
            rewards_pred = self.critic(states, actions_pred).unsqueeze(2)
            rewards_calc = -torch.mean(rewards_pred)
            
            self.optimizer_actor.zero_grad()
            rewards_calc.backward()
#             torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1)
            self.optimizer_actor.step()

agent = Agent()

In [7]:
scores = []
scores_window = deque(maxlen=100)
for i in range(1500):                                         
    env_info = env.reset(train_mode=True)[brain_name]     
    states = torch.tensor(env_info.vector_observations, dtype = torch.float) 
    score = np.zeros(num_agents) 
    while True:
        actions = agent.act(states)
        env_info = env.step(actions.cpu().data.numpy())[brain_name]           
        next_states = torch.tensor(env_info.vector_observations, dtype = torch.float)       
        rewards = env_info.rewards
        dones = env_info.local_done                        
        score += env_info.rewards
        agent.push(states, actions, rewards, next_states, dones)
        agent.step()
        states = next_states                              
        if np.any(dones):                                  
            break
    scores_window.append(score)       
    scores.append(score)
    
    print('\rEpisode {}\tReward: {:.5f}\tAverage Reward: {:.5f}'.format(i, np.mean(score), np.mean(scores_window)))
    if np.mean(scores_window) >= 2.2:
        print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i-100, np.mean(scores_window)))
        torch.save(agent.critic.state_dict(), 'checkpoint/trained_weights_ddpg_v2_critic.pth')
        torch.save(agent.actor.state_dict(), 'checkpoint/trained_weights_ddpg_v2_policy.pth')
        break



Episode 0	Reward: -0.00500	Average Reward: -0.00500
Episode 1	Reward: -0.00500	Average Reward: -0.00500
Episode 2	Reward: -0.00500	Average Reward: -0.00500
Episode 3	Reward: 0.04500	Average Reward: 0.00750
Episode 4	Reward: -0.00500	Average Reward: 0.00500
Episode 5	Reward: -0.00500	Average Reward: 0.00333
Episode 6	Reward: -0.00500	Average Reward: 0.00214
Episode 7	Reward: -0.00500	Average Reward: 0.00125
Episode 8	Reward: -0.00500	Average Reward: 0.00056
Episode 9	Reward: -0.00500	Average Reward: 0.00000
Episode 10	Reward: -0.00500	Average Reward: -0.00045
Episode 11	Reward: -0.00500	Average Reward: -0.00083
Episode 12	Reward: -0.00500	Average Reward: -0.00115
Episode 13	Reward: 0.04500	Average Reward: 0.00214
Episode 14	Reward: -0.00500	Average Reward: 0.00167
Episode 15	Reward: -0.00500	Average Reward: 0.00125
Episode 16	Reward: -0.00500	Average Reward: 0.00088
Episode 17	Reward: -0.00500	Average Reward: 0.00056
Episode 18	Reward: -0.00500	Average Reward: 0.00026
Episode 19	Reward:

Episode 157	Reward: -0.00500	Average Reward: 0.00350
Episode 158	Reward: -0.00500	Average Reward: 0.00350
Episode 159	Reward: -0.00500	Average Reward: 0.00350
Episode 160	Reward: 0.04500	Average Reward: 0.00400
Episode 161	Reward: 0.09500	Average Reward: 0.00500
Episode 162	Reward: 0.09500	Average Reward: 0.00550
Episode 163	Reward: -0.00500	Average Reward: 0.00550
Episode 164	Reward: -0.00500	Average Reward: 0.00550
Episode 165	Reward: -0.00500	Average Reward: 0.00550
Episode 166	Reward: -0.00500	Average Reward: 0.00550
Episode 167	Reward: -0.00500	Average Reward: 0.00550
Episode 168	Reward: -0.00500	Average Reward: 0.00550
Episode 169	Reward: -0.00500	Average Reward: 0.00550
Episode 170	Reward: 0.04500	Average Reward: 0.00600
Episode 171	Reward: -0.00500	Average Reward: 0.00600
Episode 172	Reward: -0.00500	Average Reward: 0.00550
Episode 173	Reward: -0.00500	Average Reward: 0.00550
Episode 174	Reward: -0.00500	Average Reward: 0.00550
Episode 175	Reward: -0.00500	Average Reward: 0.005

Episode 313	Reward: -0.00500	Average Reward: 0.01100
Episode 314	Reward: -0.00500	Average Reward: 0.01100
Episode 315	Reward: -0.00500	Average Reward: 0.01100
Episode 316	Reward: -0.00500	Average Reward: 0.01100
Episode 317	Reward: -0.00500	Average Reward: 0.01100
Episode 318	Reward: -0.00500	Average Reward: 0.01100
Episode 319	Reward: 0.04500	Average Reward: 0.01150
Episode 320	Reward: -0.00500	Average Reward: 0.01000
Episode 321	Reward: -0.00500	Average Reward: 0.01000
Episode 322	Reward: -0.00500	Average Reward: 0.01000
Episode 323	Reward: -0.00500	Average Reward: 0.00900
Episode 324	Reward: -0.00500	Average Reward: 0.00900
Episode 325	Reward: -0.00500	Average Reward: 0.00900
Episode 326	Reward: -0.00500	Average Reward: 0.00900
Episode 327	Reward: -0.00500	Average Reward: 0.00900
Episode 328	Reward: 0.04500	Average Reward: 0.00950
Episode 329	Reward: -0.00500	Average Reward: 0.00900
Episode 330	Reward: -0.00500	Average Reward: 0.00900
Episode 331	Reward: 0.04500	Average Reward: 0.00

Episode 469	Reward: 0.04500	Average Reward: 0.00500
Episode 470	Reward: -0.00500	Average Reward: 0.00500
Episode 471	Reward: 0.09500	Average Reward: 0.00600
Episode 472	Reward: -0.00500	Average Reward: 0.00550
Episode 473	Reward: -0.00500	Average Reward: 0.00550
Episode 474	Reward: -0.00500	Average Reward: 0.00550
Episode 475	Reward: 0.04500	Average Reward: 0.00600
Episode 476	Reward: -0.00500	Average Reward: 0.00600
Episode 477	Reward: -0.00500	Average Reward: 0.00600
Episode 478	Reward: 0.09500	Average Reward: 0.00700
Episode 479	Reward: -0.00500	Average Reward: 0.00700
Episode 480	Reward: -0.00500	Average Reward: 0.00700
Episode 481	Reward: -0.00500	Average Reward: 0.00600
Episode 482	Reward: -0.00500	Average Reward: 0.00600
Episode 483	Reward: -0.00500	Average Reward: 0.00600
Episode 484	Reward: 0.04500	Average Reward: 0.00650
Episode 485	Reward: -0.00500	Average Reward: 0.00650
Episode 486	Reward: -0.00500	Average Reward: 0.00600
Episode 487	Reward: 0.04500	Average Reward: 0.00650

Episode 626	Reward: 0.04500	Average Reward: 0.02350
Episode 627	Reward: -0.00500	Average Reward: 0.02300
Episode 628	Reward: -0.00500	Average Reward: 0.02250
Episode 629	Reward: 0.04500	Average Reward: 0.02300
Episode 630	Reward: -0.00500	Average Reward: 0.02300
Episode 631	Reward: -0.00500	Average Reward: 0.02250
Episode 632	Reward: -0.00500	Average Reward: 0.02250
Episode 633	Reward: 0.04500	Average Reward: 0.02300
Episode 634	Reward: -0.00500	Average Reward: 0.02300
Episode 635	Reward: -0.00500	Average Reward: 0.02250
Episode 636	Reward: 0.04500	Average Reward: 0.02250
Episode 637	Reward: 0.04500	Average Reward: 0.02250
Episode 638	Reward: 0.04500	Average Reward: 0.02300
Episode 639	Reward: 0.04500	Average Reward: 0.02350
Episode 640	Reward: 0.04500	Average Reward: 0.02350
Episode 641	Reward: 0.04500	Average Reward: 0.02350
Episode 642	Reward: 0.04500	Average Reward: 0.02350
Episode 643	Reward: -0.00500	Average Reward: 0.02350
Episode 644	Reward: -0.00500	Average Reward: 0.02300
Epi

Episode 783	Reward: 0.04500	Average Reward: 0.06200
Episode 784	Reward: 0.24500	Average Reward: 0.06400
Episode 785	Reward: -0.00500	Average Reward: 0.06400
Episode 786	Reward: 0.04500	Average Reward: 0.06400
Episode 787	Reward: 0.04500	Average Reward: 0.06400
Episode 788	Reward: 0.04500	Average Reward: 0.06400
Episode 789	Reward: 0.04500	Average Reward: 0.06350
Episode 790	Reward: 0.04500	Average Reward: 0.06350
Episode 791	Reward: 0.04500	Average Reward: 0.06400
Episode 792	Reward: 0.04500	Average Reward: 0.06450
Episode 793	Reward: 0.04500	Average Reward: 0.06450
Episode 794	Reward: 0.04500	Average Reward: 0.06450
Episode 795	Reward: 0.04500	Average Reward: 0.06400
Episode 796	Reward: 0.29500	Average Reward: 0.06600
Episode 797	Reward: 0.04500	Average Reward: 0.06550
Episode 798	Reward: 0.04500	Average Reward: 0.06550
Episode 799	Reward: 0.04500	Average Reward: 0.06550
Episode 800	Reward: 0.04500	Average Reward: 0.06550
Episode 801	Reward: 0.04500	Average Reward: 0.06600
Episode 802

Episode 941	Reward: 0.04500	Average Reward: 0.10200
Episode 942	Reward: 0.14500	Average Reward: 0.10250
Episode 943	Reward: 0.14500	Average Reward: 0.10300
Episode 944	Reward: 0.04500	Average Reward: 0.10300
Episode 945	Reward: 0.24500	Average Reward: 0.10500
Episode 946	Reward: 0.14500	Average Reward: 0.10400
Episode 947	Reward: 0.14500	Average Reward: 0.10500
Episode 948	Reward: 0.04500	Average Reward: 0.10150
Episode 949	Reward: 0.04500	Average Reward: 0.10150
Episode 950	Reward: 0.04500	Average Reward: 0.10150
Episode 951	Reward: 0.09500	Average Reward: 0.10100
Episode 952	Reward: 0.14500	Average Reward: 0.09700
Episode 953	Reward: 0.34500	Average Reward: 0.10000
Episode 954	Reward: 0.14500	Average Reward: 0.10100
Episode 955	Reward: 0.29500	Average Reward: 0.10350
Episode 956	Reward: 0.04500	Average Reward: 0.10350
Episode 957	Reward: 0.04500	Average Reward: 0.10350
Episode 958	Reward: 0.04500	Average Reward: 0.10350
Episode 959	Reward: 0.04500	Average Reward: 0.10350
Episode 960	

Episode 1097	Reward: 0.04500	Average Reward: 2.00495
Episode 1098	Reward: 2.60000	Average Reward: 2.02500
Episode 1099	Reward: 2.65000	Average Reward: 2.05105
Episode 1100	Reward: 2.60000	Average Reward: 2.07710
Episode 1101	Reward: 2.65000	Average Reward: 2.10065
Episode 1102	Reward: 2.65000	Average Reward: 2.12320
Episode 1103	Reward: 2.60000	Average Reward: 2.12320
Episode 1104	Reward: 2.65000	Average Reward: 2.14925
Episode 1105	Reward: 2.65000	Average Reward: 2.17530
Episode 1106	Reward: 2.60000	Average Reward: 2.19535
Episode 1107	Reward: 2.65000	Average Reward: 2.21890

Environment solved in 1007 episodes!	Average Score: 2.22


In [8]:
env.close()