# Continuous Control

---

You are welcome to use this coding environment to train your agent for the project.  Follow the instructions below to get started!

### 1. Start the Environment

Run the next code cell to install a few packages.  This line will take a few minutes to run!

In [1]:
!pip -q install ./python

[31mtensorflow 1.7.1 has requirement numpy>=1.13.3, but you'll have numpy 1.12.1 which is incompatible.[0m
[31mipython 6.5.0 has requirement prompt-toolkit<2.0.0,>=1.0.15, but you'll have prompt-toolkit 3.0.5 which is incompatible.[0m


The environments corresponding to both versions of the environment are already saved in the Workspace and can be accessed at the file paths provided below.  

Please select one of the two options below for loading the environment.

In [None]:
from unityagents import UnityEnvironment
import numpy as np

from collections import deque
import matplotlib.pyplot as plt

# select this option to load version 1 (with a single agent) of the environment
env = UnityEnvironment(file_name='/Reacher_Windows_x86_64/Reacher_Windows_x86_64/Reacher.exeReacher_One_Linux_NoVis.x86_64')

# select this option to load version 2 (with 20 agents) of the environment
# env = UnityEnvironment(file_name='/data/Reacher_Linux_NoVis/Reacher.x86_64')

Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

### 2. Examine the State and Action Spaces

Run the code cell below to print some information about the environment.

In [4]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 1
Size of each action: 4
There are 1 agents. Each observes a state with length: 33
The state for the first agent looks like: [  0.00000000e+00  -4.00000000e+00   0.00000000e+00   1.00000000e+00
  -0.00000000e+00  -0.00000000e+00  -4.37113883e-08   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00  -1.00000000e+01   0.00000000e+00
   1.00000000e+00  -0.00000000e+00  -0.00000000e+00  -4.37113883e-08
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   5.75471878e+00  -1.00000000e+00
   5.55726671e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
  -1.68164849e-01]


In [5]:
from collections import deque, namedtuple

import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

In [6]:
BUFFER_SIZE = int(1e6)
BATCH_SIZE = 32
ACTOR_LR = 1e-4
CRITIC_LR = 1e-3
TAU = 1e-3
BETA = 1
BETA_DECAY = 0.99
BETA_MIN = 0.01
GAMMA = 0.99
UPDATE_EVERY = 1
TIMES_UPDATE = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
class CriticNetwork(nn.Module):
    
    def __init__(self, state_size, action_size):

        super(CriticNetwork, self).__init__()

        self.critic_layer_1 = nn.Linear(state_size, 64)
        self.critic_layer_2 = nn.Linear(64+action_size, 64)
        self.critic_out = nn.Linear(64, 1)
    
    def forward(self, state, action):
#         print(state.size(), action.size())
        x = F.relu(self.critic_layer_1(state))
        x = torch.cat([x, action], dim=-1).float()
        x = F.relu(self.critic_layer_2(x))
        return self.critic_out(x)

class ActorNetwork(nn.Module):

    def __init__(self, state_size, action_size):

        super(ActorNetwork, self).__init__()

        self.actor_layer_1 = nn.Linear(state_size, 64)
        self.actor_layer_2 = nn.Linear(64, 64)
        self.actor_out = nn.Linear(64, action_size)
    
    def forward(self, state):
        x = F.relu(self.actor_layer_1(state))
        x = F.relu(self.actor_layer_2(x))
        return F.tanh(self.actor_out(x))

In [8]:
class Network:
    def __init__(self, state_size, action_size):
        self.actor = ActorNetwork(state_size, action_size).to(device)
        self.critic = CriticNetwork(state_size, action_size).to(device)

    def copy(self, network):
        # Actor
        for target_param, local_param in zip(self.actor.parameters(), network.actor.parameters()):
            target_param.data.copy_(local_param.data)
        
        # Critic
        for target_param, local_param in zip(self.critic.parameters(), network.critic.parameters()):
            target_param.data.copy_(local_param.data)
    
    def soft_update(self, network, tau=TAU):
        
        # Actor
        for target_param, local_param in zip(self.actor.parameters(), network.actor.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
        
        # Critic
        for target_param, local_param in zip(self.critic.parameters(), network.critic.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

In [9]:
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, seed=0):
        """Initialize a ReplayBuffer object.

        Params
        ======
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [10]:
class Agent:

    def __init__(self, state_size, action_size, action_low=-1, action_high=1):
        
        self.state_size = state_size
        self.action_size = action_size
        self.a_low = action_low
        self.a_high = action_high
        self.network = Network(state_size, action_size)
        
        self.actor_opt = optim.Adam(self.network.actor.parameters(), lr=ACTOR_LR)
        self.critic_opt = optim.Adam(self.network.critic.parameters(), lr=CRITIC_LR)

        self.target_network = Network(state_size, action_size)
        self.target_network.copy(self.network)
        self.memory = ReplayBuffer()
        self.t_step = 0
    
    def act(self, state, eta):
        
        state = torch.tensor(state).float().to(device)
        
        action = self.network.actor.forward(state)# + eta
        action = action.data.cpu().numpy()
        action = np.clip(action, self.a_low, self.a_high)

        return action
    
    def step(self, state, action, reward, next_state, done):
        
        self.memory.add(state, action, reward, next_state, done)
        
        self.t_step = (self.t_step + 1) % UPDATE_EVERY

        if len(self.memory) > BATCH_SIZE and self.t_step == 0:
            for i in range(TIMES_UPDATE):
                experiences = self.memory.sample()
                self.learn(experiences)
    
    def learn(self, experiences, gamma=GAMMA):
        
        states, actions, rewards, next_states, dones = experiences

        Q_target_next = self.target_network.critic.forward(next_states, self.target_network.actor.forward(next_states).detach())
        Q_target = rewards + (gamma * Q_target_next * (1-dones))
        
        Q_predicted = self.network.critic.forward(states, actions,)
        critic_loss = F.mse_loss(Q_predicted, Q_target)
        
        self.critic_opt.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        Q_val = self.network.critic.forward(states, self.network.actor(states)).mean()
        actor_loss = -(Q_val.mean())
        
        self.actor_opt.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        self.target_network.soft_update(self.network)

In [11]:
def ddpg(agent, env, brain_name, n_episodes=2000, max_t=1000, gamma=GAMMA, beta=BETA, beta_decay=BETA_DECAY, beta_min = BETA_MIN):
    
    scores = []
    scores_window = deque(maxlen=100)
    
    for i_ep in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = torch.tensor(env_info.vector_observations[0]).float().to(device)
        score = 0
        for t in range(max_t):
            eta = np.random.uniform(-beta, beta)
            action = agent.act(state, eta)
            env_info = env.step(action.reshape(1, agent.action_size))[brain_name]
            next_state = torch.tensor(env_info.vector_observations[0]).float().to(device)
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
            score += reward
            if done:
                break
        scores_window.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}, Max score: {}, Min score: {}'.format(i_ep, np.mean(scores_window), np.max(scores_window), np.min(scores_window)), end="")
        beta = max(beta_min, beta*beta_decay)

        if i_ep % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_ep, np.mean(scores_window)))
            torch.save(agent.network.actor.state_dict(), f'ddpg_actor_checkpoint.pth') 
            torch.save(agent.network.critic.state_dict(), f'ddpg_critic_checkpoint.pth')

        if np.mean(scores_window) > 30:
            print('\rSolved goal on episode {} with average score {}'.format(i_episode, np.mean(scores_window)))
            torch.save(agent.network.actor.state_dict(), f'ddpg_actor_solution.pth') 
            torch.save(agent.network.critic.state_dict(), f'ddpg_critic_solution.pth')
            
    return scores

In [12]:
agent = Agent(state_size, action_size)

In [None]:
scores = ddpg(agent, env, brain_name, n_episodes=500, max_t=100)

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

Episode 100	Average Score: 0.01, Max score: 0.09999999776482582, Min score: 0.0
Episode 109	Average Score: 0.01, Max score: 0.09999999776482582, Min score: 0.0