# Continuous Control

---

### 1. Environment setup

Run the second version of the environment (with 20 agents)

In [1]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name='./Reacher_Linux/Reacher.x86_64')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


### 2. Register important stuff

In [2]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
n_agents = len(env_info.agents)
state_dim = env_info.vector_observations.shape[1]
action_dim = brain.vector_action_space_size
print(f"n_agents: {n_agents}, state_dim: {state_dim}, action_dim: {action_dim}")

n_agents: 20, state_dim: 33, action_dim: 4


In [3]:
states = env_info.vector_observations
actions = np.random.randn(n_agents, action_dim)
env_info = env.step(actions)[brain_name]
rewards = env_info.rewards
dones = env_info.local_done
for val, name in zip([states, actions, rewards, dones], ['states', 'actions', 'rewards', 'dones']):
    print(f"type({name}): {type(val)}")
    if isinstance(val, np.ndarray):
        print(f"{name}.shape: {val.shape}, {name}.dtype: {val.dtype}")
        
print(states.max(), states.min())

type(states): <class 'numpy.ndarray'>
states.shape: (20, 33), states.dtype: float64
type(actions): <class 'numpy.ndarray'>
actions.shape: (20, 4), actions.dtype: float64
type(rewards): <class 'list'>
type(dones): <class 'list'>
7.980512619018555 -10.0


### 3. Train the agent

In [4]:
from collections import deque
from control.agent import Agent
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
# Some helper functions
def solved(episode, scores, goal=30):
    return len(scores) == 100 and np.mean(scores) >= goal

def status(episode, scores, losses):
    print(
        "\rEpisode {}\t Mean Scores: {:.2f} \t Mean Losses: {:.2f}".format(
            episode, np.mean(scores), np.mean(losses), end=""
        )
    )
    if i_episode % 100 == 0:
        clear_output(True)
        plot(episode, scores, losses)

def plot(episode, scores, losses):
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title(f"Episode: {episode}, Last 100 scores")
    plt.plot(scores)
    plt.subplot(132)
    plt.title('Last 100 losses')
    plt.plot(losses)
    plt.show()
    
def plot_final(scores):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()

In [6]:
env_info = env.reset(train_mode=True)[brain_name]
scores = deque(maxlen=100)
losses = deque(maxlen=100)
all_scores = []
n_episodes = 10

agent = Agent(state_dim, action_dim, n_agents)

for i_episode in range(1, n_episodes+1):
    dones = np.repeat(False, n_agents)
    agent_scores = []
    env_info = env.reset(train_mode=True)[brain_name]
    states = env_info.vector_observations
    
    while not any(dones):
        actions = agent.act(states)
        env_info = env.step(actions)[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        loss = agent.step(next_states, rewards, dones)
        if loss:
            losses.append(loss)
        agent_scores.append(rewards)
        states = next_states
    
    score = np.sum(agent_scores, axis=1).mean()
    scores.append(score)
    all_scores.append(score)
    status(i_episode, scores, losses)
    
    if solved(i_episode, scores):
        print(f"Solved Control problem at episode: {i_episode}, with average score: {np.mean(scores):.2f}")
        print("Saving model parameters to 'checkpoint.pth'")
        agent.model.checkpoint()
        break

plot_final(all_scores)

Episode 1	 Mean Scores: 0.01 	 Mean Losses: 321718.83
Episode 2	 Mean Scores: 0.00 	 Mean Losses: 20039785.93
Episode 3	 Mean Scores: 0.00 	 Mean Losses: 179188146.44
Episode 4	 Mean Scores: 0.00 	 Mean Losses: 744045157.25


KeyboardInterrupt: 

In [None]:
# env.close()