# Continuous Control(No Success)

In [1]:
from collections import deque
from unityagents import UnityEnvironment
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import os

from agent import *

### Start the Environment

In [2]:
# select this option to load version 1 (with a single agent) of the environment
env_file_name = os.path.abspath("Reacher_Linux/Reacher.x86_64")
env = UnityEnvironment(file_name=env_file_name, no_graphics=True)

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
print(brain_name)

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


ReacherBrain
Number of agents: 1
Size of each action: 4
There are 1 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726671e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


### Training Script

In [3]:
def layer_init(layer, w_scale=1.0):
    nn.init.orthogonal_(layer.weight.data)
    layer.weight.data.mul_(w_scale)
    nn.init.constant_(layer.bias.data, 0)
    return layer

class ContinuousActorNet(nn.Module):
    """Deep Q Model."""

    def __init__(self, state_size, action_size, seed=0):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
        """
        super(ContinuousActorNet, self).__init__()
        self.seed = torch.manual_seed(seed)


        self.mu   = nn.Sequential(layer_init(nn.Linear(state_size, 128), 1e-3), nn.ReLU(inplace=True),
                                  layer_init(nn.Linear(128       , 128), 1e-3), nn.ReLU(inplace=True),
                                  layer_init(nn.Linear(128       , 64 ), 1e-3), nn.ReLU(inplace=True),
                                  layer_init(nn.Linear(64        , action_size), 1e-3), nn.Tanh())
        self.std_val = 0
    def forward(self, state, i_episode):
        mean = self.mu(state)
        sigma = max(0.5 - (0.1 - 0.5)/100 * i_episode, 0.01)
        dist = torch.distributions.Normal(mean, torch.tensor(sigma))
        action = dist.sample()
        log_prob = dist.log_prob(action)
        entropy = dist.entropy()
        return action, log_prob, entropy
    
ContinuousActorNet(state_size, action_size)

class ContinuousCriticNet(nn.Module):
    """Deep Deterministic Critic Model."""

    def __init__(self, state_size, seed=0):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
        """
        super(ContinuousCriticNet, self).__init__()
        self.seed = torch.manual_seed(seed)
        
        # estimate advantage by TD error
        self.model = nn.Sequential(layer_init(nn.Linear(state_size, 128), 0.5), nn.ReLU(inplace=True),
                                   layer_init(nn.Linear(128       , 64), 0.5), nn.ReLU(inplace=True),
                                   layer_init(nn.Linear(64, 1), 0.5), nn.ReLU(inplace=True))

    def forward(self, state):
        return self.model(state)

ContinuousCriticNet(state_size)

def train(params, n_episodes=1800, max_t=1000):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    
    critic = ContinuousCriticNet(state_size).to(device)
    critic_optimizer = optim.Adam(critic.parameters(), lr=params.critic_lr)
        
    actor = ContinuousActorNet(state_size, action_size).to(device)
    actor_optimizer = optim.Adam(actor.parameters(), lr=params.actor_lr)
    
    
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    for i_episode in range(1, n_episodes+1):

        
        for i in range(2):
            env_info = env.reset(train_mode=True)[brain_name] # reset the environment
            state = env_info.vector_observations[0]           # get the current state
            collect_value = []
            collect_state = []
            collect_reward = []
            collect_done   = []
            collect_log_prob = []
            collect_entropy  = []
            collect_next  = []
            score = 0
            for t in range(max_t):
                state = torch.tensor(state, dtype=torch.float)
                action, log_prob, entropy = actor(state, i_episode)
                value = critic(state)
                action = np.clip(action.cpu().data.numpy(), -1, 1)
                env_info   = env.step(action)[brain_name]      # send the action to the environment
                next_state = env_info.vector_observations[0]   # get the next state
                reward     = env_info.rewards[0]               # get the reward
                done       = env_info.local_done[0]            # see if episode has finished
                collect_value.append(value)
                collect_state.append(state)
                collect_reward.append(reward)
                collect_done.append(done)
                collect_log_prob.append(log_prob)
                collect_entropy.append(entropy)
                collect_next.append(next_state)
                state = next_state
                score += reward
                if done:
                    break 

            v = critic(torch.tensor(next_state, dtype=torch.float)).detach()
            returns = []
            for reward, done in zip(reversed(collect_reward), reversed(collect_done)):
                q = reward + params.gamma * (1 - done) * v
                returns.insert(0, q)

            values   = 1e2*torch.stack(collect_value)
            states   = torch.stack(collect_state)
            returns  = 1e2*torch.tensor(returns, dtype=torch.float).unsqueeze(dim=1)
            log_prob = torch.stack(collect_log_prob)
            entropy  = torch.stack(collect_entropy)
            next_states = torch.tensor(collect_next, dtype=torch.float).unsqueeze(dim=1)
        
            loss_critic = F.mse_loss(returns, values)

            critic_optimizer.zero_grad()
            loss_critic.backward()
            critic_optimizer.step()
        print(f'loss_critic:{loss_critic}')
        values = critic(states)
        
        adv = returns - values
        tmp = -log_prob * adv.detach()
        
        loss_actor = torch.mean(-log_prob * adv.detach())
        
        actor_optimizer.zero_grad()    
        loss_actor.backward()
        actor_optimizer.step()


        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if i_episode % 100 == 0 and np.mean(scores_window)>=30.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            #torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')

    return scores

In [4]:
def plot_scores(scores, filename, rolling_window=100):
    """Plot scores and optional rolling mean using specified window."""
    plt.figure(figsize=(19.20,10.80))
    plt.plot(scores, color='b', linestyle='-', linewidth=0.75) 
    plt.title("Scores");
    rolling_mean = pd.Series(scores).rolling(rolling_window, min_periods=1).mean()
    plt.plot(rolling_mean, color='r', linestyle='-', linewidth=0.75)
    if filename is not None:
        plt.savefig(filename)

### Train A2C

In [None]:
class Params:
    """Set up configuration here."""
    def __init__(self):
        self.__dict__.update(**{
            'gamma' : 0.5,            # discount factor
            'actor_lr' : 1e-6,               # learning rate 
            'critic_lr' : 1e-3,
            'entropy_weight': 1e-3
})
        
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

params = Params()


# agent = ContinuousACagent(state_size=state_size, action_size=action_size, params=params, device=device)
scores = train(params, n_episodes=50, max_t=1000)

#filename = model_name + '.png'
plot_scores(scores=scores, filename=None)

loss_critic:1.5989094972610474
loss_actor:0.6171565651893616
Episode 1	Average Score: 0.57loss_critic:0.0
loss_actor:0.0
Episode 2	Average Score: 0.28loss_critic:0.0
loss_actor:0.0
Episode 3	Average Score: 0.19loss_critic:0.0
loss_actor:0.0
Episode 4	Average Score: 0.14loss_critic:0.14900000393390656
loss_actor:0.03128720447421074
Episode 5	Average Score: 0.19loss_critic:0.0
loss_actor:0.0
Episode 6	Average Score: 0.16loss_critic:0.09300000220537186
loss_actor:0.016963299363851547
Episode 7	Average Score: 0.17loss_critic:0.008999999612569809
loss_actor:0.004105106461793184
Episode 8	Average Score: 0.15loss_critic:0.0
loss_actor:0.0
Episode 9	Average Score: 0.14loss_critic:0.09799999743700027
loss_actor:0.031199676916003227
Episode 10	Average Score: 0.15loss_critic:0.3050000071525574
loss_actor:0.06510315835475922
Episode 11	Average Score: 0.21loss_critic:0.0
loss_actor:0.0
Episode 12	Average Score: 0.19loss_critic:0.0
loss_actor:0.0
Episode 13	Average Score: 0.18loss_critic:0.0
loss_ac