# Continuous Control

---

Congratulations for completing the second project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program!  In this notebook, you will learn how to control an agent in a more challenging environment, where the goal is to train a creature with four arms to walk forward.  **Note that this exercise is optional!**

### 1. Start the Environment

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [1]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name='Crawler_Linux_NoVis/Crawler.x86_64')

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: CrawlerBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 129
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 20
        Vector Action descriptions: , , , , , , , , , , , , , , , , , , , 


Number of agents: 12
Size of each action: 20
There are 12 agents. Each observes a state with length: 129
The state for the first agent looks like: [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  2.25000000e+00
  1.00000000e+00  0.00000000e+00  1.78813934e-07  0.00000000e+00
  1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  6.06093168e-01 -1.42857209e-01 -6.06078804e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  1.33339906e+00 -1.42857209e-01
 -1.33341408e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -6.0609

In [2]:
#######################################################################
# Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com)    #
# Permission given to modify the code as long as you keep this        #
# declaration at the top                                              #
#######################################################################

from deep_rl import *

import torch
import numpy as np
from deep_rl.utils import *
import torch.multiprocessing as mp
from collections import deque
from skimage.io import imsave
from deep_rl.network import *
from deep_rl.component import *


class BaseAgent:
    def __init__(self, config):
        self.config = config
        self.logger = get_logger(tag=config.tag, log_level=config.log_level)
        self.task_ind = 0
        self.episode_rewards = []
        self.rewards = None
        self.rewards_deque = None
        self.episodic_return = None
        self.best_score = 0
    def close(self):
        close_obj(self.task)

    def save(self, filename):
        torch.save(self.network.state_dict(), '%s.model' % (filename))
        with open('%s.stats' % (filename), 'wb') as f:
            pickle.dump(self.config.state_normalizer.state_dict(), f)

    def load(self, filename):
        state_dict = torch.load('%s.model' % filename, map_location=lambda storage, loc: storage)
        self.network.load_state_dict(state_dict)
        with open('%s.stats' % (filename), 'rb') as f:
            self.config.state_normalizer.load_state_dict(pickle.load(f))

    def eval_step(self, state):
        raise NotImplementedError

    def eval_episode(self):
        env = self.config.eval_env
        state = env.reset()
        rewards = np.zeros(12)
        while True:
            action = self.eval_step(state)
            state, reward, done, info = env.step(action)
            rewards += reward
            if np.any(done):
                ret = rewards
                break
        return ret

    def eval_episodes(self):
        episodic_returns = []
        for ep in range(self.config.eval_episodes):
            total_rewards = self.eval_episode()
            episodic_returns.append(np.mean(total_rewards))
        self.episode_rewards = episodic_returns
        self.logger.info('steps %d, episodic_return_test %.2f(%.2f)' % (
            self.total_steps, np.mean(episodic_returns), np.std(episodic_returns) / np.sqrt(len(episodic_returns))
        ))
#         self.logger.add_scalar('episodic_return_test', np.mean(episodic_returns), self.total_steps)
        if np.mean(episodic_returns) > self.best_score and ((np.std(episodic_returns) / np.sqrt(len(episodic_returns))) > 0.01):
            self.save('./data/model-DDPG.bin')
            self.best_score = np.mean(episodic_returns)   
        return {
            'episodic_return_test': np.mean(episodic_returns),
        }

    def record_online_return(self, info, offset=0):
        if isinstance(info, dict):
            ret = info['episodic_return']
            if ret is not None:
                ret = np.mean(info['episodic_return'])
            self.rewards = info['all_rewards']
            self.rewards_deque = info['rewards_deque']
            if(self.rewards is not None):
                episode = len(self.rewards)
            if ret is not None and (episode % 100 == 0):
                self.episodic_return = ret
#                 self.logger.add_scalar('episodic_return_train', ret, self.total_steps + offset)
#                 self.logger.info('Episode %d, steps %d, episodic_return_train %s' % (episode,self.total_steps + offset, ret))
        elif isinstance(info, tuple):
            for i, info_ in enumerate(info):
                self.record_online_return(info_, i)
        else:
            raise NotImplementedError
            
#     def record_online_return(self, info, offset=0):
#         if isinstance(info, dict):
#             ret = info['episodic_return']
#             if ret is not None:
#                 ret = np.mean(ret)
# #                 self.logger.add_scalar('episodic_return_train', ret, self.total_steps + offset)
# #                 self.logger.info('steps %d, episodic_return_train %s' % (self.total_steps + offset, ret))
#         elif isinstance(info, tuple):
#             for i, info_ in enumerate(info):
#                 self.record_online_return(info_, i)
#         else:
#             raise NotImplementedError
            

    def switch_task(self):
        config = self.config
        if not config.tasks:
            return
        segs = np.linspace(0, config.max_steps, len(config.tasks) + 1)
        if self.total_steps > segs[self.task_ind + 1]:
            self.task_ind += 1
            self.task = config.tasks[self.task_ind]
            self.states = self.task.reset()
            self.states = config.state_normalizer(self.states)

    def record_episode(self, dir, env):
        mkdir(dir)
        steps = 0
        state = env.reset()
        while True:
            self.record_obs(env, dir, steps)
            action = self.record_step(state)
            state, reward, done, info = env.step(action)
            ret = info['episodic_return']
            steps += 1
            if ret is not None:
                break

    def record_step(self, state):
        raise NotImplementedError

    # For DMControl
    def record_obs(self, env, dir, steps):
        env = env.env.envs[0]
        obs = env.render(mode='rgb_array')
        imsave('%s/%04d.png' % (dir, steps), obs)

class DDPGAgent(BaseAgent):
    def __init__(self, config):
        BaseAgent.__init__(self, config)
        self.config = config
        self.task = config.task_fn()
        self.network = config.network_fn()
        self.target_network = config.network_fn()
        self.target_network.load_state_dict(self.network.state_dict())
        self.replay = config.replay_fn()
        self.random_process = config.random_process_fn()
        self.total_steps = 0
        self.state = None

    def soft_update(self, target, src):
        for target_param, param in zip(target.parameters(), src.parameters()):
            target_param.detach_()
            target_param.copy_(target_param * (1.0 - self.config.target_network_mix) +
                               param * self.config.target_network_mix)

    def eval_step(self, state):
        self.config.state_normalizer.set_read_only()
        state = self.config.state_normalizer(state)
        action = self.network(state)
        self.config.state_normalizer.unset_read_only()
        return to_np(action)

    def step(self):
        config = self.config
        if self.state is None:
            self.random_process.reset_states()
            self.state = self.task.reset()
            self.state = config.state_normalizer(self.state)

        if self.total_steps < 0: # if self.total_steps < config.warm_up:
            action = [self.task.action_space.sample()]
        else:
            action = self.network(self.state)
            action = to_np(action)
            action += self.random_process.sample()
        action = np.clip(action, -1, 1)
#         action = np.clip(action, self.task.action_space.low, self.task.action_space.high)        
        next_state, reward, done, info = self.task.step(action)
        next_state = self.config.state_normalizer(next_state)
        self.record_online_return(info)
        reward = self.config.reward_normalizer(reward)

        experiences = list(zip(self.state, action, reward, next_state, done))
        self.replay.feed_batch(experiences)
        
#         if done[0]:
#             self.random_process.reset_states()
        if np.any(done):
            self.random_process.reset_states()

            
        self.state = next_state
        self.total_steps += 1
        
        if self.replay.size() >= config.warm_up:
            experiences = self.replay.sample()
            states, actions, rewards, next_states, terminals = experiences
            states = tensor(states)
            actions = tensor(actions)
            rewards = tensor(rewards).unsqueeze(-1)
            next_states = tensor(next_states)
            mask = tensor(1 - terminals).unsqueeze(-1)

            phi_next = self.target_network.feature(next_states)
            a_next = self.target_network.actor(phi_next)
            q_next = self.target_network.critic(phi_next, a_next)
            q_next = config.discount * mask * q_next
            q_next.add_(rewards)
            q_next = q_next.detach()
            phi = self.network.feature(states)
            q = self.network.critic(phi, actions)
            critic_loss = (q - q_next).pow(2).mul(0.5).sum(-1).mean()

            self.network.zero_grad()
            critic_loss.backward()
            self.network.critic_opt.step()

            phi = self.network.feature(states)
            action = self.network.actor(phi)
            policy_loss = -self.network.critic(phi.detach(), action).mean()

            self.network.zero_grad()
            policy_loss.backward()
            self.network.actor_opt.step()

            self.soft_update(self.target_network, self.network)

In [3]:
def run_steps_custom(agent):
    config = agent.config
    agent_name = agent.__class__.__name__
    t0 = time.time()
    rewards_all = []
    best_reward = 0
    while True:
        rewards = agent.rewards
        rewards_deque = agent.rewards_deque
#         if rewards is not None:
#             rewards_deque.append(np.mean(rewards))
#             rewards_all.append(np.mean(rewards))
        if config.log_interval and not agent.total_steps % config.log_interval and (rewards is not None) and (rewards_deque is not None):
            agent.logger.info('Episode %d,last %d episodes, mean rewards  %.2f,  steps %d, %.2f steps/s' % (len(rewards),len(rewards_deque),np.mean(rewards_deque),agent.total_steps, config.log_interval / (time.time() - t0)))
            t0 = time.time()
#         if config.max_steps and agent.total_steps >= config.max_steps:
#             agent.close()
#             return True,rewards_deque,rewards_all
#         if (rewards is not None):
#             agent.save('./data/model-%s.bin' % (agent_name))
#             agent.close()
#             return True,rewards_deque,rewards_all
        if config.eval_interval and not agent.total_steps % config.eval_interval:
            agent.eval_episodes()
        if (len(rewards_all) % 200):
            agent.save('./data/model-%s.bin' % (agent_name))


        agent.step()
        agent.switch_task()

class CrawlerTask():
    def __init__(self):
#         BaseTask.__init__(self)
        self.name = 'Crawler'
        self.env = env
        self.action_dim = brain.vector_action_space_size
        self.state_dim = brain.vector_observation_space_size
        self.info = {"all_rewards":None}
        self.total_rewards = np.zeros(12)
        self.rewards = []
        self.rewards_deque = deque(maxlen=100)
#         self.action_space = .sample()
    def reset(self):
        env_info = self.env.reset(train_mode=True)[brain_name]
        return np.array(env_info.vector_observations)

    def step(self, action):
        action = np.clip(action, -1, 1)
        env_info = self.env.step(action)[brain_name]
        next_state = env_info.vector_observations   # next state
        reward = env_info.rewards                   # reward
        done = env_info.local_done

        self.total_rewards += reward

        if np.any(done): 
            if any(np.isnan(self.total_rewards.reshape(-1))):
                self.total_rewards[np.isnan(self.total_rewards)] = -5
            self.info['episodic_return'] = self.total_rewards
            self.rewards_deque.append(np.mean(self.total_rewards))
            self.rewards.append(np.mean(self.total_rewards))
            self.info['all_rewards'] = self.rewards
            self.info['rewards_deque'] = self.rewards_deque
            
            self.total_rewards = np.zeros(12)
            next_state = self.reset()            
        else:
            self.info['rewards_deque'] = self.rewards_deque            
            self.info['episodic_return'] = None

        return np.array(next_state), np.array(reward), np.array(done), self.info

    def seed(self, random_seed):
        return 10
    
class FCBody(nn.Module):
    def __init__(self, state_dim, hidden_units=(64, 64), gate=F.relu):
        super(FCBody, self).__init__()
        dims = (state_dim,) + hidden_units
        self.layers = nn.ModuleList(
            [layer_init(nn.Linear(dim_in, dim_out)) for dim_in, dim_out in zip(dims[:-1], dims[1:])])
        self.gate = gate
        self.feature_dim = dims[-1]
        self.bn = nn.BatchNorm1d(state_dim)

    def forward(self, x):
#         x = self.bn(x)
        for layer in self.layers:
            x = self.gate(layer(x))
        return x

class TwoLayerFCBodyWithAction(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_units=(64, 64), gate=F.relu):
        super(TwoLayerFCBodyWithAction, self).__init__()
        hidden_size1, hidden_size2 = hidden_units
        self.fc1 = layer_init(nn.Linear(state_dim, hidden_size1))
        self.fc2 = layer_init(nn.Linear(hidden_size1 + action_dim, hidden_size2))
        self.gate = gate
        self.feature_dim = hidden_size2
        self.bn = nn.BatchNorm1d(state_dim)

    def forward(self, x, action):
#         x = self.bn(x)        
        x = self.gate(self.fc1(x))
        phi = self.gate(self.fc2(torch.cat([x, action], dim=1)))
        return phi
    
    

def ddpg_continuous(**kwargs):
    config = Config()
    config.merge(kwargs)

    config.task_fn = lambda: CrawlerTask()
    config.eval_env = config.task_fn()
    config.max_steps = int(1e6)
    config.eval_interval = int(1e4)
    config.eval_episodes = 100
    config.log_interval = 2000
    config.network_fn = lambda: DeterministicActorCriticNet(
        config.state_dim, config.action_dim,
        actor_body=FCBody(config.state_dim, (400, 300), gate=F.leaky_relu),
        critic_body=TwoLayerFCBodyWithAction(
            config.state_dim, config.action_dim, (400, 300), gate=F.leaky_relu),
        actor_opt_fn=lambda params: torch.optim.Adam(params, lr=1e-4),
        critic_opt_fn=lambda params: torch.optim.Adam(params, lr=1e-3))

    config.replay_fn = lambda: Replay(memory_size=int(1e6), batch_size=64)
    config.discount = 0.95
    config.random_process_fn = lambda: OrnsteinUhlenbeckProcess(
        size=(config.action_dim,), std=LinearSchedule(0.2))
    config.warm_up = 100
    config.target_network_mix = 1e-3
    config.state_normalizer = MeanStdNormalizer()    
    agent = DDPGAgent(config)
    config.eval_interval
#     agent.load('data1/model-DDPGAgent.bin')    
    return run_steps_custom(agent)

success, rewards_deque, rewards_all = ddpg_continuous()

[46.36691351 60.94302569 53.68047563 62.80065738 60.94780986 46.41073941
 53.64643728 59.26645052 53.64720165 51.92119759 46.40961678 53.64566409]
[46.36835261 60.94152983 53.66961987 62.80444224 60.94746152 46.40976042
 53.64644056 59.2430989  53.64355861 51.92580052 46.41207995 53.64669492]
[46.36914111 60.9391034  53.67124568 62.8001318  60.94200714 46.41203495
 53.64643728 59.26406856 53.64119779 51.92351066 46.41301041 53.64262676]
[46.36843417 60.94264391 53.66961987 62.80729213 60.94746152 46.4124224
 53.64326122 59.24886562 53.64091565 51.92781532 46.41207995 53.64544154]
[46.36564573 60.94007856 53.67644963 62.80327294 60.94178956 46.40742585
 53.64541854 59.26538642 53.6409972  51.92091964 46.40982031 53.64721798]
[46.36810303 60.94402548 53.68094024 62.80424288 60.94311022 46.40686226
 53.64634162 59.26107077 53.6385817  51.92292935 46.41295648 53.64810141]
[46.3672579  60.9396341  53.66669454 62.80327294 60.9437475  46.40737394
 53.6483718  59.26538642 53.64390635 51.921197

KeyboardInterrupt: 