# Continuous Control

---

In this notebook, you will learn how to use the Unity ML-Agents environment for the second project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program.

### 1. Start the Environment

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [1]:
from unityagents import UnityEnvironment
import copy

import numpy as np
import random
import torch
from collections import deque
import matplotlib.pyplot as plt
#from ddpg_agent import Agent
import os
import pickle
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from mqtt_writer import SummaryWriter

In [2]:
#env = UnityEnvironment(file_name='one_agent_vis/Reacher_Linux/Reacher.x86_64')
#env = UnityEnvironment(file_name='20_agents/Reacher_Linux/Reacher.x86_64')
#env = UnityEnvironment(file_name='20_agents/Reacher_Linux_NoVis/Reacher.x86_64')
#env = UnityEnvironment(file_name='one_agent/Reacher_Linux_NoVis/Reacher.x86_64')
env = UnityEnvironment(file_name='./Reacher_Linux_NoVis/Reacher.x86_64')
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]
writer = SummaryWriter(comment="-reacher-ga")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
config = {
    
        'state_size':  env_info.vector_observations.shape[1],
        'action_size': brain.vector_action_space_size,
        'number_of_agents': len(env_info.agents),
        'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
        'hidden_size': 256,
        'sigma': 0.05,
        
    
}
NUM_AGENTS = len(env_info.agents)
print(config)

{'state_size': 33, 'action_size': 4, 'number_of_agents': 20, 'device': device(type='cpu'), 'hidden_size': 256, 'sigma': 0.05}


### 4. Set Params



In [4]:
STATE_SIZE = env_info.vector_observations.shape[1]
ACTION_SIZE = brain.vector_action_space_size
BRAIN_NAME = env.brain_names[0]
NUM_AGENTS = len(env_info.agents)
NOISE_STD = 0.01
POPULATION_SIZE = 25
PARENTS_COUNT = int(POPULATION_SIZE / 5)
MAX_STEPS = 1000
HIDDEN_LAYER = 128


In [5]:
class Net(nn.Module):
    def __init__(self, obs_size, act_size, hid_size=HIDDEN_LAYER):
        super(Net, self).__init__()

        self.mu = nn.Sequential(
            nn.Linear(obs_size, hid_size),
            nn.Tanh(),
            nn.Linear(hid_size, hid_size),
            nn.Tanh(),
            nn.Linear(hid_size, act_size),
            nn.Tanh(),
        )
        self.netId = 0

    def forward(self, x):
        x = torch.Tensor(x)
        return self.mu(x)
    
    def setNetId(self, netId):
        self.netId  = netId

    def getNetId(self):
        return self.netId

In [10]:
def evaluate(env, net):
    global step_count, writer
    env_info = env.reset(train_mode=True)[BRAIN_NAME]    
    states = env_info.vector_observations                 
    scores = np.zeros(NUM_AGENTS)
    steps = 0
    while True:
        obs_v = torch.Tensor(states)
        #state = torch.from_numpy(states)
        actions = net(obs_v)
        env_info = env.step(actions.cpu().detach().numpy())[BRAIN_NAME]
        steps +=1
        next_states = env_info.vector_observations         
        rewards = env_info.rewards                         
        dones = env_info.local_done                     
        scores += env_info.rewards                      
        states = next_states
        
        if steps >= MAX_STEPS or np.any(dones):                                  
            break
    #print(step_count, "score", np.mean(scores))
    writer.add_scalar("step", step_count, step_count)
    return np.mean(scores)

In [7]:
def mutate_parent(net):
    new_net = copy.deepcopy(net)
    for p in new_net.parameters():
        noise_t = torch.tensor(np.random.normal(size=p.data.size()).astype(np.float32))
        p.data += NOISE_STD * noise_t
    return new_net


In [8]:
good_net = Net(STATE_SIZE, ACTION_SIZE, 128)
good_net.load_state_dict(torch.load("models/best-ga.pth"))
global step_count
step_count = 1

In [12]:
evaluate(env, good_net)

6.55349985351786

In [13]:
nets = [
    mutate_parent(good_net)
    for _ in range(POPULATION_SIZE)
]

for i in range(len(nets)):
    nets[i].setNetId(i)



In [None]:
#nets

In [14]:
population = []
step_count = 0
for net in nets:
    step_count +=1
    population.append((net, evaluate(env,net)))
              
    #(net, evaluate(env, net))
    #for net in nets
#]

In [None]:
writer.reset()
timestamp =  int(time.time())
log_msg = "%d New Run  Pop size: %d Noise: %.3f  Parents: %d" %(timestamp, POPULATION_SIZE, NOISE_STD, PARENTS_COUNT )
writer.log(log_msg)
gen_idx = 0
prev_reward = 0
while True:
    population.sort(key=lambda p: p[1], reverse=True)
    rewards = [p[1] for p in population[:PARENTS_COUNT]]
    reward_mean = np.mean(rewards)
    reward_max = np.max(rewards)
    reward_std = np.std(rewards)

    writer.add_scalar("episode", gen_idx, gen_idx)
    writer.add_scalar("average", reward_mean, gen_idx)
    writer.add_scalar("reward_std", reward_std, gen_idx)
    writer.add_scalar("score", reward_max, gen_idx)
    msg = "%d: reward_mean=%.2f, reward_max=%.2f, reward_std=%.2f" % (
        gen_idx, reward_mean, reward_max, reward_std)
    writer.log(msg)
    print(msg)
    if reward_mean > 30:
        print("Solved in %d steps" % gen_idx)
        break
    if reward_max > prev_reward + 1:
        #save weights
        best_net = population[0][0]
        torch.save(best_net.state_dict(), f"models/ga-{HIDDEN_LAYER}-{int(reward_max)}.pth")
        prev_reward = reward_mean

    # generate next population
    prev_population = population
    population = [population[0]]
    for _ in range(POPULATION_SIZE-1):
        parent_idx = np.random.randint(0, PARENTS_COUNT)
        parent = prev_population[parent_idx][0]
        net = mutate_parent(parent)
        fitness = evaluate(env, net)
        population.append((net, fitness))
    gen_idx += 1

pass


0: reward_mean=6.19, reward_max=6.33, reward_std=0.08
1: reward_mean=6.70, reward_max=7.83, reward_std=0.60
2: reward_mean=6.80, reward_max=7.83, reward_std=0.70
