# HER robotic problem ~ Reacher 20 Arms ( UnityML ) ~ experimental WIP
- *Wheeler task definition ( task wrapper, State decoder settings, NeuralNetwork, ReplayBuffer, .. )*

### Import generics

In [1]:
import os, time

os.chdir("..")

import numpy as np
import toml, gym

import torch
from torch.multiprocessing import Queue, Process

### Prepare statistic helpers

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

def print_stats(scores, title):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(1, len(scores)+1), scores)
    plt.ylabel('Rewards')
    plt.xlabel(title)
    plt.show()

### Load task configs ~ this should be adopted offline for particular task

In [3]:
CFG = toml.loads(open('cfg.toml').read())
GYM_CFG = toml.loads(open('gym.toml').read())

torch.set_default_tensor_type(CFG['tensor'])
print(CFG['task'])
CFG['task'] = "data/Reacher.x86_64"

MountainCarContinuous-v0


### Import wheeler environment and particular utils we want to use ~ general ones ( shared across tasks )

In [4]:
from utils.task import Task
from utils.taskinfo import *

from utils.rbf import *
from utils.normalizer import *

from utils.taskmgr import *
from utils.replay import *

from utils.fastmem import Memory

from utils.curiosity import *

from utils.unity import unity_factory

from agent.zer0bot import agent_launch

### Define Task wrapper ~ when is goal met, how to step ( update rewards function, .. ), when / how to reset

In [5]:
CLOSE_ENOUGH = 1.25

def extract_goal(state):
    return state[-4-3:-1-3] # this is nice, from perspective, that we are OK if our states are stacked ~ we work with latest one

def goal_distance(goal_a, goal_b):
    return np.linalg.norm(goal_a - goal_b.reshape(goal_a.shape), axis=-1)

def fun_reward(s, n, goal, objective_id, cfg, her):
    xid = objective_id % 4
    if True:#xid >= 3: # full task 3D naigation
        return -1 * (2 * CLOSE_ENOUGH < np.abs(goal_distance(extract_goal(s), goal)))
# X- Y - Z subtask navigation ~ multi task
    a = np.abs(extract_goal(n)[xid] - goal[xid])
    b = np.abs(extract_goal(s)[xid] - goal[xid])
    if b < CLOSE_ENOUGH:
        return 0.
    return -1 + .9 * int(a < b) # little shape it maybe good idea ? ~ experimental

def sample_goal(cfg, trajectory_goal, trajectory_state):
    hs = cfg['her_state_size']
    pos = extract_goal(trajectory_state.copy())
    for i in range(3):# be carefull extremly expensive
        radius = np.abs(np.random.rand() * CLOSE_ENOUGH)
        angle = np.random.rand() * np.pi * 2
        a = np.cos(angle) * radius
        b = np.sin(angle) * radius
        ids = np.random.choice(hs, 2, p=[1/hs]*hs, replace=False)

        target = pos if random.randint(0, 5) else trajectory_goal
        goal = target.copy()
        goal[ids[0]] += a
        goal[ids[1]] += b

        if np.abs(goal_distance(goal, target)) < CLOSE_ENOUGH:
            return goal, target
    return (
        pos if random.randint(0, 3) else trajectory_goal, 
        pos if random.randint(0, 3) else trajectory_goal
    )

def goal_select(total_after, n_step):
    if total_after <= n_step + 1: # only last n_state remainds
        return 0
    if random.randint(0, 2):
        return random.randint(1, n_step)
    if 0 == random.randint(0, 3):
        return 0
    return random.randint(1, total_after - 1 - n_step)

In [6]:
class GymTask(Task):
    def reset(self, seed = None, test = False):
        cfg = {"goal_size":5., "goal_speed":0.}
        state = super().reset(cfg, test)[0]

        self.goal_ = state
        
        if test: return state # we will get array of states
        
        state = state.reshape(-1)
        return [ state ]
    
    def step_ex(self, action, test = False):
        state, done, reward = self.env.step(self.bot_id, self.objective_id, action)
        self.goal_ = state.copy()

        if not test: reward = fun_reward(state, None, self.goal(0), self.objective_id, self.cfg, False)
        
        return action, state, reward, done, True

    def goal_met(self, states, rewards, n_steps):
        return sum(abs(r) for r in rewards) > 30

    def goal(self, ind = 0):
#        print(extract_goal(self.goal_.reshape(-1)).reshape(1, -1) if self.goal_.shape[0] != 20 else extract_goal(self.goal_[ind].reshape(-1)).reshape(1, -1))
        if self.goal_.shape[0] != 20: return extract_goal(self.goal_.reshape(-1)).reshape(1, -1)
        return extract_goal(self.goal_[ind].reshape(-1)).reshape(1, -1)
    
    def update_goal(self, _, goals, states, n_goals, n_states, updates):
        for i, (g, s, n_g, n, u) in enumerate(zip(goals, states, n_goals, n_states, updates)):
            if u:
                ind = i + goal_select(len(states) - i, self.cfg['n_step'])
                g, n_g = sample_goal(self.cfg, goals[ind], states[ind])
            
            yield (
                fun_reward(s, None, g, self.objective_id, self.cfg, True),
                g, s,
                n_g, n,
            )

### Generic proxy for creating our Task ( multiprocess environments purpose mainly ) 
- but can also add wrapping function approx values ( action value to tanh, sigmoid, .. ) - this not works well with PPO now

In [7]:
class GymInfo(TaskInfo):
    def __init__(self, replaybuf, factory, Mgr, args):
        super().__init__(
                33, 4, -1, +1,
                CFG,
                replaybuf,
                factory, Mgr, args)

    def new(self, cfg, bot_id, objective_id):
        task = GymTask(cfg,
                self.env,
                objective_id, bot_id,
                self.action_low, self.action_high)
        if -1 == objective_id:
            task.reset()
        return task

### Implement callback for testing policy ~ per X training rounds, we want to test it ~ enable visuals if you want

In [8]:
def context_callback(count, print_every):
    context_callback.z = 0
    context_callback.scores = []
    def callback(task, agent, scores):
        context_callback.z += 1
        if 0 == context_callback.z % print_every:
            print("\nTEST review : ", np.mean(context_callback.scores[-print_every:]), "LAST stats :", sum(scores), sum(map(lambda r: r != 0, scores)), len(scores))
        # we can save scores to main queue, and avarage them, or we can ..
        # run testing w/ visuals :
        for _ in range(count):
            done, states, rewards = task.test_policy(agent)
            rewards = sum(rewards)
            context_callback.scores.append(rewards)
            if not done:
                return None

        print("\n")
        print("="*80)
        print("training over", context_callback.z * GYM_CFG['n_simulations'] * GYM_CFG['mcts_rounds'])
        print("="*80)

        return context_callback.scores
    return callback

### Prepare neural network which we will be using

In [9]:
from models import ddpg_model, noisy_model, state_action_model

def CriticNN(state_size, action_size, wrap_value, cfg):
    return state_action_model.Critic(state_size, action_size, wrap_value, cfg, fcs1_units=256, fc2_units=128)

def ActorNN(state_size, action_size, wrap_action, cfg):
    return noisy_model.Actor(state_size, action_size, wrap_action, cfg, hiddens=[400, 300])
    return noisy_model.Actor(state_size, action_size, wrap_action, cfg, hiddens=[128, 64])

### Select encoders

In [10]:
from utils.encoders import *
from utils.rnn import *#GRUEncoder

def encoderstack():
    norm = GlobalNormalizer(GYM_CFG, 33)
    return norm
    GYM_CFG['max_n_episode'] = 70 # we want gae + rnn; need re-eval eps ~ 1000 per shot is overkill
    GYM_CFG['batch_size'] = 16 # this is basically numero of sample eps, an from each ep we draw about <1, 40> steps
    experience = GRUEncoder(GYM_CFG, norm.total_size())#GRU#LSTM
    encoder_norm = StackedEncoder(GYM_CFG, 33, norm, experience)
    encoder_norm.share_memory()
    return encoder_norm

### Cook Task : replay buffer ( fast / prio-gae-rnn ) + task manager ( local / remote / unity )

In [11]:
def taskfactory():
#    return GymInfo(Memory, unity_factory(CFG, CFG['total_simulations']), RemoteTaskManager, (LocalTaskManager, 1 + GYM_CFG['n_simulations']))
    return GymInfo(ReplayBuffer, unity_factory(CFG, CFG['total_simulations']), RemoteTaskManager, (LocalTaskManager, 1 + GYM_CFG['n_simulations']))

### Glue it all together ~ select buffer, encoders, agents, ... and RUN!!

In [None]:
class GoalEncoder(IEncoder):
    def __init__(self, cfg, size):
        super().__init__(cfg)
        self.size = size
        self.net = torch.nn.Linear(size, cfg['her_state_features'], bias=False)
    def out_size(self):
        assert False, "..."
    def forward(self, goals, _):
        return self.net(goals)

def main():
    print(CFG)
    
    encoder = encoderstack()
    task_factory = taskfactory()
    task = task_factory.new(GYM_CFG, 0, -1)
    
    callback = context_callback(10, 10)
    def callback_task(agent, stop_q):
        return callback(task, agent, stop_q)

    stop_q = Queue()
    agent_launch(0, GYM_CFG, task_factory, encoder, ActorNN, CriticNN, stop_q, callback_task, GoalEncoder(GYM_CFG, GYM_CFG['her_state_size']))
    
    scores = stop_q.get()
    print("FINISHED!")
    print_stats(scores, "learning algorithm")

if '__main__' == __name__:
    main()

{'tensor': 'torch.DoubleTensor', 'task': 'data/Reacher.x86_64', 'total_simulations': 20, 'cross_exp_size': 5000, 'max_reward_val': 1000, 'min_reward_val': -1000}


INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 
INFO:unityagents:
Academy Reset with parameters : 	goal_size -> 5.0, goal_speed -> 0.0


UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153
UNITY SERVER PROCESS PING ~ running at 3153


INFO:unityagents:
Academy Reset with parameters : 	goal_size -> 5.0, goal_speed -> 0.0


[0>   1::     0] training =  0, steps =  36, max_step = 10000, reward=0.000000 ::[[ 1.          1.         -0.77776737 -1.        ]]: 