# MountainCarContinuous-v0 environment of OpenAi GYM 
- *Wheeler task definition ( task wrapper, State decoder settings, NeuralNetwork, ReplayBuffer, .. )*

### Import generics

In [1]:
import os, time

os.chdir("..")

import numpy as np
import toml, gym

import torch
from torch.multiprocessing import Queue, Process

### Load task configs ~ this should be adopted offline for particular task

In [2]:
CFG = toml.loads(open('cfg.toml').read())
GYM_CFG = toml.loads(open('gym.toml').read())

torch.set_default_tensor_type(CFG['tensor'])

CFG['task'] = "MountainCarContinuous-v0"

### Import wheeler environment and particular utils we want to use ~ general ones ( shared across tasks )

In [3]:
from utils.task import Task
from utils.taskinfo import *

from utils.rbf import *
from utils.normalizer import *

from utils.taskmgr import *
from utils.replay import *

from utils.fastmem import Memory

from utils.curiosity import *

from agent.zer0bot import agent_launch

### Define Task wrapper ~ when is goal met, how to step ( update rewards function, .. ), when / how to reset

In [None]:
class MCarTask(Task):
    def __init__(self, cfg, env, objective_id, bot_id, action_low, action_high, rewarder):
        self.reward = 0
        self.rewards = []

        super().__init__(
                cfg,
                env,
                objective_id,
                bot_id,
                action_low, action_high)

        self.rewarder = rewarder

    def reset(self, seed = None, test = False):
        state = super().reset(seed, test)
        self.rewards.append(self.reward)
        self.reward = 0
        self.prev_state = state[0]
        return state

    def step_ex(self, action, test = False):
        state, reward, done, _ = self.env.step(self.bot_id, self.objective_id, action)

        if test: return action, state.reshape(1, -1), reward, done, True

        self.reward += (done and reward > 0)

        curiosity_test = '''
        if not test and reward < 0:
            ss, ns = np.vstack([s] * self.cfg['history_count']), np.vstack([n] * self.cfg['history_count'])
            ss = ss.reshape(1, -1)
            ns = ns.reshape(1, -1)
            reward = self.rewarder.weight(ss, ns, action)[0]
            self.rewarder.update(ss, ns, action)
        self.prev_state = state

        reward_update  = '''
        if not test and not done:# and sum(self.rewards) < 3
            true_state = np.abs(np.cos(np.pi/3.) + state[0])
            reward += -(1. - true_state)
#        '''

        return action, state, reward, done, True

    def goal_met(self, states, rewards, n_steps):
        print("TEST : ", sum(rewards))
        return sum(rewards) > 90.

### Generic proxy for creating our Task ( multiprocess environments purpose mainly ) 
- but can also add wrapping function approx values ( action value to tanh, sigmoid, .. ) - this not works well with PPO now

In [None]:
class MCarInfo(TaskInfo):
    def __init__(self, env, replaybuf, factory, Mgr, args):
        super().__init__(
                len(env.reset()), 1, -1, +1,
                CFG,
                replaybuf,
                factory, Mgr, args)

        self.rewarder = CuriosityPrio(
                self.state_size, self.action_size,
                self.action_range, self.wrap_action, "cpu", GYM_CFG)

    def new(self, cfg, bot_id, objective_id):
        return MCarTask(cfg,
                self.env,
                objective_id, bot_id,
                self.action_low, self.action_high,
                self.rewarder)

    @staticmethod
    def factory(ind): # bare metal task creation
        print("created %i-th task"%ind)
        CFG = toml.loads(open('cfg.toml').read())
        return gym.make(CFG['task'])
    
#    def wrap_action(self, x):
#        return torch.tanh(x)

### Implement callback for testing policy ~ per X training rounds, we want to test it ~ enable visuals if you want

In [None]:
def callback(task, agent, scores):
    try: callback.z += 1
    except: callback.z = 0
    
    # we can save scores to main queue, and avarage them, or we can ..
    # run testing w/ visuals :
    done = all(task.test_policy(agent)[0] for _ in range(10))
    if not done:
        return False

    print("\n")
    print("="*80)
    print("training over", callback.z * GYM_CFG['n_simulations'] * GYM_CFG['mcts_rounds'])
    print("="*80)

    for i in range(100): print("total steps : training : %i :: %i >"%(
        callback.z * GYM_CFG['mcts_rounds'] * GYM_CFG['n_simulations'],
        len(task.test_policy(agent)[2])))

    return True

### Prepare neural network which we will be using

In [None]:
from models import ddpg_model, noisy_model

def CriticNN(state_size, action_size, wrap_value, cfg):
    return ddpg_model.Critic(state_size, action_size, wrap_value, cfg, fcs1_units=400, fc2_units=300)

def ActorNN(state_size, action_size, wrap_action, cfg):
    return noisy_model.Actor(state_size, action_size, wrap_action, cfg, hiddens=[256, 256, 128])
    return ddpg_model.Actor(state_size, action_size, wrap_action, cfg, fc1_units=400, fc2_units=300)

### Select encoders

In [None]:
from utils.encoders import *
from utils.rnn import *#GRUEncoder

def encoderstack(env):
    encoder = RBFEncoder(GYM_CFG, env, [5., 2., 1., .5], [20] * 4)
    #return encoder
    
    norm = GlobalNormalizer(GYM_CFG, encoder.total_size())
    #norm = BatchNormalizer3D(GYM_CFG, encoder.total_size())
    #return StackedEncoder(GYM_CFG, len(env.reset()), encoder, norm)
    encoder = StackedEncoder(GYM_CFG, len(env.reset()), encoder, norm)

    experience = GRUEncoder(GYM_CFG, encoder.total_size())#GRU#LSTM
    encoder_norm = StackedEncoder(GYM_CFG, len(env.reset()), encoder, experience)
    
    return encoder_norm

### Cook Task : replay buffer ( fast / prio-gae-rnn ) + task manager ( local / remote / unity )

In [None]:
def taskfactory(env):
    #return MCarInfo(env, Memory, MCarInfo.factory, LocalTaskManager, ())
    return MCarInfo(env, ReplayBuffer, MCarInfo.factory, LocalTaskManager, ())
    return MCarInfo(env, ReplayBuffer, MCarInfo.factory, RemoteTaskManager, (LocalTaskManager, 1 + GYM_CFG['n_simulations']))

### Glue it all together ~ select buffer, encoders, agents, ... and RUN!!

In [None]:
def main():
    print(CFG)

    env = gym.make(CFG['task'])
    
    encoder = encoderstack(env)
    task_factory = taskfactory(env)
    task = task_factory.new(GYM_CFG, 0, -1)
    
    def callback_task(agent, stop_q):
        return callback(task, agent, stop_q)
    
    stop_q = Queue()
    agent_launch(0, GYM_CFG, task_factory, encoder, ActorNN, CriticNN, stop_q, callback_task)

if '__main__' == __name__:
    main()

{'tensor': 'torch.DoubleTensor', 'task': 'MountainCarContinuous-v0', 'total_simulations': 2, 'cross_exp_size': 5000, 'max_reward_val': 1000, 'min_reward_val': -1000}
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
created 0-th task
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
created 1-th task
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[0>   1::     0] training =  6, steps = 300, max_step =  67, reward=-238.428510 ::[[-1.]]: [ TARGET:-1.061843 replay::1567 ]<----- ]<----TEST :  -99.8