# MountainCarContinuous-v0 environment of OpenAi GYM 
- *Wheeler task definition ( task wrapper, State decoder settings, NeuralNetwork, ReplayBuffer, .. )*

### Import generics

In [1]:
import os, time

os.chdir("..")

import numpy as np
import toml, gym

import torch
from torch.multiprocessing import Queue, Process

### Load task configs ~ this should be adopted offline for particular task

In [2]:
CFG = toml.loads(open('cfg.toml').read())
GYM_CFG = toml.loads(open('gym.toml').read())

torch.set_default_tensor_type(CFG['tensor'])

CFG['task'] = "Pendulum-v0"

### Import wheeler environment and particular utils we want to use ~ general ones ( shared across tasks )

In [3]:
from utils.task import Task
from utils.taskinfo import *

from utils.rbf import *
from utils.normalizer import *

from utils.taskmgr import *
from utils.replay import *

from utils.fastmem import Memory

from utils.curiosity import *

from agent.zer0bot import agent_launch

### Define Task wrapper ~ when is goal met, how to step ( update rewards function, .. ), when / how to reset

In [4]:
class GymTask(Task):
    def step_ex(self, action, test = False):
        state, reward, done, _ = self.env.step(self.bot_id, self.objective_id, action)

        if test: return action, state.reshape(1, -1), reward, done, True
        
        return action, state, reward, done, True

    def goal_met(self, states, rewards, n_steps):
        print("TEST : ", sum(rewards))
        return sum(rewards) > -150.

### Generic proxy for creating our Task ( multiprocess environments purpose mainly ) 
- but can also add wrapping function approx values ( action value to tanh, sigmoid, .. ) - this not works well with PPO now

In [5]:
class GymInfo(TaskInfo):
    def __init__(self, env, replaybuf, factory, Mgr, args):
        super().__init__(
                len(env.reset()), 1,
                float(env.action_space.low[0]), float(env.action_space.high[0]),
                CFG,
                replaybuf,
                factory, Mgr, args)

    def new(self, cfg, bot_id, objective_id):
        return GymTask(cfg,
                self.env,
                objective_id, bot_id,
                self.action_low, self.action_high)

    @staticmethod
    def factory(ind): # bare metal task creation
        global CFG
        print("created %i-th task"%ind)
        return gym.make(CFG['task'])

### Implement callback for testing policy ~ per X training rounds, we want to test it ~ enable visuals if you want

In [6]:
def callback(task, agent, scores):
    try: callback.z += 1
    except: callback.z = 0
    
    # we can save scores to main queue, and avarage them, or we can ..
    # run testing w/ visuals :
    done = all(task.test_policy(agent)[0] for _ in range(10))
    if not done:
        return False

    print("\n")
    print("="*80)
    print("training over", callback.z * GYM_CFG['n_simulations'] * GYM_CFG['mcts_rounds'])
    print("="*80)

    for i in range(100): print("total steps : training : %i :: %i >"%(
        callback.z * GYM_CFG['mcts_rounds'] * GYM_CFG['n_simulations'],
        len(task.test_policy(agent)[2])))

    return True

### Prepare neural network which we will be using

In [7]:
from models import ddpg_model, noisy_model, state_action_model

def CriticNN(state_size, action_size, wrap_value, cfg):
    return state_action_model.Critic(state_size, action_size, wrap_value, cfg, fcs1_units=256, fc2_units=128)
    return ddpg_model.Critic(state_size, action_size, wrap_value, cfg, fcs1_units=400, fc2_units=300)

def ActorNN(state_size, action_size, wrap_action, cfg):
    return noisy_model.Actor(state_size, action_size, wrap_action, cfg, hiddens=[128, 64])
    return ddpg_model.Actor(state_size, action_size, wrap_action, cfg, fc1_units=400, fc2_units=300)

### Select encoders

In [8]:
from utils.encoders import *
from utils.rnn import *#GRUEncoder

def encoderstack(env):
    norm = GlobalNormalizer(GYM_CFG, len(env.reset()))
    #return norm
    experience = GRUEncoder(GYM_CFG, norm.total_size())#GRU#LSTM
    encoder_norm = StackedEncoder(GYM_CFG, len(env.reset()), norm, experience)
    encoder_norm.share_memory()
    return encoder_norm

### Cook Task : replay buffer ( fast / prio-gae-rnn ) + task manager ( local / remote / unity )

In [9]:
def taskfactory(env):
#    return GymInfo(env, Memory, GymInfo.factory, LocalTaskManager, ())
    return GymInfo(env, ReplayBuffer, GymInfo.factory, LocalTaskManager, ())
    return GymInfo(env, ReplayBuffer, GymInfo.factory, RemoteTaskManager, (LocalTaskManager, 1 + GYM_CFG['n_simulations']))

### Glue it all together ~ select buffer, encoders, agents, ... and RUN!!

In [None]:
def main():
    print(CFG)

    env = gym.make(CFG['task'])
    
    encoder = encoderstack(env)
    task_factory = taskfactory(env)
    task = task_factory.new(GYM_CFG, 0, -1)
    
    def callback_task(agent, stop_q):
        return callback(task, agent, stop_q)
    
    stop_q = Queue()
    agent_launch(0, GYM_CFG, task_factory, encoder, ActorNN, CriticNN, stop_q, callback_task)

if '__main__' == __name__:
    main()

{'tensor': 'torch.DoubleTensor', 'task': 'Pendulum-v0', 'total_simulations': 2, 'cross_exp_size': 5000, 'max_reward_val': 1000, 'min_reward_val': -1000}
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
created 0-th task
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
created 1-th task
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[0>   1::     0] training =  0, steps = 174, max_step = 10000, reward=-997.664009 ::[[0.10530957]]: >>train tensor(-15.9997) 128:0 ]<----
[0>   1::     0] training

>>train tensor(-9.4303) 116
>>train tensor(-8.8663) 83
>>train tensor(-10.8985) 123
>>train tensor(-9.2911) 181
>>train tensor(-7.8279) 74
>>train tensor(-10.5022) 111
>>train tensor(-9.7862) 160
>>train tensor(-8.3146) 54
[0>   4::     0] training =  0, steps =  12, max_step = 200, reward=-30.646167 ::[[-0.06167549]]: >>train tensor(-12.2773) 63600 ]<----
[0>   4::     0] training =  0, steps =  31, max_step = 200, reward=-125.606928 ::[[-0.10604215]]: >>train tensor(-11.3493) 57
[0>   4::     0] training =  0, steps =  45, max_step = 200, reward=-232.736858 ::[[-0.04812173]]: >>train tensor(-12.4035) 55
[0>   4::     0] training =  0, steps =  50, max_step = 200, reward=-242.549156 ::[[-0.02034905]]: >>train tensor(-11.9962) 73
[0>   4::     0] training =  0, steps =  51, max_step = 200, reward=-243.196361 ::[[-0.03371138]]: >>train tensor(-15.2209) 75
[0>   4::     0] training =  0, steps =  54, max_step = 200, reward=-244.649478 ::[[-0.0492515]]: >>train tensor(-14.9127) 60
[0>   4

[0>   6::     0] training =  0, steps =  33, max_step = 200, reward=-138.468087 ::[[0.01057432]]: >>train tensor(-8.0901) 168
[0>   6::     0] training =  0, steps =  55, max_step = 200, reward=-248.583268 ::[[-0.00773125]]: >>train tensor(-8.9464) 133:1000 ]<----
[0>   6::     0] training =  0, steps =  76, max_step = 200, reward=-364.232739 ::[[-0.07279162]]: >>train tensor(-11.1425) 1151000 ]<----
[0>   6::     0] training =  0, steps =  82, max_step = 200, reward=-375.249973 ::[[-0.0683756]]:  >>train tensor(-7.1414) 122
>>train tensor(-8.8371) 127
[0>   6::     0] training =  0, steps =  83, max_step = 200, reward=-380.213486 ::[[-0.06057989]]: [ TARGET:-24.020991 replay::1000 ]<---->>train tensor(-6.0132) 60
[0>   6::     0] training =  0, steps =  99, max_step = 200, reward=-484.089881 ::[[-0.13246004]]: >>train tensor(-7.5053) 80::1000 ]<----
[0>   6::     0] training =  0, steps = 122, max_step = 200, reward=-603.878156 ::[[-0.12660127]]: >>train tensor(-7.5870) 115
>>train te

[0>   8::     0] training =  0, steps = 161, max_step = 200, reward=-837.229040 ::[[-0.05003108]]: >>train tensor(-1.2012) 209:1400 ]<----
[0>   8::     0] training =  0, steps = 162, max_step = 200, reward=-838.155081 ::[[-0.0560782]]: >>train tensor(-2.1114) 72
>>train tensor(-3.1915) 83
>>train tensor(-1.2395) 240
>>train tensor(-2.1853) 111
>>train tensor(1.00000e-02 *
       -9.4719) 126
[0>   8::     0] training =  0, steps = 163, max_step = 200, reward=-839.014744 ::[[-0.05346183]]: [ TARGET:-63.417068 replay::1400 ]<---->>train tensor(-1.9982) 132
>>train tensor(-2.2458) 80
[0>   8::     0] training =  0, steps = 188, max_step = 200, reward=-960.936574 ::[[-0.08347767]]: >>train tensor(-2.9638) 244:1400 ]<----
[0>   8::     0] training =  0, steps = 200, max_step = 200, reward=-1053.921558 ::[[-0.05482761]]: >>train tensor(-2.7194) 132
>>train tensor(-2.8730) 233
>>train tensor(-3.0980) 234
>>train tensor(-1.5175) 160
>>train tensor(-2.1428) 223
>>train tensor(-2.1274) 114
[0> 

[0>  11::     0] training =  0, steps =  83, max_step = 200, reward=-468.343739 ::[[-0.09128656]]: [ TARGET:-71.922881 replay::2000 ]<---->>train tensor(-2.6213) 176
[0>  11::     0] training =  0, steps = 122, max_step = 200, reward=-689.267920 ::[[-0.37220622]]: >>train tensor(-2.7704) 109
>>train tensor(0.2566) 95
>>train tensor(1.6749) 137
>>train tensor(3.1861) 196
>>train tensor(2.8237) 201
>>train tensor(4.5157) 223
>>train tensor(2.4948) 134
[0>  11::     0] training =  0, steps = 128, max_step = 200, reward=-708.345438 ::[[-0.20357562]]: >>train tensor(-0.6819) 40::2000 ]<----
[0>  11::     0] training =  0, steps = 161, max_step = 200, reward=-888.857587 ::[[-0.22103453]]: >>train tensor(0.2998) 168y::2000 ]<----
[0>  11::     0] training =  0, steps = 162, max_step = 200, reward=-901.267140 ::[[-0.0822542]]: >>train tensor(-1.0014) 176
>>train tensor(0.4808) 216
>>train tensor(1.3191) 156
>>train tensor(1.00000e-02 *
       6.2484) 293
>>train tensor(1.6625) 179
[0>  11::   

[0>  14::     0] training =  0, steps =  88, max_step = 200, reward=-675.867423 ::[[-0.04405738]]: >>train tensor(1.00000e-02 *:2600 ]<----
       2.1413) 277
[0>  14::     0] training =  0, steps = 122, max_step = 200, reward=-936.189510 ::[[-0.03412247]]: >>train tensor(-3.4257) 355
>>train tensor(-1.7677) 257
>>train tensor(-1.6153) 293
>>train tensor(-2.1922) 258
>>train tensor(-0.9728) 167
>>train tensor(-1.7939) 100
[0>  14::     0] training =  0, steps = 123, max_step = 200, reward=-945.711799 ::[[-0.0260147]]: [ TARGET:-144.309655 replay::2600 ]<---->>train tensor(-1.1716) 91
[0>  14::     0] training =  0, steps = 126, max_step = 200, reward=-975.177207 ::[[-0.01799293]]: >>train tensor(-2.5106) 40
[0>  14::     0] training =  0, steps = 160, max_step = 200, reward=-1234.450717 ::[[-0.02203341]]: >>train tensor(-0.3401) 177::2600 ]<----
[0>  14::     0] training =  0, steps = 162, max_step = 200, reward=-1249.729030 ::[[-0.01761814]]: >>train tensor(-1.4826) 354
>>train tensor

[0>  17::     0] training =  0, steps =  82, max_step = 200, reward=-631.678875 ::[[-0.15300154]]: >>train tensor(-0.2667) 291
>>train tensor(1.1526) 322
>>train tensor(1.9268) 270
>>train tensor(1.2423) 312
>>train tensor(0.4356) 400
>>train tensor(-0.1344) 365
>>train tensor(0.5242) 381
>>train tensor(-1.9967) 326
>>train tensor(-2.0260) 240
[0>  17::     0] training =  0, steps =  85, max_step = 200, reward=-650.450909 ::[[-0.09414863]]: >>train tensor(-0.1507) 310::3200 ]<----
[0>  17::     0] training =  0, steps = 122, max_step = 200, reward=-931.637948 ::[[-0.09435746]]: >>train tensor(-2.0602) 237
>>train tensor(-1.1954) 441
>>train tensor(-0.8684) 160
>>train tensor(-0.9652) 134
>>train tensor(-0.3475) 338
>>train tensor(-0.3809) 273
[0>  17::     0] training =  0, steps = 123, max_step = 200, reward=-939.636605 ::[[-0.0939582]]: [ TARGET:-189.381906 replay::3200 ]<---->>train tensor(1.7010) 88
[0>  17::     0] training =  0, steps = 125, max_step = 200, reward=-957.962316 ::[

>>train tensor(-0.7114) 346
[0>  20::     0] training =  0, steps =  85, max_step = 200, reward=-583.133557 ::[[-0.17245399]]: [ TARGET:-246.778482 replay::3800 ]<---->>train tensor(-0.9281) 339
[0>  20::     0] training =  0, steps = 122, max_step = 200, reward=-830.744182 ::[[-0.18158035]]: >>train tensor(-2.2179) 266
>>train tensor(-0.1910) 135
>>train tensor(0.8455) 222
>>train tensor(1.2595) 101
>>train tensor(1.5898) 110
>>train tensor(0.6412) 476
[0>  20::     0] training =  0, steps = 123, max_step = 200, reward=-835.333467 ::[[-0.18496491]]: [ TARGET:-229.142941 replay::3800 ]<---->>train tensor(0.5377) 295
[0>  20::     0] training =  0, steps = 127, max_step = 200, reward=-852.871126 ::[[-0.40255989]]: >>train tensor(-1.0367) 40
[0>  20::     0] training =  0, steps = 162, max_step = 200, reward=-1086.324642 ::[[-0.3489864]]:  >>train tensor(1.00000e-02 *
       -2.7775) 371
>>train tensor(-0.8186) 368
>>train tensor(-0.2814) 219
>>train tensor(0.3073) 131
>>train tensor(0.7

>>train tensor(-0.3681) 149
>>train tensor(-1.4383) 455
>>train tensor(-0.4474) 265
>>train tensor(-0.8645) 142
[0>  23::     0] training =  0, steps = 123, max_step = 200, reward=-648.345969 ::[[-0.18769546]]: [ TARGET:-286.686298 replay::4400 ]<---->>train tensor(0.2689) 411
[0>  23::     0] training =  0, steps = 124, max_step = 200, reward=-649.061784 ::[[-0.14925852]]: tensor(-0.9984) 40
[0>  23::     0] training =  0, steps = 162, max_step = 200, reward=-853.194009 ::[[-0.15470082]]: >>train tensor(0.7570) 307
>>train tensor(-1.1121) 305
>>train tensor(-0.6236) 458
>>train tensor(-0.2109) 143
>>train tensor(-1.2750) 465
>>train tensor(-1.6268) 226
>>train tensor(-0.5635) 436
[0>  23::     0] training =  0, steps = 167, max_step = 200, reward=-896.512672 ::[[-0.28251349]]: >>train tensor(0.9704) 80ay::4400 ]<----
[0>  23::     0] training =  0, steps = 200, max_step = 200, reward=-1040.939048 ::[[-0.25504425]]: >>train tensor(-1.5970) 164
>>train tensor(-0.4035) 367
>>train tensor

>>train tensor(0.4609) 40
[0>  26::     0] training =  0, steps = 162, max_step = 200, reward=-846.215894 ::[[-0.17746883]]: >>train tensor(-0.3907) 176
>>train tensor(-1.3511) 283
>>train tensor(-0.2284) 173
>>train tensor(-1.3804) 271
>>train tensor(0.1592) 162
>>train tensor(-1.0827) 284
[0>  26::     0] training =  0, steps = 163, max_step = 200, reward=-852.556385 ::[[-0.17476275]]: [ TARGET:-334.760496 replay::5000 ]<---->>train tensor(0.5145) 541
>>train tensor(1.00000e-02 *
       -4.6360) 80
[0>  26::     0] training =  0, steps = 200, max_step = 200, reward=-1068.301135 ::[[-0.20418282]]: >>train tensor(-0.3613) 481
>>train tensor(1.00000e-02 *
       2.0484) 279
>>train tensor(0.4247) 368
>>train tensor(0.3951) 217
>>train tensor(0.4176) 592
>>train tensor(-0.6370) 551
>>train tensor(1.00000e-03 *
       1.9535) 146
[0>  26::     0] training =  0, steps = 200, max_step = 200, reward=-1068.301135 ::[[-0.20192667]]: [ TARGET:-340.495147 replay::5000 ]<----TEST :  -1261.4033556

[0>  29::     0] training =  0, steps = 162, max_step = 200, reward=-836.403427 ::[[-0.2077422]]:  >>train tensor(-1.0010) 514
>>train tensor(-2.4357) 398
>>train tensor(-3.6489) 156
>>train tensor(-0.7796) 628
>>train tensor(-0.2635) 534
>>train tensor(0.6261) 215
[0>  29::     0] training =  0, steps = 163, max_step = 200, reward=-837.722139 ::[[-0.20769997]]: [ TARGET:-374.619291 replay::5600 ]<---->>train tensor(-0.5544) 479
[0>  29::     0] training =  0, steps = 167, max_step = 200, reward=-843.170331 ::[[-0.1455405]]: >>train tensor(1.9288) 80
[0>  29::     0] training =  0, steps = 200, max_step = 200, reward=-1037.487139 ::[[-0.14129453]]: >>train tensor(-1.5762) 482
>>train tensor(-1.8890) 567
>>train tensor(-1.5403) 314
>>train tensor(-0.4861) 621
>>train tensor(0.7815) 220
>>train tensor(0.1501) 347
[0>  29::     0] training =  0, steps = 200, max_step = 200, reward=-1037.487139 ::[[-0.14126447]]: [ TARGET:-368.347342 replay::5600 ]<---->>train tensor(-1.3823) 395
TEST :  -