In [1]:
import numpy as np
import argparse
from copy import deepcopy
import gym
import torch
import time

from normalised_env import NormalizedEnv
from wolptinger import Wolptinger
from utils import *
from evaluator import Evaluator
from ContinuousCartPole import ContinuousCartPoleEnv

In [2]:
def train(num_iterations, agent, env, evaluate, validate_steps, output, max_episode_length=None, debug=False):
    agent.is_training = True
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None

    while step < num_iterations:
        # reset if it is the start of episode
        if observation is None:
            # the original method use deepcopy here
            observation = env.reset()
            agent.reset(observation)
        
        # agent choose action
        if step < args.warmup:
            action = agent.random_action()
        else:
            action = agent.select_action(observation)
        
        next_state, reward, done, info = env.step(action)
        if max_episode_length and episode_steps >= max_episode_length - 1:
            done = True
        
        # agent observe and update policy
        agent.observe(reward, next_state, done)
        if step > args.warmup:
            agent.update_policy()
        
        # evaluate the model
        # I personally don't think this will work, also, this will let the model memory a wrong
        # action
        if evaluate is not None and validate_steps > 0 and step % validate_steps == 0:
            policy = lambda x: agent.select_action(x, decay_epsilon=False)
            validate_reward = evaluate(env, policy, debug=False, visualize=False, save=False)
            if debug: 
                prYellow('[Evaluate] Step_{:07d}: mean_reward:{}'.format(step, validate_reward))
            agent.s_t = observation
            agent.a_t = action

        # save intermidate training model
        if step % int(num_iterations/3) == 0:
            agent.save_model(output)
        
        # update models
        step += 1
        episode_steps += 1
        episode_reward += reward
        observation = next_state
        
        # end of episode
        if done:
            if debug:
                prLightPurple(f'#{episode}: episode_reward: {episode_reward} steps:{step}')
            
            # agent.memory.append(
            #     observation,
            #     agent.select_action(observation)
            #     0.,
            #     False
            # )

            # reset
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1

In [3]:
def test(num_episodes, agent, env, evaluate, model_path, visualize=True, debug=False):

    agent.load_weights(model_path)
    agent.is_training = False
    agent.eval()
    policy = lambda x: agent.select_action(x, decay_epsilon=False)

    for i in range(num_episodes):
        validate_reward = evaluate(env, policy, debug=debug, visualize=visualize, save=False)
        if debug: 
            prYellow('[Evaluate] #{}: mean_reward:{}'.format(i, validate_reward))

In [4]:
class Arguments(object):
    def __init__(self):
        self.mode = 'train'
        # self.env = "InvertedPendulum-v2"
        self.env = 'ContinuousCartPole'
        self.h1 = 128
        self.h2 = 128
        self.rate = 1e-3
        self.prate = 1e-3
        self.warmup = 100
        self.discount = 0.99
        self.bsize = 64
        self.rmsize = 1000
        self.window_length = 1
        self.tau = 0.001
        self.ou_theta = 0.15
        self.ou_sigma = 0.2
        self.ou_mu = 0.0
        self.validate_episodes = 20
        self.max_episode_length = 500
        self.validate_steps = 2000
        self.output = 'output'
        self.debug = True
        self.init_w = 0.003
        self.train_iter = 20000
        # self.epsilon = 50000
        self.epsilon = 10000
        self.seed = -1
        self.max_actions = 1e6
        self.resume = 'default'
        self.k_ratio = 1e-6
        # we pass in the pre_generated action space
        self.action_space = None

args = Arguments()

In [5]:
args.output = get_output_folder(args.output, args.env)
if args.resume == 'default':
    args.resume = 'output/{}-run0'.format(args.env)

# env = NormalizedEnv(gym.make(args.env))
# env = gym.make(args.env)
env = ContinuousCartPoleEnv()
args.low = env.action_space.low
args.high = env.action_space.high

if args.seed > 0:
    np.random.seed(args.seed)
    env.seed(args.seed)

nb_states = env.observation_space.shape[0]
nb_actions = env.action_space.shape[0]

agent = Wolptinger(nb_states, nb_actions, args)
evaluate = Evaluator(args.validate_episodes, 
    args.validate_steps, args.output, max_episode_length=args.max_episode_length)

start_time = time.time()

if args.mode == 'train':
    train(args.train_iter, agent, env, evaluate, 
        args.validate_steps, args.output, max_episode_length=args.max_episode_length, debug=args.debug)
    end_time = time.time()

elif args.mode == 'test':
    test(args.validate_episodes, agent, env, evaluate, args.resume,
        visualize=True, debug=args.debug)

else:
    raise RuntimeError('undefined mode {}'.format(args.mode))

rd: 6.0 steps:8969[00m
[94m #1524: episode_reward: 6.0 steps:8975[00m
[94m #1525: episode_reward: 5.0 steps:8980[00m
[94m #1526: episode_reward: 6.0 steps:8986[00m
[94m #1527: episode_reward: 5.0 steps:8991[00m
[94m #1528: episode_reward: 6.0 steps:8997[00m
[94m #1529: episode_reward: 5.0 steps:9002[00m
[94m #1530: episode_reward: 6.0 steps:9008[00m
[94m #1531: episode_reward: 6.0 steps:9014[00m
[94m #1532: episode_reward: 6.0 steps:9020[00m
[94m #1533: episode_reward: 6.0 steps:9026[00m
[94m #1534: episode_reward: 6.0 steps:9032[00m
[94m #1535: episode_reward: 5.0 steps:9037[00m
[94m #1536: episode_reward: 6.0 steps:9043[00m
[94m #1537: episode_reward: 6.0 steps:9049[00m
[94m #1538: episode_reward: 6.0 steps:9055[00m
[94m #1539: episode_reward: 6.0 steps:9061[00m
[94m #1540: episode_reward: 6.0 steps:9067[00m
[94m #1541: episode_reward: 5.0 steps:9072[00m
[94m #1542: episode_reward: 6.0 steps:9078[00m
[94m #1543: episode_reward: 6.0 steps:9084[

KeyboardInterrupt: 