In [1]:
import numpy as np
import argparse
from copy import deepcopy
import gym
import torch
import time

In [2]:
import sys
sys.path.append(os.path.join(os.getcwd(), 'Simulator'))
sys.path.append(os.path.join(os.getcwd(), 'Wolptinger'))

In [3]:
from Wolptinger.ddpg import DDPG
from Wolptinger.utils import *
from Wolptinger.evaluator import Evaluator

from Simulator.WordCounting import WordCountingEnv

In [4]:
%load_ext autoreload
%autoreload 2

In [9]:
def train(num_iterations, agent, env, evaluate, validate_steps, output, max_episode_length=None, debug=False):
    agent.is_training = True
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None

    while step < num_iterations:
        # reset if it is the start of episode
        if observation is None:
            # the original method use deepcopy here
            observation = env.reset()
            agent.reset(observation)
        
        # agent choose action
        if step < args.warmup:
            action = agent.random_action()
        else:
            action = agent.select_action(observation)
            print(action)
        
        
        next_state, reward, done, info = env.step(action)
        if max_episode_length and episode_steps >= max_episode_length - 1:
            done = True
        
        # agent observe and update policy
        agent.observe(reward, next_state, done)
        if step > args.warmup:
            agent.update_policy()
        
        # if debug:
        #     prCyan(f'{step}: reward is {reward}')
        
        # evaluate the model
        # I personally don't think this will work, also, this will let the model memory a wrong
        # action
        if evaluate is not None and validate_steps > 0 and step % validate_steps == 0:
            policy = lambda x: agent.select_action(x, decay_epsilon=False)
            validate_reward = evaluate(env, policy, debug=False, visualize=False, save=False)
            if debug: 
                prYellow('[Evaluate] Step_{:07d}: mean_reward:{}'.format(step, validate_reward))
            agent.s_t = observation
            agent.a_t = action

        # save intermidate training model
        if step % int(num_iterations/3) == 0:
            agent.save_model(output)
        
        # update models
        step += 1
        episode_steps += 1
        episode_reward += reward
        observation = next_state
        
        # end of episode
        if done:
            if debug:
                prLightPurple(f'#{episode}: episode_reward: {episode_reward} steps:{step}')
            
            # agent.memory.append(
            #     observation,
            #     agent.select_action(observation)
            #     0.,
            #     False
            # )

            # reset
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1

In [6]:
def test(num_episodes, agent, env, evaluate, model_path, visualize=True, debug=False):

    agent.load_weights(model_path)
    agent.is_training = False
    agent.eval()
    policy = lambda x: agent.select_action(x, decay_epsilon=False)

    for i in range(num_episodes):
        validate_reward = evaluate(env, policy, debug=debug, visualize=visualize, save=False)
        if debug: 
            prYellow('[Evaluate] #{}: mean_reward:{}'.format(i, validate_reward))

In [7]:
class Arguments(object):
    def __init__(self):
        self.mode = 'train'
        # self.env = "InvertedPendulum-v2"
        self.env = 'WordCountingEnv'
        self.h1 = 128
        self.h2 = 128
        self.rate = 1e-3
        self.prate = 1e-3
        self.warmup = 100
        self.discount = 0.99
        self.bsize = 64
        self.rmsize = 10000
        self.window_length = 1
        self.tau = 0.001
        self.ou_theta = 0.15
        self.ou_sigma = 0.2
        self.ou_mu = 0.0
        self.validate_episodes = 20
        self.max_episode_length = 50
        self.validate_steps = 2000
        self.output = 'output'
        self.debug = True
        self.init_w = 0.003
        self.train_iter = 15000
        # self.epsilon = 50000
        self.epsilon = 10000
        self.seed = -1
        self.resume = 'default'
        self.k_ratio = 1e-6
        # we pass in the pre_generated action space
        self.action_space = None

args = Arguments()

In [10]:
args.output = get_output_folder(args.output, args.env)
if args.resume == 'default':
    args.resume = 'WordCountingDDPG/{}-run0'.format(args.env)

# env = NormalizedEnv(gym.make(args.env))
# env = gym.make(args.env)
env = WordCountingEnv()
args.low = env.action_space.low
args.high = env.action_space.high

if args.seed > 0:
    np.random.seed(args.seed)
    env.seed(args.seed)

nb_states = env.observation_space.shape[0]
nb_actions = env.action_space.shape[0]

agent = DDPG(nb_states, nb_actions, args)
evaluate = Evaluator(args.validate_episodes, 
    args.validate_steps, args.output, max_episode_length=args.max_episode_length)

start_time = time.time()

if args.mode == 'train':
    train(args.train_iter, agent, env, None, 
        args.validate_steps, args.output, max_episode_length=args.max_episode_length, debug=args.debug)
    end_time = time.time()

elif args.mode == 'test':
    test(args.validate_episodes, agent, env, evaluate, args.resume,
        visualize=True, debug=args.debug)

else:
    raise RuntimeError('undefined mode {}'.format(args.mode))

Building Topology
spout 2
WordCount 9
Database 9
torch.Size([128, 17])
torch.Size([128, 17])
[96m 0: reward is -1.7222163899998169[00m
[96m 1: reward is -1.8696246599998745[00m
[96m 2: reward is -1.752544070000428[00m
[96m 3: reward is -1.7792574400003514[00m
[96m 4: reward is -1.5527319100002153[00m
[96m 5: reward is -1.7818512800003588[00m
[96m 6: reward is -1.8534155400002992[00m
[96m 7: reward is -1.7785978500006017[00m
[96m 8: reward is -1.669319539999811[00m
[96m 9: reward is -1.6885948700001234[00m
[96m 10: reward is -1.6419122299996682[00m
[96m 11: reward is -1.6859567200004002[00m
[96m 12: reward is -1.5615080999997648[00m
[96m 13: reward is -1.6870391199994796[00m
[96m 14: reward is -1.5944494599996875[00m
[96m 15: reward is -1.8363346100011435[00m
[96m 16: reward is -1.8443893199983803[00m
[96m 17: reward is -1.6805662600000584[00m
[96m 18: reward is -1.6444585100004785[00m
[96m 19: reward is -1.5479083900008546[00m
[96m 20: reward is

  return Variable(
  next_q_values.volatile=False


[96m 101: reward is -1.5659244299993245[00m
[ 0.06516823  0.11785434  0.02791397 -0.14884204  0.1198983   0.05136436
  0.04915169  0.13614656 -0.03353372 -0.08920311  0.01615299  0.03847342
 -0.1017075  -0.0086747  -0.1237223 ]
[96m 102: reward is -1.5968825999996346[00m
[ 0.18322055  0.21276349 -0.06742755 -0.23770496  0.18557934  0.15207975
  0.12226377  0.20514634 -0.13555361 -0.18081748  0.11856009  0.09364305
 -0.08795589 -0.10496526 -0.23731962]
[96m 103: reward is -1.5915890000011226[00m
[ 0.29013205  0.33842716 -0.22619314 -0.35250434  0.28674763  0.27323574
  0.18270303  0.30739683 -0.22035141 -0.337819    0.24536519  0.04608582
 -0.02764674 -0.23514836 -0.36312115]
[96m 104: reward is -1.6619202200024912[00m
[ 0.42546284  0.47937873 -0.4260175  -0.47522664  0.31862664  0.33527112
  0.14246957  0.3830762  -0.36543766 -0.43298492  0.38440472 -0.05051708
  0.07771321 -0.3945063  -0.5080956 ]
[96m 105: reward is -1.5872966699998268[00m
[ 0.5785282   0.6146932  -0.575459

KeyboardInterrupt: 