In [54]:
import myenv
import gym
from gym import wrappers
import numpy as np
import time
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl

In [55]:
class QFunction(chainer.Chain):
    def __init__(self, obs_size, n_actions, n_hidden_channels=100):
        super(QFunction, self).__init__()
        with self.init_scope():
            self.l0 = L.Linear(obs_size, n_hidden_channels)
            self.l1 = L.Linear(n_hidden_channels, n_hidden_channels)
            self.l2 = L.Linear(n_hidden_channels, n_actions)
    def __call__(self, x, test=False):
        h = F.tanh(self.l0(x))
        h = F.tanh(self.l1(h))
        return chainerrl.action_value.DiscreteActionValue(self.l2(h))

In [56]:
env = gym.make('CartPoleODE-v0')
env = wrappers.Monitor(env, 'videos+1', video_callable=(lambda ep: ep % 10 == 0), force=True)

In [57]:
gamma = 0.99
alpha = 0.5
max_number_of_steps = 2000
num_episodes = 1000

In [58]:
q_func = QFunction(env.observation_space.shape[0], env.action_space.n)
optimizer = chainer.optimizers.Adam(eps=1e-2)
optimizer.setup(q_func)
explorer = chainerrl.explorers.LinearDecayEpsilonGreedy(
    start_epsilon = 1.0,
    end_epsilon = 0.1,
    decay_steps = num_episodes,
    random_action_func = env.action_space.sample
)
replay_buffer = chainerrl.replay_buffer.PrioritizedReplayBuffer(capacity=10**6)
phi = lambda x: x.astype(np.float32, copy=False)

In [59]:
agent = chainerrl.agents.DoubleDQN(
    q_func, optimizer, replay_buffer, gamma, explorer,
    replay_start_size = 500,
    update_interval = 1,
    target_update_interval = 100,
    phi = phi
)

In [60]:
for episode in range(num_episodes):
    observation = env.reset()
    done = False
    reward = 0
    R = 0
    for t in range(max_number_of_steps):
        if episode%10==0:
            env.render('human')
        action = agent.act_and_train(observation, reward)
        observation, reward, done, info = env.step(action)
        R += reward
        if done:
            break

    agent.stop_episode_and_train(observation, reward, done)
    if episode % 10 == 0:
        agent.save('agent+1/agent_' + str(episode))
        print('episode: ', episode, 'R: ', R, 'statistics', agent.get_statistics())
    

('episode: ', 0, 'R: ', 31.0, 'statistics', [(u'average_q', 0.009944357832667661), (u'average_loss', 0)])
('episode: ', 10, 'R: ', 152.0, 'statistics', [(u'average_q', 2.311275805222879), (u'average_loss', 0.10093903745184106)])
('episode: ', 20, 'R: ', 262.0, 'statistics', [(u'average_q', 17.980564842392393), (u'average_loss', 0.4097942978721389)])
('episode: ', 30, 'R: ', 286.0, 'statistics', [(u'average_q', 44.657851759475896), (u'average_loss', 0.7363527600786323)])
('episode: ', 40, 'R: ', 265.0, 'statistics', [(u'average_q', 57.30028061447649), (u'average_loss', 0.8300193989091991)])
('episode: ', 50, 'R: ', 969.0, 'statistics', [(u'average_q', 75.08933075314522), (u'average_loss', 0.4982966949476077)])


Error: Tried to reset environment which is not done. While the monitor is active for CartPoleODE-v0, you cannot call reset() unless the episode is over.