In [2]:
import myenv
import gym
from gym import wrappers
import numpy as np
import time
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl

In [3]:
from tqdm import tqdm

In [4]:
gamma = 0.999
alpha = 0.5

In [5]:
class QFunction(chainer.Chain):
    def __init__(self, obs_size, n_actions, n_hidden_channels=50):
        super(QFunction, self).__init__()
        with self.init_scope():
            self.l0 = L.Linear(obs_size, n_hidden_channels)
            self.l1 = L.Linear(n_hidden_channels, n_hidden_channels)
            self.l2 = L.Linear(n_hidden_channels, n_actions)
    def __call__(self, x, test=False):
        h1 = F.tanh(self.l0(x))
        h2 = F.tanh(self.l1(h1))
        return chainerrl.action_value.DiscreteActionValue(self.l2(h2))

In [6]:
env = gym.make('LiftingODE-v0')
env = wrappers.Monitor(env, 'videos', video_callable=(lambda ep: ep % 10 == 0), force=True)
num_episodes = 20000



In [7]:
q_func = QFunction(env.observation_space.shape[0], env.action_space.n)
optimizer = chainer.optimizers.Adam(1e-2)
optimizer.setup(q_func)
explorer = chainerrl.explorers.LinearDecayEpsilonGreedy(
    start_epsilon=1.0, 
    end_epsilon=0.1,
    decay_steps = num_episodes,
    random_action_func = env.action_space.sample
)
replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10**6)
phi = lambda x: x.astype(np.float32, copy=False)

In [8]:
agent = chainerrl.agents.DQN(
    q_func, optimizer, replay_buffer, gamma, explorer,
    replay_start_size = 500,
    update_interval = 1,
    target_update_interval = 100,
    phi = phi,
#     gpu = 0
)

In [9]:
for episode in range(num_episodes):
    observation = env.reset()
    done = False
    reward = 0
    R = 0
    while not done and R<20:
        if episode%10==0:
            env.render('human')
        action = agent.act_and_train(observation, reward)
        observation, reward, done, info = env.step(action)
        R += reward
    agent.stop_episode_and_train(observation, reward, done)
    if episode%10==0:
        print('episode: ', episode, 'R :', R, 'statistics :', agent.get_statistics())
        agent.save('agent/agent_' + str(episode))

('episode: ', 0, 'R :', 3.0, 'statistics :', [(u'average_q', 0.08647044220550745), (u'average_loss', 0)])
('episode: ', 10, 'R :', 2.0, 'statistics :', [(u'average_q', 0.3539118973240027), (u'average_loss', 0.00584154601132023)])
('episode: ', 20, 'R :', 0.0, 'statistics :', [(u'average_q', 0.5963866581002584), (u'average_loss', 0.007267065643337132)])
('episode: ', 30, 'R :', 1.0, 'statistics :', [(u'average_q', 1.2502951846538273), (u'average_loss', 0.005970460586989977)])
('episode: ', 40, 'R :', 1.0, 'statistics :', [(u'average_q', 1.8611040247709516), (u'average_loss', 0.011306380839403949)])
('episode: ', 50, 'R :', 2.0, 'statistics :', [(u'average_q', 2.6011256942431387), (u'average_loss', 0.014935865346787472)])
('episode: ', 60, 'R :', 2.0, 'statistics :', [(u'average_q', 2.793401704007595), (u'average_loss', 0.014317843581185126)])
('episode: ', 70, 'R :', 1.0, 'statistics :', [(u'average_q', 3.5483770584134433), (u'average_loss', 0.021430002019112673)])
('episode: ', 80, 'R 

Error: Tried to reset environment which is not done. While the monitor is active for LiftingODE-v0, you cannot call reset() unless the episode is over.