In [2]:
# coding:utf-8
"""
リフティングのDQNプログラム（ODE使用）
Copyright(c) 2018 Koji Makino and Hiromitsu Nishizaki All Rights Reserved.
"""
import myenv
import gym  #倒立振子(cartpole)の実行環境
from gym import wrappers  #gymの画像保存
import numpy as np
import time
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl
 

gamma = 0.999
alpha = 0.5

# Q-関数の定義
class QFunction(chainer.Chain):
    def __init__(self, obs_size, n_actions, n_hidden_channels=50):
        super().__init__()
        with self.init_scope():
            self.l0=L.Linear(obs_size, n_hidden_channels)
            self.l1=L.Linear(n_hidden_channels, n_hidden_channels)
            self.l2=L.Linear(n_hidden_channels, n_actions)

    def __call__(self, x, test=False):
        h1 = F.tanh(self.l0(x))#leaky_relu
        h2 = F.tanh(self.l1(h1))#leaky_relu
        return chainerrl.action_value.DiscreteActionValue(self.l2(h2))

env = gym.make('Lifting-v0')
env = gym.wrappers.Monitor(env, 'videos', force=True, video_callable=(lambda ep: ep % 100 == 0))
num_episodes = 20000  #総試行回数
q_func = QFunction(env.observation_space.shape[0], env.action_space.n)
optimizer = chainer.optimizers.Adam(eps=1e-2)
optimizer.setup(q_func)
explorer = chainerrl.explorers.LinearDecayEpsilonGreedy(start_epsilon=1.0, end_epsilon=0.1, decay_steps=num_episodes, random_action_func=env.action_space.sample)
replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)
phi = lambda x: x.astype(np.float32, copy=False)
agent = chainerrl.agents.DQN(
    q_func, optimizer, replay_buffer, gamma, explorer,
    replay_start_size=500, update_interval=1, target_update_interval=100, phi=phi)

for episode in range(num_episodes):  #試行数分繰り返す
    observation = env.reset()
    done = False
    reward = 0
    R = 0
    while not done and R<10:
        if episode%10==0:
            env.render('human')
        action = agent.act_and_train(observation, reward)
        observation, reward, done, info = env.step(action)
        R += reward
    agent.stop_episode_and_train(observation, reward, done)
    if episode % 10 == 0:
        print('episode:', episode,
              'R:', R,
              'statistics:', agent.get_statistics())
agent.save('agent')


episode: 0 R: 0.0 statistics: [('average_q', -0.00550827032901434), ('average_loss', 0), ('n_updates', 0)]
episode: 10 R: 0.0 statistics: [('average_q', -0.06895247450791817), ('average_loss', 0), ('n_updates', 0)]
episode: 20 R: 0.0 statistics: [('average_q', -0.08239578676854652), ('average_loss', 0.007163189883062057), ('n_updates', 301)]
episode: 30 R: 0.0 statistics: [('average_q', -0.055954282744113), ('average_loss', 0.007353568374299347), ('n_updates', 613)]
episode: 40 R: 0.0 statistics: [('average_q', 0.015184216981965986), ('average_loss', 0.008122228113647493), ('n_updates', 966)]
episode: 50 R: 1.0 statistics: [('average_q', 0.16944947422874407), ('average_loss', 0.010650966451022634), ('n_updates', 1486)]
episode: 60 R: 1.0 statistics: [('average_q', 0.26680623872219733), ('average_loss', 0.008888552787466789), ('n_updates', 1798)]
episode: 70 R: 1.0 statistics: [('average_q', 0.42298978167615764), ('average_loss', 0.009401562992188089), ('n_updates', 2336)]
episode: 80 R

Error: Tried to reset environment which is not done. While the monitor is active for Lifting-v0, you cannot call reset() unless the episode is over.