In [0]:
!pip install chainerrl

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import sys
sys.path.append('/content/drive/My Drive/ML/chainer_book/DQN')

In [0]:
import myenv
import gym
from gym import wrappers
import numpy as np
import time
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl

In [0]:
!export DISPLAY=':99.0'
!Xvfb :99 -screen 0 1400x900x24 > /dev/null 2>&1 &

In [0]:
from tqdm import tqdm
from pyglet import gl
from pyglet.gl import *

ImportError: ignored

In [0]:
gamma = 0.999
alpha = 0.5

In [0]:
class QFunction(chainer.Chain):
    def __init__(self, obs_size, n_actions, n_hidden_channels=50):
        super(QFunction, self).__init__()
        with self.init_scope():
            self.l0 = L.Linear(obs_size, n_hidden_channels)
            self.l1 = L.Linear(n_hidden_channels, n_hidden_channels)
            self.l2 = L.Linear(n_hidden_channels, n_actions)
    def __call__(self, x, test=False):
        h1 = F.tanh(self.l0(x))
        h2 = F.tanh(self.l1(h1))
        return chainerrl.action_value.DiscreteActionValue(self.l2(h2))

In [6]:
env = gym.make('Lifting-v0')
# env = gym.wrappers.Monitor(env, 'liffting_video', video_callable=(lambda ep: ep % 10 == 0), force=True)
num_episodes = 20000



In [0]:
q_func = QFunction(env.observation_space.shape[0], env.action_space.n)
optimizer = chainer.optimizers.Adam(1e-2)
optimizer.setup(q_func)
explorer = chainerrl.explorers.LinearDecayEpsilonGreedy(
    start_epsilon=1.0, 
    end_epsilon=0.1,
    decay_steps = num_episodes,
    random_action_func = env.action_space.sample
)
replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10**6)
phi = lambda x: x.astype(np.float32, copy=False)

In [0]:
agent = chainerrl.agents.DQN(
    q_func, optimizer, replay_buffer, gamma, explorer,
    replay_start_size = 500,
    update_interval = 1,
    target_update_interval = 100,
    phi = phi,
#     gpu = 0
)

In [10]:
# env = wrappers.Monitor(env, '/content/drive/My Drive/ML/chainer_book/DQN/videos', video_callable=(lambda ep: ep % 100 == 0), force=True)
for episode in range(num_episodes):
    observation = env.reset()
    done = False
    reward = 0
    R = 0
    while not done and R<10:
        action = agent.act_and_train(observation, reward)
        observation, reward, done, info = env.step(action)
        R += reward
    agent.stop_episode_and_train(observation, reward, done)
    if episode%100==0:
        print('episode: ', episode, 'R :', R, 'statistics :', agent.get_statistics())
        agent.save('agent/agent_' + str(episode))

('episode: ', 0, 'R :', 2.0, 'statistics :', [(u'average_q', -0.007513555959701296), (u'average_loss', 0), (u'n_updates', 0)])
('episode: ', 100, 'R :', 0.0, 'statistics :', [(u'average_q', 1.3066995356155904), (u'average_loss', 0.0171602198349568), (u'n_updates', 3637)])
('episode: ', 200, 'R :', 1.0, 'statistics :', [(u'average_q', 2.8373330588281784), (u'average_loss', 0.024963693443963084), (u'n_updates', 7360)])
('episode: ', 300, 'R :', 3.0, 'statistics :', [(u'average_q', 4.9562329013195265), (u'average_loss', 0.04838513058168517), (u'n_updates', 13309)])
('episode: ', 400, 'R :', 9.0, 'statistics :', [(u'average_q', 12.181255788085656), (u'average_loss', 0.1903241491844252), (u'n_updates', 23987)])
('episode: ', 500, 'R :', 1.0, 'statistics :', [(u'average_q', 18.98941568451045), (u'average_loss', 0.28050252274170295), (u'n_updates', 36219)])
('episode: ', 600, 'R :', 1.0, 'statistics :', [(u'average_q', 26.41136568346018), (u'average_loss', 0.3065977784067083), (u'n_updates', 