In [2]:
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl
import gym
import numpy as np

In [3]:
env = gym.make('CartPole-v0')
print('observation space:', env.observation_space)
print('action space:', env.action_space)

obs = env.reset()
#env.render()
print('initial observation:', obs)

action = env.action_space.sample()
obs, r, done, info = env.step(action)
print('next observation:', obs)
print('reward:', r)
print('done:', done)
print('info:', info)

[2017-09-19 20:21:53,276] Making new env: CartPole-v0


('observation space:', Box(4,))
('action space:', Discrete(2))
('initial observation:', array([-0.03789894,  0.0038022 ,  0.0068747 , -0.048693  ]))
('next observation:', array([-0.0378229 , -0.19141765,  0.00590084,  0.24615101]))
('reward:', 1.0)
('done:', False)
('info:', {})


In [4]:
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

In [5]:
q_func = chainerrl.q_functions.FCStateQFunctionWithDiscreteAction(
    obs_size, n_actions,
    n_hidden_layers=2, n_hidden_channels=50)

In [6]:
optimizer = chainer.optimizers.Adam(eps=1e-2)
optimizer.setup(q_func)

In [7]:
# Set the discount factor that discounts future rewards.
gamma = 0.95

In [8]:
#Use epsilon-greedy for exploration
explorer = chainerrl.explorers.ConstantEpsilonGreedy(
    epsilon=0.3, random_action_func=env.action_space.sample)


In [9]:
# DQN uses Experience Replay.
# Specify a replay buffer and its capacity.
replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)


In [10]:
# Since observations from CartPole-v0 is numpy.float64 while
# Chainer only accepts numpy.float32 by default, specify
# a converter as a feature extractor function phi.
phi = lambda x: x.astype(np.float32, copy=False)


In [11]:
# Now create an agent that will interact with the environment.
agent = chainerrl.agents.DoubleDQN(
    q_func, optimizer, replay_buffer, gamma, explorer,
    replay_start_size=500, update_interval=1,
    target_update_interval=100, phi=phi)

In [12]:
n_episodes = 200
max_episode_len = 200
for i in range(1, n_episodes + 1):
    obs = env.reset()
    reward = 0
    done = False
    R = 0  # return (sum of rewards)
    t = 0  # time step
    while not done and t < max_episode_len:
        # Uncomment to watch the behaviour
        # env.render()
        action = agent.act_and_train(obs, reward)
        obs, reward, done, _ = env.step(action)
        R += reward
        t += 1
    if i % 10 == 0:
        print('episode:', i,
              'R:', R,
              'statistics:', agent.get_statistics())
    agent.stop_episode_and_train(obs, reward, done)
print('Finished.')

('episode:', 10, 'R:', 9.0, 'statistics:', [(u'average_q', -0.02210699027348437), (u'average_loss', 0)])
('episode:', 20, 'R:', 8.0, 'statistics:', [(u'average_q', -0.04065060468379197), (u'average_loss', 0)])
('episode:', 30, 'R:', 16.0, 'statistics:', [(u'average_q', -0.05744586046366472), (u'average_loss', 0)])
('episode:', 40, 'R:', 13.0, 'statistics:', [(u'average_q', -0.07290433053616992), (u'average_loss', 0)])
('episode:', 50, 'R:', 12.0, 'statistics:', [(u'average_q', -0.03458843703789683), (u'average_loss', 0.1234621904767346)])
('episode:', 60, 'R:', 10.0, 'statistics:', [(u'average_q', 0.10075702616147279), (u'average_loss', 0.22860829044088962)])
('episode:', 70, 'R:', 9.0, 'statistics:', [(u'average_q', 0.3482971352422558), (u'average_loss', 0.2680381093193151)])
('episode:', 80, 'R:', 12.0, 'statistics:', [(u'average_q', 0.6775768589941548), (u'average_loss', 0.29100983285419463)])
('episode:', 90, 'R:', 10.0, 'statistics:', [(u'average_q', 1.049088807338526), (u'average

In [13]:
for i in range(10):
    obs = env.reset()
    done = False
    R = 0
    t = 0
    while not done and t < 200:
        env.render()
        action = agent.act(obs)
        obs, r, done, _ = env.step(action)
        R += r
        t += 1
    print('test episode:', i, 'R:', R)
    agent.stop_episode()

('test episode:', 0, 'R:', 9.0)
('test episode:', 1, 'R:', 8.0)
('test episode:', 2, 'R:', 10.0)
('test episode:', 3, 'R:', 8.0)
('test episode:', 4, 'R:', 8.0)
('test episode:', 5, 'R:', 10.0)
('test episode:', 6, 'R:', 9.0)
('test episode:', 7, 'R:', 10.0)
('test episode:', 8, 'R:', 10.0)
('test episode:', 9, 'R:', 10.0)


In [14]:
# Save an agent to the 'agent' directory
agent.save('agent')

In [15]:
ls

[34mAirbnb[m[m/
[31mAnaconda.sh[m[m*
[31mAnaconda3-4.2.0-Linux-x86_64.sh[m[m*
[34mAndroidStudioProjects[m[m/
[34mApplications[m[m/
[34mAurius Expense[m[m/
[34mBurningMan[m[m/
[34mCarND-Keras-Lab[m[m/
ChainerRL_Pole.ipynb
[34mCooking[m[m/
[34mCreative Cloud Files[m[m/
[34mDLPython[m[m/
[34mDeep-Learning-Experiments[m[m/
[34mDeepLearning_Presentation[m[m/
DeepLearning_Presentation.tar.gz
[34mDesktop[m[m/
Dockerfile.devel
[34mDocuments[m[m/
[34mDownloads[m[m/
[34mDropbox[m[m/
FullSizeRender.jpg
[34mHtml[m[m/
[34mHuman_Level_Control_through_Deep_Reinforcement_Learning[m[m/
IMG_0611.m4v
[34mIOT[m[m/
[34mIdeaProjects[m[m/
[34mIdeas[m[m/
[34mJSAnimation[m[m/
[34mLibrary[m[m/
[34mLinh[m[m/
[34mMedical[m[m/
[34mMedical Receipts[m[m/
[34mMovies[m[m/
[34mMusic[m[m/
[34mNotes[m[m/
[34mNumpy[m[m/
[34mPayments[m[m/
[34mPictures[m[m/
Presentation2.pptx
[34mPublic

In [None]:
import logging
import sys
gym.undo_logger_setup()  # Turn off gym's default logger settings
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')

chainerrl.experiments.train_agent_with_evaluation(
    agent, env,
    steps=2000,           # Train the agent for 2000 steps
    eval_n_runs=10,       # 10 episodes are sampled for each evaluation
    max_episode_len=200,  # Maximum length of each episodes
    eval_interval=1000,   # Evaluate the agent after every 1000 steps
    outdir='result')      # Save everything to 'result' directory

outdir:result step:15 episode:0 R:15.0
statistics:[(u'average_q', 12.368850708979599), (u'average_loss', 0.585744226739545)]
outdir:result step:25 episode:1 R:10.0
statistics:[(u'average_q', 12.427505000192768), (u'average_loss', 0.585453625865528)]
outdir:result step:37 episode:2 R:12.0
statistics:[(u'average_q', 12.493370433376787), (u'average_loss', 0.5838082273837416)]
outdir:result step:51 episode:3 R:14.0
statistics:[(u'average_q', 12.5585641897944), (u'average_loss', 0.591990578029855)]
outdir:result step:65 episode:4 R:14.0
statistics:[(u'average_q', 12.628557601632549), (u'average_loss', 0.5916609175030797)]
outdir:result step:86 episode:5 R:21.0
statistics:[(u'average_q', 12.732738894275212), (u'average_loss', 0.6367328541630722)]
outdir:result step:95 episode:6 R:9.0
statistics:[(u'average_q', 12.785399766745888), (u'average_loss', 0.6297449793037597)]
outdir:result step:107 episode:7 R:12.0
statistics:[(u'average_q', 12.852770482330733), (u'average_loss', 0.6278774889204287