In [1]:
from __future__ import print_function
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl
import numpy as np
import gym
import logging

In [2]:
from tqdm import tqdm

In [3]:
def phi(obs):
    return obs.astype(np.float32)

In [4]:
class A3CLSTMSoftmax(chainer.Chain, chainerrl.agents.a3c.A3CModel, chainerrl.recurrent.RecurrentChainMixin):
    def __init__(self):
        super(A3CLSTMSoftmax, self).__init__()
        with self.init_scope():
            self.conv1 = L.Convolution2D(3, 16, (11, 9), 1, 0)
            self.conv2 = L.Convolution2D(16, 32, (11, 9), 1, 0)
            self.conv3 = L.Convolution2D(32, 64, (10, 9), 1, 0)
            self.l4p = L.LSTM(14976, 1024) # ポリシーネットワーク
            self.l4v = L.LSTM(14976, 1024) # バリューネットワーク
            self.l5p = L.Linear(1024, 1024)
            self.l5v = L.Linear(1024, 1024)
            self.pi = chainerrl.policies.SoftmaxPolicy(L.Linear(1024, 6))
            self.v = L.Linear(1024, 1)
            
    def pi_and_v(self, state):
        state = np.asarray(state.transpose(0, 3, 1, 2), dtype=np.float32)
        h1 = F.max_pooling_2d(F.relu(self.conv1(state)), ksize=2, stride=2)
        h2 = F.max_pooling_2d(F.relu(self.conv2(h1)), ksize=2, stride=2) 
        h3 = F.max_pooling_2d(F.relu(self.conv3(h2)), ksize=2, stride=2)
        h4p = self.l4p(h3)
        h4v = self.l4v(h3)
        h5p = F.relu(self.l5p(h4p))
        h5v = F.relu(self.l5v(h4v))
        pout = self.pi(h5p) # ポリシーネットワークの出力
        vout = self.v(h5v) # バリューネットワークの出力
        return pout, vout

In [5]:
n_process = 8
outdir = 'result'
chainerrl.misc.set_random_seed(0)
process_seeds = np.arange(n_process)

In [6]:
def make_env(process_idx, test=False):
    env = gym.make('SpaceInvaders-v0')
    process_seed = int(process_seeds[process_idx])
    if not test:
        chainerrl.misc.env_modifiers.make_reward_filtered(env, lambda x: x*0.01)
    if process_idx == 0 and not test:
        env = gym.wrappers.Monitor(env, outdir)
    env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
    env.seed(env_seed)
    return env

In [7]:
num_episodes = 100000

In [8]:
model = A3CLSTMSoftmax()
optimizer = chainerrl.optimizers.rmsprop_async.RMSpropAsync(lr=0.001, eps=0.1, alpha=0.99)
optimizer.setup(model)

<chainerrl.optimizers.rmsprop_async.RMSpropAsync at 0x7f4a4ef035c0>

In [9]:
agent = chainerrl.agents.a3c.A3C(
    model, 
    optimizer, 
    t_max = 8,
    gamma = 0.995,
    beta = 0.1,
    phi = phi,
)

In [10]:
gym.logger.set_level(0)
logging.basicConfig(level=logging.DEBUG)

In [0]:
env = gym.
chainerrl.experiments.train_agent_async(
    agent = agent,
    outdir = outdir,
    processes = n_process,
    make_env = make_env,
    profile = True,
    steps = 1000000,
    eval_interval = None,
    max_episode_len = num_episodes,
    logger = gym.logger
)

INFO: Making new env: SpaceInvaders-v0
INFO: Making new env: SpaceInvaders-v0
INFO: Making new env: SpaceInvaders-v0
INFO: Making new env: SpaceInvaders-v0
INFO: Making new env: SpaceInvaders-v0
INFO: Making new env: SpaceInvaders-v0
INFO: Making new env: SpaceInvaders-v0
INFO: Making new env: SpaceInvaders-v0
INFO: Creating monitor directory result
INFO: Starting new video recorder writing to /content/result/openaigym.video.0.210.video000000.mp4
DEBUG: Capturing video frame: path=/content/result/openaigym.video.0.210.video000000.mp4
DEBUG: Starting ffmpeg with "ffmpeg -nostats -loglevel error -y -r 30 -f rawvideo -s:v 160x210 -pix_fmt rgb24 -i - -vcodec libx264 -pix_fmt yuv420p /content/result/openaigym.video.0.210.video000000.mp4"


DEBUG:chainerrl.agents.a3c:t:1 r:0 a:4 pout:SoftmaxDistribution(beta=1.0, min_prob=0.0) logits:[[ 0.10047702 -0.03848258 -0.063296   -0.23922749 -0.17477234 -0.46331877]] probs:[[0.21010141 0.18284352 0.17836238 0.14958815 0.15954737 0.11955714]] entropy:[1.7769206]


DEBUG: Capturing video frame: path=/content/result/openaigym.video.0.210.video000000.mp4


DEBUG:chainerrl.agents.a3c:t:2 r:0.0 a:0 pout:SoftmaxDistribution(beta=1.0, min_prob=0.0) logits:[[ 0.17721948 -0.10529566 -0.08396527 -0.24916878 -0.17999867 -0.5354867 ]] probs:[[0.22899948 0.17263931 0.17636134 0.14950524 0.16021259 0.11228199]] entropy:[1.7698739]


DEBUG: Capturing video frame: path=/content/result/openaigym.video.0.210.video000000.mp4


DEBUG:chainerrl.agents.a3c:t:3 r:0.0 a:4 pout:SoftmaxDistribution(beta=1.0, min_prob=0.0) logits:[[ 0.15943252 -0.13753036 -0.08677285 -0.23704201 -0.18496138 -0.5467509 ]] probs:[[0.22728851 0.16889164 0.17768544 0.15289412 0.16106795 0.11217237]] entropy:[1.7707405]


DEBUG: Capturing video frame: path=/content/result/openaigym.video.0.210.video000000.mp4


DEBUG:chainerrl.agents.a3c:t:4 r:0.0 a:4 pout:SoftmaxDistribution(beta=1.0, min_prob=0.0) logits:[[ 0.07777552 -0.17124036 -0.0807454  -0.22030777 -0.21522921 -0.49890226]] probs:[[0.21353868 0.16646783 0.182235   0.15849684 0.15930383 0.11995782]] entropy:[1.7773774]


DEBUG: Capturing video frame: path=/content/result/openaigym.video.0.210.video000000.mp4


DEBUG:chainerrl.agents.a3c:t:5 r:0.0 a:0 pout:SoftmaxDistribution(beta=1.0, min_prob=0.0) logits:[[ 0.06859897 -0.17762242 -0.08160009 -0.21894354 -0.21987191 -0.49692038]] probs:[[0.21232034 0.16598125 0.18270944 0.15926248 0.1591147  0.12061177]] entropy:[1.7778729]


DEBUG: Capturing video frame: path=/content/result/openaigym.video.0.210.video000000.mp4


DEBUG:chainerrl.agents.a3c:t:6 r:0.0 a:4 pout:SoftmaxDistribution(beta=1.0, min_prob=0.0) logits:[[ 0.06144023 -0.18308373 -0.07998002 -0.21936794 -0.22319332 -0.4950864 ]] probs:[[0.21133333 0.16549033 0.18346363 0.15959327 0.15898393 0.12113553]] entropy:[1.7782129]


DEBUG: Capturing video frame: path=/content/result/openaigym.video.0.210.video000000.mp4


DEBUG:chainerrl.agents.a3c:t:7 r:0.0 a:4 pout:SoftmaxDistribution(beta=1.0, min_prob=0.0) logits:[[ 0.12597479 -0.16489074 -0.08105923 -0.23151934 -0.19576657 -0.5424179 ]] probs:[[0.22234808 0.16623102 0.18076721 0.15551622 0.16117693 0.11396045]] entropy:[1.7729106]


DEBUG: Capturing video frame: path=/content/result/openaigym.video.0.210.video000000.mp4


DEBUG:chainerrl.agents.a3c:t:8 r:0.0 a:1 pout:SoftmaxDistribution(beta=1.0, min_prob=0.0) logits:[[ 0.12220032 -0.17342998 -0.07492602 -0.23524088 -0.19677885 -0.5447308 ]] probs:[[0.22198617 0.16517156 0.18226993 0.1552713  0.16135968 0.11394138]] entropy:[1.772862]


DEBUG: Capturing video frame: path=/content/result/openaigym.video.0.210.video000000.mp4


DEBUG:chainerrl.agents.a3c:pi_loss:[-1.5852811] v_loss:[[0.00061139]]
DEBUG:chainerrl.agents.a3c:grad norm:322.08468898218257


In [0]:
#-*- coding: utf-8 -*-
"""
スペースインベーダーゲームの学習（A3Cバージョン）
Copyright(c) Hiromitsu Nishizaki and Koji Makino All Rrights Reserved.
"""
from __future__ import print_function
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl
import numpy as np
import gym
import logging

# 型変換用関数
def phi(obs):
    return obs.astype(np.float32)

# A3C FeedForward Softmax
class A3CLSTMSoftmax(chainer.Chain, chainerrl.agents.a3c.A3CModel, chainerrl.recurrent.RecurrentChainMixin):
    
    # ポリシーネットワークと評価関数でCNN部分は共通化する
    def __init__(self):
        super(A3CLSTMSoftmax, self).__init__()        
        with self.init_scope():
            self.conv1 = L.Convolution2D(3, 16, (11, 9), 1, 0)  # 1層目の畳み込み層（チャンネル数は16）
            self.conv2 = L.Convolution2D(16, 32, (11, 9), 1, 0) # 2層目の畳み込み層（チャンネル数は32）
            self.conv3 = L.Convolution2D(32, 64, (10, 9), 1, 0) # 2層目の畳み込み層（チャンネル数は64）
            self.l4p = L.LSTM(14976, 1024) #ポリシーネットワーク
            self.l4v = L.LSTM(14976, 1024) #バリューネットワーク
            self.l5p = L.Linear(1024, 1024)
            self.l5v = L.Linear(1024, 1024)            
            self.pi = chainerrl.policies.SoftmaxPolicy(L.Linear(1024, 6)) # ポリシーネットワーク
            self.v = L.Linear(1024, 1) # バリューネットワーク

    def pi_and_v(self, state):
        state = np.asarray(state.transpose(0, 3, 1, 2), dtype=np.float32)
        h1 = F.max_pooling_2d(F.relu(self.conv1(state)), ksize=2, stride=2)
        h2 = F.max_pooling_2d(F.relu(self.conv2(h1)), ksize=2, stride=2) 
        h3 = F.max_pooling_2d(F.relu(self.conv3(h2)), ksize=2, stride=2) #ここまでは共通
        h4p = self.l4p(h3)
        h4v = self.l4v(h3)
        h5p = F.relu(self.l5p(h4p)) 
        h5v = F.relu(self.l5p(h4v))
        pout = self.pi(h5p) #ポリシーネットワークの出力
        vout = self.v(h5v) #バリューネットワークの出力
        return pout, vout


# メイン関数
def main():    

    # 初期設定（プロセス数は8）
    n_process = 4
    outdir = 'result'

    # スペースインベーダー環境の設定
    chainerrl.misc.set_random_seed(0)
    process_seeds = np.arange(n_process)

    def make_env(process_idx, test=False):
        env = gym.make('SpaceInvaders-v0')
        process_seed = int(process_seeds[process_idx])
        if not test:
            chainerrl.misc.env_modifiers.make_reward_filtered(env, lambda x: x * 0.01)
        if process_idx == 0 and not test:
            env = gym.wrappers.Monitor(env, outdir, video_callable=(lambda ep: ep % 1 == 0), force=True)
        env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        return env
    
    
    # 強化学習のパラメータ        
    num_episodes = 10  #試行回数

    # DQNのセットアップ
    model = A3CLSTMSoftmax()
    optimizer = chainerrl.optimizers.rmsprop_async.RMSpropAsync(lr=0.001, eps=0.1, alpha=0.99)
    optimizer.setup(model)
    
    agent = chainerrl.agents.a3c.A3C(
        model, optimizer, t_max=8, gamma=0.995, beta=0.1, 
        phi=phi
    )
    
    # DEBUG用にログを表示
    gym.logger.set_level(0)
    logging.basicConfig(level=logging.DEBUG)

    # エピソードの試行＆強化学習スタート（トレーナーを利用）
    chainerrl.experiments.train_agent_async(
            agent=agent,
            outdir=outdir,
            processes=n_process,
            make_env=make_env,
            profile=True,
            steps=1000000,
            eval_interval=None,
            max_episode_len=num_episodes,
            logger=gym.logger)

if __name__ == '__main__':
    main()