# A simple example using A3C
we begin by importing some packages; see also https://github.com/chainer/chainerrl/blob/master/examples/atari/a3c/train_a3c.py

In [1]:
import chainer
import chainer.functions as F
import chainerrl
from chainerrl.agents import a3c
from chainerrl import experiments
import chainer.links as L
from chainerrl import misc
from chainerrl.optimizers.nonbias_weight_decay import NonbiasWeightDecay
from chainerrl.optimizers import rmsprop_async
from chainerrl import policy
from chainerrl import v_function

import multiprocessing as mp
import os

import gym
import numpy as np
from pyCICY import CICY

define the CICY we want to study

In [4]:
M = CICY([[1,0, 1, 1], [1,0, 1, 1], [1,1, 1, 0], [1,1, 1, 0], [1,1, 0, 1], [1,1, 0, 1]])

Next we want to load the gym environment

In [5]:
gym.envs.register(
     id='CICY-v0',
     entry_point='gym_CICYlbmodels.envs.env_flip_4:flip_4',
    kwargs={'M': M, 'r': 2, 'max': 2},
)

In [6]:
env = gym.make('CICY-v0')

  result = entry_point.load(False)


In [7]:
print('observation space:', env.observation_space)
print('action space:', env.action_space)

obs = env.reset()
print('initial observation:', obs)

action = env.action_space.sample()
obs, r, done, info = env.step(action)
print('next observation:', obs)
print('reward:', r)
print('done:', done)
print('info:', info)

observation space: Box(5, 6)
action space: Discrete(48)
initial observation: [[ 0  1  0  0  1  0]
 [ 1  1 -1  1 -1  1]
 [ 1 -1 -1  1  0 -1]
 [ 0  0  0 -1  0 -1]
 [-2 -1  2 -1  0  1]]
next observation: [[ 0  1  0  0  1  0]
 [ 1  1 -1  1 -1  1]
 [ 1 -1 -1  1  0 -1]
 [ 0  0 -1 -1  0 -1]
 [-2 -1  3 -1  0  1]]
reward: -0.6000000000000001
done: False
info: {}


some hyperparameters

In [8]:
weight_decay = 0.0
lr = 7e-4
eps = 1e-2
gamma = 0.999
beta = 1e-2
alpha = 0.99
obs_size = 5*M.len
n_actions = env.action_space.n
t_max = 5
nsteps = 1000
eval_n_steps = 5*10**4
eval_n_episodes = 2
eval_interval = 50000
max_episode_len = 10000
processes = 4
outdir = 'data_a3c'
seed = 1
process_seeds = np.arange(processes) + seed * processes

Define action value function

In [9]:
class QFunction(chainer.Chain):

    def __init__(self, obs_size, n_output_channels, n_hidden_channels=50):
        
        self.n_output_channels = n_output_channels
        self.n_input_channels = obs_size
        
        super().__init__()
        
        with self.init_scope():
            self.l0 = L.Linear(obs_size, n_hidden_channels)
            self.l1 = L.Linear(n_hidden_channels, n_hidden_channels)
            self.l2 = L.Linear(n_hidden_channels, n_hidden_channels)
            self.l3 = L.Linear(n_hidden_channels, n_output_channels)
            #self.l4 = L.Linear(n_hidden_channels+150, n_actions)

    def __call__(self, x, test=False):
        """
        Args:
            x (ndarray or chainer.Variable): An observation
            test (bool): a flag indicating whether it is in test mode
        """
        h = F.tanh(self.l0(x))
        h = F.tanh(self.l1(h))
        h = F.tanh(self.l2(h))
        #h = F.tanh(self.l3(h))
        #chainerrl.action_value.DiscreteActionValue(self.l4(h))
        return F.tanh(self.l3(h))

In [10]:
class A3CFF(chainer.ChainList, a3c.A3CModel):

    def __init__(self, n_input, n_actions, n_hidden):
        self.head = QFunction(n_input, n_hidden)
        self.pi = policy.FCSoftmaxPolicy(
            self.head.n_output_channels, n_actions)
        self.v = v_function.FCVFunction(self.head.n_output_channels)
        super().__init__(self.head, self.pi, self.v)

    def pi_and_v(self, state):
        out = self.head(state)
        return self.pi(out), self.v(out)

In [11]:
model = A3CFF(obs_size, n_actions, 200)

In [10]:
# Draw the computational graph and save it in the output directory.
fake_obs = chainer.Variable(np.zeros(obs_size, dtype=np.float32)[None], name='observation')
with chainerrl.recurrent.state_reset(model):
    # The state of the model is reset again after drawing the graph
    chainerrl.misc.draw_computational_graph(
            [model(fake_obs)],
            os.path.join(outdir, 'model'))

In [12]:
opt = rmsprop_async.RMSpropAsync(lr=lr, eps=eps, alpha=alpha)
opt.setup(model)
opt.add_hook(chainer.optimizer.GradientClipping(40))
if weight_decay > 0:
    opt.add_hook(NonbiasWeightDecay(args.weight_decay))

phi = lambda x: x.astype(np.float32, copy=False)

define the agents

In [13]:
agent = a3c.A3C(model, opt, t_max=t_max, gamma=gamma,
                    beta=beta, phi=phi)

In [14]:
def make_env(process_idx, test):
    
    process_seed = process_seeds[process_idx]
    env_seed = 2 ** 31 - 1 - process_seed if test else process_seed
    env = gym.make('CICY-v0')
    env.seed(int(env_seed))
    return env

In [16]:
# Linearly decay the learning rate to zero
def lr_setter(env, agent, value):
    agent.optimizer.lr = value

lr_decay_hook = experiments.LinearInterpolationHook(nsteps, lr, 0, lr_setter)

training = experiments.train_agent_async(
            agent=agent,
            outdir=outdir,
            processes=processes,
            make_env=make_env,
            profile=False,
            steps=nsteps,
            #eval_n_steps=50000,
            eval_interval=eval_interval,
            max_episode_len=max_episode_len,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=False,
        )

  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)


test

In [17]:
for i in range(20):
    obs = env.reset()
    done = False
    R = 0
    t = 0
    while not done and t < 10000-1:
        #env.render()
        action = agent.act(obs)
        obs, r, done, _ = env.step(action)
        R += r
        t += 1
    print('test episode:', i, 'R:', R)
    agent.stop_episode()

test episode: 0 R: 2937.199999999979
test episode: 1 R: 2188.999999999962
test episode: 2 R: 1502.5999999999194
test episode: 3 R: 3492.400000000012
test episode: 4 R: 1429.599999999941
test episode: 5 R: 1807.5999999998871
test episode: 6 R: 2165.3999999999905
test episode: 7 R: 2667.0000000000537
test episode: 8 R: 2194.2000000000176
test episode: 9 R: 2658.3999999999714
test episode: 10 R: 2776.8000000000293
test episode: 11 R: 1977.3999999998925
test episode: 12 R: 3122.6000000000304
test episode: 13 R: 2649.8000000000247
test episode: 14 R: 2047.9999999998986
test episode: 15 R: 1022.9999999999559
test episode: 16 R: 2473.9999999999245
test episode: 17 R: 2210.5999999999544
test episode: 18 R: 2704.4000000000237
test episode: 19 R: 1976.9999999999243


a random walker

In [19]:
for i in range(20):
    obs = env.reset()
    done = False
    R = 0
    t = 0
    while not done and t < 10000-1:
        #env.render()
        action = np.random.randint(2*4*M.len)
        obs, r, done, _ = env.step(action)
        R += r
        t += 1
    print('test episode:', i, 'R:', R)
    #agent.stop_episode()

test episode: 0 R: 2209.7999999999483
test episode: 1 R: 1596.199999999934
test episode: 2 R: 2896.199999999987
test episode: 3 R: 995.9999999999548
test episode: 4 R: 2710.200000000023
test episode: 5 R: 1249.999999999951
test episode: 6 R: 2679.799999999957
test episode: 7 R: 1879.9999999999177
test episode: 8 R: 1755.59999999991
test episode: 9 R: 1389.399999999928
test episode: 10 R: 2108.1999999999266
test episode: 11 R: 2351.800000000016
test episode: 12 R: 2685.4000000000106
test episode: 13 R: 2557.8000000000093
test episode: 14 R: 3415.0000000000937
test episode: 15 R: 2871.400000000014
test episode: 16 R: 2448.1999999999553
test episode: 17 R: 2150.599999999932
test episode: 18 R: 1197.1999999999662
test episode: 19 R: 2800.800000000037


In [21]:
from gym_CICYlbmodels.envs.stack import create_stack, create_stack_h, _quick_index

In [22]:
stack = create_stack(M, 2, 2)

In [23]:
len(stack)

2980

In [24]:
for l in stack:
    h = M.line_co(l)
    if h[0] > 0:
        print('failed')