In [0]:
!pip install chainerrl

In [0]:
import numpy as np
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl

In [0]:
from tqdm import tqdm

In [0]:
class QFunction(chainer.Chain):
    def __init__(self, obs_size, n_actions, n_hidden_channels=2):
        super(QFunction, self).__init__()
        with self.init_scope():
            self.l1 = L.Linear(obs_size, n_hidden_channels)
            self.l2 = L.Linear(n_hidden_channels, n_hidden_channels)
            self.l3 = L.Linear(n_hidden_channels, n_actions)
    def __call__(self, x, test=False):
        h1 = F.tanh(self.l1(x))
        h2 = F.tanh(self.l2(h1))
        y = chainerrl.action_value.DiscreteActionValue(self.l3(h2))
        return y

In [0]:
def random_action():
    return np.random.choice([0, 1])

In [0]:
def step(state, action):
    reward = 0
    if state == 0:
        if action == 0:
            state = 1
        else:
            state = 0
    else:
        if action == 0:
            state = 0
        else:
            state = 1
            reward = 1
    return np.array([state]), reward

In [0]:
gamma = 0.9
alpha = 0.5
max_number_of_steps = 5
num_episodes = 20

In [0]:
q_func = QFunction(1, 2)
optimizer = chainer.optimizers.Adam(eps=1e-2)
optimizer.setup(q_func)
explorer = chainerrl.explorers.LinearDecayEpsilonGreedy(
    start_epsilon = 1.0, 
    end_epsilon = 0.1,
    decay_steps = num_episodes, 
    random_action_func = random_action
)
replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10**6)
phi = lambda x: x.astype(np.float32, copy=False)

In [0]:
agent = chainerrl.agents.DQN(
    q_func, optimizer, replay_buffer, gamma, explorer,
    replay_start_size = 500,
    update_interval = 1,
    target_update_interval = 100,
    phi = phi
)

In [25]:
for episode in tqdm(range(num_episodes)):
    state = np.array([0])
    R = 0
    reward = 0
    done = True
    for t in range(max_number_of_steps):
        action = agent.act_and_train(state, reward)
        next_state, reward = step(state, action)
        print(state, action, reward)
        R += reward
        state = next_state
    agent.stop_episode_and_train(state, reward, done)
    print('episode : %d total reward %d' %(episode+1, R))
agent.save('agent')

100%|██████████| 20/20 [00:00<00:00, 145.84it/s]

[0] 0 0
[1] 1 1
[1] 1 1
[1] 1 1
[1] 1 1
episode : 1 total reward 4
[0] 0 0
[1] 1 1
[1] 1 1
[1] 1 1
[1] 1 1
episode : 2 total reward 4
[0] 0 0
[1] 1 1
[1] 1 1
[1] 1 1
[1] 1 1
episode : 3 total reward 4
[0] 0 0
[1] 1 1
[1] 1 1
[1] 1 1
[1] 1 1
episode : 4 total reward 4
[0] 0 0
[1] 1 1
[1] 1 1
[1] 1 1
[1] 1 1
episode : 5 total reward 4
[0] 0 0
[1] 1 1
[1] 1 1
[1] 1 1
[1] 1 1
episode : 6 total reward 4
[0] 1 0
[0] 0 0
[1] 1 1
[1] 1 1
[1] 1 1
episode : 7 total reward 3
[0] 0 0
[1] 1 1
[1] 1 1
[1] 1 1
[1] 1 1
episode : 8 total reward 4
[0] 0 0
[1] 1 1
[1] 1 1
[1] 0 0
[0] 0 0
episode : 9 total reward 2
[0] 0 0
[1] 1 1
[1] 0 0
[0] 0 0
[1] 1 1
episode : 10 total reward 2
[0] 0 0
[1] 1 1
[1] 1 1
[1] 1 1
[1] 1 1
episode : 11 total reward 4
[0] 0 0
[1] 1 1
[1] 1 1
[1] 1 1
[1] 1 1
episode : 12 total reward 4
[0] 0 0
[1] 1 1
[1] 1 1
[1] 1 1
[1] 1 1
episode : 13 total reward 4
[0] 0 0
[1] 1 1
[1] 1 1
[1] 0 0
[0] 0 0
episode : 14 total reward 2
[0] 0 0
[1] 1 1
[1] 1 1
[1] 1 1
[1] 1 1
episode : 15 tota


