In [1]:
import math, random

import gym
import numpy as np

import mxnet as mx
from mxnet import gluon, autograd, nd
from mxnet.gluon import nn

from tqdm import tqdm, trange
from mxboard import SummaryWriter

In [2]:

# from IPython.display import clear_output
# import matplotlib.pyplot as plt
# %matplotlib inline

In [3]:
from collections import deque

class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
            
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def __len__(self):
        return len(self.buffer)


In [4]:
from wrappers import make_atari, wrap_deepmind, wrap_mxnet

In [5]:
env_id = "CartPole-v0"
env = gym.make(env_id)

In [6]:
class DQN(nn.Block):
    def __init__(self, input_shape, n_actions, **kwargs):
        super(DQN, self).__init__(**kwargs)
        
        with self.name_scope():
            self.fc1 = nn.Dense(128)
            self.fc2 = nn.Dense(n_actions, in_units=128)
    
    def forward(self, x):
        out = self.fc1(x)
        out = nd.relu(out)
        out = self.fc2(out)
        return out
    
    def act(self, state, epsilon, ctx):
        if random.random() > epsilon:
            state = nd.array(np.float32(state), ctx=ctx).expand_dims(0) 
            q_value = self.forward(state)
            action = nd.argmax(q_value, axis=1)
            action = int(action.asnumpy())
        else:
            action = random.randrange(env.action_space.n)
        return action

In [7]:
def compute_td_loss(batch_size, net, loss_fn, ctx):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    state      = nd.array((np.float32(state)), ctx=ctx) 
    next_state = nd.array(np.float32(next_state), ctx=ctx)
    action     = nd.array((action), ctx=ctx)
    reward     = nd.array((reward), ctx=ctx)
    done       = nd.array((done), ctx=ctx)

    q_values      = net(state)
    next_q_values = net(next_state)
    
    q_values = nd.gather_nd(q_values, nd.stack(nd.arange(action.shape[0], ctx=ctx).expand_dims(-1),action.expand_dims(-1), axis=0))
    next_q_value     = next_q_values.max(1)
    expected_q_value = reward + gamma * next_q_value * (1 - done)
    
    loss = loss_fn(q_values, expected_q_value)
#     loss = nd.power(q_values - expected_q_value,2).sum()
        
    return loss

In [8]:
def plot(frame_idx, rewards, losses):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:])))
    plt.plot(rewards)
    plt.subplot(132)
    plt.title('loss')
    plt.plot(losses)
    plt.show()

In [9]:
ctx = mx.gpu()

In [10]:
replay_initial = 100
replay_buffer = ReplayBuffer(200)

net = DQN(env.observation_space.shape, env.action_space.n)
net.initialize(ctx=ctx)
loss_fn = gluon.loss.L2Loss()
trainer = gluon.Trainer(net.collect_params(), optimizer='adam', optimizer_params={'learning_rate':0.01})

In [11]:
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 500

epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

In [12]:
num_frames = 20000
batch_size = 32
gamma      = 0.99

losses = []
all_rewards = []
episode_reward = 0

state = env.reset()
writer = SummaryWriter(logdir='./logs',filename_suffix="_Cart_Pole")

In [13]:
state = env.reset()
current_best = 0.0
for frame_idx in range(1, num_frames + 1):
    epsilon = epsilon_by_frame(frame_idx)
    action = net.act(state, epsilon, ctx)
#     print(action)
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        writer.add_scalar("reward", episode_reward, frame_idx)  
        mean_reward = np.mean(all_rewards[-100:])
        print("%d: done %d games, mean reward %.3f, episode reward %.3f,  eps %.2f" % (
                frame_idx, len(all_rewards), mean_reward, episode_reward, epsilon,
            ))
        if current_best < mean_reward:
            print("save current best model")
            net.save_parameters('./models/cartpole_best_model')
            current_best = mean_reward
        episode_reward = 0
        writer.add_scalar("epsilon", epsilon, frame_idx)
        writer.add_scalar("mean_reward", mean_reward, frame_idx)  
        
        
    if len(replay_buffer) > replay_initial:
        with autograd.record():
#             print("compute loss")
            loss = compute_td_loss(batch_size, net, loss_fn, ctx)
            loss.backward()
        trainer.step(batch_size)
        losses.append(loss.sum().asscalar())
        writer.add_scalar("loss", loss.mean().asscalar(), frame_idx) 
#     if frame_idx % 10000 == 0:
#         plot(frame_idx, all_rewards, losses)

27: done 1 games, mean reward 27.000, episode reward 27.000,  eps 0.95
save current best model
57: done 2 games, mean reward 28.500, episode reward 30.000,  eps 0.89
save current best model
71: done 3 games, mean reward 23.667, episode reward 14.000,  eps 0.87
82: done 4 games, mean reward 20.500, episode reward 11.000,  eps 0.85
106: done 5 games, mean reward 21.200, episode reward 24.000,  eps 0.81
121: done 6 games, mean reward 20.167, episode reward 15.000,  eps 0.79
136: done 7 games, mean reward 19.429, episode reward 15.000,  eps 0.76
162: done 8 games, mean reward 20.250, episode reward 26.000,  eps 0.73
181: done 9 games, mean reward 20.111, episode reward 19.000,  eps 0.70
218: done 10 games, mean reward 21.800, episode reward 37.000,  eps 0.65
241: done 11 games, mean reward 21.909, episode reward 23.000,  eps 0.62
266: done 12 games, mean reward 22.167, episode reward 25.000,  eps 0.59
284: done 13 games, mean reward 21.846, episode reward 18.000,  eps 0.57
362: done 14 gam