In [1]:
import math, random

import gym
import numpy as np

import mxnet as mx
from mxnet import gluon, autograd, nd
from mxnet.gluon import nn

from tqdm import tqdm, trange
from mxboard import SummaryWriter

In [2]:
from collections import deque

class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
            
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def __len__(self):
        return len(self.buffer)

In [3]:
from wrappers import make_atari, wrap_deepmind, wrap_mxnet

In [4]:
env_id = "PongNoFrameskip-v4"
env    = make_atari(env_id)
env    = wrap_deepmind(env, frame_stack=True)
env    = wrap_mxnet(env)

In [5]:
class DuelingDQN(nn.Block):
    def __init__(self, input_shape, n_actions, **kwargs):
        super(DuelingDQN, self).__init__(**kwargs)
        

        with self.name_scope():
            self.conv1 = nn.Conv2D(32, 8, 4, in_channels=input_shape[0])
            self.bn1 = nn.BatchNorm()
            self.conv2 = nn.Conv2D(64, 4, 2, in_channels=32)
            self.bn2 = nn.BatchNorm()
            self.conv3 = nn.Conv2D(64, 3, 1, in_channels=64)
            self.bn3 = nn.BatchNorm()
            self.adv_layer = nn.Sequential()
            self.adv_layer.add(nn.Dense(512, activation='relu'))
            self.adv_layer.add(nn.Dense(n_actions, in_units=512))
            self.value_layer = nn.Sequential()
            self.value_layer.add(nn.Dense(512, activation='relu'))
            self.value_layer.add(nn.Dense(1, in_units=512))
#             self.advantage_fc1 = nn.Dense(512)
#             self.advantage_fc2 = nn.Dense(n_actions, in_units=512)
#             self.value_fc1 = nn.Dense(512)
#             self.value_fc2 = nn.Dense(1, in_units=512)
    
    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = nd.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = nd.relu(out)
        out = self.conv3(out)
        out = self.bn3(out)
        out = nd.relu(out)
        out = nd.reshape(out, shape=(x.shape[0],-1))
        adv_out = self.adv_layer(out)
        value_out = self.value_layer(out)
#         advantage = self.advantage_fc1(out)
#         advantage = nd.relu(advantage)
#         advantage = self.advantage_fc2(advantage)
#         value = self.value_fc1(out)
#         value = nd.relu(value)
#         value = self.value_fc2(value)
#         print(adv_out.shape)
#         print(value_out.shape)
        return value_out + adv_out - adv_out.mean()
    
    def act(self, state, epsilon, ctx):
        if random.random() > epsilon:
            state = nd.array(np.float32(state), ctx=ctx).expand_dims(0)
            q_value = self.forward(state)
            action = nd.argmax(q_value, axis=1)
            action = int(action.asnumpy())
        else:
            action = random.randrange(env.action_space.n)
        return action

In [37]:
# x = mx.random.uniform(shape=(1,4,84,84))

In [38]:
# net= DuelingDQN((4,84,84), 6)
# net.initialize()

In [42]:
# with autograd.record():
#     pred= net(x)
#     target = nd.ones(shape=(1,6))
#     loss = pred - target

In [43]:
# loss.backward()

In [6]:
def compute_td_loss(batch_size, current_model, target_model, loss_fn, ctx):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    state      = nd.array((np.float32(state)), ctx=ctx)
    next_state = nd.array(np.float32(next_state), ctx=ctx)
    action     = nd.array((action), ctx=ctx)
    reward     = nd.array((reward), ctx=ctx)
    done       = nd.array((done), ctx=ctx)
   
    q_values      = current_model(state)
    next_q_values = current_model(next_state)
    next_q_state_values = target_model(next_state) 
    
    next_action = nd.argmax(next_q_values,1)
    q_values = nd.gather_nd(q_values, nd.stack(nd.arange(action.shape[0], ctx=ctx).expand_dims(-1),action.expand_dims(-1), axis=0))
    next_q_value = nd.gather_nd(next_q_state_values, nd.stack(nd.arange(next_action.shape[0], ctx=ctx).expand_dims(-1),\
                                                                  next_action.expand_dims(-1), axis=0))
    q_values = q_values.squeeze()
    next_q_value = next_q_value.squeeze()
    expected_q_value = reward + gamma * next_q_value * (1 - done)
    loss = loss_fn(q_values, expected_q_value.detach())
        
    return loss

In [7]:
ctx = mx.gpu()

In [8]:
def update_target(current_model, target_model):
    current_model.save_parameters('./tmp/dueliing_dqn_current_model')
    target_model.load_parameters('./tmp/dueliing_dqn_current_model')

In [9]:
replay_initial = 10000
replay_buffer = ReplayBuffer(100000)

current_model = DuelingDQN(env.observation_space.shape, env.action_space.n)
target_model = DuelingDQN(env.observation_space.shape, env.action_space.n)
current_model.initialize(ctx=ctx)
target_model.initialize(ctx=ctx)
loss_fn = gluon.loss.L2Loss()
trainer = gluon.Trainer(current_model.collect_params(), optimizer='adam', optimizer_params={'learning_rate':0.0001})

In [10]:
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 50000

epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

In [11]:
num_frames = 1400000
batch_size = 32
gamma      = 0.99

losses = []
all_rewards = []
episode_reward = 0

state = env.reset()
writer = SummaryWriter(logdir='./logs',filename_suffix="_DuelingDQN")

In [None]:
state = env.reset()
current_best = -100
for frame_idx in range(1, num_frames + 1):
    epsilon = epsilon_by_frame(frame_idx)
    action = current_model.act(state, epsilon, ctx)
    
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        writer.add_scalar("reward", episode_reward, frame_idx)  
        mean_reward = np.mean(all_rewards[-100:])
        print("%d: done %d games, mean reward %.3f, reward %.3f, eps %.2f" % (
                frame_idx, len(all_rewards), mean_reward, episode_reward, epsilon,
            ))
        if current_best < mean_reward:
            print("save current best model")
            current_model.save_parameters('./models/double_dqn_best_model')
            current_best = mean_reward
        episode_reward = 0
        writer.add_scalar("epsilon", epsilon, frame_idx)
        writer.add_scalar("mean_reward", mean_reward, frame_idx)  
        
        
    if len(replay_buffer) > replay_initial:
        with autograd.record():
            loss = compute_td_loss(batch_size, current_model, target_model, loss_fn, ctx)
            loss.backward()
        trainer.step(batch_size)
        losses.append(loss.sum().asscalar())
        writer.add_scalar("loss", loss.mean().asscalar(), frame_idx)   
    if frame_idx % 1000 == 0:
        print("update target model")
        update_target(current_model, target_model)

935: done 1 games, mean reward -21.000, reward -21.000, eps 0.98
save current best model
update target model
1974: done 2 games, mean reward -20.000, reward -19.000, eps 0.96
save current best model
update target model
2931: done 3 games, mean reward -20.000, reward -20.000, eps 0.94
update target model
3692: done 4 games, mean reward -20.250, reward -21.000, eps 0.93
update target model
4449: done 5 games, mean reward -20.400, reward -21.000, eps 0.92
update target model
5208: done 6 games, mean reward -20.500, reward -21.000, eps 0.90
update target model
6070: done 7 games, mean reward -20.429, reward -20.000, eps 0.89
update target model
7029: done 8 games, mean reward -20.500, reward -21.000, eps 0.87
7788: done 9 games, mean reward -20.556, reward -21.000, eps 0.86
update target model
8701: done 10 games, mean reward -20.400, reward -19.000, eps 0.84
update target model
9458: done 11 games, mean reward -20.455, reward -21.000, eps 0.83
update target model
10234: done 12 games, mea

In [None]:
loss

In [None]:
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)
    state      = nd.array((np.float32(state)), ctx=ctx)
    next_state = nd.array(np.float32(next_state), ctx=ctx)
    action     = nd.array(action, ctx=ctx)
    reward     = nd.array(reward, ctx=ctx)
    done       = nd.array(done, ctx=ctx)
   
    q_values      = current_model(state)
    next_q_values = current_model(next_state)
    next_q_state_values = target_model(next_state)
    
    next_action = nd.argmax(next_q_values,1)

In [14]:
done


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 32 @gpu(0)>

In [15]:
nd.argmax(next_q_values,1)


[4. 4. 4. 4. 4. 4. 1. 4. 4. 4. 1. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
 4. 4. 4. 4. 4. 4. 1. 4.]
<NDArray 32 @gpu(0)>

In [16]:
q_values = nd.gather_nd(q_values, nd.stack(nd.arange(action.shape[0], ctx=ctx).expand_dims(-1),action.expand_dims(-1), axis=0))
next_q_value = nd.gather_nd(next_q_state_values, nd.stack(nd.arange(next_action.shape[0], ctx=ctx).expand_dims(-1),\
                                                                  next_action.expand_dims(-1), axis=0))
    

In [17]:
q_values.shape

(32, 1)

In [18]:
next_q_value.shape

(32, 1)

In [21]:
q_values = q_values.squeeze()
next_q_value = next_q_value.squeeze()

In [23]:
expected_q_value = reward + gamma * next_q_value * (1 - done)

In [25]:
loss_fn(q_values,expected_q_value)


[7.77847767e-01 8.75926670e-03 9.11522985e-01 1.61150675e-02
 1.02776647e+00 5.36763556e-02 3.00979435e-01 7.47589052e-01
 1.31033516e+00 1.39697935e-04 6.04455243e-04 5.77379346e-01
 1.18786907e+00 1.15350652e+00 1.82130409e-03 3.74929160e-01
 9.89390373e-01 6.70911074e-01 1.20459557e+00 7.89059550e-02
 1.23624317e-02 6.92399263e-01 3.42928439e-01 1.04627311e+00
 8.34418647e-03 3.46169651e-01 1.58085692e+00 7.51657188e-01
 2.93571725e-02 1.33847715e-02 5.35753276e-03 8.88727009e-01]
<NDArray 32 @gpu(0)>

In [28]:
next_q_value.shape

(32, 1)

In [33]:
a = gamma * next_q_value 

In [34]:
a.shape

(32, 1)