In [1]:
# built-in
import math
import os
import random
import time
from collections import deque
from copy import deepcopy
#tihrd party
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import DataLoader

In [2]:
# global constant
SEED = 0
BATCH_SIZE = 32
LR = 0.001
ADAM_EPS = 0.01 / BATCH_SIZE
USE_TENSORBOARD = True

# C51 hyperparameter
V_MAX = 10
V_MIN = -10
N_ATOMS = 51
DELTA_Z = (V_MAX - V_MIN) / (N_ATOMS - 1)

In [3]:
# set device
use_cuda = torch.cuda.is_available()
print('cuda:', use_cuda)
device = torch.device('cuda' if use_cuda else 'cpu')

# random seed
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

# set tensorboard
# tensorboard and tensorboardX must be installed.
# pip install tensorboardX
if USE_TENSORBOARD:
    from tensorboardX import SummaryWriter
    writer = SummaryWriter()

cuda: True


In [4]:
class DistrDQN(nn.Module):
    def __init__(self, in_dim, n_actions, n_atoms):
        super(DistrDQN, self).__init__()
        self.dense = nn.Sequential(
            nn.Linear(in_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU()
        )
        self.fc = nn.Sequential(
            nn.Linear(64, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions * n_atoms)
        )
        self.log_softmax = nn.LogSoftmax(dim=-1)

        self.register_buffer('support', torch.arange(V_MIN, V_MAX + DELTA_Z, DELTA_Z))

    def forward(self, x):
        out = self.dense(x)
        out = out.view(out.size(0), -1)
        out = self.fc(out).view(out.size(0), -1, N_ATOMS)
        out = self.log_softmax(out)
        probs = out.exp()

        return probs

In [5]:
def projection(next_p, rewards, dones):
    proj_p = np.zeros((BATCH_SIZE, N_ATOMS), dtype=np.float32)
    for atom in range(N_ATOMS):
        z = np.minimum(V_MAX, np.maximum(V_MIN, rewards + (V_MIN + atom * DELTA_Z) * 0.9))
        b = (z - V_MIN) / DELTA_Z
        l = np.floor(b).astype(np.int64)
        u = np.ceil(b).astype(np.int64)

        eq_mask = u == l
        proj_p[eq_mask, l[eq_mask]] += next_p[eq_mask, atom]
        ne_mask = u != l
        proj_p[ne_mask, l[ne_mask]] += next_p[ne_mask, atom] * (u - b)[ne_mask]
        proj_p[ne_mask, u[ne_mask]] += next_p[ne_mask, atom] * (b - l)[ne_mask]

        if dones.any():
            proj_p[dones] = 0.0
            z = np.minimum(V_MAX, np.maximum(V_MIN, rewards[dones]))
            b = (z - V_MIN) / DELTA_Z
            l = np.floor(b).astype(np.int64)
            u = np.ceil(b).astype(np.int64)

            eq_mask = u == l
            eq_dones = dones.copy()
            eq_dones[dones] = eq_mask
            if eq_dones.any():
                proj_p[eq_dones, l] = 1.0

            ne_mask = u != l
            ne_dones = dones.copy()
            ne_dones[dones] = ne_mask
            if ne_dones.any():
                proj_p[ne_dones, l] = (u - b)[ne_mask]
                proj_p[ne_dones, u] = (b - l)[ne_mask]

    return proj_p

In [6]:
def train(net, tgt_net, rep_memory):
    net.train()
    optimizer = optim.Adam(net.parameters(), lr=LR, eps=ADAM_EPS)
    
    train_data = []
    train_data.extend(random.sample(rep_memory, BATCH_SIZE))
    
    dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, pin_memory=use_cuda)
    
    for i, (s, a, r, _s, d) in enumerate(dataloader):
        s_batch = s.to(device).float()
        a_batch = a.to(device).long()
        _s_batch = _s.to(device).float()
        rewards = r.detach().cpu().numpy()
        dones = d.detach().cpu().numpy().astype(np.bool)
        
        with torch.no_grad():
            _p_batch = tgt_net(_s_batch)
            _weights = _p_batch * tgt_net.support
            _q_batch = _weights.sum(dim=2)
            _q_batch_np = _q_batch.cpu().numpy()[0]
            _action_batch_np = np.argmax(_q_batch_np)
            _p_best = _p_batch[range(BATCH_SIZE), _action_batch_np]
            _p_best_np = _p_best.cpu().numpy()
            
        proj_p_np = projection(_p_best_np, rewards, dones)
        proj_p = torch.tensor(proj_p_np).to(device).float()
        
        p_batch = net(s_batch)
        p_acting = p_batch[range(BATCH_SIZE), a_batch.data]
        
        loss = -(proj_p * (p_acting + 1e-8).log()).sum(dim=1).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        assert loss == loss 

In [7]:
# make an environment
env = gym.make("CartPole-v0")
env.seed(SEED)
obs_dim = env.observation_space.shape[0]
n_actions = env.action_space.n

# make two nerual network
net = DistrDQN(obs_dim, n_actions, N_ATOMS).to(device)
target_net = deepcopy(net)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m




In [8]:
# play
n_episodes = 500
memory_size = 10000
learn_start = 2000
update_frq = 1
epsilon = 1.0
eps_min = 0.02
total_steps = 0
n_dones = 0
rewards = []
is_learned = False
is_solved = False

# make a replay memory
rep_memory = deque(maxlen=memory_size)

for i in range(n_episodes):
    obs = env.reset()
    ep_reward = 0
    ep_steps = 0
    while True:
        env.render()
        # epsilon greedy
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            target_net.eval()
            with torch.no_grad():
                state = torch.tensor([obs]).to(device).float()
                probs = target_net(state)
                weights = probs * net.support
                q = weights.sum(dim=2)
                q_np = q.cpu().numpy()[0]
            action = np.argmax(q_np)

        _obs, reward, done, _ = env.step(action)
        
        rep_memory.append((obs, action, reward, _obs, done))
        
        obs = _obs
        ep_reward += reward
        ep_steps += 1
        total_steps += 1

        # tensorboard
        if USE_TENSORBOARD:
            writer.add_scalar('reward', ep_reward, i + 1)
        
        # learning
        if len(rep_memory) >= learn_start:
            if len(rep_memory) == learn_start:
                is_learned = True
                print('\n==========  Learning Start  ==========')
            train(net, target_net, rep_memory)

            # epsilon decay
            epsilon -= 1 / 10**4
            epsilon = max(eps_min, epsilon)

        if done:
            rewards.append(ep_reward)
            n_dones += 1
            print('{:3} Episode in {:3} steps, reward {:.2f}'.format(i + 1, ep_steps, ep_reward))
            if is_learned:
                # sync target net
                if n_dones % update_frq == 0:
                    target_net.load_state_dict(net.state_dict())

            # evaluate
            if len(rewards) > 20:
                if np.mean(rewards[-21:-1]) >= 200:
                    is_solved = True
                    print('\nCartpole is sloved! {:3} Episode in {:3} steps'.format(i + 1, total_steps))
            break
    
    if is_solved:                  
        break

env.close()

  1 Episode in  14 steps, reward 14.00
  2 Episode in  12 steps, reward 12.00
  3 Episode in  12 steps, reward 12.00
  4 Episode in  20 steps, reward 20.00
  5 Episode in  25 steps, reward 25.00
  6 Episode in  10 steps, reward 10.00
  7 Episode in  18 steps, reward 18.00
  8 Episode in  12 steps, reward 12.00
  9 Episode in  11 steps, reward 11.00
 10 Episode in  15 steps, reward 15.00
 11 Episode in  56 steps, reward 56.00
 12 Episode in  18 steps, reward 18.00
 13 Episode in  11 steps, reward 11.00
 14 Episode in  23 steps, reward 23.00
 15 Episode in  70 steps, reward 70.00
 16 Episode in  10 steps, reward 10.00
 17 Episode in  16 steps, reward 16.00
 18 Episode in  17 steps, reward 17.00
 19 Episode in  20 steps, reward 20.00
 20 Episode in  17 steps, reward 17.00
 21 Episode in  11 steps, reward 11.00
 22 Episode in  29 steps, reward 29.00
 23 Episode in  17 steps, reward 17.00
 24 Episode in  36 steps, reward 36.00
 25 Episode in  39 steps, reward 39.00
 26 Episode in  22 steps,

211 Episode in  19 steps, reward 19.00
212 Episode in  25 steps, reward 25.00
213 Episode in  98 steps, reward 98.00
214 Episode in  17 steps, reward 17.00
215 Episode in  41 steps, reward 41.00
216 Episode in  33 steps, reward 33.00
217 Episode in  80 steps, reward 80.00
218 Episode in 103 steps, reward 103.00
219 Episode in  33 steps, reward 33.00
220 Episode in  31 steps, reward 31.00
221 Episode in  61 steps, reward 61.00
222 Episode in  67 steps, reward 67.00
223 Episode in  23 steps, reward 23.00
224 Episode in 120 steps, reward 120.00
225 Episode in  57 steps, reward 57.00
226 Episode in  36 steps, reward 36.00
227 Episode in 134 steps, reward 134.00
228 Episode in  47 steps, reward 47.00
229 Episode in  20 steps, reward 20.00
230 Episode in  37 steps, reward 37.00
231 Episode in  26 steps, reward 26.00
232 Episode in  24 steps, reward 24.00
233 Episode in 145 steps, reward 145.00
234 Episode in  98 steps, reward 98.00
235 Episode in  54 steps, reward 54.00
236 Episode in  47 st