In [1]:
from torch import nn
import torch
import gym
from collections import deque
import itertools
import numpy as np
import random

In [2]:
GAMMA = 0.99
BATCH_SIZE = 32
BUFFER_SIZE = 50000
MIN_REPLAY_SIZE = 1000
EPSILON_START = 1.0
EPSILON_END = 0.02
EPSILON_DECAY = 10000
TARGET_UPDATE_FREQ = 1000
MAX_EP = 25000

REWARD_ACC = list()
LOSS_ACC = list()

torch.manual_seed(1234)
np.random.seed(1234)

In [6]:
class Network(nn.Module):
    def __init__(self, env):
        super().__init__()
        
        in_features = int(np.prod(env.observation_space.shape))
        
        self.net = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.Tanh(),
            nn.Linear(64, env.action_space.n)
        )
        
    def forward(self, x):
        return self.net(x)
    
    def act(self, state):
        state_t = torch.as_tensor(state, dtype=torch.float32)
        q_values = self.forward(state_t.unsqueeze(0))
        max_q_index = torch.argmax(q_values, dim=1)[0]
        action = max_q_index.detach().item()
        
        return action

In [7]:
env = gym.make('CartPole-v1')
replay_buffer = deque(maxlen=BUFFER_SIZE)
reward_buffer = deque([0.0], maxlen=100)
episode_reward = 0.0

online_net = Network(env)
target_net = Network(env)

target_net.load_state_dict(online_net.state_dict())
optimizer = torch.optim.Adam(online_net.parameters(), lr=5e-4)

In [8]:
state = env.reset()

for _ in range(MIN_REPLAY_SIZE):
    
    action = env.action_space.sample()
    new_state, reward, done, _ = env.step(action)
    transition = (state, action, reward, done, new_state)
    replay_buffer.append(transition)
    state = new_state
    
    if done:
        state = env.reset()

In [11]:
state = env.reset()

for step in itertools.count():
    epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END])
    
    random_sample = random.random()
    
    if random_sample <= epsilon:
        action = env.action_space.sample()
    else:
        action = online_net.act(state)
        
    new_state, reward, done, _ = env.step(action)
    transition = (state, action, reward, done, new_state)
    replay_buffer.append(transition)
    state = new_state
    
    episode_reward = episode_reward + reward
    
    if done:
        state = env.reset()
        reward_buffer.append(episode_reward)
        episode_reward = 0.0
        
        
    transitions = random.sample(replay_buffer, BATCH_SIZE)
    
    states     = np.asarray([t[0] for t in transitions])                        # len(states) == 32
    actions    = np.asarray([t[1] for t in transitions])                        # len(actions) == 32
    rewards    = np.asarray([t[2] for t in transitions])                        # len(rewards) == 32    
    dones      = np.asarray([t[3] for t in transitions])                        # len(dones) == 32    
    new_states = np.asarray([t[4] for t in transitions])                        # len(new_states) == 32
    
    states_t = torch.as_tensor(states, dtype = torch.float32)
    actions_t = torch.as_tensor(actions, dtype = torch.int64).unsqueeze(-1)
    rewards_t = torch.as_tensor(rewards, dtype=torch.float32).unsqueeze(-1)
    dones_t = torch.as_tensor(dones, dtype=torch.float32).unsqueeze(-1)
    new_states_t = torch.as_tensor(new_states, dtype=torch.float32)
    
    online_with_new_states = online_net.forward(new_states_t)
    argmax_online_with_new_states = online_with_new_states.argmax(dim=1, keepdim=True)
    
    offline_with_new_states = target_net.forward(new_states_t)
    target_q_vals = torch.gather(input=offline_with_new_states, dim=1, index=argmax_online_with_new_states)
    targets = rewards_t + GAMMA * (1 - dones_t) * target_q_vals
    
    q_values = online_net.forward(states_t)
    action_q_values = torch.gather(input=q_values, dim=1, index=actions_t)
    loss = nn.functional.smooth_l1_loss(action_q_values, targets)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if step % TARGET_UPDATE_FREQ == 0:
        target_net.load_state_dict(online_net.state_dict())
        
    if step % 1000 == 0:
        print()
        print('Step', step)
        print('Avg REWARD',np.mean(reward_buffer))
        print('Loss', loss)
        REWARD_ACC.append(np.mean(reward_buffer))
        LOSS_ACC.append(loss.item())
        
    if step == MAX_EP:
        break


Step 0
Avg REWARD 0.0
Loss tensor(0.4712, grad_fn=<SmoothL1LossBackward0>)

Step 1000
Avg REWARD 22.0
Loss tensor(0.0004, grad_fn=<SmoothL1LossBackward0>)

Step 2000
Avg REWARD 22.75862068965517
Loss tensor(0.0170, grad_fn=<SmoothL1LossBackward0>)

Step 3000
Avg REWARD 25.18
Loss tensor(0.0853, grad_fn=<SmoothL1LossBackward0>)

Step 4000
Avg REWARD 28.63
Loss tensor(0.0572, grad_fn=<SmoothL1LossBackward0>)

Step 5000
Avg REWARD 35.55
Loss tensor(0.0023, grad_fn=<SmoothL1LossBackward0>)

Step 6000
Avg REWARD 41.88
Loss tensor(0.2255, grad_fn=<SmoothL1LossBackward0>)

Step 7000
Avg REWARD 51.29
Loss tensor(0.1358, grad_fn=<SmoothL1LossBackward0>)

Step 8000
Avg REWARD 60.48
Loss tensor(0.0020, grad_fn=<SmoothL1LossBackward0>)

Step 9000
Avg REWARD 67.57
Loss tensor(0.0034, grad_fn=<SmoothL1LossBackward0>)

Step 10000
Avg REWARD 79.98
Loss tensor(0.1708, grad_fn=<SmoothL1LossBackward0>)

Step 11000
Avg REWARD 89.34
Loss tensor(0.0058, grad_fn=<SmoothL1LossBackward0>)

Step 12000
Avg REWA

In [12]:
with open('DDQN.txt', 'w') as f:
    f.write(str(REWARD_ACC))
    f.write("\n")
    f.write(str(LOSS_ACC))