In [1]:
from lib import wrappers
from lib import dqn_model

import argparse
import time
import numpy as np
import collections

import torch
import torch.nn as nn
import torch.optim as optim
from tensorboardX import SummaryWriter

DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 19.5
GAMMA = 0.99
BATCH_SIZE = 64
REPLAY_SIZE = 10000
REPLAY_START_SIZE = 10000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000
EPSILON_START = 1.0
EPSILON_FINAL = 0.02
EPSILON_DECAY_LAST_FRAME = 10**5

In [2]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

In [3]:
class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)
        
    def __len__(self):
        return len(self.buffer)
    
    def append(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), np.array(dones, dtype=np.uint8), np.array(next_states)
    

In [4]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()
        
    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0
        
    def play_step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None
        
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())
        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward
        new_state = new_state
        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward
    
def calc_loss(batch, net, tgt_net, device="cpu"):
    states, actions, rewards, dones, next_states = batch
    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)
    state_action_values = net(states_v).gather(1,actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()
    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)
    

In [None]:
if __name__ == "__main__":
    device = torch.device("cuda")
    env = wrappers.make_env(DEFAULT_ENV_NAME)
    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
    tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
    writer = SummaryWriter(comment="-"+DEFAULT_ENV_NAME)
    print(net)
    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START
    optimizer= optim.Adam(net.parameters(), lr=LEARNING_RATE)
    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward = None
    while True:
        env.render()
        frame_idx += 1
        epsilon = max(EPSILON_FINAL, EPSILON_START- frame_idx / EPSILON_DECAY_LAST_FRAME)
        reward = agent.play_step(net, epsilon, device=device)
        if reward is not None:
            total_rewards.append(reward)
            speed = (frame_idx - ts_frame) / (time.time() - ts)
            ts_frame = frame_idx
            ts = time.time()
            mean_reward = np.mean(total_rewards[-100:])
            print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s"%(frame_idx, len(total_rewards), mean_reward,epsilon,speed))
            writer.add_scalar("epsilon",epsilon,frame_idx)
            writer.add_scalar("speed",speed,frame_idx)
            writer.add_scalar("reward_100", mean_reward,frame_idx)
            writer.add_scalar("reward",reward,frame_idx)
            if best_mean_reward is None or best_mean_reward < mean_reward:
                torch.save(net.state_dict(), DEFAULT_ENV_NAME + "best.dat")
                if best_mean_reward is not None:
                    print("Best mean reward updated %.3f -> %.3f, model saved"%(best_mean_reward,mean_reward))
                    best_mean_reward = mean_reward
            if mean_reward > MEAN_REWARD_BOUND:
                print("Solved in %d frames!"% frame_idx)
                break
        if len(buffer) < REPLAY_START_SIZE:
            continue
        if frame_idx % SYNC_TARGET_FRAMES == 0:
            tgt_net.load_state_dict(net.state_dict())
            
        optimizer.zero_grad()
        batch = buffer.sample(BATCH_SIZE)
        loss_t = calc_loss(batch, net, tgt_net, device=device)
        loss_t.backward()
        optimizer.step()
    writer.close()

DQN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)
[33mWARN: <class 'lib.wrappers.MaxAndSkipEnv'> doesn't implement 'reset' method, but it implements deprecated '_reset' method.[0m
1156: done 1 games, mean reward -20.000, eps 0.99, speed 137.96 f/s
2112: done 2 games, mean reward -20.000, eps 0.98, speed 168.12 f/s
3257: done 3 games, mean reward -19.667, eps 0.97, speed 169.18 f/s
4376: done 4 games, mean reward -19.250, eps 0.96, speed 171.64 f/s
5380: done 5 games, mean reward -19.600, eps 0.95, speed 169.48 f/s
6276: done 6 games, mean reward -19.667, eps 0.94, speed 172.38 f/s
7195: done 7 games, mean reward -19.571,

199439: done 112 games, mean reward -14.410, eps 0.02, speed 25.89 f/s
202389: done 113 games, mean reward -14.150, eps 0.02, speed 25.70 f/s
205116: done 114 games, mean reward -13.840, eps 0.02, speed 25.93 f/s
207747: done 115 games, mean reward -13.550, eps 0.02, speed 25.86 f/s
209513: done 116 games, mean reward -13.140, eps 0.02, speed 25.84 f/s
211886: done 117 games, mean reward -12.860, eps 0.02, speed 25.89 f/s
214929: done 118 games, mean reward -12.630, eps 0.02, speed 25.91 f/s
217375: done 119 games, mean reward -12.310, eps 0.02, speed 25.87 f/s
219276: done 120 games, mean reward -11.930, eps 0.02, speed 25.90 f/s
221570: done 121 games, mean reward -11.600, eps 0.02, speed 25.93 f/s
223589: done 122 games, mean reward -11.210, eps 0.02, speed 25.93 f/s
226147: done 123 games, mean reward -10.920, eps 0.02, speed 25.87 f/s
228098: done 124 games, mean reward -10.540, eps 0.02, speed 25.87 f/s
231078: done 125 games, mean reward -10.320, eps 0.02, speed 25.88 f/s
234132

447054: done 230 games, mean reward 16.430, eps 0.02, speed 25.77 f/s
448837: done 231 games, mean reward 16.470, eps 0.02, speed 25.72 f/s
450773: done 232 games, mean reward 16.570, eps 0.02, speed 25.75 f/s
452556: done 233 games, mean reward 16.570, eps 0.02, speed 25.79 f/s
454618: done 234 games, mean reward 16.570, eps 0.02, speed 25.76 f/s
456836: done 235 games, mean reward 16.520, eps 0.02, speed 25.73 f/s
458906: done 236 games, mean reward 16.530, eps 0.02, speed 25.82 f/s
461633: done 237 games, mean reward 16.440, eps 0.02, speed 25.79 f/s
463330: done 238 games, mean reward 16.470, eps 0.02, speed 25.74 f/s
465075: done 239 games, mean reward 16.470, eps 0.02, speed 25.78 f/s
467155: done 240 games, mean reward 16.480, eps 0.02, speed 25.81 f/s
468913: done 241 games, mean reward 16.520, eps 0.02, speed 25.77 f/s
471275: done 242 games, mean reward 16.470, eps 0.02, speed 25.83 f/s
473000: done 243 games, mean reward 16.500, eps 0.02, speed 25.74 f/s
475166: done 244 gam

673126: done 348 games, mean reward 17.580, eps 0.02, speed 25.66 f/s
674791: done 349 games, mean reward 17.580, eps 0.02, speed 19.40 f/s
676896: done 350 games, mean reward 17.560, eps 0.02, speed 21.34 f/s
