In [1]:
import wrappers
import dqn_model

In [2]:
import time
import numpy as np
import collections

import torch
import torch.nn as nn
import torch.optim as optim

from tensorboardX import SummaryWriter

In [3]:
DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 19.0

In [4]:
GAMMA = 0.99 # Gamma for Bellman approximations
BATCH_SIZE = 32 # Batch size sampled from the replay buffer
REPLAY_SIZE = 10000 # Maximum capacity of the buffer
REPLAY_START_SIZE = 10000 # Count of frames we wait for before starting training to populate the replay buffer
LEARNING_RATE = 0.0001 # learning rate used in adam optimiser
SYNC_TARGET_FRAMES = 1000 # Model sync frequency

In [5]:
EPSILON_DECAY_LAST_FRAME = 150000
EPSILON_START = 1.0
EPSILON_FINAL = 0.01

In [6]:
device = "cuda:0"
ENV= DEFAULT_ENV_NAME

In [7]:
# Code for Experience Buffer

Experience = collections.namedtuple(
    'Experience', field_names = ['state', 'action', 'reward',
                                 'done', 'new_state'])

class ExperienceBuffer:
    
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen = capacity)
        
    def __len__(self):
        return len(self.buffer)
    
    def append(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        
        indices = np.random.choice(len(self.buffer), batch_size,
                                  replace = False)
        states, actions, rewards, dones, next_states = \
            zip(*[self.buffer[idx] for idx in indices])

In [17]:
class Agent:
    
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()
        
    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0
        
        
    @torch.no_grad()
    def play_step(self, net, epsilon=0.0, device = "cpu"):
        
        """
        The main method of the agent is to perform a step in
        the environment and store its result int the buffer.
        We take an action and perform a random action, we take the
        random action; otherwise, we use the past model to obtain 
        the Q-values for all possible actions and choose the best
        """
        done_reward = None
        
        # Exploitation vs explorations
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy = False)
            state_v = torch.tensor(state_a).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim = 1)
            action - int(act_v.item())

        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward
        
        exp = Experience(self.state, action, reward, 
                        is_done, new_state)
        
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
            
        return done_reward
    
    def calc_loss(batch, net, tgt_net, device="cpu"):
        
        states, actions, rewards, dones, next_states = batch
        
        states_v = torch.tensor(np.array(
            states, copy = False)).to(device)
        next_states_v = torch.tensor(np.array(
            next_states, copy = False)).to(device)
        actions_v = torch.tensor(actions).to(device)
        rewards_v = torch.tensor(rewards).to(device)
        done_mask = torch.BoolTensor(dones).to(device)
        
        # pass observations to the first model and 
        # extract the specific Q-values for the taken 
        # actions usig the gather () tensor operation
        state_action_values = net(states_v).gather(
            1, actions_v.unsqueeze(-1)).squeeze(-1)
        
        next_state_values = tgt_net(next_states_v).max(1)[0]
        
        # to make discounted reward of the last step in the 
        # episode, then our value of the action doesn't have
        # discounted rewarsd = 0
        next_state_values[done_mask] = 0.0
        
        # nullify the gradients from it's computational graph
        # to prevent gradietns from flowing into the NN
        # to calculate the Q approximations for the next states
        # without this the backpropgatin of the loss will start 
        # to affect both the prediction for the current state 
        # an the next state
        next_state_values = next_state_values.detach()
        
        expected_state_action_values = next_state_values*GAMMA + \
                                        rewards_v
        
        return nn.MSELoss()(state_action_values,
                           expected_state_action_values)

Create an environment with all the required wrappers applied, the NN that we are going to train, and our target network with the same architecture

In [18]:
env = wrappers.make_env(DEFAULT_ENV_NAME)

In [19]:
print(env.observation_space.shape)
print(env.action_space.n)

(4, 84, 84)
6


In [20]:
net = dqn_model.DQN(env.observation_space.shape,
                    env.action_space.n).to(device)
tgt_net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)

In [21]:
writer = SummaryWriter(comment="-" + ENV)

In [22]:
print(net)

DQN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)


Create an experience relay buffer of the required size and pass
it to the agent

In [23]:
buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = EPSILON_START

Training loop to create an optimzer, a buffer for full episode rewards
, a counter of frames and several variables to track our speed

In [24]:
optimizer = optim.Adam(net.parameters(), 
                       lr=LEARNING_RATE) # optimizer
total_rewards = [] # buffer for full episode rewards
frame_idx = 0 # counter of frames
ts_frame = 0 # 
ts = time.time()
best_m_reward = None # counter for best mean rewarded

In this block, we ask the agent to make a single step in the environment (using our current network and value for epsilon). This function returns a non-None result only if this is the final step in the episode

In [26]:
reward = agent.play_step(net, epsilon, device=device)

while True:
    # count the iterations and decrease epsilon
    frame_idx += 1
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
    
    if reward is not None:
        total_rewards.append(reward)
        speed = (frame_idx - ts_frame)/(time.time())
        ts_frame = frame_idx
        ts = time.time()
        m_reward = np.mean(total_rewards[-100:])
        print("%d : done %d games, reward %.3f, "
              "eps %.2f, speeed %.2f f/s" % (
            frame_idx, len(total_rewards), m_reward, epsilon,
            speed
            ))
        writer.add_scalar("epsilon", epsilon, frame_idx)
        writer.add_scalar("speed", speed, frame_idx)
        writer.add_scalar("reward_100", m_reward, frame_idx)
        writer.add_scalar("reward", reward, frame_idx)

KeyboardInterrupt: 