In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import gym
import numpy as np
from collections import deque
from gym import spaces
import cv2
cv2.ocl.setUseOpenCL(False)

In [4]:
class FeedForwardNeuralNet(nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(FeedForwardNeuralNet, self).__init__()
        self.dl1 = nn.Linear(state_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.dl2 = nn.Linear(hidden_size, hidden_size)
        self.bn2 = nn.BatchNorm2d(hidden_size)
        self.out = nn.Linear(hidden_size, action_size)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.dl1(x)))
        x = F.relu(self.bn2(self.dl2(x)))
        return self.out(x)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
device

device(type='cuda')

In [7]:
class ReplayBuffer():
    def __init__(self, max_size=1e6):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0
        
    def add(self, data):
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = data
            self.ptr = (self.ptr + 1) % self.max_size
        else: self.storage.append(data)
            
    def sample(self, batch_size):
        ind = np.random.randint(0, len(self.storage), size=batch_size)
        x, y, u, r, d = [], [], [], [], []
        
        for i in ind:
            X, Y, U, R, D = self.storage[i]
            x.append(np.array(X, copy=False))
            y.append(np.array(Y, copy=False))
            u.append(np.array(U, copy=False))
            r.append(np.array(R, copy=False))
            d.append(np.array(D, copy=False))
            
        return np.array(x), np.array(y), np.array(u).reshape(-1, 1), np.array(r).reshape(-1,1), np.array(d).reshape(-1, 1)    

In [8]:
class NoopResetEnv(gym.Wrapper):
    def __init__(self, env, noop_max=30):
        gym.Wrapper.__init__(self, env)
        self.noop_max = noop_max
        self.override_num_noops = None
        self.noop_action = 0
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
        
    def reset(self, **kwargs):
        self.env.reset(**kwargs)
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1)
        assert noops > 0
        obs = None
        for _ in range(noops):
            obs, _, done, _ = self.env.step(self.noop_action)
            if done:
                obs = self.env.reset(**kwargs)
        return obs
    
    def step(self, ac):
        return self.env.step(ac)

In [9]:
class WarpFrame(gym.ObservationWrapper):
    def __init__(self, env, width=84, height=84, grayscale=True):
        gym.ObservationWrapper.__init__(self, env)
        self.width = width
        self.height = height
        self.grayscale = grayscale
        shape = (1 if self.grayscale else 3, self.height, self.width)
        self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=np.uint8)
        
    def observation(self, frame):
        if self.grayscale:
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        size = (self.width, self.height)
        frame = cv2.resize(frame, size, interpolation=cv2.INTER_AREA)
        if self.grayscale:
            frame = np.expand_dims(frame, -1)
        return frame.transpose((2, 0, 1))

In [14]:
env = gym.make('PongNoFrameskip-v4')

In [15]:
action_size = env.action_space.n

In [11]:
class DeepQNetwork(nn.Module):
    def __init__(self, action_size, hidden_size):
        super(DeepQNetwork, self).__init__()
        self.cv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.bn1 = nn.BatchNorm2d(32)
        self.cv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.bn2 = nn.BatchNorm2d(64)
        self.cv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.fc = nn.Linear(7 * 7 * 64, hidden_size)
        self.bn4 = nn.BatchNorm1d(hidden_size)
        self.out = nn.Linear(hidden_size, action_size)
        
    def forward(self, x):
        x = x / 255.0
        x = F.relu(self.bn1(self.cv1(x)))
        x = F.relu(self.bn2(self.cv2(x)))
        x = F.relu(self.bn3(self.cv3(x)))
        x = F.relu(self.bn4(self.fc(x.view(x.size(0), -1))))
        return self.out(x)

In [12]:
class DQNAgent():
    def __init__(self, action_size, hidden_size, learning_rate):
        self.action_size = action_size
        self.train_net = DeepQNetwork(action_size, hidden_size).to(device)
        self.target_net = DeepQNetwork(action_size, hidden_size).to(device)
        self.target_net.load_state_dict(self.train_net.state_dict())
        self.optimizer = optim.Adam(self.train_net.parameters(), lr=learning_rate)
        
    def select_action(self, s, eps):
        if np.random.rand() <= eps:
            a = env.action_space.sample()
        else:
            with torch.no_grad():
                input_state = torch.FloatTensor(np.array(s)).unsqueeze(0).to(device)
                a = self.train_net(input_state).max(1)[1]
                a = int(a)
        return a
    
    def train(self, replay_buffer, batch_size, discount):
        x0, x1, a, r, d = replay_buffer.sample(batch_size)
        state_batch = torch.FloatTensor(x0).to(device)
        next_state_batch = torch.FloatTensor(x1).to(device)
        action_batch = torch.LongTensor(a).to(device)
        reward_batch = torch.FloatTensor(r).to(device)
        done_batch = torch.FloatTensor(1. - d).to(device)
        
        train_q = self.train_net(state_batch).gather(1, action_batch)
        
        with torch.no_grad():
            target_net_q = reward_batch + done_batch * discount * \
            torch.max(self.target_net(next_state_batch).detach(), dim=1)[0].view(batch_size, -1)
            
        loss = F.smooth_l1_loss(train_q, target_net_q)
        
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.train_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
        
        return loss.detach().cpu().numpy()
    
    
    def update_target_network(self, num_iter, update_every):
        if num_iter % update_every == 0:
            self.target_net.load_state_dict(self.train_net.state_dict())

In [17]:
replay_size = 50000
replay_buffer = ReplayBuffer(max_size=replay_size)
update_target_every = 1000

timesteps = 2000000
hidden_size = 512
learning_rate = 0.0001
batch_size = 32
start_training_after = 10001
discount = 0.99

epsilon_start = 1.0
epsilon_min = 0.01
epsilon_decay_steps = timesteps * .15
epsilon_step = (epsilon_start - epsilon_min) / epsilon_decay_steps

dqn_agent = DQNAgent(action_size, hidden_size, learning_rate)


In [None]:
stats_rewards_list = []
stats_every = 10
total_reward = 0
episode = 1
episode_length = 0
stats_loss = 0.
epsilon = epsilon_start
state = env.reset()

for ts in range(timesteps):
    action = dqn_agent.select_action(state, epsilon)
    epsilon -= epsilon_step
    if epsilon < epsilon_min:
        epsilon = epsilon_min
        
    next_state, reward, done, info = env.step(action)
    total_reward += reward
    episode_length += 1
    replay_buffer.add((state, next_state, action, reward, float(done)))
    
    if ts > start_training_after:
        stats_loss = dqn_agent.train(replay_buffer, batch_size, discount)
        dqn_agent.update_target_network(ts, update_target_every)
        
    if done:
        state = env.reset()
        stats_rewards_list.append((episode, total_reward, episode_length))
        episode += 1
        total_reward = 0
        episode_length = 0

In [18]:
class DQNAgent():
    def __init__(self, action_size, hidden_size, learning_rate):
        self.action_size = action_size
        self.train_net = DeepQNetwork(action_size, hidden_size).to(device)
        self.target_net = DeepQNetwork(action_size, hidden_size).to(device)
        self.target_net.load_state_dict(self.train_net.state_dict())
        self.optimizer = optim.Adam(self.train_net.parameters(), lr=learning_rate)
        
    def select_action(self, s, eps):
        if np.random.rand() <= eps:
            a = env.action_space.sample()
        else:
            with torch.no_grad():
                input_state = torch.FloatTensor(np.array(s)).unsqueeze(0).to(device)
                a = self.train_net(input_state).max(1)[1]
                a = int(a)
        return a
    
    def train(self, replay_buffer, batch_size, discount):
        x0, x1, a, r, d = replay_buffer.sample(batch_size)
        state_batch = torch.FloatTensor(x0).to(device)
        next_state_batch = torch.FloatTensor(x1).to(device)
        action_batch = torch.LongTensor(a).to(device)
        reward_batch = torch.FloatTensor(r).to(device)
        done_batch = torch.FloatTensor(1. - d).to(device)
        
        train_q = self.train_net(state_batch).gather(1, action_batch)
        
        with torch.no_grad():
            train_argmax = self.train_net(next_state_batch).max(1)[1].view(batch_size, 1)
            target_net_q = reward_batch + done_batch * discount * \
                self.target_net(next_state_batch).gather(1, train_argmax)
            
        loss = F.smooth_l1_loss(train_q, target_net_q)
        
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.train_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
        
        return loss.detach().cpu().numpy()
    
    
    def update_target_network(self, num_iter, update_every):
        if num_iter % update_every == 0:
            self.target_net.load_state_dict(self.train_net.state_dict())
            
    def update_target_network_soft(self, num_iter, update_every, update_tau=0.001):
        if num_iter % update_every == 0:
            for target_var, var in zip(self.target_net.parameters(), self.train_net.parameters()):
                target_var.data.copy_((1. - update_tau) * target_var + update_tau * var)

In [19]:
class DuelingDeepQNetwork(nn.Module):
    def __init__(self, action_size, hidden_size):
        super(DuelingDeepQNetwork, self).__init__()
        self.cv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.bn1 = nn.BatchNorm2d(32)
        self.cv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.bn2 = nn.BatchNorm2d(64)
        self.cv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.fc = nn.Linear(7 * 7 * 64, hidden_size)
        self.bn4 = nn.BatchNorm1d(hidden_size)
        self.dueling_value = nn.Linear(hidden_size, 1)
        self.dueling_action = nn.Linear(hidden_size, action_size)
        
    def forward(self, x):
        x = x / 255.0
        x = F.relu(self.bn1(self.cv1(x)))
        x = F.relu(self.bn2(self.cv2(x)))
        x = F.relu(self.bn3(self.cv3(x)))
        x = F.relu(self.bn4(self.fc(x.view(x.size(0), -1))))
        x = self.dueling_action(x) - self.duelin_action(x).mean(dim=1, keepdim=True) + self.dueling_value(x)
        return x