## Colab setup

In [None]:
!pip install gym > /dev/null 2>&1

In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [None]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1

# Deep Q-Learning (DQN)


In DQN, the $Q$-function is parameterized by a neural network of parameters $\theta$. The network takes as input a state $s$ and outputs $Q(s, a, \theta)$ for all actions $a$. 

The network is trained in way that is similar to Fitted Q Iteration. At each time $T$, the agent has observed the transitions $(s_t, a_t, r_t, s_t')_{t=1}^T$, which are stored in a __replay buffer__.

In addition to the network with parameters $\theta$, DQN keeps another network with the same architecture and parameters $\tilde{\theta}$, called __target network__. 
To update the parameters $\theta$, we sample $N$ transitions from the __replay buffer__, we define the loss 

$$
L(\theta) = \sum_{i=1}^N [Q(s_i, a_i, \theta) - (r_i + \gamma\max_{a'}Q(s'_i,a', \tilde{\theta}))]^2
$$

and update 

$$
\theta \gets \theta + \eta \nabla L(\theta).
$$


Every $C$ iterations, the target network is updated as $\tilde{\theta} \gets \theta$. 

At each time $t$, DQN updates the networks as described above, selects an action according to an $\epsilon$-greedy policy, plays the action and stores the new data in the replay buffer.

In [None]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
from copy import deepcopy

import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only

from pyvirtualdisplay import Display
from IPython import display as ipythondisplay
from IPython.display import clear_output
from pathlib import Path

import random, os.path, math, glob, csv, base64, itertools, sys
from pprint import pprint

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import io
from IPython.display import HTML

## Step 1: Define the parameters

In [None]:
# Environment
env = gym.make("CartPole-v0")

# Discount factor
GAMMA = 0.99

# Batch size
BATCH_SIZE = 256
# Capacity of the replay buffer
BUFFER_CAPACITY = 10000
# Update target net every ... episodes
UPDATE_TARGET_EVERY = 20

# Initial value of epsilon
EPSILON_START = 1.0
# Parameter to decrease epsilon
DECREASE_EPSILON = 200
# Minimum value of epislon
EPSILON_MIN = 0.05

# Number of training episodes
N_EPISODES = 300

# Learning rate
LEARNING_RATE = 0.1

## Step 2: Define the replay buffer

In [None]:

class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        """
        Returns
         - states
         - actions
         - rewards
         - next states
         - done
        """
        samples = random.sample(self.memory, batch_size)
        return map(np.asarray, zip(*samples))

    def __len__(self):
        return len(self.memory)

## Step 3: Define the neural network architecture, objective and optimizer

In [None]:
class Net(nn.Module):
    """
    Basic neural net.
    """
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)

## Step 4: Implement DQN

In [None]:
#
#  Some useful functions (TO IMPLEMENT)
#

def choose_action(state, epsilon):
    """
    TO BE IMPLEMENTED
    
    Return action according to an epsilon-greedy exploration policy
    """
    pass

def simulate_agent(n_sim=5):
    """
    Monte Carlo evaluation of DQN agent
    """
    sum_rewards = np.zeros(n_sim)
    return sum_rewards

In [None]:
#
# Train
# 

EVAL_EVERY = 5
REWARD_THRESHOLD = 199

# initialize replay buffer
replay_buffer = ReplayBuffer(BUFFER_CAPACITY)

# create network and target network
hidden_size = 128
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

q_net = Net(obs_size, hidden_size, n_actions)
target_net = Net(obs_size, hidden_size, n_actions)

# objective and optimizer
optimizer = optim.Adam(params=q_net.parameters(), lr=LEARNING_RATE)


# Algorithm
state = env.reset()
epsilon = EPSILON_START
ep = 0
total_time = 0
while ep < N_EPISODES:
    # take an action and observe outcome
    action = choose_action(state, epsilon)
    next_state, reward, done, _ = env.step(action)

    # add data to replay buffer
    ...

    if len(replay_buffer) > BATCH_SIZE:
        #########################
        # UPDATE MODEL
        #########################
        # get batch
        ...
        
        # Convert numpy nd_array to torch variables for calculation
        states = torch.FloatTensor(states)
        actions = torch.tensor(actions, dtype=int)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        not_done = torch.tensor(1 - dones, dtype=int)

        # Compute loss - TO BE IMPLEMENTED!
        # 1) Compute current Q value, q_func takes only state and output value for every state-action pair
        # We choose Q based on action taken. (hint use ".gather(1, CCC.view(-1,1))")
        Q_values = 

        # 2) Compute targets
        # 2.a -> Q-function
        # Detach variable from the current graph since we don't want gradients for next Q to propagated
        # remember to use the flag done. If it is done, we do not want to compute the next state
        Qtarget_values = 

        # 2.b -> full target
        target_Q_values = 
        target_Q_values = target_Q_values.unsqueeze(1)

        # 3) compute loss
        loss = 

        # Optimize the model - UNCOMMENT!
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss_value = loss.data.numpy()

    # update state
    state = next_state

    # end episode if done
    if done:
        state = env.reset()
        ep   += 1
        if ( (ep+1)% EVAL_EVERY == 0):
            rewards = simulate_agent()
            print("episode =", ep+1, ", reward = ", np.mean(rewards))
            if np.mean(rewards) >= REWARD_THRESHOLD:
                break

        # update target network
        if ep % UPDATE_TARGET_EVERY == 0:
            target_net.load_state_dict(q_net.state_dict())
        # decrease epsilon
        epsilon = EPSILON_MIN + (EPSILON_START - EPSILON_MIN) * \
                        np.exp(-1. * ep / DECREASE_EPSILON )    

    total_time += 1

# re-evaluate the agent at the end
rewards = simulate_agent(20)
print("")
print("mean reward after training = ", np.mean(rewards))

## Visualizing the agent

In [None]:
def show_video(directory):
    html = []
    for mp4 in Path(directory).glob("*.mp4"):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append('''<video alt="{}" autoplay 
                      loop controls style="height: 400px;">
                      <source src="data:video/mp4;base64,{}" type="video/mp4" />
                 </video>'''.format(mp4, video_b64.decode('ascii')))
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))
    
def make_seed(seed):
    np.random.seed(seed=seed)
    torch.manual_seed(seed=seed)
  
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

In [None]:
env = Monitor(env, "./gym-results", force=True, video_callable=lambda episode: True)
for episode in range(1):
    done = False
    state = env.reset()
    while not done:
        action = env.action_space.sample() # MODIFY THIS PART TO COMPUTE THE ACTION WITH DQN
        state, reward, done, info = env.step(action)
env.close()
show_video("./gym-results")