In [None]:
### This cell is needed to run the code in Colab
!apt-get install -y xvfb x11-utils
!pip install pyvirtualdisplay==0.2.* \
             PyOpenGL==3.1.* \
             PyOpenGL-accelerate==3.1.*
import pyvirtualdisplay


_display = pyvirtualdisplay.Display(visible=False,  # use False with Xvfb
                                    size=(1400, 900))
_ = _display.start()

In [None]:
import random
import torch
import numpy as np
import gym
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from torch import nn
import torch.nn.functional as F
import torchvision.transforms as T
from PIL import Image
from collections import deque # this python module implements exactly what we need for the replay memeory

In [None]:
np.random.seed(0)
torch.manual_seed(0)
random.seed(0)

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
trained = True

In [None]:
# Check if the GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Training device: {device}")

In [None]:
class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity) # Define a queue with maxlen "capacity"

    def push(self, state, action, next_state, reward):
        self.memory.append((state, action, next_state, reward))

    def sample(self, batch_size):
        batch_size = min(batch_size, len(self)) # Get all the samples if the requested batch_size is higher than the number of sample currently in the memory
        return random.sample(self.memory, batch_size) # Randomly select "batch_size" samples

    def __len__(self):
        return len(self.memory) # Return the number of samples currently stored in the memory

In [None]:
class ConvDQN(nn.Module):

    def __init__(self, outputs, w=40, h=60):
        super(ConvDQN, self).__init__()
        self.conv = nn.Sequential(nn.Conv2d(4, 16, kernel_size=5, stride=3), nn.BatchNorm2d(16), nn.ReLU(True),
                                  nn.Conv2d(16, 32, kernel_size=4, stride=2), nn.BatchNorm2d(32), nn.ReLU(True))
                                  #nn.Conv2d(32, 64, kernel_size=3, stride=1), nn.BatchNorm2d(64), nn.ReLU(True))

        # Number of Linear input connections depends on output of conv2d layers
        # and therefore the input image size, so compute it.
        def conv2d_size_out1(size, kernel_size = 5, stride = 3):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        def conv2d_size_out2(size, kernel_size = 4, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        # def conv2d_size_out3(size, kernel_size = 3, stride = 1):
        #     return (size - (kernel_size - 1) - 1) // stride  + 1
        convh = conv2d_size_out2(conv2d_size_out1(h))
        convw = conv2d_size_out2(conv2d_size_out1(w))
        linear_input_size = convh * convw * 32
        self.head = nn.Sequential(nn.Linear(linear_input_size, 512), nn.ReLU(True), 
                                  nn.Linear(512, 256), nn.ReLU(True), 
                                  nn.Linear(256, outputs))

    def forward(self, x):
        x = self.conv(x)
        flatten = x.view(x.size(0), -1)
        return self.head(flatten)

In [None]:
resize = T.Compose([T.ToPILImage(),
                    T.Resize((40,60), interpolation=T.InterpolationMode.BICUBIC), 
                    T.ToTensor(),
                    T.Grayscale(num_output_channels=1)])

class Agent:
    
    def __init__(self, env_name):
        self.env_name = env_name
        self.env = gym.make(env_name)
        self.env.seed(0)
        self.ROWS = 40
        self.COLS = 60
        self.REM_STEP = 4
        self.image_memory = torch.zeros(self.REM_STEP, self.ROWS, self.COLS)
        self.n_actions = self.env.action_space.n

    def get_cart_location(self, screen_width):
        world_width = self.env.x_threshold * 2
        scale = screen_width / world_width
        return int(self.env.state[0] * scale + screen_width / 2.0)  # MIDDLE OF CART
    
    def get_screen(self):
        # Returned screen requested by gym is 400x600x3, but is sometimes larger
        # such as 800x1200x3. Transpose it into torch order (CHW).
        screen = self.env.render(mode='rgb_array').transpose((2, 0, 1))
        # Cart is in the lower half, so strip off the top and bottom of the screen
        _, screen_height, screen_width = screen.shape
        screen = screen[:, int(screen_height*0.4):int(screen_height * 0.8)]#crop top and bottom
        view_width = int(screen_width * 0.6)
        cart_location = self.get_cart_location(screen_width)
        if cart_location < view_width // 2:
            slice_range = slice(view_width)
        elif cart_location > (screen_width - view_width // 2):
            slice_range = slice(-view_width, None)
        else:
            slice_range = slice(cart_location - view_width // 2,
                                cart_location + view_width // 2)   
        # Strip off the edges
        screen = screen[:, :, slice_range]
        # Convert to float, rescale, convert to torch tensor
        screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
        screen = torch.from_numpy(screen) # to tensor
        # Roll in the image memory to discard the oldest image
        self.image_memory = torch.roll(self.image_memory, 1, dims = 0)
        screen = resize(screen) # Apply the composed transform
        # insert in the first position the latest screen
        self.image_memory[0,:,:] = screen  
        #return the memory after having added the batch dimension
        return self.image_memory.unsqueeze(0)
    
    def reset(self):
        self.env.reset()
        for i in range(self.REM_STEP):
            state= self.get_screen()
        return state
        
    def step(self,action):
        _, reward, done, _ = self.env.step(action)
        next_state = self.get_screen()
        return next_state, reward, done
    
    def render(self):
        self.env.render()
       

In [None]:
env_name = 'CartPole-v0'
agent = Agent(env_name)

In [None]:
batch_size = 32
gamma = 0.999
learning_rate = 5e-5
bad_state_penalty = -100
target_net_update_steps = 25

# Get number of actions from gym action space
n_actions = agent.n_actions

policy_net = ConvDQN(n_actions).to(device)
target_net = ConvDQN(n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = torch.optim.Adam(policy_net.parameters(), lr=learning_rate)
replay_mem = ReplayMemory(100000)
min_samples_for_training = 5000

loss_fn = nn.SmoothL1Loss()

In [None]:
def choose_action_epsilon_greedy(net, state, epsilon):
    
    if epsilon > 1 or epsilon < 0:
        raise Exception('The epsilon value must be between 0 and 1')
                
    # Evaluate the network output from the current state
    with torch.no_grad():
        net.eval()
        net_out = net(state.to(device))
        
    # Get the best action (argmax of the network output)
    best_action = int(net_out.argmax())
    # Get the number of possible actions
    action_space_dim = net_out.shape[-1]

    # Select a non optimal action with probability epsilon, otherwise choose the best action
    if random.random() < epsilon:
        # List of non-optimal actions
        non_optimal_actions = [a for a in range(action_space_dim) if a != best_action]
        # Select randomly
        action = random.choice(non_optimal_actions)
    else:
        # Select best action
        action = best_action
        
    return action, net_out.cpu().numpy()

In [None]:
### Define exploration profile
max_value = 1
num_iterations = 2000
exp_decay = 0.01
exploration_profile = [max_value * np.exp(-exp_decay * i) for i in range(num_iterations)]
exploration_profile = [0.01 if x < 0.01 else x for x in exploration_profile] # set minimum exploring rate


### Plot exploration profile
plt.figure(figsize=(12,8))
plt.plot(exploration_profile)
plt.grid()
plt.xlabel('Iteration', fontsize = 14)
plt.ylabel('Exploration profile (Epsilon)', fontsize = 14)
#plt.savefig("Exploration_vision.pdf", format='pdf')

In [None]:
def update_step(policy_net, target_net, buffer, gamma, batch_size, loss_fn, optimizer):
    # Sample a batch of size batch_size from the buffer
    batch = buffer.sample(batch_size)
    
    states = torch.cat([s[0] for s in batch])
    actions = torch.tensor([s[1] for s in batch], dtype=torch.int64, device=device)
    rewards = torch.tensor([s[3] for s in batch], dtype=torch.float32, device=device)
    
    # Compute a mask of non-final states (all the elements where the next state is not None)
    non_final_mask = torch.tensor([s[2] is not None for s in batch], dtype=torch.bool, device=device)
    non_final_next_states = torch.cat([s[2] for s in batch if s[2] is not None])
    
    # Compute the action values from the batch states
    policy_net.train()
    q_values = policy_net(states.to(device))
    # Select the proper Q value for the corresponding action taken Q(s_t, a)
    state_action_values = q_values.gather(1, actions.unsqueeze(1))
    
    # Compute the value function of the next states using the target network V(s_{t+1}) = max_a( Q_target(s_{t+1}, a)) )
    with torch.no_grad():
        target_net.eval()
        q_values_target = target_net(non_final_next_states.to(device))
    # For the terminal state the expected reward is zero otherwise is set to the max computed with the target net
    next_state_max_q_values = torch.zeros(batch_size, device=device)
    next_state_max_q_values[non_final_mask] = q_values_target.max(dim=1)[0]
    
    # Compute the expected Q values
    expected_state_action_values = rewards + (next_state_max_q_values * gamma)
    expected_state_action_values = expected_state_action_values.unsqueeze(1) # Set the required tensor shape

    # Compute the loss
    loss = loss_fn(state_action_values, expected_state_action_values)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # Apply gradient clipping (clip all the gradients greater than 2 for training stability)
    nn.utils.clip_grad_norm_(policy_net.parameters(), 2)
    optimizer.step()
    return loss
    

### Training

In [None]:
if not trained:
    rewards = []
    mean_rew = []
    patience = 10
    for episode_num, epsilon in enumerate(tqdm(exploration_profile)):

        state = agent.reset()
        # Reset the score. The final score will be the total amount of steps before the pole falls
        score = 0
        done = False

        # Go on until the pole falls off
        while not done:

            # Choose the action following the policy
            action, _ = choose_action_epsilon_greedy(policy_net, state, epsilon)

            # Apply the action to perform a step
            next_state, reward, done = agent.step(action)

            # Update the final score (+1 for each step)
            score += 1

            # Apply penalty for bad state
            if done: # if the pole has fallen down 
                reward += bad_state_penalty
                next_state = None
                rewards.append(score)
                mean_rew.append(sum(rewards[-50:]) / len(rewards[-50:]))
                if mean_rew[-1] >= 190:
                    patience -= 1
                else: patience = 10
            if patience == 0:
                print("The game has been solved")
                break

            # Update the replay memory
            replay_mem.push(state, action, next_state, reward)

            # Update the network
            if len(replay_mem) > min_samples_for_training: # we enable the training only if we have enough samples in the replay memory, otherwise the training will use the same samples too often
                update_step(policy_net, target_net, replay_mem, gamma, batch_size, loss_fn, optimizer)
            # Visually render the environment (disable to speed up the training)
            #env.render()

            # Set the current state for the next iteration
            state = next_state

        # Update the target network every target_net_update_steps episodes
        if episode_num % target_net_update_steps == 0:
            print('Updating target network...')
            target_net.load_state_dict(policy_net.state_dict()) # This will copy the weights of the policy network to the target network
            torch.save(target_net.state_dict(), 'RL-DQN.torch')

        # Print the final score
        print(f"EPISODE: {episode_num + 1} - FINAL SCORE: {score} - Epsilon: {epsilon}") # Print the final score
    # Plot the results
    plt.figure(figsize=(12,8))
    plt.plot(mean_rew, 'r')
    plt.plot(rewards, 'b', alpha = 0.3)
    plt.ylabel('Score', fontsize=18)
    plt.xlabel('Episode', fontsize=18)
    plt.savefig("score.pdf", format='pdf')

### Test the environment

In [None]:
# Initialize the Gym environment
env_name = 'CartPole-v0'
agent = Agent(env_name)
n_actions = agent.n_actions
policy_net = ConvDQN(n_actions).to(device)
policy_net.load_state_dict(torch.load('RL-DQN.torch', map_location=device))
returns = []
# Let's try for a total of 10 episodes
for num_episode in range(100): 
    # Reset the environment and get the initial state
    state = agent.reset()
    # Reset the score. The final score will be the total amount of steps before the pole falls
    score = 0
    done = False
    # Go on until the pole falls off or the score reach 490
    while not done:
      # Choose the best action (epsilon 0)
      action, _ = choose_action_epsilon_greedy(policy_net, state, epsilon=0)
      # Apply the action and get the next state, the reward and a flag "done" that is True if the game is ended
      next_state, reward, done = agent.step(action)
      # Visually render the environment
      agent.render()
      # Update the final score (+1 for each step)
      score += reward 
      # Set the current state for the next iteration
      state = next_state
      # Check if the episode ended (the pole fell down)
    # Print the final score
    returns.append(score)
    print(f"EPISODE {num_episode + 1} - FINAL SCORE: {score}") 
print(f"MEAN SCORE OVER 100 EPISODES IS: {np.mean(returns)}")