# DEEP REINFORCEMENT LEARNING - CARTPOLE
In this notebook I will explore the implementation of a deep network to solve a reinforcement learning task.




In [None]:
!pip install gym



In [None]:
import random
import torch
import numpy as np
import gym
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from torch import nn
from collections import deque

In [None]:
!apt update
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:8 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:11 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Ign:12 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Pack

In [None]:
import glob
import io
import base64
import os
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
from gym.wrappers import Monitor

Creation of the display (virtual monitor) to visualize the OpenAI gym environment.

In [None]:
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7fb0efd6eef0>

In [None]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_videos():
  mp4list = glob.glob('video/*.mp4')
  mp4list.sort()
  for mp4 in mp4list:
    print(f"\nSHOWING VIDEO {mp4}")
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    
def wrap_env(env, video_callable=None):
  env = Monitor(env, './video', force=True, video_callable=video_callable)
  return env

## Replay Memory
Definition of the replay memory, which is a list of finite length storing at each time step (state, action, next_state, reward).

In [None]:
class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity) # Define a queue with maxlen "capacity"

    def push(self, state, action, next_state, reward):
        self.memory.append( (state, action, next_state, reward) ) # Add the tuple (state, action, next_state, reward) to the queue

    def sample(self, batch_size):
        batch_size = min(batch_size, len(self)) # Get all the samples if the requested batch_size is higher than the number of sample currently in the memory
        return random.sample(self.memory, batch_size) # Randomly select "batch_size" samples

    def __len__(self):
        return len(self.memory) # Return the number of samples currently stored in the memory

## Policies
Definition of the policies used for the evaluation of the current state and the generation of associated Q-values.

### $\epsilon$-Greedy Policy
Since we work under the assumption that our network will profit from an early exploration policy, with the divergence between real and expected reward shrinking with the number of iterations, then we define
$$\epsilon=e^{-\alpha{t}}$$
$$t:\:\:\:time\:step$$
$$\alpha:\:\:\:adjustable\:parameter$$

In [None]:
def choose_action_epsilon_greedy(net, state, epsilon):
    
    if epsilon > 1 or epsilon < 0:
        raise Exception('Epsilon value must be between 0 and 1')
                
    with torch.no_grad():
        net.eval()
        state = torch.tensor(state, dtype=torch.float32)
        net_out = net(state)

    best_action = int(net_out.argmax())
    # Get the number of possible actions
    action_space_dim = net_out.shape[-1]

    # Select a non optimal action with probability epsilon, otherwise choose the best action
    if random.random() < epsilon:
        non_optimal_actions = [a for a in range(action_space_dim) if a != best_action]
        action = random.choice(non_optimal_actions)
    else:
        action = best_action
        
    return action, net_out.numpy()

### Softmax Policy
Implementation of the Softmax policy.



In [None]:
def choose_action_softmax(net, state, temperature):
    
    if temperature < 0:
        raise Exception('The temperature value must be greater than or equal to 0 ')
        
    # If the temperature is 0, just select the best action using the eps-greedy policy with epsilon = 0
    if temperature == 0:
        return choose_action_epsilon_greedy(net, state, 0)
    
    # Evaluate the network output from the current state
    with torch.no_grad():
        net.eval()
        state = torch.tensor(state, dtype=torch.float32)
        net_out = net(state)

    # Apply softmax with temp
    temperature = max(temperature, 1e-8) # set a minimum to the temperature for numerical stability
    softmax_out = nn.functional.softmax(net_out / temperature, dim=0).numpy()
                
    # Sample the action using softmax output as mass pdf
    all_possible_actions = np.arange(0, softmax_out.shape[-1])
    action = np.random.choice(all_possible_actions, p=softmax_out) # this samples a random element from "all_possible_actions" with the probability distribution p (softmax_out in this case)
    
    return action, net_out.numpy()

### Exploration Profiles
Definition of the lists of probabilities characterizing $\epsilon$-greedy policy and temperatures characterizing the softmax policy.

In [None]:
n_iter=1000
ini_temperature=5

def explr_pr(alpha, dil):
  global n_iter, ini_temperature

  explor_prof_greedy=[np.exp(-alpha*i) for i in range(n_iter)]
  explor_prof_soft=[ini_temperature*((2**((-ini_temperature*i)/n_iter))**(i*dil/n_iter)) for i in range(n_iter)]
  
  return explor_prof_greedy, explor_prof_soft

## Network Definition
Initialization of the network that generates actions for any given state.


In [None]:
class DQN(nn.Module):

    def __init__(self, state_space_dim, action_space_dim):
        super().__init__()

        self.linear = nn.Sequential(
                nn.Linear(state_space_dim, 128),
                nn.Tanh(),
                nn.Linear(128, 128),
                nn.Tanh(),
                nn.Linear(128, action_space_dim)
                )

    def forward(self, x):
        return self.linear(x)

## Gym Environment

In [None]:
env = gym.make('CartPole-v1')
env.seed(0)

state_space_dim = env.observation_space.shape[0]
action_space_dim = env.action_space.n

print(f"STATE SPACE SIZE: {state_space_dim}")
print(f"ACTION SPACE SIZE: {action_space_dim}")

STATE SPACE SIZE: 4
ACTION SPACE SIZE: 2


## Network


### Initialization

In [None]:
# Random seeds
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

# PARAMETERS
gamma = 0.97   # LongTerm reward
replay_memory_capacity = 10000   # Replay memory capacity
lr = 1e-2   # Optimizer learning rate
target_net_update_steps = 10   # Number of episodes to wait before updating the target network
batch_size = 128   # Number of samples to take from the replay memory for each update
bad_state_penalty = 1   # Penalty to the reward when we are in a bad state (in this case when the pole falls down) 
min_samples_for_training = 1000   # Minimum samples in the replay memory to enable the training

scores=[]

In [None]:
def initialization(replay_memory_capacity, state_space_dim, action_space_dim, lr):

  ### Initialize the replay memory
  replay_mem = ReplayMemory(replay_memory_capacity)    

  ### Initialize the policy network
  policy_net = DQN(state_space_dim, action_space_dim)

  ### Initialize the target network with the same weights of the policy network
  target_net = DQN(state_space_dim, action_space_dim)
  target_net.load_state_dict(policy_net.state_dict()) # This will copy the weights of the policy network to the target network

  ### Initialize the optimizer
  optimizer = torch.optim.SGD(policy_net.parameters(), lr=lr) # The optimizer will update ONLY the parameters of the policy network

  return replay_mem, policy_net, target_net, optimizer

### Initialize the loss function (Huber loss)
loss_fn = nn.SmoothL1Loss()

### Update rule

In [None]:
def update_step(policy_net, target_net, replay_mem, gamma, optimizer, loss_fn, batch_size):
        
    # Sample the data from the replay memory
    batch = replay_mem.sample(batch_size)
    batch_size = len(batch)

    # Create tensors for each element of the batch
    states      = torch.tensor([s[0] for s in batch], dtype=torch.float32)
    actions     = torch.tensor([s[1] for s in batch], dtype=torch.int64)
    rewards     = torch.tensor([s[3] for s in batch], dtype=torch.float32)

    # Compute a mask of non-final states (all the elements where the next state is not None)
    non_final_next_states = torch.tensor([s[2] for s in batch if s[2] is not None], dtype=torch.float32) # the next state can be None if the game has ended
    non_final_mask = torch.tensor([s[2] is not None for s in batch], dtype=torch.bool)

    # Compute all the Q values (forward pass)
    policy_net.train()
    q_values = policy_net(states)
    # Select the proper Q value for the corresponding action taken Q(s_t, a)
    state_action_values = q_values.gather(1, actions.unsqueeze(1))

    # Compute the value function of the next states using the target network V(s_{t+1}) = max_a( Q_target(s_{t+1}, a)) )
    with torch.no_grad():
      target_net.eval()
      q_values_target = target_net(non_final_next_states)
    next_state_max_q_values = torch.zeros(batch_size)
    next_state_max_q_values[non_final_mask] = q_values_target.max(dim=1)[0]

    # Compute the expected Q values
    expected_state_action_values = rewards + (next_state_max_q_values * gamma)
    expected_state_action_values = expected_state_action_values.unsqueeze(1) # Set the required tensor shape

    # Compute the Huber loss
    loss = loss_fn(state_action_values, expected_state_action_values)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # Apply gradient clipping (clip all the gradients greater than 2 for training stability)
    nn.utils.clip_grad_norm_(policy_net.parameters(), 2)
    optimizer.step()

### Training Loop
Initialization of the training loop, which iterates with both the softmax policy and $\epsilon$-greedy policy.

In [None]:
# Initialize the Gym environment
env = gym.make('CartPole-v1') 
env.seed(0) # Set a random seed for the environment (reproducible results)

# This is for creating the output video in Colab, not required outside Colab
env = wrap_env(env, video_callable=lambda episode_id: episode_id % 100 == 0) # Save a video every 100 episodes
Models=[]
# Definition of the values for dilation factors
alpha=[0.005, 0.01, 0.02, 0.05, 0.1]
dil=[25, 50, 75, 100, 125]
pol="soft"

for j in range(5):
  explor_prof_greedy, explor_prof_soft = explr_pr(alpha[j],dil[j])
  if pol=="soft":
    exploration_profile=explor_prof_soft
  elif pol=="greedy":
    exploration_profile=explor_prof_greedy
  replay_mem, policy_net, target_net, optimizer = initialization(replay_memory_capacity, state_space_dim, action_space_dim, lr)
  for episode_num, tau in enumerate(tqdm(exploration_profile)):

      # Reset the environment and get the initial state
      state = env.reset()
      # Reset the score. The final score will be the total amount of steps before the pole falls
      score = 0
      done = False

      # Go on until the pole falls off
      while not done:

        # Choose the action following the policy
        if pol=="soft":
          action, q_values = choose_action_softmax(policy_net, state, temperature=tau)
        elif pol=="greedy":
          action, q_values = choose_action_epsilon_greedy(policy_net, state, epsilon=tau)
      
        # Apply the action and get the next state, the reward and a flag "done" that is True if the game is ended
        next_state, reward, done, info = env.step(action)

        # We apply a (linear) penalty when the cart is far from center
        pos_weight = 1
        reward = reward - pos_weight * (next_state[0]**4) 

        # Update the final score (+1 for each step)
        score += 1

        # Apply penalty for bad state
        if done: # if the pole has fallen down 
            reward += bad_state_penalty
            next_state = None

        # Update the replay memory
        replay_mem.push(state, action, next_state, reward)

        # Update the network
        if len(replay_mem) > min_samples_for_training: # we enable the training only if we have enough samples in the replay memory, otherwise the training will use the same samples too often
            update_step(policy_net, target_net, replay_mem, gamma, optimizer, loss_fn, batch_size)

        # Visually render the environment (disable to speed up the training)
        env.render()

        # Set the current state for the next iteration
        state = next_state

      scores.append(score)
      # Update the target network every target_net_update_steps episodes
      if episode_num % target_net_update_steps == 0:
          print('Updating target network...')
          target_net.load_state_dict(policy_net.state_dict()) # This will copy the weights of the policy network to the target network

      # Print the final score
      if pol=="soft":
        print(f"EPISODE: {episode_num + 1} - FINAL SCORE: {score} - Temperature: {tau}")
      elif pol=="greedy":
        print(f"EPISODE: {episode_num + 1} - FINAL SCORE: {score} - Epsilon: {tau}")
    
      # Early Stop Condition
      early_stop=0
      if episode_num>50:
        early_stop=np.mean(scores[-5:])
      if episode_num==999 or early_stop>450:
        model={"policy":pol, "episode":episode_num, "alpha":alpha[j], "dil":dil[j]}
        Models.append(model)
        name="Policy_param_"+pol+"_"+str(j)+".pth"
        torch.save(policy_net.state_dict(),name)
      if episode_num>50:
        if early_stop>450:
            break

  print("---------------------------------------------------------")
  print()
  print("---------------------------------------------------------")

env.close()

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Updating target network...
EPISODE: 1 - FINAL SCORE: 12 - Temperature: 5.0
EPISODE: 2 - FINAL SCORE: 17 - Temperature: 4.999566801779304
EPISODE: 3 - FINAL SCORE: 41 - Temperature: 4.998267432297046
EPISODE: 4 - FINAL SCORE: 19 - Temperature: 4.99610256689765
EPISODE: 5 - FINAL SCORE: 24 - Temperature: 4.993073330505144
EPISODE: 6 - FINAL SCORE: 16 - Temperature: 4.98918129664899
EPISODE: 7 - FINAL SCORE: 30 - Temperature: 4.984428486101879
EPISODE: 8 - FINAL SCORE: 29 - Temperature: 4.978817365131169
EPISODE: 9 - FINAL SCORE: 15 - Temperature: 4.972350843366072
EPISODE: 10 - FINAL SCORE: 21 - Temperature: 4.965032271283169
Updating target network...
EPISODE: 11 - FINAL SCORE: 31 - Temperature: 4.9568654373133105
EPISODE: 12 - FINAL SCORE: 59 - Temperature: 4.947854564573375
EPISODE: 13 - FINAL SCORE: 29 - Temperature: 4.938004307226863
EPISODE: 14 - FINAL SCORE: 26 - Temperature: 4.927319746477684
EPISODE: 15 - FINAL SCORE: 18 - Temperature: 4.9158063862019965
EPISODE: 16 - FINAL SCOR

## Testing

In [None]:
# Initialize the Gym environment
env = gym.make('CartPole-v1') 
env.seed(1) # Set a random seed for the environment (reproducible results)

# This is for creating the output video in Colab, not required outside Colab
env = wrap_env(env, video_callable=lambda episode_id: True) # Save a video every episode
mean_scores=[]

# Let's try for a total of 10 episodes
for j in range(5):
  _ ,policy_net, _, _ = initialization(replay_memory_capacity, state_space_dim, action_space_dim, lr)
  name="Policy_param_"+pol+"_"+str(j)+".pth"
  policy_net.load_state_dict(torch.load(name))
  score_m=[]
  for num_episode in range(10): 
      # Reset the environment and get the initial state
      state = env.reset()
      # Reset the score. The final score will be the total amount of steps before the pole falls
      score = 0
      done = False
      # Go on until the pole falls off or the score reach 490
      while not done:
        # Choose the best action (temperature 0)
        action, q_values = choose_action_softmax(policy_net, state, temperature=0)
        # Apply the action and get the next state, the reward and a flag "done" that is True if the game is ended
        next_state, reward, done, info = env.step(action)
        # Visually render the environment
        env.render()
        # Update the final score (+1 for each step)
        score += reward 
        # Set the current state for the next iteration
        state = next_state
        # Check if the episode ended (the pole fell down)
      # Print the final score
      score_m.append(score)
      print(f"EPISODE {num_episode + 1} - FINAL SCORE: {score}")
      if num_episode==9:
        mean_scores.append(np.mean(score_m))
  print("----------------------------------------------------------")
  print()
  print("----------------------------------------------------------")
env.close()

for j in range(5):
  print(Models[j])
  print("FINAL MEAN SCORE: "+str(mean_scores[j]))
  explor_prof_greedy, explor_prof_soft = explr_pr(alpha[j],dil[j])

  plt.figure(figsize=(12,8))
  if pol=="greedy":
    plt.plot(explor_prof_greedy)
  elif pol=="soft":
    plt.plot(explor_prof_soft)
  plt.grid()
  plt.xlabel("Iterations")
  plt.ylabel("Exploration Profiles")
  plt.show()

  print("----------------------------------------------------------")
  print()
  print("----------------------------------------------------------")

In [None]:
# Not required outside Colab
show_videos()