<a href="https://colab.research.google.com/github/rahul-jha98/Reinforcement-Learning/blob/master/3.%20Policy%20Gradients%20Learning/Policy%20Gradients%20%20with%20Cartpole/Policy_Learning_with_Cartpole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Step 0: Preparing Colab for rendering the environment and installing GYM 🏗️

In [None]:
!pip install pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [None]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1

In [None]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f54789d1e10>

## Step 1: Importing the libraries 📚

In [None]:
## Importing GYM to make our environment
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only

import tensorflow.compat.v1 as tf      # Tensorflow compat for v1 functionalities
import tensorflow as tf2               # Importing tf2 for some intitalizers
import numpy as np                     # Handle matrices


import random                       # Handling random number generation

from skimage import transform       # Help us to preprocess the frames
from skimage.color import rgb2gray  # Help us to gray our frames

import matplotlib.pyplot as plt      # Display graphs

from collections import deque       # Ordered collection with ends

import warnings # This ignore all the warning messages that are normally printed during the training because of skiimage
warnings.filterwarnings('ignore')

In [None]:
## Disable eager execution to make it compatible with tf session module
tf.disable_eager_execution()

## Creating the environment 🎮

In [None]:
env = gym.make('CartPole-v0') 
env = env.unwrapped

## Due to high variance in Policy Gradient, we set a seed for reporducability
env.seed(1)

[1]

## Step 3: Set up our hyperparameters ⚗️

In [None]:
## ENVIRONMENT Hyperparameters
state_size = env.observation_space.sample().shape[0]
action_size = env.action_space.n

## TRAINING Hyperparameters
max_episodes = 1000
learning_rate = 0.005
gamma = 0.95 # Discount rate

## Step 4 : Define the preprocessing functions ⚙️
This function takes <b>the rewards and perform discounting.</b>

In [None]:
def discount_and_normalize_rewards(episode_rewards):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0.0
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative
    
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)
    
    return discounted_episode_rewards

## Step 5: Create our Policy Gradient Neural Network model 🧠

<img src="https://raw.githubusercontent.com/simoninithomas/Deep_reinforcement_learning_Course/master/Policy%20Gradients/Cartpole/assets/catpole.png">

The idea is simple:
- Our state which is an array of 4 values will be used as an input.
- Our NN is 3 fully connected layers.
- Our output activation function is softmax that squashes the outputs to a probability distribution (for instance if we have 4, 2, 6 --> softmax --> (0.4, 0.2, 0.6)

In [None]:
with tf.name_scope("inputs"):
    input_ = tf.placeholder(tf.float32, [None, state_size], name="input_")
    actions = tf.placeholder(tf.int32, [None, action_size], name="actions")
    discounted_episode_rewards_ = tf.placeholder(tf.float32, [None,], name="discounted_episode_rewards")
    
    # Add this placeholder for having this variable in tensorboard
    mean_reward_ = tf.placeholder(tf.float32 , name="mean_reward")

    with tf.name_scope("fc1"):
        fc1 = tf.layers.Dense(units = 10,
                              activation=tf.nn.relu,
                              kernel_initializer=tf2.keras.initializers.GlorotUniform(),
                              bias_initializer=tf2.keras.initializers.GlorotUniform())(input_)

    with tf.name_scope("fc2"):
        fc2 = tf.layers.Dense(units = action_size,
                              activation= tf.nn.relu,
                              kernel_initializer=tf2.keras.initializers.GlorotUniform(),
                              bias_initializer=tf2.keras.initializers.GlorotUniform())(fc1)

    with tf.name_scope("fc3"):
        fc3 = tf.layers.Dense(units = action_size,
                              activation= None,
                              kernel_initializer=tf2.keras.initializers.GlorotUniform(),
                              bias_initializer=tf2.keras.initializers.GlorotUniform())(fc2)

    with tf.name_scope("softmax"):
        action_distribution = tf.nn.softmax(fc3)

    with tf.name_scope("loss"):
        # tf.nn.softmax_cross_entropy_with_logits computes the cross entropy of the result after applying the softmax function
        # If you have single-class labels, where an object can only belong to one class, you might now consider using 
        # tf.nn.sparse_softmax_cross_entropy_with_logits so that you don't have to convert your labels to a dense one-hot array. 
        neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = fc3, labels = actions)
        loss = tf.reduce_mean(neg_log_prob * discounted_episode_rewards_) 
        
    
    with tf.name_scope("train"):
        train_opt = tf.train.AdamOptimizer(learning_rate).minimize(loss)  

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


## Step 6: Set up Tensorboard 📊

In [None]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("/tensorboard/pg/1")

## Losses
tf.summary.scalar("Loss", loss)

## Reward mean
tf.summary.scalar("Reward_mean", mean_reward_)

write_op = tf.summary.merge_all()

## Step 7: Train our Agent 🏃‍♂️

    Algo: 

      Create the NN
      maxReward = 0 # Keep track of maximum reward
      For episode in range(max_episodes):
          episode + 1
          reset environment
          reset stores (states, actions, rewards)
          
          For each step:
              Choose action a
              Perform action a
              Store s, a, r
              If done:
                  Calculate sum reward
                  Calculate gamma Gt
                  Optimize

In [None]:
allRewards = []
total_rewards = 0
maximumRewardRecorded = 0
episode = 0
episode_states, episode_actions, episode_rewards = [],[],[]

saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for episode in range(max_episodes):
        
        episode_rewards_sum = 0

        # Launch the game
        state = env.reset()
        
        env.render()
           
        while True:
            
            # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, WE'RE OUTPUT PROBABILITIES.
            action_probability_distribution = sess.run(action_distribution, feed_dict={input_: state.reshape([1,4])})
            
            action = np.random.choice(range(action_probability_distribution.shape[1]), p=action_probability_distribution.ravel())  # select action w.r.t the actions prob

            # Perform a
            new_state, reward, done, info = env.step(action)

            # Store s, a, r
            episode_states.append(state)
                        
            # For actions because we output only one (the index) we need 2 (1 is for the action taken)
            # We need [0., 1.] (if we take right) not just the index
            action_ = np.zeros(action_size)
            action_[action] = 1
            
            episode_actions.append(action_)
            
            episode_rewards.append(reward)
            if done:
                # Calculate sum reward
                episode_rewards_sum = np.sum(episode_rewards)
                
                allRewards.append(episode_rewards_sum)
                
                total_rewards = np.sum(allRewards)
                
                # Mean reward
                mean_reward = np.divide(total_rewards, episode+1)
                
                
                maximumRewardRecorded = np.amax(allRewards)
                
                print("==========================================")
                print("Episode: ", episode)
                print("Reward: ", episode_rewards_sum)
                print("Mean Reward", mean_reward)
                print("Max reward so far: ", maximumRewardRecorded)
                
                # Calculate discounted reward
                discounted_episode_rewards = discount_and_normalize_rewards(episode_rewards)
                                
                # Feedforward, gradient and backpropagation
                loss_, _ = sess.run([loss, train_opt], feed_dict={input_: np.vstack(np.array(episode_states)),
                                                                 actions: np.vstack(np.array(episode_actions)),
                                                                 discounted_episode_rewards_: discounted_episode_rewards 
                                                                })
                
 
                                                                 
                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={input_: np.vstack(np.array(episode_states)),
                                                                 actions: np.vstack(np.array(episode_actions)),
                                                                 discounted_episode_rewards_: discounted_episode_rewards,
                                                                    mean_reward_: mean_reward
                                                                })
                
               
                writer.add_summary(summary, episode)
                writer.flush()
                
            
                
                # Reset the transition stores
                episode_states, episode_actions, episode_rewards = [],[],[]
                
                break
            
            state = new_state
        
        # Save Model
        if episode % 100 == 0:
            saver.save(sess, "./models/model.ckpt")
            print("Model saved")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Episode:  2
Reward:  58.0
Mean Reward 28.666666666666668
Max reward so far:  58.0
Episode:  3
Reward:  17.0
Mean Reward 25.75
Max reward so far:  58.0
Episode:  4
Reward:  15.0
Mean Reward 23.6
Max reward so far:  58.0
Episode:  5
Reward:  13.0
Mean Reward 21.833333333333332
Max reward so far:  58.0
Episode:  6
Reward:  18.0
Mean Reward 21.285714285714285
Max reward so far:  58.0
Episode:  7
Reward:  16.0
Mean Reward 20.625
Max reward so far:  58.0
Episode:  8
Reward:  22.0
Mean Reward 20.77777777777778
Max reward so far:  58.0
Episode:  9
Reward:  31.0
Mean Reward 21.8
Max reward so far:  58.0
Episode:  10
Reward:  52.0
Mean Reward 24.545454545454547
Max reward so far:  58.0
Episode:  11
Reward:  9.0
Mean Reward 23.25
Max reward so far:  58.0
Episode:  12
Reward:  29.0
Mean Reward 23.692307692307693
Max reward so far:  58.0
Episode:  13
Reward:  16.0
Mean Reward 23.142857142857142
Max reward so far:  58.0
Episode:  14
Re

## Step 9: Watch our Agent try to balance a rod 👀
Now that we trained our agent, we can test it

In [None]:
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay



def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [None]:
playing_env = wrap_env(gym.make('CartPole-v0'))

In [None]:
saver = tf.train.Saver()
with tf.Session() as sess:    
    # Load the model
    saver.restore(sess, "./models/model.ckpt")

    state = playing_env.reset()
    step = 0
    done = False

    while True:
        

        # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, WE'RE OUTPUT PROBABILITIES.
        action_probability_distribution = sess.run(action_distribution, feed_dict={input_: state.reshape([1,4])})
        #print(action_probability_distribution)
        action = np.random.choice(range(action_probability_distribution.shape[1]), p=action_probability_distribution.ravel())  # select action w.r.t the actions prob


        new_state, reward, done, info = playing_env.step(action)

        total_rewards += reward

        if done:
            print ("Score", total_rewards)
            break
        state = new_state

    playing_env.close()

INFO:tensorflow:Restoring parameters from ./models/model.ckpt
Score 119488.0


In [None]:
show_video()