## Step 1 : Import the libraries

In [1]:
import tensorflow as tf
import numpy as np
import gym

## Step 2 : Create our environment

In [2]:
env = gym.make('CartPole-v0')
env = env.unwrapped
# Policy gradient has high variance, seed for reproducibility
env.seed(1)

[1]

## Step 3 : Setting up hyperparameters

In [3]:
## Environment Hyperparameters
state_size = 4
action_size = env.action_space.n

## Training Hyperparameters
max_episodes = 300
learning_rate = 0.01
gamma = 0.95 # Discount Rate

## Step 4 : Define the preprocessing functions

In [4]:
def discount_and_normalize_rewards(episode_rewards) :
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0.0
    
    for i in reversed(range(len(episode_rewards))) :
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative
        
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)
    
    return discounted_episode_rewards

## Step 5 : Create our Policy Gradient Neural Network

The idea is simple :

* Our state which is an array of 4 values will be used as an input
* Our NN is 3 fully connected layers
* Our outpus activation function is softmax that squashes the outputs to a probability distribution (for instance if we have 4,2,6 --> softmax --> (0.4,0.2,0.6))

In [10]:
with tf.name_scope("inputs") :
    input_ = tf.placeholder(tf.float32,[None,state_size],name="input_")
    actions = tf.placeholder(tf.int32,[None,action_size],name="actions")
    discounted_episode_rewards_ = tf.placeholder(tf.float32,[None],name="discounted_episode_rewards")
    
    # Add this placeholder for having this variable in tensorboard
    mean_reward_ = tf.placeholder(tf.float32,name="mean_reward")
    
    with tf.name_scope("fc1") : 
        fc1 = tf.contrib.layers.fully_connected(inputs=input_,
                                                num_outputs=10,
                                                activation_fn=tf.nn.relu,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())
    with tf.name_scope("fc2") :
        fc2 = tf.contrib.layers.fully_connected(inputs=fc1,
                                                num_outputs=action_size,
                                                activation_fn=tf.nn.relu,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())
    with tf.name_scope("fc3") :
        fc3 = tf.contrib.layers.fully_connected(inputs=fc2,
                                                num_outputs=action_size,
                                                activation_fn=None,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())
        
    with tf.name_scope("softmax") :
        action_distribution = tf.nn.softmax(fc3)
        
    with tf.name_scope("loss") :
        # tf.nn.softmax_cross_entropy_with_logits computes the cross entropy of the result after applying the softmax function
        # If you have single-class labels, where an object can only belong to one class, you might can consider using
        # tf.nn.sparse_softmax_cross_entropy_with_logits so that you don't have to convert your labels to a dense one-hot array.
        
        neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=fc3,labels=actions)
        loss = tf.reduce_mean(neg_log_prob*discounted_episode_rewards_)
        
    with tf.name_scope("train") :
        train_opt = tf.train.AdamOptimizer(learning_rate).minimize(loss)

## Step 6 : Setting up Tensorboard

In [11]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("/tensorboard/pg/1")

## Losses
tf.summary.scalar("Loss",loss)

## Reward mean
tf.summary.scalar("Reward_mean",mean_reward_)

write_op = tf.summary.merge_all()

## Step 7 : Train our Agent

Create the NN maxReward = 0 # Keep track of maximum reward For episode in range(max_episodes) : episode + 1 reset environmet, reset stores (states,actions,rewards) For each step : Choose action a Perform action a Store s, a, r If done : Calculate Sum reward Calculate gamma Gt Optimize

In [12]:
allRewards = []
total_rewards = 0
maximumRewardRecorded = 0
episode = 0
episode_states, episode_actions, episode_rewards = [], [], []


saver = tf.train.Saver()
#tf.reset_default_graph()
with tf.Session() as sess :
    
    
    sess.run(tf.global_variables_initializer())
    
    for episode in range(max_episodes) :
        
        episode_rewards_sum = 0
        
        # Launch the game
        state = env.reset()
        
        while True :
            
            # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, WE'RE OUTPUT PROBABILITIES
            action_probability_distribution = sess.run(action_distribution,feed_dict={input_:state.reshape([1,4])})
            action = np.random.choice(range(action_probability_distribution.shape[1]),p=action_probability_distribution.ravel()) # Select action w.r.t the actions prob
            
            # Perform a
            new_state, reward, done, info = env.step(action)
            
            # Store s, a, r
            episode_states.append(state)
            
            # For actions because we output only one (the index) we need 2 (1 is for the action taken)
            # We need [0., 1.] (if we take right) not just the index
            action_ = np.zeros(action_size)
            action_[action] = 1
            
            episode_actions.append(action_)
            
            episode_rewards.append(reward)
            
            if done :
                # Calculate sum reward
                episode_rewards_sum = np.sum(episode_rewards)
                
                allRewards.append(episode_rewards_sum)
                
                total_rewards = np.sum(allRewards)
                
                # Mean reward
                mean_reward = np.divide(total_rewards,episode + 1)
                
                maximumRewardRecorded = np.amax(allRewards)
                
                print("=======================================")
                print("Episode: ",episode)
                print("Reward: ",episode_rewards_sum)
                print("Mean Reward ",mean_reward)
                print("Max reward so far: ",maximumRewardRecorded)
                
                # Calculate discounted reward
                discounted_episode_rewards = discount_and_normalize_rewards(episode_rewards)
                
                # Feedforward, gradient and backpropogation
                loss_, _ = sess.run([loss,train_opt],feed_dict={input_:np.vstack(np.array(episode_states)),
                                                                actions:np.vstack(np.array(episode_actions)),
                                                                discounted_episode_rewards_:discounted_episode_rewards})
                
                # Write TF Summaries
                summary = sess.run(write_op,feed_dict={input_:np.vstack(np.array(episode_states)),
                                                       actions:np.vstack(np.array(episode_actions)),
                                                       discounted_episode_rewards_:discounted_episode_rewards,
                                                       mean_reward_:mean_reward})
                
                writer.add_summary(summary,episode)
                writer.flush()
                
                # Reset transition scores
                episode_states, episode_actions, episode_rewards = [], [], []
                
                break
                
            state = new_state
            
        # Save Model
        if episode % 100 :
            saver.save(sess,"./models/model.ckpt")
            print("Model Saved")

Episode:  0
Reward:  19.0
Mean Reward  19.0
Max reward so far:  19.0
Episode:  1
Reward:  58.0
Mean Reward  38.5
Max reward so far:  58.0
Model Saved
Episode:  2
Reward:  23.0
Mean Reward  33.333333333333336
Max reward so far:  58.0
Model Saved
Episode:  3
Reward:  13.0
Mean Reward  28.25
Max reward so far:  58.0
Model Saved
Episode:  4
Reward:  29.0
Mean Reward  28.4
Max reward so far:  58.0
Model Saved
Episode:  5
Reward:  41.0
Mean Reward  30.5
Max reward so far:  58.0
Model Saved
Episode:  6
Reward:  24.0
Mean Reward  29.571428571428573
Max reward so far:  58.0
Model Saved
Episode:  7
Reward:  16.0
Mean Reward  27.875
Max reward so far:  58.0
Model Saved
Episode:  8
Reward:  18.0
Mean Reward  26.77777777777778
Max reward so far:  58.0
Model Saved
Episode:  9
Reward:  42.0
Mean Reward  28.3
Max reward so far:  58.0
Model Saved
Episode:  10
Reward:  13.0
Mean Reward  26.90909090909091
Max reward so far:  58.0
Model Saved
Episode:  11
Reward:  24.0
Mean Reward  26.666666666666668
Max 

Model Saved
Episode:  63
Reward:  10.0
Mean Reward  20.78125
Max reward so far:  80.0
Model Saved
Episode:  64
Reward:  43.0
Mean Reward  21.123076923076923
Max reward so far:  80.0
Model Saved
Episode:  65
Reward:  25.0
Mean Reward  21.181818181818183
Max reward so far:  80.0
Model Saved
Episode:  66
Reward:  20.0
Mean Reward  21.16417910447761
Max reward so far:  80.0
Model Saved
Episode:  67
Reward:  43.0
Mean Reward  21.485294117647058
Max reward so far:  80.0
Model Saved
Episode:  68
Reward:  21.0
Mean Reward  21.47826086956522
Max reward so far:  80.0
Model Saved
Episode:  69
Reward:  18.0
Mean Reward  21.428571428571427
Max reward so far:  80.0
Model Saved
Episode:  70
Reward:  20.0
Mean Reward  21.408450704225352
Max reward so far:  80.0
Model Saved
Episode:  71
Reward:  42.0
Mean Reward  21.694444444444443
Max reward so far:  80.0
Model Saved
Episode:  72
Reward:  14.0
Mean Reward  21.589041095890412
Max reward so far:  80.0
Model Saved
Episode:  73
Reward:  18.0
Mean Reward  

Model Saved
Episode:  124
Reward:  35.0
Mean Reward  23.84
Max reward so far:  80.0
Model Saved
Episode:  125
Reward:  33.0
Mean Reward  23.91269841269841
Max reward so far:  80.0
Model Saved
Episode:  126
Reward:  27.0
Mean Reward  23.937007874015748
Max reward so far:  80.0
Model Saved
Episode:  127
Reward:  15.0
Mean Reward  23.8671875
Max reward so far:  80.0
Model Saved
Episode:  128
Reward:  52.0
Mean Reward  24.085271317829456
Max reward so far:  80.0
Model Saved
Episode:  129
Reward:  17.0
Mean Reward  24.03076923076923
Max reward so far:  80.0
Model Saved
Episode:  130
Reward:  11.0
Mean Reward  23.931297709923665
Max reward so far:  80.0
Model Saved
Episode:  131
Reward:  46.0
Mean Reward  24.098484848484848
Max reward so far:  80.0
Model Saved
Episode:  132
Reward:  16.0
Mean Reward  24.037593984962406
Max reward so far:  80.0
Model Saved
Episode:  133
Reward:  68.0
Mean Reward  24.365671641791046
Max reward so far:  80.0
Model Saved
Episode:  134
Reward:  20.0
Mean Reward  

Model Saved
Episode:  185
Reward:  45.0
Mean Reward  26.349462365591396
Max reward so far:  90.0
Model Saved
Episode:  186
Reward:  18.0
Mean Reward  26.3048128342246
Max reward so far:  90.0
Model Saved
Episode:  187
Reward:  39.0
Mean Reward  26.372340425531913
Max reward so far:  90.0
Model Saved
Episode:  188
Reward:  42.0
Mean Reward  26.455026455026456
Max reward so far:  90.0
Model Saved
Episode:  189
Reward:  25.0
Mean Reward  26.44736842105263
Max reward so far:  90.0
Model Saved
Episode:  190
Reward:  16.0
Mean Reward  26.392670157068064
Max reward so far:  90.0
Model Saved
Episode:  191
Reward:  41.0
Mean Reward  26.46875
Max reward so far:  90.0
Model Saved
Episode:  192
Reward:  15.0
Mean Reward  26.409326424870468
Max reward so far:  90.0
Model Saved
Episode:  193
Reward:  35.0
Mean Reward  26.45360824742268
Max reward so far:  90.0
Model Saved
Episode:  194
Reward:  14.0
Mean Reward  26.38974358974359
Max reward so far:  90.0
Model Saved
Episode:  195
Reward:  24.0
Mean 

Model Saved
Episode:  245
Reward:  136.0
Mean Reward  37.68292682926829
Max reward so far:  217.0
Model Saved
Episode:  246
Reward:  164.0
Mean Reward  38.19433198380567
Max reward so far:  217.0
Model Saved
Episode:  247
Reward:  183.0
Mean Reward  38.778225806451616
Max reward so far:  217.0
Model Saved
Episode:  248
Reward:  24.0
Mean Reward  38.71887550200803
Max reward so far:  217.0
Model Saved
Episode:  249
Reward:  110.0
Mean Reward  39.004
Max reward so far:  217.0
Model Saved
Episode:  250
Reward:  99.0
Mean Reward  39.243027888446214
Max reward so far:  217.0
Model Saved
Episode:  251
Reward:  146.0
Mean Reward  39.666666666666664
Max reward so far:  217.0
Model Saved
Episode:  252
Reward:  144.0
Mean Reward  40.07905138339921
Max reward so far:  217.0
Model Saved
Episode:  253
Reward:  188.0
Mean Reward  40.661417322834644
Max reward so far:  217.0
Model Saved
Episode:  254
Reward:  96.0
Mean Reward  40.87843137254902
Max reward so far:  217.0
Model Saved
Episode:  255
Rewa

In [13]:
with tf.Session() as sess :
    env.reset()
    rewards = []
    
    # Load the model
    saver.restore(sess,"./models/model.ckpt")
    
    for episode in range(10) :
        state = env.reset()
        step = 0
        done = False
        total_rewards = 0
        print("**************************************")
        print("EPISODE ",episode)
        
        while True :
            
            # Chose action a, remeber WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, WE'RE OUPUT PROBABILITIES
            action_probability_distribution = sess.run(action_distribution,feed_dict={input_:state.reshape([1,4])})
            
            # print (action_probability_distribution)
            action = np.random.choice(range(action_probability_distribution.shape[1]),p=action_probability_distribution.ravel()) # select action w.r.t the action prob
            
            new_state, reward, done, info = env.step(action)
            
            total_rewards += reward
            
            if done :
                rewards.append(total_rewards)
                print("Score",total_rewards)
                break
                
            state = new_state
        env.close()
        print("Score over time: " + str(sum(rewards)/10))

INFO:tensorflow:Restoring parameters from ./models/model.ckpt
**************************************
EPISODE  0
Score 263.0
Score over time: 26.3
**************************************
EPISODE  1
Score 603.0
Score over time: 86.6
**************************************
EPISODE  2
Score 341.0
Score over time: 120.7
**************************************
EPISODE  3
Score 289.0
Score over time: 149.6
**************************************
EPISODE  4
Score 410.0
Score over time: 190.6
**************************************
EPISODE  5
Score 20.0
Score over time: 192.6
**************************************
EPISODE  6
Score 470.0
Score over time: 239.6
**************************************
EPISODE  7
Score 239.0
Score over time: 263.5
**************************************
EPISODE  8
Score 286.0
Score over time: 292.1
**************************************
EPISODE  9
Score 342.0
Score over time: 326.3
