## Policy gradient using REINFORCE algorithm

In this demo we create a multi-layer perceptron using tensorflow and train it to estimate our policy for the cart-pole environment in openai gym.

In [1]:
import tensorflow as tf
import numpy as np
import gym

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
env = gym.make('CartPole-v0')
env = env.unwrapped
# Policy gradient has high variance, seed for reproducability
env.seed(1)

[1]

In [3]:
## ENVIRONMENT Hyperparameters
state_size = 4
action_size = env.action_space.n

## TRAINING Hyperparameters
max_episodes = 300
learning_rate = 0.01
gamma = 0.95 # Discount rate

In [4]:
def discount_and_normalize_rewards(episode_rewards):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0.0
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative
    
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)
    
    return discounted_episode_rewards

In [6]:
with tf.name_scope("inputs"):
    input_ = tf.placeholder(tf.float32, [None, state_size], name="input_")
    actions = tf.placeholder(tf.int32, [None, action_size], name="actions")
    discounted_episode_rewards_ = tf.placeholder(tf.float32, [None,], name="discounted_episode_rewards")
    
    # Add this placeholder for having this variable in tensorboard
    mean_reward_ = tf.placeholder(tf.float32 , name="mean_reward")

    with tf.name_scope("fc1"):
        fc1 = tf.contrib.layers.fully_connected(inputs = input_,
                                                num_outputs = 10,
                                                activation_fn=tf.nn.relu,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())

    with tf.name_scope("fc2"):
        fc2 = tf.contrib.layers.fully_connected(inputs = fc1,
                                                num_outputs = action_size,
                                                activation_fn= tf.nn.relu,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())
    
    with tf.name_scope("fc3"):
        fc3 = tf.contrib.layers.fully_connected(inputs = fc2,
                                                num_outputs = action_size,
                                                activation_fn= None,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())

    with tf.name_scope("softmax"):
        action_distribution = tf.nn.softmax(fc3)

    with tf.name_scope("loss"):
        # tf.nn.softmax_cross_entropy_with_logits computes the cross entropy of the result after applying the softmax function
        # If you have single-class labels, where an object can only belong to one class, you might now consider using 
        # tf.nn.sparse_softmax_cross_entropy_with_logits so that you don't have to convert your labels to a dense one-hot array. 
        neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = fc3, labels = actions)
        loss = tf.reduce_mean(neg_log_prob * discounted_episode_rewards_) 
        
    
    with tf.name_scope("train"):
        train_opt = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [7]:
allRewards = []
total_rewards = 0
maximumRewardRecorded = 0
episode = 0
episode_states, episode_actions, episode_rewards = [],[],[]

saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for episode in range(max_episodes):
        
        episode_rewards_sum = 0

        # Launch the game
        state = env.reset()
        
        #env.render()
           
        while True:
            
            # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, WE'RE OUTPUT PROBABILITIES.
            action_probability_distribution = sess.run(action_distribution, feed_dict={input_: state.reshape([1,4])})
            
            action = np.random.choice(range(action_probability_distribution.shape[1]), p=action_probability_distribution.ravel())  # select action w.r.t the actions prob

            # Perform a
            new_state, reward, done, info = env.step(action)

            # Store s, a, r
            episode_states.append(state)
                        
            # For actions because we output only one (the index) we need 2 (1 is for the action taken)
            # We need [0., 1.] (if we take right) not just the index
            action_ = np.zeros(action_size)
            action_[action] = 1
            
            episode_actions.append(action_)
            
            episode_rewards.append(reward)
            if done:
                # Calculate sum reward
                episode_rewards_sum = np.sum(episode_rewards)
                
                allRewards.append(episode_rewards_sum)
                
                total_rewards = np.sum(allRewards)
                
                # Mean reward
                mean_reward = np.divide(total_rewards, episode+1)
                
                
                maximumRewardRecorded = np.amax(allRewards)
                
                print("==========================================")
                print("Episode: ", episode)
                print("Reward: ", episode_rewards_sum)
                print("Mean Reward", mean_reward)
                print("Max reward so far: ", maximumRewardRecorded)
                
                # Calculate discounted reward
                discounted_episode_rewards = discount_and_normalize_rewards(episode_rewards)
                                
                # Feedforward, gradient and backpropagation
                loss_, _ = sess.run([loss, train_opt], feed_dict={input_: np.vstack(np.array(episode_states)),
                                                                 actions: np.vstack(np.array(episode_actions)),
                                                                 discounted_episode_rewards_: discounted_episode_rewards 
                                                                })

            
                
                # Reset the transition stores
                episode_states, episode_actions, episode_rewards = [],[],[]
                
                break
            
            state = new_state
        
        # Save Model
        if episode % 100 == 0:
            saver.save(sess, "./models/model.ckpt")
            print("Model saved")

Episode:  0
Reward:  29.0
Mean Reward 29.0
Max reward so far:  29.0
Model saved
Episode:  1
Reward:  39.0
Mean Reward 34.0
Max reward so far:  39.0
Episode:  2
Reward:  13.0
Mean Reward 27.0
Max reward so far:  39.0
Episode:  3
Reward:  28.0
Mean Reward 27.25
Max reward so far:  39.0
Episode:  4
Reward:  26.0
Mean Reward 27.0
Max reward so far:  39.0
Episode:  5
Reward:  22.0
Mean Reward 26.166666666666668
Max reward so far:  39.0
Episode:  6
Reward:  13.0
Mean Reward 24.285714285714285
Max reward so far:  39.0
Episode:  7
Reward:  13.0
Mean Reward 22.875
Max reward so far:  39.0
Episode:  8
Reward:  25.0
Mean Reward 23.11111111111111
Max reward so far:  39.0
Episode:  9
Reward:  9.0
Mean Reward 21.7
Max reward so far:  39.0
Episode:  10
Reward:  19.0
Mean Reward 21.454545454545453
Max reward so far:  39.0
Episode:  11
Reward:  24.0
Mean Reward 21.666666666666668
Max reward so far:  39.0
Episode:  12
Reward:  40.0
Mean Reward 23.076923076923077
Max reward so far:  40.0
Episode:  13
Rew

Episode:  79
Reward:  21.0
Mean Reward 24.0875
Max reward so far:  98.0
Episode:  80
Reward:  31.0
Mean Reward 24.17283950617284
Max reward so far:  98.0
Episode:  81
Reward:  35.0
Mean Reward 24.304878048780488
Max reward so far:  98.0
Episode:  82
Reward:  20.0
Mean Reward 24.253012048192772
Max reward so far:  98.0
Episode:  83
Reward:  9.0
Mean Reward 24.071428571428573
Max reward so far:  98.0
Episode:  84
Reward:  37.0
Mean Reward 24.223529411764705
Max reward so far:  98.0
Episode:  85
Reward:  14.0
Mean Reward 24.1046511627907
Max reward so far:  98.0
Episode:  86
Reward:  17.0
Mean Reward 24.022988505747126
Max reward so far:  98.0
Episode:  87
Reward:  42.0
Mean Reward 24.227272727272727
Max reward so far:  98.0
Episode:  88
Reward:  27.0
Mean Reward 24.258426966292134
Max reward so far:  98.0
Episode:  89
Reward:  11.0
Mean Reward 24.11111111111111
Max reward so far:  98.0
Episode:  90
Reward:  32.0
Mean Reward 24.197802197802197
Max reward so far:  98.0
Episode:  91
Reward:

Episode:  148
Reward:  20.0
Mean Reward 24.449664429530202
Max reward so far:  98.0
Episode:  149
Reward:  11.0
Mean Reward 24.36
Max reward so far:  98.0
Episode:  150
Reward:  12.0
Mean Reward 24.278145695364238
Max reward so far:  98.0
Episode:  151
Reward:  27.0
Mean Reward 24.29605263157895
Max reward so far:  98.0
Episode:  152
Reward:  96.0
Mean Reward 24.764705882352942
Max reward so far:  98.0
Episode:  153
Reward:  23.0
Mean Reward 24.753246753246753
Max reward so far:  98.0
Episode:  154
Reward:  15.0
Mean Reward 24.690322580645162
Max reward so far:  98.0
Episode:  155
Reward:  24.0
Mean Reward 24.685897435897434
Max reward so far:  98.0
Episode:  156
Reward:  36.0
Mean Reward 24.75796178343949
Max reward so far:  98.0
Episode:  157
Reward:  22.0
Mean Reward 24.740506329113924
Max reward so far:  98.0
Episode:  158
Reward:  15.0
Mean Reward 24.67924528301887
Max reward so far:  98.0
Episode:  159
Reward:  59.0
Mean Reward 24.89375
Max reward so far:  98.0
Episode:  160
Rewa

Episode:  216
Reward:  59.0
Mean Reward 29.516129032258064
Max reward so far:  127.0
Episode:  217
Reward:  54.0
Mean Reward 29.628440366972477
Max reward so far:  127.0
Episode:  218
Reward:  79.0
Mean Reward 29.85388127853881
Max reward so far:  127.0
Episode:  219
Reward:  40.0
Mean Reward 29.9
Max reward so far:  127.0
Episode:  220
Reward:  91.0
Mean Reward 30.176470588235293
Max reward so far:  127.0
Episode:  221
Reward:  44.0
Mean Reward 30.23873873873874
Max reward so far:  127.0
Episode:  222
Reward:  113.0
Mean Reward 30.609865470852018
Max reward so far:  127.0
Episode:  223
Reward:  18.0
Mean Reward 30.553571428571427
Max reward so far:  127.0
Episode:  224
Reward:  28.0
Mean Reward 30.54222222222222
Max reward so far:  127.0
Episode:  225
Reward:  56.0
Mean Reward 30.654867256637168
Max reward so far:  127.0
Episode:  226
Reward:  16.0
Mean Reward 30.590308370044053
Max reward so far:  127.0
Episode:  227
Reward:  97.0
Mean Reward 30.88157894736842
Max reward so far:  127

Episode:  281
Reward:  385.0
Mean Reward 85.87234042553192
Max reward so far:  1119.0
Episode:  282
Reward:  235.0
Mean Reward 86.39929328621908
Max reward so far:  1119.0
Episode:  283
Reward:  251.0
Mean Reward 86.97887323943662
Max reward so far:  1119.0
Episode:  284
Reward:  278.0
Mean Reward 87.64912280701755
Max reward so far:  1119.0
Episode:  285
Reward:  230.0
Mean Reward 88.14685314685315
Max reward so far:  1119.0
Episode:  286
Reward:  260.0
Mean Reward 88.74564459930313
Max reward so far:  1119.0
Episode:  287
Reward:  290.0
Mean Reward 89.44444444444444
Max reward so far:  1119.0
Episode:  288
Reward:  268.0
Mean Reward 90.06228373702422
Max reward so far:  1119.0
Episode:  289
Reward:  229.0
Mean Reward 90.54137931034482
Max reward so far:  1119.0
Episode:  290
Reward:  214.0
Mean Reward 90.96563573883161
Max reward so far:  1119.0
Episode:  291
Reward:  218.0
Mean Reward 91.40068493150685
Max reward so far:  1119.0
Episode:  292
Reward:  262.0
Mean Reward 91.9829351535

In [8]:
with tf.Session() as sess:
    env.reset()
    rewards = []
    
    # Load the model
    saver.restore(sess, "./models/model.ckpt")

    for episode in range(10):
        state = env.reset()
        step = 0
        done = False
        total_rewards = 0
        print("****************************************************")
        print("EPISODE ", episode)

        while True:
            

            # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, WE'RE OUTPUT PROBABILITIES.
            action_probability_distribution = sess.run(action_distribution, feed_dict={input_: state.reshape([1,4])})
            #print(action_probability_distribution)
            action = np.random.choice(range(action_probability_distribution.shape[1]), p=action_probability_distribution.ravel())  # select action w.r.t the actions prob


            new_state, reward, done, info = env.step(action)

            total_rewards += reward

            if done:
                rewards.append(total_rewards)
                print ("Score", total_rewards)
                break
            state = new_state
    env.close()
    print ("Score over time: " +  str(sum(rewards)/10))

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./models/model.ckpt
****************************************************
EPISODE  0
Score 22.0
****************************************************
EPISODE  1
Score 23.0
****************************************************
EPISODE  2
Score 82.0
****************************************************
EPISODE  3
Score 20.0
****************************************************
EPISODE  4
Score 24.0
****************************************************
EPISODE  5
Score 14.0
****************************************************
EPISODE  6
Score 34.0
****************************************************
EPISODE  7
Score 90.0
****************************************************
EPISODE  8
Score 31.0
****************************************************
EPISODE  9
Score 13.0
Score over time: 35.3
