# 0) Installing the dependencies.

In [1]:
!pip install tensorflow-gpu
!pip install numpy
!pip install gym



# 1) Importing the libraries.

In [2]:
import tensorflow as tf
import numpy as np
import gym

# 2) Creating our environment.

To start working with Carpole, we'll first use OpenAI Gym to create our environment.

In [3]:
env = gym.make('CartPole-v0')
env = env.unwrapped  # unwrapped means no custom Wrappers are used
env.seed(1)  # Considering policy gradients have high variability, we'll use a fixed seed for reproducibility

[1]

# 3) Setting up our hyperparameters.

In [4]:
# >>>> ENVIRONMENT HYPERPARAMETERS <<<<
state_size = 4
action_size = env.action_space.n

# >>>> TRAINING HYPERPARAMETERS <<<<
max_episodes = 10000
learning_rate = 0.01
gamma = 0.95 # gamma is the discount rate for future rewards

# 4) Defining our preprocessing function

In our **Policy Gradient** implementation, we're using a preprocessing function to **perform the discounting of our rewards using gamma**.

In [5]:
def discount_and_normalize_rewards(episode_rewards):
    discounted_episode_rewards = np.zeros_like(episode_rewards)  # Empty numpy array with the same size of our rewards
    cumulative = 0.0
    
    # We're taking the sum of all rewards discounted by gamma, in reversed order.
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative
    
    # Normalizes the rewards by subtracting them to their mean and dividing the result by the standard deviation.
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)
        
    return discounted_episode_rewards


# 5) Creating our Policy Gradient Neural Network model with TensorFlow

We'll implement the following model:

- The input of our network is a state, which is an array of size 4 containing:
    - Cart Position;
    - Cart Velocity;
    - Pole Angle;
    - Pole Velocity at Tip.
- Our neural network contains 3 dense layers. The first two with RELU activation and the third with no activation.
- The outut of our network is a probability distribution function for the possible actions, so we're using another dense layer with *softmax* activation function. The possible actions for our environment are:
    - Push cart to the left;
    - Push cart to the right.

In [6]:
with tf.name_scope('inputs'):
    # Placeholder used to get the input, in this case the state
    input_ = tf.placeholder(tf.float32, [None, state_size], name='input_')
    # Placeholder used to output the probability distribution for our actions
    actions = tf.placeholder(tf.int32, [None, action_size], name='actions')
    discounted_episode_rewards_ = tf.placeholder(tf.float32, [None, ], name='discounted_episode_rewards')
    
    # This placeholder is used to record the mean reward info to TensorBoard
    mean_reward_ = tf.placeholder(tf.float32, name='mean_reward')
    
    with tf.name_scope('fc1'):
        fc1 = tf.contrib.layers.fully_connected(inputs=input_,
                                                num_outputs = 10,
                                                activation_fn = tf.nn.relu,
                                                weights_initializer = tf.contrib.layers.xavier_initializer())
        
    with tf.name_scope('fc2'):
        fc2 = tf.contrib.layers.fully_connected(inputs=fc1,
                                                num_outputs = action_size,
                                                activation_fn = tf.nn.relu,
                                                weights_initializer = tf.contrib.layers.xavier_initializer())
        
    with tf.name_scope('fc3'):
        fc3 = tf.contrib.layers.fully_connected(inputs=fc2,
                                                num_outputs = action_size,
                                                activation_fn = None,
                                                weights_initializer = tf.contrib.layers.xavier_initializer())
        
    with tf.name_scope('softmax'):
        action_distribution = tf.nn.softmax(fc3)
    
    with tf.name_scope('loss'):
        # Computing the crossentropy of the result of our network, after applying softmax. This works
        # just like a multiclass classification problem, so we can use cross entropy to calculate our loss.
        # If we were using just one class to classify our output probability distribution, we could use
        # tf.nn.sparse_softmax_cross_entropy_with_logits to calculate our loss.
        neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=fc3, labels=actions)
        loss = tf.reduce_mean(neg_log_prob * discounted_episode_rewards_)
        
    with tf.name_scope('train'):
        # Optimizes our parameters using 'learning_rate' and minimizing 'loss'
        train_opt = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    
    

# 6) Setting up TensorBoard.

To launch TensorBoard, use *tensorboard --logdir=/tensorboard/pg/1* on the command prompt.

In [7]:
# Setting up TensorBoard writer
writer = tf.summary.FileWriter('/tensorboard/pg/1')

# Recording the losses
tf.summary.scalar('Loss', loss)

# Recording the reward mean
tf.summary.scalar('Reward_mean', mean_reward_)

write_op = tf.summary.merge_all()

# 7) Training our agent.

Now we'll use our neural network model to output a probability distribution for the possible actions for our agent given an input state, select an action from this distribution and execute it in our environment. When an episode is over, we'll feed all states, actions and rewards from the episode to train our network's parameters.

In [9]:
all_rewards = []   # Records all rewards so far
total_rewards = 0  # Sum of all rewards
maximum_reward_recorded = 0
episode = 0  # Current episode
episode_states, episode_actions, episode_rewards = [], [], []  # Records all states, actions and rewards for an episode

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for episode in range(max_episodes):
        
        episode_rewards_sum = 0
        state = env.reset()  # Resets the environment before starting the episode
        env.render()
        
        while True:
            # Choosing an action from the network's output, which is a probability distribution over
            # the possible actions given the input state.
            action_probability_distribution = sess.run(action_distribution,
                                                       feed_dict = {input_: state.reshape([1, 4])})
            
            action = np.random.choice(range(action_probability_distribution.shape[1]),
                                      p=action_probability_distribution.ravel())
            
            new_state, reward, done, info = env.step(action)  # Executes the chosen action
            
            # Storing states, actions and rewards
            episode_states.append(state)
            
            # For actions, we'll record a one-hot vector correspondent to the chosen action
            action_ = np.zeros(action_size)
            action_[action] = 1  # Setting index = 'action' to 1
            episode_actions.append(action_)
            
            episode_rewards.append(reward)
            
            if done:
                # Calculating the total reward for the current episode
                episode_rewards_sum = np.sum(episode_rewards_sum)
                
                all_rewards.append(episode_rewards_sum)
                
                total_rewards = np.sum(all_rewards)
                
                mean_reward = np.divide(total_rewards, episode+1)
                
                maximum_reward_recorded = np.amax(all_rewards)
                
                print("===========================")
                print("Episode #", episode)
                print("Reward: ", episode_rewards_sum)
                print("Mean Reward: ", mean_reward)
                print("Max Reward So Far: ", maximum_reward_recorded)
                print("===========================")
                
                # Calculating the discounted rewards for the current episode
                discounted_episode_rewards = discount_and_normalize_rewards(episode_rewards)
                
                # Calculating the loss and training our parameters using the current episode's outputs,
                # that is, feedforward, gradient and backpropagation steps.
                loss_, _ = sess.run([loss, train_opt],
                                    feed_dict={input_: np.vstack(np.array(episode_states)),
                                               actions: np.vstack(np.array(episode_actions)),
                                               discounted_episode_rewards_: discounted_episode_rewards})
                
                # Writing training summaries to TensorBoard
                summary = sess.run(write_op,
                                   feed_dict={input_: np.vstack(np.array(episode_states)),
                                                actions: np.vstack(np.array(episode_actions)),
                                                discounted_episode_rewards_: discounted_episode_rewards,
                                                mean_reward_: mean_reward})
                
                writer.add_summary(summary, episode)
                writer.flush()
                
                # Resetting the stores before beginning a new episode
                episode_states, episode_actions, episode_rewards = [], [], []
                
                break
                
            state = new_state
            
        

Episode # 0
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 1
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 2
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 3
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 4
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 5
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 6
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 7
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 8
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 9
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 10
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 11
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 12
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 13
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 14
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 15
Reward:  0
Mean Reward:  0.0
Max Rewa

Episode # 74
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 75
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 76
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 77
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 78
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 79
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 80
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 81
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 82
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 83
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 84
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 85
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 86
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 87
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 88
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 89
Reward:  0
Mean Reward:  0.

Episode # 150
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 151
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 152
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 153
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 154
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 155
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 156
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 157
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 158
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 159
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 160
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 161
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 162
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 163
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 164
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 165
Reward:  0


Episode # 220
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 221
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 222
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 223
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 224
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 225
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 226
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 227
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 228
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 229
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 230
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 231
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 232
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 233
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 234
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 235
Reward:  0


Episode # 288
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 289
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 290
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 291
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 292
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 293
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 294
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 295
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 296
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 297
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 298
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 299
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 300
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 301
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 302
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 303
Reward:  0


Episode # 356
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 357
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 358
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 359
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 360
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 361
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 362
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 363
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 364
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 365
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 366
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 367
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 368
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 369
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 370
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 371
Reward:  0


Episode # 425
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 426
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 427
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 428
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 429
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 430
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 431
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 432
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 433
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 434
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 435
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 436
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 437
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 438
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 439
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 440
Reward:  0


Episode # 494
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 495
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 496
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 497
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 498
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 499
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 500
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 501
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 502
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 503
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 504
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 505
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 506
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 507
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 508
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 509
Reward:  0


Episode # 563
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 564
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 565
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 566
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 567
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 568
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 569
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 570
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 571
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 572
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 573
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 574
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 575
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 576
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 577
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 578
Reward:  0


Episode # 631
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 632
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 633
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 634
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 635
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 636
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 637
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 638
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 639
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 640
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 641
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 642
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 643
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 644
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 645
Reward:  0
Mean Reward:  0.0
Max Reward So Far:  0
Episode # 646
Reward:  0


KeyboardInterrupt: 