In [1]:
import matplotlib.pyplot as plt
import math

import gym
env = gym.make('CartPole-v0')

import numpy as np
import tensorflow as tf

env.reset()

#Hyperparameters
H_SIZE = 10 #Number of hidden layer neurons
batch_size = 5 #Update Params after every 5 episodes
learning_r = 0.01 #Learning Rate
GAMMA = 0.95 #Discount factor

INPUT_DIM = 4 #Input dimensions

  from ._conv import register_converters as _register_converters


In [2]:
# Initializing 
tf.reset_default_graph()

# Network to define moving left or right
input = tf.placeholder(tf.float32, [None,INPUT_DIM] , name="input_x")
W1 = tf.get_variable("W1", shape=[INPUT_DIM, H_SIZE],
           initializer=tf.contrib.layers.xavier_initializer()) # weight for hidden layer
layer1 = tf.nn.relu(tf.matmul(input,W1))
W2 = tf.get_variable("W2", shape=[H_SIZE, 1],
           initializer=tf.contrib.layers.xavier_initializer()) # weight for output layer
score = tf.matmul(layer1,W2)
probability = tf.nn.sigmoid(score)

# Define network needed for learning a good policy.
tvars = tf.trainable_variables() # for storing weights caches
input_y = tf.placeholder(tf.float32,[None,1], name="input_y")
advantages = tf.placeholder(tf.float32,name="reward_signal") # advantage is total reward over time

# The loss function sends the weights in the direction of making actions 
# that gave good advantage (reward over time) more likely, and actions that didn't less likely.
loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability)) # Entropy generation

loss = -tf.reduce_mean(loglik * advantages) # advantage is total reward over time:
newGrads = tf.gradients(loss,tvars) # get dloss_dtvars

In [3]:
# Optimization 

adam = tf.train.AdamOptimizer(learning_rate=learning_r) # Adam optimizer
W1Grad = tf.placeholder(tf.float32,name="batch_grad1") # Placeholders for final gradients once update happens
W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
batchGrad = [W1Grad,W2Grad] # Update Params after every mini batch: 5 episodes
updateGrads = adam.apply_gradients(zip(batchGrad,tvars))

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)): # calculate backward, last step's reward enjoys least impact from discount
        running_add = running_add * GAMMA + r[t] # aggregating reward iteratively 
        discounted_r[t] = running_add
    return discounted_r

In [12]:
# xs:input_x, 
# drs: store reward after action is taken,
# ys: y = 1 if action == 0 else 0
xs,drs,ys = [],[],[] #Arrays to store parameters till an update happens
running_reward = None
reward_sum = 0
episode_number = 1
total_episodes = 100
initial = tf.global_variables_initializer()

In [13]:
# Training
with tf.Session() as sess:
    rendering = False
    sess.run(initial)
    input_initial = env.reset() # Initial state of the environment

    # Array to store gradients for each min-batch step
    gradBuffer = sess.run(tvars)
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    
    while episode_number <= total_episodes:
        
        if reward_sum/batch_size > 100 or rendering == True :     #Render environment only after avg reward reaches 100
            #env.render()
            rendering = True
            
        # Format the state for placeholder
        x = np.reshape(input_initial,[1,INPUT_DIM]) # introduce initial state to shape [1,4]
        
        # Run policy network 
        tfprob = sess.run(probability,feed_dict={input: x}) #  calculate sigmoid for initial state
        action = 1 if np.random.uniform() < tfprob else 0 # make an initial move 
        
        xs.append(x) # store the present state to the state list
        y = 1 if action == 0 else 0
        ys.append(y) # store 

        # take action for the state
        input_initial, reward, done, info = env.step(action)
        reward_sum += reward

        drs.append(reward) # store reward after action is taken

        if done: # done means collaps, then entering to a brand new episode
            episode_number += 1
            # Stack the memory arrays to feed in session
            epx = np.vstack(xs) # stack all recorded state in one list
            epy = np.vstack(ys) # ??
            epr = np.vstack(drs)# stack all recorded reward in one list
            
            xs,drs,ys = [],[],[] #Reset Arrays

            # Compute the discounted reward
            discounted_epr = discount_rewards(epr) # expected reward is the aggregation of reward with discounted factor
            
            # normalize and standardize the reward
            discounted_epr -= np.mean(discounted_epr) # scalized the dicounted reward
            discounted_epr /= np.std(discounted_epr) # standardize the dicount reward
            
            # Get and save the gradient
            tGrad = sess.run(newGrads,feed_dict={input: epx, input_y: epy, advantages: discounted_epr})
            for ix,grad in enumerate(tGrad):
                gradBuffer[ix] += grad
                
            # Update Params after Min-Batch number of episodes
            if episode_number % batch_size == 0: 
                sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]})
                for ix,grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0
                
                # Print details of the present model
                running_reward = reward_sum if running_reward is None else running_reward * 0.95 + reward_sum * 0.01
                print ('Average reward for episode %f.  Total average reward %f.' % (reward_sum/batch_size, running_reward/batch_size))
                
                if reward_sum/batch_size > 200: 
                    print ("Task solved in",episode_number,'episodes')
                    break
                    
                reward_sum = 0
            
            input_initial = env.reset()
        
print (episode_number,'Episodes completed.')

Average reward for episode 14.800000.  Total average reward 14.800000.
Average reward for episode 14.600000.  Total average reward 14.206000.
Average reward for episode 26.600000.  Total average reward 13.761700.
Average reward for episode 27.800000.  Total average reward 13.351615.
Average reward for episode 19.200000.  Total average reward 12.876034.
Average reward for episode 28.800000.  Total average reward 12.520233.
Average reward for episode 16.000000.  Total average reward 12.054221.
Average reward for episode 16.400000.  Total average reward 11.615510.
Average reward for episode 18.200000.  Total average reward 11.216734.
Average reward for episode 24.200000.  Total average reward 10.897898.
Average reward for episode 18.000000.  Total average reward 10.533003.
Average reward for episode 22.200000.  Total average reward 10.228353.
Average reward for episode 23.200000.  Total average reward 9.948935.
Average reward for episode 23.800000.  Total average reward 9.689488.
Average 