# Vanilla Policy Gradient Report


This project is to solve CartPole problem from Gym OpenAI. Following is sample code of vanilla PG provided by https://medium.com/@awjuliani/super-simple-reinforcement-learning-tutorial-part-2-ded33892c724#.mtwpvfi8b. My understanding of each line of code is in the inline comments. 

To summarize, the sample code uses vanilla PG to directly compute the gradient of loss function: L = -A(s,a)*log(pi(s|a)) with respect to variables, i.e., the two layers 
in the network, and then use gradient ascension to update the network. The following image is the mean reward of latest 20 episodes over time. 
For each episode, the max reward is set to be 200.

<img src="reward.png">

Part 1: build up PG agent. This agent is built with two layers of neural network. 

In [None]:
class agent():
    def __init__(self, lr, s_size,a_size,h_size):
        #Create placeholder for states
        self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
        #Create a hidden layer, with size of h_size
        hidden = tf.contrib.slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.relu)
        #Calculate output, with size of (# of episodes, # of actions).
        self.output = tf.contrib.slim.fully_connected(hidden,a_size,activation_fn=tf.nn.softmax,biases_initializer=None)

        #Feed the reward and chosen action into the network to compute the loss. 
        #The loss function is L = -A(s,a)*log(pi(s|a)). Then use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        #Create a list of indices that point to each responsible action for each output.
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
        #Select outputs by indicies.
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)
        #Plug the responsible outputs, i.e., policy into the loss function along with the rewards.
        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
        
        #List all variables, i.e., the two layers of neural network.
        tvars = tf.trainable_variables()
        #The next four lines create gradient holders for later zip
        self.gradient_holders = []
        for idx,_ in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        #Compute gradients with respect to each of two layers
        self.gradients = tf.gradients(self.loss,tvars)
        
        #Use Adamn optimizer and apply the pair of gradient holder and gradient to the optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars), global_step)

Part 2: train the agent.

In [None]:
#Clear the Tensorflow graph.
tf.reset_default_graph() 
#The next two lines set the learning rate to decay over time
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.compat.v1.train.exponential_decay(0.01, global_step, 500, 0.9)

#Load the agent.
myAgent = agent(lr=learning_rate,s_size=4,a_size=2,h_size=8) 
#Set total number of episodes to train agent on.
total_episodes = 5000 
#Set max moves in each episodes
max_ep = 999
update_frequency = 5

#Create a list for mean reward over time
re = []

init = tf.global_variables_initializer()


# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_length = []
    
    #The next three lines create a list for the gradients of two neural layers and set them to zero to ensure later update.
    gradBuffer = sess.run(tf.trainable_variables())
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0

    #Start training
    while i < total_episodes:
        #Get the initial state
        s = env.reset()
        running_reward = 0
        #Create buffer of experience for later update
        ep_history = []
        for j in range(max_ep):
            #Probabilistically pick an action given our network outputs.
            a_dist = sess.run(myAgent.output,feed_dict={myAgent.state_in:[s]})
            a = np.random.choice(a_dist[0],p=a_dist[0])
            a = np.argmax(a_dist == a)

            #Get our reward for taking an action.
            s1,r,d,_ = env.step(a) 
            #Add this experience to buffer
            ep_history.append([s,a,r,s1])
            #Set the current state to next state
            s = s1
            running_reward += r
            #If the episode is done, i.e, the pole fell or this episode has gained 200 rewards
            if d == True:
                ep_history = np.array(ep_history)
                #Get the list of rewards
                ep_history[:,2] = discount_rewards(ep_history[:,2])
                #Feed the actions, rewards and states from experience buffer into the graph
                feed_dict={myAgent.reward_holder:ep_history[:,2],
                        myAgent.action_holder:ep_history[:,1],myAgent.state_in:np.vstack(ep_history[:,0])}
                #Compute the gradients of the loss function with respect to the two neural layers
                #Then update the gradient buffer created at the beginning
                grads = sess.run([myAgent.gradients], feed_dict=feed_dict)
                for idx,grad in enumerate(grads):
                    gradBuffer[idx] += grad

                #If it is time to update the network, update it.
                if i % update_frequency == 0 and i != 0:
                    #Feed gradient holders and the gradient buffer to the graph
                    feed_dict= dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                    sess.run(myAgent.update_batch, feed_dict=feed_dict)
                    #Clear gradient buffer for next episode
                    for ix,grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                
                total_reward.append(running_reward)
                total_length.append(j)
                break

        
        #Update our running tally of scores.
        if i % 100 == 0:
            print(np.mean(total_reward[-25:]))
            re.append(np.mean(total_reward[-25:]))
        i += 1