In [1]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

try:
    xrange = xrange
except:
    xrange = range

In [2]:
#env = gym.make('CartPole-v0')
env = gym.make('MountainCar-v0')

[2017-07-29 18:19:29,919] Making new env: MountainCar-v0


In [3]:
gamma = 0.99

def discount_rewards(r):
    """ Take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [4]:

class agent():
    
    def forward_model(self, s_size, a_size):
        
        with tf.variable_scope('forward_model'):
            W_action = tf.Variable(tf.random_uniform([a_size, s_size], -1.0, 1.0))

            state_pred = slim.fully_connected(self.state_in, s_size, biases_initializer=None, activation_fn=None)
            state_pred += tf.nn.embedding_lookup(W_action, self.action_holder)


            return state_pred
    
    
    def __init__(self, lr, s_size, a_size, h_size, curiosity_activated=True, intrinsic_reward_weight=1.0):
        
        
        # These lines established the feed-forward part of the network. 
        # The agent takes a state and produces an action.
        self.state_in = tf.placeholder(shape=[None, s_size], dtype=tf.float32)
        self.next_state_in = tf.placeholder(shape=[None, s_size], dtype=tf.float32)

        with tf.variable_scope('policy'):
            hidden = slim.fully_connected(self.state_in, h_size, biases_initializer=None, activation_fn=tf.nn.relu)
            self.output = slim.fully_connected(hidden, a_size, activation_fn=tf.nn.softmax, biases_initializer=None) #pi
        # self.chosen_action = tf.argmax(self.output, 1)

        # The next six lines establish the training proceedure. 
        # We feed the reward and chosen action into the network
        # to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None], dtype=tf.float32) # Contains discounted future reward for each timestep
        self.action_holder = tf.placeholder(shape=[None], dtype=tf.int32) # Contains index of actions for each timestep
        
        # Gather the probabilities for the action we took at each timestep
        self.indexes = tf.range(0, tf.shape(self.action_holder)[0]) * tf.shape(self.output)[1] + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)

        # Takes s, s', a and minimizes (f(s, a) - s')^2
        self.state_pred = self.forward_model(s_size, a_size)
        
        
        self.forward_loss = tf.nn.l2_loss( self.state_pred - self.next_state_in )
        
        forward_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.forward_model_step = forward_optimizer.minimize(self.forward_loss)
        
        
        
        self.intrinsic_reward = self.forward_loss
        
        if curiosity_activated:
            self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs) * (self.reward_holder + intrinsic_reward_weight*self.intrinsic_reward))
        else:
            self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs) * self.reward_holder)
        
        tvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'policy')
        self.gradients = tf.gradients(self.loss, tvars)

        self.gradients_etc = [self.gradients, self.forward_model_step]

        
        # Create gradient placeholders for our parameters so that we can feed in a batch of gradients
        self.gradient_holders = []
        for idx, var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32, name=str(idx) + '_holder')
            self.gradient_holders.append(placeholder)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.take_step = optimizer.apply_gradients(zip(self.gradient_holders, tvars))
        
            
        
    def pick_action(self, observation):
        a_dist = sess.run(self.output, feed_dict={self.state_in:[observation]})
        a = np.random.choice(range(0, a_dist.shape[1]), p=a_dist[0])
        print("probs:", a_dist[0])
#        a = np.random.choice(a_dist[0], p=a_dist[0])
#        a = np.argmax(a_dist == a)
        return a

NameError: name 'curiosity_activated' is not defined

In [8]:
tf.reset_default_graph() #Clear the Tensorflow graph.


total_episodes = 5000 #Set total number of episodes to train agent on.
max_steps = 999
update_frequency = 5

curiosity_activated = False

#Load the agent.
myAgent = agent(lr=1e-2, 
                s_size=env.observation_space.shape[0], 
                a_size=env.action_space.n, 
                h_size=8,
                curiosity_activated=curiosity_activated,
                intrinsic_reward_weight=10.0) 



init = tf.global_variables_initializer()

sess = tf.Session()
# Launch the tensorflow graph

sess.run(init)
i = 0
total_reward = []
total_length = []

gradBuffer = sess.run(tf.trainable_variables())
for ix, grad in enumerate(gradBuffer):
    gradBuffer[ix] = grad * 0

while i < total_episodes:
    s = env.reset()
    running_reward = 0
    ep_history = []
    for j in range(max_steps):
     
        #Probabilistically pick an action given our network outputs.
        a = myAgent.pick_action(s)
        
        #Get our reward for taking an action
        next_state, r, done,_ = env.step(a)
        
        ep_history.append([s, a, r, next_state])
        s = next_state
        running_reward += r
        
        if done:
            #Update the network.
            
            # Compute future discounted reward for each time step
            ep_history = np.array(ep_history)
            future_rewards = discount_rewards(ep_history[:, 2])
            
            feed_dict = { 
                myAgent.reward_holder: future_rewards,
                myAgent.action_holder: ep_history[:, 1],
                myAgent.state_in: np.vstack(ep_history[:, 0]),
                myAgent.next_state_in: np.vstack(ep_history[:, 3]),
                
            }
            
            
            if curiosity_activated:
                grads, _ = sess.run(myAgent.gradients_etc, feed_dict=feed_dict)
            else:
                grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
                
            
            for idx, grad in enumerate(grads):
                gradBuffer[idx] += grad

            if i % update_frequency == 0 and i != 0:
                
                feed_dict = dict(zip(myAgent.gradient_holders, gradBuffer))
                _ = sess.run(myAgent.take_step, feed_dict=feed_dict)
                
                for ix, grad in enumerate(gradBuffer):
                    gradBuffer[ix] = np.zeros_like(grad)
            
            total_reward.append(running_reward)
            total_length.append(j)
            
            break

       
    if i % 100 == 0:
        print(np.mean(total_reward[-100:]))
    i += 1

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


probs: [ 0.34501144  0.30946103  0.34552759]
probs: [ 0.34501275  0.30957338  0.34541386]
probs: [ 0.34500784  0.30994728  0.34504491]
probs: [ 0.34497145  0.31013218  0.34489635]
probs: [ 0.34492633  0.31033063  0.34474301]
probs: [ 0.3448751   0.31078541  0.34433952]
probs: [ 0.34479371  0.31104481  0.34416148]
probs: [ 0.34470481  0.31131107  0.34398413]
probs: [ 0.34460905  0.31158218  0.34380877]
probs: [ 0.34450716  0.31185606  0.34363675]
probs: [ 0.34439814  0.31188601  0.34371588]
probs: [ 0.34430426  0.31187615  0.34381956]
probs: [ 0.3442263   0.31182662  0.34394705]
probs: [ 0.34416845  0.31222737  0.34360412]
probs: [ 0.34408465  0.3121765   0.34373888]
probs: [ 0.34402174  0.31257349  0.34340474]
probs: [ 0.34393579  0.3127611   0.34330314]
probs: [ 0.34384879  0.31294253  0.34320873]
probs: [ 0.34376141  0.31311646  0.34312212]
probs: [ 0.34367299  0.31303629  0.34329072]
probs: [ 0.34360659  0.31315291  0.34324044]
probs: [ 0.34354171  0.31326044  0.34319785]
probs: [ 0

KeyboardInterrupt: 

In [None]:
myAgent.gradients

In [9]:
env.render(close=True)
env.theta_threshold_radians = 24*2*np.pi/360

for i_episode in range(20):
    observation = env.reset()
    for t in range(200):
        env.render()
        #print(observation)
        action = myAgent.pick_action(observation)
        #print("action", action)
        observation, reward, done, info = env.step(action)
        # print(reward)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break

probs: [ 0.34616742  0.30734244  0.34649014]
probs: [ 0.34611741  0.30741355  0.34646904]
probs: [ 0.34594393  0.30766594  0.3463901 ]
probs: [ 0.34585553  0.30781615  0.34632838]
probs: [ 0.34575996  0.30798194  0.3462581 ]
probs: [ 0.345658    0.30816218  0.34617984]
probs: [ 0.34555033  0.30835554  0.34609416]
probs: [ 0.34543777  0.30856058  0.34600165]
probs: [ 0.34543562  0.30861276  0.34595159]
probs: [ 0.34545109  0.30863088  0.34591803]
probs: [ 0.34548402  0.30861479  0.34590119]
probs: [ 0.34530517  0.30889064  0.34580413]
probs: [ 0.34521624  0.30905494  0.34572881]
probs: [ 0.3451249   0.30922568  0.34564945]
probs: [ 0.34503183  0.30940154  0.34556657]
probs: [ 0.34505236  0.30941805  0.34552959]
probs: [ 0.34497881  0.30955762  0.34546357]
probs: [ 0.34501922  0.30953661  0.3454442 ]
probs: [ 0.34496579  0.30963776  0.34539643]
probs: [ 0.34479728  0.30990431  0.34529838]
probs: [ 0.34460789  0.31021494  0.3451772 ]
probs: [ 0.3445136   0.31040376  0.34508261]
probs: [ 0

KeyboardInterrupt: 