In [1]:
import tensorflow as tf
from keras.layers import *
from keras.models import Model
import gym
import random

Using TensorFlow backend.


In [2]:
# Defining the constants
env_name = "Acrobot-v1"
learning_rate = 0.0001
batch_size = 32
stability_const = 0.01
max_steps = 1000
episodes = 10000
gamma = 0.95
should_save = True
embed_dim = 8
curiosity_level = 0.2
checkpoint = 20
optimizer = tf.train.AdamOptimizer(learning_rate)

In [3]:
# Creating the environment
env = gym.make(env_name)
action_space = env.action_space.n
state_space = env.observation_space.shape[0]

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [4]:
# Creating placeholders for all the variables
action_tf = tf.placeholder(dtype = tf.float32, shape = [None, action_space], name = "Action_Placeholder")
reward_tf = tf.placeholder(dtype = tf.float32, shape = [None, 1], name = "Reward_Placeholder")
state_tf  = tf.placeholder(dtype = tf.float32, shape = [None, state_space], name = "State_placeholder")
next_state_tf = tf.placeholder(dtype = tf.float32, shape = [None, state_space], name = "Next_state_placeholder")

In [None]:
def crossEntropy(logits, predictions):
    term1 = -logits * tf.log(tf.clip_by_value(predictions, clip_value_min = 1e-10, clip_value_max = 1))
    term2 = -(1 - logits) * tf.log(tf.clip_by_value(1 - predictions, clip_value_min = 1e-10, clip_value_max = 1))
    cross_entropy = tf.reduce_mean(term1 + term2)
    return cross_entropy

In [None]:
# Building the network

# Building the embedding layer model
actions_tf = Input(shape = (None, action_space))
states_tf  = Input(shape = (None, state_space))
embedding1 = Input(shape = (None, embed_dim))
embedding2 = Input(shape = (None, embed_dim))

# Creating the embedder
Embed1 = Dense(16, activation = "relu")(states_tf)
Embed2 = Dense(embed_dim, activation = "relu")(Embed1)

Embedder = Model(inputs = [states_tf], outputs = [Embed2])

# Inverse model
A_layer1a = Dense(10, activation = "relu")(embedding1)
A_layer1b = Dense(10, activation = "relu")(embedding2)
A_layer2  = concatenate([A_layer1a, A_layer1b])
A_preds   = Dense(action_space, activation = "linear")(A_layer2)

InverseModel = Model(inputs = [embedding1, embedding2], outputs = [A_preds])

# Creating the forward model
F_layer1a = Dense(10, activation = "relu")(actions_tf)
F_layer1b = Dense(10, activation = "relu")(embedding1)
F_layer2  = concatenate([F_layer1a, F_layer1b])
F_preds   = Dense(embed_dim, activation = "linear")(F_layer2)

ForwardModel = Model(inputs = [actions_tf, embedding1], outputs = [F_preds])

# Creating sequential model policy prediction
AC_A_layer1 = Dense(16, activation = "relu")(states_tf)
AC_A_layer2 = Dense(8, activation = "relu")(AC_A_layer1)
AC_A_preds = Dense(action_space, activation = "softmax")(AC_A_layer2)

#Creating the layers for the q value prediction
C_layer3 = Dense(5, activation = "relu")(actions_tf)
C_layer4 = concatenate([C_layer3, AC_A_layer2])
Q_values = Dense(1, activation = "linear")(C_layer4)

#Creating the layers for the value of the state
V_values = Dense(1, activation = "linear")(AC_A_layer2)

#Creating the advantage function
Adv_values = Subtract()([Q_values, V_values])

Actor = Model(inputs = [states_tf], outputs = [AC_A_preds])
Critic = Model(inputs = [states_tf, actions_tf], outputs = [Adv_values])

In [None]:
# Getting the embedder outputs for the state and next state
state_embeds = Embedder([state_tf])
next_state_embeds = Embedder([next_state_tf])

# Feeding the embedded state to the inverse model and finding the loss
action_preds = InverseModel([state_embeds, next_state_embeds])
inverse_loss = crossEntropy(action_tf, action_preds)

# Feeding the states to the forward model to predict next state
next_state_preds = ForwardModel([action_tf, state_embeds])
intrinsic_reward = tf.losses.cosine_distance(next_state_embeds, next_state_preds, axis = 1)

#Building the actor loss
ac_action_preds = Actor([state_tf])
advantage_values = Critic([state_tf, action_tf])
policy_loss = crossEntropy(action_tf, ac_action_preds) * advantage_values

#Building the critic loss
action_preds_next = tf.argmax(Actor([next_state_tf]), axis = 1)
next_best_action  = tf.one_hot(action_preds_next, action_space)
reward_preds_next = Critic([next_state_tf, next_best_action])
td_loss = stability_const * (gamma * reward_preds_next + reward_tf - advantage_values)

#Finding the total loss and minimizing
total_loss = tf.reduce_mean(td_loss + policy_loss + inverse_loss - curiosity_level * intrinsic_reward)
train = optimizer.minimize(total_loss)

# Creating an operation for training
train = optimizer.minimize(total_loss)

In [None]:
#Creating an init operation
init = tf.global_variables_initializer()

#Creating a session and initialzing all variables
config = tf.ConfigProto(log_device_placement = True)
config.gpu_options.allow_growth = True
sess = tf.Session(config = config)

In [None]:
#Saving the weights
saver_path = './Acrobot_Weights/saved_weights.ckpt'
saver = tf.train.Saver()

#Initialzing all variables
#saver.restore(sess, saver_path)
sess.run(init)

In [None]:
#Training the agent
average_reward = 0
average_steps  = 0
for episode in range(episodes):
    steps = 0
    total_reward = 0
    state   = env.reset()
    prev_states = list()
    actions = list()
    states  = list()
    rewards = list()
    while True:
        prev_state = state
        prev_states.append(state)
        actions_dist = sess.run(ac_action_preds, feed_dict = {state_tf: np.array(state).reshape(1, state_space)})
        action  = np.random.choice(np.arange(0, action_space), p = actions_dist.ravel())
        action_ = np.zeros(action_space)
        action_[action] = 1
        state, reward_ext, done, _ = env.step(action)
        
        # Getting the intrinsic reward from curiosity
        reward = sess.run(intrinsic_reward, feed_dict = {
            state_tf: np.array(prev_state).reshape(1, state_space),
            next_state_tf : np.array(state).reshape(1, state_space),
            action_tf : action_.reshape(1, action_space) 
        })
        
        env.render()
        
        #Storing data in experience replay
        states.append(state)
        actions.append(action_)
        rewards.append(reward)
        
        #Incrementing step and reward
        steps += 1 
        total_reward += reward_ext
            
        if done or steps > max_steps:
            #Training model
            states[steps - 1] = np.zeros(state_space)
            next_states = np.array(states).reshape(steps, state_space)
            states = np.array(prev_states).reshape(steps, state_space)
            actions = np.array(actions).reshape(steps, action_space)
            rewards = np.array(rewards).reshape(steps, 1)
            
            sess.run(train, feed_dict = {
                state_tf: states,
                action_tf: actions,
                reward_tf: rewards,
                next_state_tf: next_states
            })
                
            break
            
    #Adding to the average reward and average steps
    average_reward += total_reward
    average_steps += steps
    
    if episode % checkpoint == 0:
        #Printing stats
        print("======================================================================================================")
        print("Average extrinsic reward : ", average_reward / checkpoint)
        print("Average steps ", average_steps / checkpoint)
        print("Episode Number ", episode)
        print("======================================================================================================")
        if should_save:
            saver.save(sess, saver_path)
#         if average_steps / checkpoint == 200:
#             break

        #Resetting the stats
        average_reward = 0
        average_steps  = 0

Average extrinsic reward :  -25.0
Average steps  25.0
Episode Number  0
Average extrinsic reward :  -487.4
Average steps  487.5
Episode Number  20
Average extrinsic reward :  -488.05
Average steps  488.2
Episode Number  40
Average extrinsic reward :  -488.55
Average steps  488.75
Episode Number  60
Average extrinsic reward :  -471.2
Average steps  471.5
Episode Number  80
Average extrinsic reward :  -473.4
Average steps  473.75
Episode Number  100
Average extrinsic reward :  -473.1
Average steps  473.45
Episode Number  120
Average extrinsic reward :  -454.85
Average steps  455.4
Episode Number  140
Average extrinsic reward :  -447.3
Average steps  447.7
Episode Number  160
Average extrinsic reward :  -433.6
Average steps  434.1
Episode Number  180
Average extrinsic reward :  -444.25
Average steps  444.75
Episode Number  200
Average extrinsic reward :  -454.65
Average steps  455.15
Episode Number  220
Average extrinsic reward :  -437.6
Average steps  438.2
Episode Number  240
Average ex

In [None]:
#Testing the agent
test_length    = 20
average_reward = 0
average_steps  = 0
for i in range(test_length):
    steps = 0
    rewards = 0
    state = env.reset()
    while True:
        env.render()
        actions_dist = sess.run(ac_action_preds, feed_dict = {state_tf: np.array(state).reshape(1, state_space)})
        action  = np.random.choice(np.arange(0, action_space), p = actions_dist.ravel())
        state, reward, done, _ = env.step(action)
        steps += 1
        rewards += reward
        if done:
            break
    average_reward += rewards
    average_steps  += steps

print("======================================================================================================")
print("Average steps ", average_steps / test_length)
print("Average reward ", average_reward / test_length)
print("======================================================================================================")