In [None]:
import gym
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
env = gym.make('CartPole-v1')
env.reset()

## Defining TRPO setup

In [None]:
def get_padded_gradients(loss, var_list):
    grads = tf.gradients(loss, var_list)
    return [g if g is not None else tf.zeros(v.shape)
            for g, v in zip(grads, var_list)]

def get_flattened_gradients(loss, var_list):
    padded_gradients = get_padded_gradients(loss, var_list)
    return tf.concat([tf.reshape(x, [-1]) for x in padded_gradients], 0)

def sum_discounted_rewards(rewards, discount):
    discounted_rewards = list(rewards)
    pointer = len(rewards) - 1
    acc_discounted_sum = rewards[-1]
    while pointer > 0:
        acc_discounted_sum *= 0.95
        pointer -= 1
        discounted_rewards[pointer] += acc_discounted_sum
        acc_discounted_sum += rewards[pointer]
    return discounted_rewards

In [None]:
class RL_Agent:
    
    def __init__(self, model_name):
        with tf.variable_scope(model_name):
            self.model_name = model_name
            self.session = tf.Session()
            
            self.input_layer = tf.placeholder(shape=[None, 4], dtype=tf.float32)
            self.dense1_layer = tf.layers.dense(self.input_layer, 
                                                units=8, use_bias=True, 
                                                activation=tf.nn.relu, name="dense1_weights"
                                               )
            
            self.dense2_layer = tf.layers.dense(self.dense1_layer, 
                                                units=2, use_bias=True, 
                                                activation=tf.nn.relu, name="dense2_weights"
                                               ) 
            
            self.prob_layer = tf.nn.softmax(self.dense2_layer)
            self.log_prob_layer = tf.log(self.prob_layer)
                        
            self.session.run(tf.global_variables_initializer())

    def model_variables(self):
        return [x for x in tf.trainable_variables() if self.model_name in x.name]
    
    def model_size(self):
        var_shapes = [tf.size(x) for x in self.model_variables()]
        return self.session.run(tf.reduce_sum(var_shapes))
            
    def predict(self, states):
        return self.session.run(self.prob_layer, feed_dict={self.input_layer: states})
    
    def grad_log_prob_actions(self, states, actions, rewards):
        # Return a sum of log_prob gradients weighted by discounted sum of future rewards
        # By design this function is supposed to be called once per each individual game
        action_mask = tf.one_hot(actions, depth=2, on_value=1.0, off_value=0.0, axis=-1)
        picked_log_prob_actions = tf.reduce_sum(action_mask * self.log_prob_layer, axis=1)
        weighted_log_prob_actions = picked_log_prob_actions * rewards
        grad_log_prob_actions = get_flattened_gradients(weighted_log_prob_actions, self.model_variables())
        return self.session.run(grad_log_prob_actions, feed_dict={self.input_layer: states})
    
    def fisher_vector_product(self, states, vector):
        # This function is supposed to return the product of estimated fisher information matrix and a specified vector
        # As I hope to reliably estimate this matrix, I take all states accumulated in a batch of games
        expected_log_prob = tf.reduce_sum(tf.stop_gradient(self.prob_layer) * self.log_prob_layer, axis=1)
        log_prob_grad = get_flattened_gradients(expected_log_prob, self.model_variables())
        grad_vector_product = tf.reduce_sum(log_prob_grad * vector)
        fisher_vector_product = -get_flattened_gradients(grad_vector_product, self.model_variables()) / states.shape[0]
        return self.session.run(fisher_vector_product, feed_dict={self.input_layer: states})            

In [None]:
tf.reset_default_graph()

cartpole_model = RL_Agent("cartpole_model")
print cartpole_model.predict(env.reset().reshape((1, 4)))
print cartpole_model.model_variables()
print cartpole_model.model_size()

In [None]:
class TRPO_Learner:
    
    def __init__(self, rl_agent, game_env, trpo_delta, discount, batch_size):
        self.session = rl_agent.session
        self.agent = rl_agent
        self.env = game_env
        
        self.trpo_delta = trpo_delta
        self.discount = discount
        self.batch_size = batch_size
        
    def play_single_game(self):
        states = None
        actions = []
        rewards = []
        
        observation = self.env.reset().reshape((1, 4))
        done = False
        
        while done == False:
            if states is None:
                states = observation
            else:
                states = np.concatenate((states, observation), axis=0)
            prob_actions = self.agent.predict(observation)[0]
            action = np.random.choice(np.arange(len(prob_actions)), p=prob_actions)
            actions.append(action)
            observation, reward, done, info = self.env.step(action)
            observation = observation.reshape((1, 4))
            rewards.append(reward)
            
        return states, actions, rewards
    
    def play_batch(self):        
        grad_reward = np.zeros(self.agent.model_size())
        all_states = []
        
        for i in range(self.batch_size):

            states, actions, rewards = self.play_single_game()
            all_states.append(states)
            discounted_rewards = sum_discounted_rewards(rewards, self.discount)
            
            grad_reward += cartpole_model.grad_log_prob_actions(states, 
                                                                tf.constant(actions),
                                                                tf.constant(discounted_rewards)
                                                               )
            
        return grad_reward / self.batch_size, reduce(lambda x, y: np.concatenate((x, y), axis=0), all_states)
    
    def TRPO_step(self):
        grad_reward, obs_states = self.play_batch()
        return 0
                    

In [None]:
np.zeros(5).reshape((-1, 1)).shape
np.concatenate((np.zeros((1, 5)), np.ones((1, 5))), axis=0)

In [None]:
trpo = TRPO_Learner(cartpole_model, env, 0.05, 0.95, 50)

states, actions, rewards = trpo.play_single_game()

In [None]:
print cartpole_model.grad_log_prob_actions(states, 
                                           tf.constant(actions), 
                                           tf.constant(sum_discounted_rewards(rewards, trpo.discount))
                                          )

In [None]:
grad_reward, obs_states = trpo.play_batch()

In [None]:
trpo.agent.fisher_vector_product(obs_states, tf.constant([1.0] * trpo.agent.model_size()))