In [2]:
import tensorflow as tf
import gym
import numpy as np
import copy

## A2C

Actor-Critic Architecture

In [3]:
class A2C:
    def __init__(self, name : str, obs_space, action_space, sess):
        self.observation = obs_space
        self.action = action_space
        self.scope_name = name
        self.sess = sess
        
        with(tf.variable_scope(self.scope_name)):
            # placeholder for inputs to the network
            self.inputs = tf.placeholder(shape = [None] + list(self.observation.shape) , dtype = tf.float32)
        
            # build the two networks
            self.build_network()
            
            # stochastic action
            self.act = tf.multinomial(tf.log(self.act_probs),1)
            self.act = tf.reshape(self.act, shape = [-1])
            
    def build_network(self):
        # critic network gives out value prediction for the given inputs
        with tf.variable_scope('critic', reuse = tf.AUTO_REUSE):
            cout = tf.layers.dense(self.inputs, units = 16, activation = tf.tanh)
            cout = tf.layers.dense(cout, units = 32, activation = tf.tanh)
            self.value = tf.layers.dense(cout, units = 1, activation = None)
        
        # actor network spits out action probabilites
        with tf.variable_scope('actor', reuse = tf.AUTO_REUSE):
            aout = tf.layers.dense(self.inputs, 16,activation = tf.tanh)
            aout = tf.layers.dense(aout, units = 32, activation = tf.tanh)
            aout = tf.layers.dense(aout, units = 16, activation = tf.tanh)
            self.act_probs = tf.layers.dense(aout, self.action.n , activation = tf.nn.softmax)
                    
    # get actions based on the given inputs
    def get_action(self, inputs):
        return self.sess.run(self.act, feed_dict = {self.inputs : inputs})
    
    # get value prediction for the given inputs
    def get_value(self, inputs):
        return self.sess.run(self.value, feed_dict = {self.inputs : inputs})
    
    # get all trainable variables required for policy update later
    def trainable_vars(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,self.scope_name)

## PPO

In [4]:
class PPO:
    
    def __init__(self, env, sess, eps = 0.2, gamma = 0.95, clip1=1, clip2=0.01, learning_rate = 5e-5):
        self.sess = sess
        self.eps = eps
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.clip1 = clip1
        self.clip2 = clip2
        self.act_clip_max = 1
        self.act_clip_min = 1e-10
        
        self.pi = A2C("pi", env.observation_space, env.action_space, self.sess)
        self.old_pi = A2C("old_pi", env.observation_space, env.action_space, self.sess)
        
        self.pi_trainable_params = self.pi.trainable_vars()
        self.old_pi_trainable_params = self.old_pi.trainable_vars()
        
        with tf.variable_scope('update_policy'):
            self.update_ops = [old_pi_vals.assign(pi_vals) for pi_vals, old_pi_vals in zip(self.pi_trainable_params, self.old_pi_trainable_params)]
        
        with tf.variable_scope('training_inputs'):
            self.actions = tf.placeholder(shape = [None], dtype=tf.int32)
            self.rewards = tf.placeholder(shape = [None], dtype=tf.float32)
            self.v_next = tf.placeholder(shape = [None], dtype=tf.float32)
            self.adv = tf.placeholder(shape = [None], dtype=tf.float32)
            
        act_probs = self.hotify_action(self.pi.act_probs)
        act_old_probs = self.hotify_action(self.old_pi.act_probs)
            
        with tf.variable_scope("loss"):
        
            # loss calculations
            clipped_act_probs = tf.log(tf.clip_by_value(act_probs, self.act_clip_min, self.act_clip_max))
            clipped_old_act_probs = tf.log(tf.clip_by_value(act_old_probs, self.act_clip_min, self.act_clip_max))
            
            ratio = tf.exp(clipped_act_probs - clipped_old_act_probs)
    
            clipped_ratio = tf.clip_by_value(ratio, 1 -self.eps, 1 + self.eps)
            surrogate = tf.multiply(ratio, self.adv)
            surrogate_clipped = tf.multiply(clipped_ratio, self.adv)
            
            clipped_loss = tf.minimum(surrogate, surrogate_clipped)
            clipped_loss = tf.reduce_mean(clipped_loss)
            
            entropy = -tf.reduce_sum(self.pi.act_probs * tf.log(tf.clip_by_value(self.pi.act_probs, self.act_clip_min, self.act_clip_max)), 1)
            entropy = tf.reduce_mean(entropy, 0)
            
            value = self.pi.value
            error = self.rewards + self.gamma * self.v_next
            loss_value = tf.squared_difference(error, value)
            loss_value = tf.reduce_sum(loss_value)
            
            self.loss = -(clipped_loss - self.clip1 * loss_value + self.clip2 * entropy)
            self.loss_plot = tf.summary.scalar('loss', self.loss)
        
        opt = tf.train.AdamOptimizer(self.learning_rate, epsilon=1e-5)
        self.gradients = opt.compute_gradients(self.loss, var_list = self.pi_trainable_params)
        self.train_op = opt.minimize(self.loss, var_list = self.pi_trainable_params)

    # get action given state
    def get_action(self, inputs):
        return self.pi.get_action(inputs)

    # get value estimate given state
    def get_value(self, inputs):
        return self.pi.get_value(inputs)
    
    # update old policy network to the new network parameters    
    def update_old_policy(self):
        self.sess.run(self.update_ops)
    
    def train_policy(self, inputs, actions, rewards, v_next, advantages):
        self.sess.run(self.train_op, feed_dict = {self.pi.inputs : inputs,
                                                  self.old_pi.inputs : inputs,
                                                  self.actions: actions,
                                                  self.rewards: rewards,
                                                  self.v_next: v_next, 
                                                  self.adv: advantages})
    
    # get advantage estimates
    def get_gaes(self, rewards, v_preds, v_preds_next):
        deltas = [r_t + self.gamma * v_next - v for r_t, v_next, v in zip(rewards, v_preds_next, v_preds)]
        gaes = copy.deepcopy(deltas)
        for t in reversed(range(len(gaes) - 1)):
            gaes[t] = gaes[t] + self.gamma * gaes[t + 1]
        return gaes
    
    def hotify_action(self, action):
        action *= tf.one_hot(self.actions, action.shape[1])
        action = tf.reduce_sum(action, 1)
        return action
    
    def get_entropy(self, act_probs):
        entropy = -tf.reduce_sum(act_probs * tf.log(tf.clip_by_value(act_probs, self.act_clip_min, self.act_clip_max)), 1)
        return tf.reduce_mean(entropy, 0)        

In [10]:
def epoch_train(num_epochs, ppo, obs, actions, adv, rewards, v_preds_next):
    transitions = [obs, actions, adv, rewards, v_preds_next]
    
    for epochs in range(num_epochs):
        # random sampling
        index = indices = np.random.randint(0, obs.shape[0], size = 32)
        samples = [np.take(transition, index, axis=0) for transition in transitions]

        # training
        ppo.train_policy(inputs = samples[0],
                     actions = samples[1],
                     advantages = samples[2],
                     rewards = samples[3],
                     v_next = samples[4])

In [11]:
def preprocess(input):
    return np.stack([input])

In [12]:
def z_score(input):
    return (input - input.mean())/input.std()

## Hyperparameters

In [13]:
iterations = 5000
num_of_epochs = 6

## PPO runner

In [14]:
tf.reset_default_graph()  

with tf.Session() as sess:
    tensor_plot = tf.summary.FileWriter('log/ppo', graph = sess.graph)
    env = gym.make('CartPole-v0')
    ppo = PPO(env, sess)
    sess.run(tf.global_variables_initializer())
    state = env.reset()
    
    for i in range(iterations):
        obs = []
        actions = []
        rewards = []
        values = []
        length = 0
        
        if i % 1000 == 0:
                print('Episode number: {}'.format(i))
             
        while True:
            length += 1
            
            state = preprocess(state)
            
            action = ppo.get_action(state)
            action = np.asscalar(action)
            
            value = ppo.get_value(state)
            value = np.asscalar(value)
            
            next_state, reward, done, _ = env.step(action)
            
            obs.append(state)
            actions.append(action)
            rewards.append(reward)
            values.append(value)
               
            if done:
                next_state = preprocess(next_state)
                next_value = ppo.get_value(next_state)
                next_value = np.asscalar(next_value)
                v_preds_next = values[1:] + [next_value]
                state = env.reset()
                break
            else:
                state = next_state
        
        tensor_plot.add_summary(tf.Summary(value = [tf.Summary.Value(tag="my_ppo_episode_rewards", simple_value = sum(rewards))]), i)
        tensor_plot.add_summary(tf.Summary(value = [tf.Summary.Value(tag="my_ppo_episode_length", simple_value = length)]), i)
        
        adv = ppo.get_gaes(rewards, values, v_preds_next)

        obs = np.reshape(obs, newshape=(-1,) + env.observation_space.shape)
        
        rewards = np.array(rewards)
        v_preds_next = np.array(v_preds_next)
        actions = np.array(actions)
        adv = z_score(np.array(adv))
                
        ppo.update_old_policy()

        epoch_train(num_of_epochs, ppo, obs, actions, adv, rewards, v_preds_next)
        
    tensor_plot.close()
env.close()

Episode number: 0
Episode number: 1000
Episode number: 2000
Episode number: 3000
Episode number: 4000


## Baselines Implementation of PPO

From the stable-baselines repo to compare my implementation with the baselines implementation of PPO

In [15]:
from stable_baselines import PPO1

model = PPO1('MlpPolicy', 'CartPole-v0', verbose=1, tensorboard_log="log/ppo/")
model.learn(total_timesteps=10000)

ModuleNotFoundError: No module named 'stable_baselines'

## Baselines Implementation of TRPO

From the stable-baselines repo to compare my implementation with the baselines implementation of TRPO

In [10]:
from stable_baselines import TRPO

model = TRPO('MlpPolicy', 'CartPole-v0', verbose=1, tensorboard_log="log/trpo/")
model.learn(total_timesteps=10000)

Creating environment from the given name, wrapped in a DummyVecEnv.
********** Iteration 0 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.518 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.109 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0     0.0187          0
         1   4.86e-05      0.104
         2   3.31e-05      0.346
         3   2.52e-05      0.363
         4   1.08e-07       0.49
         5   1.79e-06       0.49
         6    1.1e-09      0.492
         7   2.73e-11      0.492
[35mdone in 0.154 seconds[0m
Expected: 0.019 Actual: 0.018
Stepsize OK!
[35mvf[0m
[35mdone in 0.076 seconds[0m
---------------------------------
| EpLenMean       | 21.1        |
| EpRewMean       | 21.1        |
| EpThisIter      | 48          |
| EpisodesSoFar   | 48          |
| TimeElapsed     | 0.936       |
| TimestepsSoFar  | 1024        |
| entloss         | 0.0         |
| entropy         | 0.6851318   |
| ev_tdlam_before | -0.00177    |
| mea

<stable_baselines.trpo_mpi.trpo_mpi.TRPO at 0x1e422e929e8>

## Monitor Tensorboard for plots

1. Change directory into this project directory
2. Execute the following command
    `tensorboard --logdir=log/`
3. Visit the localhost page with the provided port number to monitor tensorboard