Notebook written by [Zhedong Zheng](https://github.com/zhedongzheng)

<img src="img/pg.png" width="400">

In [1]:
import tensorflow as tf
import numpy as np
import gym

In [2]:
class Agent:
    def __init__(self, env, n_in, n_out, sess):
        self.env = env
        self.n_in = n_in
        self.n_out = n_out
        self.sess = sess
        self.build_graph()
    # end constructor


    def build_graph(self):
        self.ph_obs = tf.placeholder(tf.float32, shape=[None, self.n_in])
        self.ph_rewards = tf.placeholder(tf.float32, shape=[None])
        self.ph_actions = tf.placeholder(tf.int32, shape=[None])
        
        x = tf.layers.dense(self.ph_obs, 10, tf.tanh)
        logits = tf.layers.dense(x, self.n_out)
        
        self.op_sample = tf.multinomial(tf.nn.log_softmax(logits), num_samples=1)
        self.op_action = tf.argmax(logits, -1)

        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.ph_actions,
                                                              logits=logits)
        self.op_loss = tf.reduce_mean(loss * self.ph_rewards)
        self.op_train = tf.train.AdamOptimizer(2e-3).minimize(self.op_loss)
    # end method


    def train(self,
              nb_episodes=500,
              nb_games_per_update=10,
              nb_max_steps=1000,
              discount_rate=0.99):
        
        self.sess.run(tf.global_variables_initializer())

        for episode in range(nb_episodes):
            ep_rewards = []                         # rewards in one eposide
            ep_obs = []
            ep_actions = []

            for game in range(nb_games_per_update):
                game_rewards = []                   # rewards in one round of game
                obs = self.env.reset()
                for step in range(nb_max_steps):
                    action_val = self.sess.run(self.op_sample,
                                              {self.ph_obs: np.atleast_2d(obs)})
                    obs, reward, done, info = self.env.step(action_val[0,0])
                    ep_obs.append(obs)
                    ep_actions.append(action_val[0,0])
                    reward = -5 if done else reward
                    game_rewards.append(reward)
                    if done:
                        break
                ep_rewards.append(game_rewards)

            ep_rewards = self.discount_and_normalize_rewards(ep_rewards, discount_rate)
            flat_rewards = np.concatenate(ep_rewards)
            _, loss = self.sess.run([self.op_train, self.op_loss],
                                    {self.ph_obs: np.vstack(ep_obs),
                                     self.ph_rewards: flat_rewards,
                                     self.ph_actions: np.array(ep_actions)})
            print("Episode %d/%d | Loss: %.3f | Step: %d" % (episode, nb_episodes, loss, step))
    # end method


    def simulate(self):
        obs = self.env.reset()
        done = False
        count = 0
        while not done:
            self.env.render()
            action_val = self.sess.run(self.op_action,
                                      {self.ph_obs: np.atleast_2d(obs)})
            obs, reward, done, info = self.env.step(action_val[0])
            count += 1
        print("Test Time: %d Steps Completed" % count)
    # end method


    def discount_rewards(self, game_rewards, discount_rate):
        discounted_rewards = np.zeros(len(game_rewards))
        cumulative_rewards = 0
        for step in reversed(range(len(game_rewards))):
            cumulative_rewards = game_rewards[step] + cumulative_rewards * discount_rate
            discounted_rewards[step] = cumulative_rewards
        return discounted_rewards
    # end method


    def discount_and_normalize_rewards(self, ep_rewards, discount_rate):
        discounted = [self.discount_rewards(game_rewards, discount_rate) for game_rewards in ep_rewards]
        flat_rewards = np.concatenate(discounted)
        reward_mean = flat_rewards.mean()
        reward_std = flat_rewards.std()
        return [(game_rewards - reward_mean) / reward_std for game_rewards in discounted]
    # end method
# end class

In [3]:
def main():
    agent = Agent(env = gym.make('CartPole-v1'),
                  n_in = 4,
                  n_out = 2,
                  sess = tf.Session())
    agent.train()
    agent.simulate()

In [4]:
main()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode 0/500 | Loss: 0.055 | Step: 18
Episode 1/500 | Loss: 0.080 | Step: 13
Episode 2/500 | Loss: 0.110 | Step: 9
Episode 3/500 | Loss: 0.049 | Step: 14
Episode 4/500 | Loss: 0.040 | Step: 13
Episode 5/500 | Loss: 0.026 | Step: 9
Episode 6/500 | Loss: 0.065 | Step: 16
Episode 7/500 | Loss: -0.052 | Step: 13
Episode 8/500 | Loss: 0.053 | Step: 13
Episode 9/500 | Loss: 0.034 | Step: 10
Episode 10/500 | Loss: 0.062 | Step: 13
Episode 11/500 | Loss: 0.087 | Step: 9
Episode 12/500 | Loss: 0.058 | Step: 10
Episode 13/500 | Loss: 0.040 | Step: 13
Episode 14/500 | Loss: 0.052 | Step: 13
Episode 15/500 | Loss: 0.023 | Step: 18
Episode 16/500 | Loss: -0.070 | Step: 9
Episode 17/500 | Loss: 0.005 | Step: 15
Episode 18/500 | Loss: 0.011 | Step: 10
Episode 19/500 | Loss: 0.003 | Step: 10
Episode 20/500 | Loss: 0.032 | Step: 15
Episode 21/500 | Loss: 0.057 | Step: 12
Episode 22/500 | Loss: 0

Episode 197/500 | Loss: -0.009 | Step: 33
Episode 198/500 | Loss: -0.007 | Step: 190
Episode 199/500 | Loss: -0.011 | Step: 319
Episode 200/500 | Loss: -0.023 | Step: 102
Episode 201/500 | Loss: -0.014 | Step: 190
Episode 202/500 | Loss: -0.020 | Step: 211
Episode 203/500 | Loss: -0.021 | Step: 187
Episode 204/500 | Loss: -0.012 | Step: 305
Episode 205/500 | Loss: -0.019 | Step: 170
Episode 206/500 | Loss: -0.021 | Step: 134
Episode 207/500 | Loss: -0.025 | Step: 102
Episode 208/500 | Loss: -0.028 | Step: 53
Episode 209/500 | Loss: -0.022 | Step: 56
Episode 210/500 | Loss: -0.022 | Step: 60
Episode 211/500 | Loss: -0.016 | Step: 123
Episode 212/500 | Loss: -0.019 | Step: 213
Episode 213/500 | Loss: -0.013 | Step: 121
Episode 214/500 | Loss: -0.018 | Step: 202
Episode 215/500 | Loss: -0.018 | Step: 245
Episode 216/500 | Loss: -0.020 | Step: 109
Episode 217/500 | Loss: -0.016 | Step: 106
Episode 218/500 | Loss: -0.028 | Step: 138
Episode 219/500 | Loss: -0.024 | Step: 40
Episode 220/500 

Episode 389/500 | Loss: -0.019 | Step: 499
Episode 390/500 | Loss: -0.027 | Step: 499
Episode 391/500 | Loss: -0.014 | Step: 234
Episode 392/500 | Loss: -0.029 | Step: 401
Episode 393/500 | Loss: -0.014 | Step: 499
Episode 394/500 | Loss: -0.014 | Step: 396
Episode 395/500 | Loss: -0.022 | Step: 499
Episode 396/500 | Loss: -0.018 | Step: 418
Episode 397/500 | Loss: -0.024 | Step: 82
Episode 398/500 | Loss: -0.019 | Step: 243
Episode 399/500 | Loss: -0.034 | Step: 288
Episode 400/500 | Loss: -0.031 | Step: 499
Episode 401/500 | Loss: -0.007 | Step: 82
Episode 402/500 | Loss: -0.011 | Step: 499
Episode 403/500 | Loss: -0.016 | Step: 143
Episode 404/500 | Loss: -0.028 | Step: 444
Episode 405/500 | Loss: -0.012 | Step: 499
Episode 406/500 | Loss: -0.017 | Step: 499
Episode 407/500 | Loss: -0.016 | Step: 227
Episode 408/500 | Loss: -0.008 | Step: 24
Episode 409/500 | Loss: -0.010 | Step: 499
Episode 410/500 | Loss: -0.011 | Step: 499
Episode 411/500 | Loss: -0.022 | Step: 277
Episode 412/50