Policy gradient algorithm and agent with neural network policy
Chapter 2, TensorFlow 2 Reinforcement Learning Cookbook | Praveen Palanisamy

In [1]:
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import gym

In [2]:
class PolicyNet(keras.Model):
    def __init__(self, action_dim=1):
        super(PolicyNet, self).__init__()
        self.fc1 = layers.Dense(24, activation="relu")
        self.fc2 = layers.Dense(36, activation="relu")
        self.fc3 = layers.Dense(action_dim, activation="softmax")

    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

    def process(self, observations):
        # Process batch observations using `call(x)` behind-the-scenes
        action_probabilities = self.predict_on_batch(observations)
        return action_probabilities

In [3]:
class Agent(object):
    def __init__(self, action_dim=1):
        """Agent with a neural-network brain powered policy

        Args:
            action_dim (int): Action dimension
        """
        self.policy_net = PolicyNet(action_dim=action_dim)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        self.gamma = 0.99

    def policy(self, observation):
        observation = observation.reshape(1, -1)
        observation = tf.convert_to_tensor(observation, dtype=tf.float32)
        action_logits = self.policy_net(observation)
        action = tf.random.categorical(tf.math.log(action_logits), num_samples=1)
        return action

    def get_action(self, observation):
        action = self.policy(observation).numpy()
        return action.squeeze()

    def learn(self, states, rewards, actions):
        discounted_reward = 0
        discounted_rewards = []
        rewards.reverse()
        for r in rewards:
            discounted_reward = r + self.gamma * discounted_reward
            discounted_rewards.append(discounted_reward)
            discounted_rewards.reverse()

        for state, reward, action in zip(states, discounted_rewards, actions):
            with tf.GradientTape() as tape:
                action_probabilities = self.policy_net(np.array([state]), training=True)
                loss = self.loss(action_probabilities, action, reward)
            grads = tape.gradient(loss, self.policy_net.trainable_variables)
            self.optimizer.apply_gradients(
                zip(grads, self.policy_net.trainable_variables)
            )

    def loss(self, action_probabilities, action, reward):
        dist = tfp.distributions.Categorical(
            probs=action_probabilities, dtype=tf.float32
        )
        log_prob = dist.log_prob(action)
        loss = -log_prob * reward
        return loss

In [4]:
def train(agent: Agent, env: gym.Env, episodes: int, render=True):
    """Train `agent` in `env` for `episodes`

    Args:
        agent (Agent): Agent to train
        env (gym.Env): Environment to train the agent
        episodes (int): Number of episodes to train
        render (bool): True=Enable/False=Disable rendering; Default=True
    """

    for episode in range(episodes):
        done = False
        state = env.reset()
        total_reward = 0
        rewards = []
        states = []
        actions = []
        while not done:
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            rewards.append(reward)
            states.append(state)
            actions.append(action)
            state = next_state
            total_reward += reward
            if render:
                env.render()
            if done:
                agent.learn(states, rewards, actions)
                print("\n")
            print(f"Episode#:{episode} ep_reward:{total_reward}", end="\r")

In [5]:
if __name__ == "__main__":
    agent = Agent()
    episodes = 2  #  Increase number of episodes to train
    env = gym.make("MountainCar-v0")
    # Set render=True to visualize Agent's actions in the env
    train(agent, env, episodes, render=False)
    env.close()

Episode#:0 ep_reward:-1.0Episode#:0 ep_reward:-2.0Episode#:0 ep_reward:-3.0Episode#:0 ep_reward:-4.0Episode#:0 ep_reward:-5.0Episode#:0 ep_reward:-6.0Episode#:0 ep_reward:-7.0Episode#:0 ep_reward:-8.0Episode#:0 ep_reward:-9.0Episode#:0 ep_reward:-10.0Episode#:0 ep_reward:-11.0Episode#:0 ep_reward:-12.0Episode#:0 ep_reward:-13.0Episode#:0 ep_reward:-14.0Episode#:0 ep_reward:-15.0Episode#:0 ep_reward:-16.0Episode#:0 ep_reward:-17.0Episode#:0 ep_reward:-18.0Episode#:0 ep_reward:-19.0Episode#:0 ep_reward:-20.0Episode#:0 ep_reward:-21.0Episode#:0 ep_reward:-22.0Episode#:0 ep_reward:-23.0Episode#:0 ep_reward:-24.0Episode#:0 ep_reward:-25.0Episode#:0 ep_reward:-26.0Episode#:0 ep_reward:-27.0Episode#:0 ep_reward:-28.0Episode#:0 ep_reward:-29.0Episode#:0 ep_reward:-30.0Episode#:0 ep_reward:-31.0Episode#:0 ep_reward:-32.0Episode#:0 ep_reward:-33.0Episode#:0 ep_reward:-34.0Episode#:0 ep_reward:-35.0Episode#:0 ep_reward:-36.0Episode#:0 ep_reward:-37.0Episode#:0



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.





Episode#:0 ep_reward:-200.0Episode#:1 ep_reward:-1.0Episode#:1 ep_reward:-2.0Episode#:1 ep_reward:-3.0Episode#:1 ep_reward:-4.0Episode#:1 ep_reward:-5.0Episode#:1 ep_reward:-6.0Episode#:1 ep_reward:-7.0Episode#:1 ep_reward:-8.0Episode#:1 ep_reward:-9.0Episode#:1 ep_reward:-10.0Episode#:1 ep_reward:-11.0Episode#:1 ep_reward:-12.0Episode#:1 ep_reward:-13.0Episode#:1 ep_reward:-14.0Episode#:1 ep_reward:-15.0Episode#:1 ep_reward:-16.0Episode#:1 ep_reward:-17.0Episode#:1 ep_reward:-18.0Episode#:1 ep_reward:-19.0Episode#:1 ep_reward:-20.0Episode#:1 ep_reward:-21.0Episode#:1 ep_reward:-22.0Episode#:1 ep_reward:-23.0Episode#:1 ep_reward:-24.0Episode#:1 ep_reward:-25.0Episode#:1 ep_reward:-26.0Episode#:1 ep_reward:-27.0Episode#:1 ep_reward:-28.0Episode#:1 ep_reward:-29.0Episode#:1 ep_reward:-30.0Episode#:1 ep_reward:-31.0Episode#:1 ep_reward:-32.0Episode#:1 ep_reward:-33.0Episode#:1 ep_reward:-34.0Episode#:1 ep_reward:-35.0Episode#:1 ep_reward:-36.0Episode

Episode#:1 ep_reward:-175.0Episode#:1 ep_reward:-176.0Episode#:1 ep_reward:-177.0Episode#:1 ep_reward:-178.0Episode#:1 ep_reward:-179.0Episode#:1 ep_reward:-180.0Episode#:1 ep_reward:-181.0Episode#:1 ep_reward:-182.0Episode#:1 ep_reward:-183.0Episode#:1 ep_reward:-184.0Episode#:1 ep_reward:-185.0Episode#:1 ep_reward:-186.0Episode#:1 ep_reward:-187.0Episode#:1 ep_reward:-188.0Episode#:1 ep_reward:-189.0Episode#:1 ep_reward:-190.0Episode#:1 ep_reward:-191.0Episode#:1 ep_reward:-192.0Episode#:1 ep_reward:-193.0Episode#:1 ep_reward:-194.0Episode#:1 ep_reward:-195.0Episode#:1 ep_reward:-196.0Episode#:1 ep_reward:-197.0Episode#:1 ep_reward:-198.0Episode#:1 ep_reward:-199.0



Episode#:1 ep_reward:-200.0