Neural Actor-Critic algorithm and agent
Chapter N, TensorFlow 2 Reinforcement Learning Cookbook | Praveen Palanisamy

In [1]:
import numpy as np
import tensorflow as tf
import gym
import tensorflow_probability as tfp

In [2]:
class ActorCritic(tf.keras.Model):
    def __init__(self, action_dim):
        super().__init__()
        self.fc1 = tf.keras.layers.Dense(512, activation="relu")
        self.fc2 = tf.keras.layers.Dense(128, activation="relu")
        self.critic = tf.keras.layers.Dense(1, activation=None)
        self.actor = tf.keras.layers.Dense(action_dim, activation=None)

    def call(self, input_data):
        x = self.fc1(input_data)
        x1 = self.fc2(x)
        actor = self.actor(x1)
        critic = self.critic(x1)
        return critic, actor

In [3]:
class Agent:
    def __init__(self, action_dim=4, gamma=0.99):
        """Agent with a neural-network brain powered policy

        Args:
            action_dim (int): Action dimension
            gamma (float) : Discount factor. Default=0.99
        """

        self.gamma = gamma
        self.opt = tf.keras.optimizers.Adam(learning_rate=1e-4)
        self.actor_critic = ActorCritic(action_dim)

    def get_action(self, state):
        _, action_probabilities = self.actor_critic(np.array([state]))
        action_probabilities = tf.nn.softmax(action_probabilities)
        action_probabilities = action_probabilities.numpy()
        dist = tfp.distributions.Categorical(
            probs=action_probabilities, dtype=tf.float32
        )
        action = dist.sample()
        return int(action.numpy()[0])

    def actor_loss(self, prob, action, td):
        prob = tf.nn.softmax(prob)
        dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
        log_prob = dist.log_prob(action)
        loss = -log_prob * td
        return loss

    def learn(self, state, action, reward, next_state, done):
        state = np.array([state])
        next_state = np.array([next_state])

        with tf.GradientTape() as tape:
            value, action_probabilities = self.actor_critic(state, training=True)
            value_next_st, _ = self.actor_critic(next_state, training=True)
            td = reward + self.gamma * value_next_st * (1 - int(done)) - value
            actor_loss = self.actor_loss(action_probabilities, action, td)
            critic_loss = td ** 2
            total_loss = actor_loss + critic_loss
        grads = tape.gradient(total_loss, self.actor_critic.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.actor_critic.trainable_variables))
        return total_loss

In [4]:
def train(agent, env, episodes, render=True):
    """Train `agent` in `env` for `episodes`

    Args:
        agent (Agent): Agent to train
        env (gym.Env): Environment to train the agent
        episodes (int): Number of episodes to train
        render (bool): True=Enable/False=Disable rendering; Default=True
    """

    for episode in range(episodes):

        done = False
        state = env.reset()
        total_reward = 0
        all_loss = []

        while not done:
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            loss = agent.learn(state, action, reward, next_state, done)
            all_loss.append(loss)
            state = next_state
            total_reward += reward
            if render:
                env.render()
            if done:
                print("\n")
            print(f"Episode#:{episode} ep_reward:{total_reward}", end="\r")

In [5]:
if __name__ == "__main__":

    env = gym.make("CartPole-v0")
    agent = Agent(env.action_space.n)
    num_episodes = 2  #  Increase number of episodes to train
    # Set render=True to visualize Agent's actions in the env
    train(agent, env, num_episodes, render=False)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



Episode#:0 ep_reward:1.0Episode#:0 ep_reward:2.0Episode#:0 ep_reward:3.0Episode#:0 ep_reward:4.0Episode#:0 ep_reward:5.0Episode#:0 ep_reward:6.0Episode#:0 ep_reward:7.0Episode#:0 ep_reward:8.0Episode#:0 ep_reward:9.0Episode#:0 ep_reward:10.0Episode#:0 ep_reward:11.0

Episode#:0 ep_reward:12.0Episode#:0 ep_reward:13.0Episode#:0 ep_reward:14.0Episode#:0 ep_reward:15.0Episode#:0 ep_reward:16.0Episode#:0 ep_reward:17.0Episode#:0 ep_reward:18.0Episode#:0 ep_reward:19.0

Episode#:0 ep_reward:20.0Episode#:0 ep_reward:21.0Episode#:0 ep_reward:22.0Episode#:0 ep_reward:23.0Episode#:0 ep_reward:24.0Episode#:0 ep_reward:25.0Episode#:0 ep_reward:26.0Episode#:0 ep_reward:27.0Episode#:0 ep_reward:28.0Episode#:0 ep_reward:29.0Episode#:0 ep_reward:30.0Episode#:0 ep_reward:31.0

Episode#:0 ep_reward:32.0Episode#:0 ep_reward:33.0Episode#:0 ep_reward:34.0Episode#:0 ep_reward:35.0Episode#:0 ep_reward:36.0Episode#:0 ep_reward:37.0Episode#:0 ep_reward:38.0Episode#:0 ep_reward:39.0

Episode#:0 ep_reward:40.0Episode#:0 ep_reward:41.0Episode#:0 ep_reward:42.0Episode#:0 ep_reward:43.0Episode#:0 ep_reward:44.0Episode#:0 ep_reward:45.0Episode#:0 ep_reward:46.0Episode#:0 ep_reward:47.0Episode#:0 ep_reward:48.0Episode#:0 ep_reward:49.0Episode#:0 ep_reward:50.0Episode#:0 ep_reward:51.0

Episode#:0 ep_reward:52.0Episode#:0 ep_reward:53.0Episode#:0 ep_reward:54.0Episode#:0 ep_reward:55.0Episode#:0 ep_reward:56.0Episode#:0 ep_reward:57.0Episode#:0 ep_reward:58.0

Episode#:0 ep_reward:59.0Episode#:0 ep_reward:60.0Episode#:0 ep_reward:61.0

Episode#:0 ep_reward:62.0Episode#:1 ep_reward:1.0Episode#:1 ep_reward:2.0Episode#:1 ep_reward:3.0Episode#:1 ep_reward:4.0Episode#:1 ep_reward:5.0Episode#:1 ep_reward:6.0Episode#:1 ep_reward:7.0Episode#:1 ep_reward:8.0

Episode#:1 ep_reward:9.0Episode#:1 ep_reward:10.0Episode#:1 ep_reward:11.0Episode#:1 ep_reward:12.0Episode#:1 ep_reward:13.0Episode#:1 ep_reward:14.0Episode#:1 ep_reward:15.0

Episode#:1 ep_reward:16.0Episode#:1 ep_reward:17.0Episode#:1 ep_reward:18.0Episode#:1 ep_reward:19.0Episode#:1 ep_reward:20.0Episode#:1 ep_reward:21.0

Episode#:1 ep_reward:22.0