In Actor Critic Method, as the agent takes action and moves through an environment, it learns to map the observed state of the environment to two possible outputs:

- A softmax probability value of each action in the action space - Actor
- An estimated rewards in the future: Sum of all rewards it expects to receive in the future conditioned on each action - Critic.

In [9]:
!pip install --upgrade gym
# !pip install --upgrade keras
# !pip install --upgrade tensorflow

Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/721.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.26.2-py3-none-any.whl size=827731 sha256=62b69459e49e371cdb1b7335dcb4e4b288bda046c9ed6efe6ddf02c29ca4819b
  Stored in directory: /root/.cache/pip/wheels/95/51/6c/9bb05ebbe7c5cb8171dfaa3611f32622ca4658d53f31c79077
Successfully built gym
Installing collected packages: gym
  Attempting uninstall: gym
    Found existing installation: gym 0.25.2
    Uninstalling gym-0.25.2:
      Succes

In [4]:
# Setup
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
import gymnasium as gym
import numpy as np
import keras
from keras import ops
from keras import layers
import tensorflow as tf


# Other configuration parameters
seed = 42
gamma = 0.99
max_steps_per_episode = 10000
# Create environment
env = gym.make("CartPole-v0")
env.reset(seed=seed)
eps = np.finfo(np.float32).eps.item()

  logger.deprecation(


In [5]:
# Implement Actor Critic Network
# Actor Network: Takes as input the state of the environment and returns softmax output of each action in its action space
# Critic network: Takes as input the state of the environment and return an estimate of total rewards in the future.
num_inputs = 4 # x, y, angle,
num_actions = 2 # left or right
num_hidden_units = 128

inputs = layers.Input(shape=(num_inputs,))
common = layers.Dense(num_hidden_units, activation="relu")(inputs)
action = layers.Dense(num_actions, activation="softmax")(common)
critic = layers.Dense(1)(common)

# Combine actor and critic into a single network
model = keras.Model(inputs=inputs, outputs=[action, critic])
model.summary()

In [6]:
# Train the network
optimizer = keras.optimizers.Adam(learning_rate=0.01)
huber_loss = keras.losses.Huber() # Huber loss is robust loss function and less sensitive to outliers
action_prob_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0

while True:
    state = env.reset()[0]
    episode_reward = 0
    with tf.GradientTape() as tape:
        for timestep in range(1, max_steps_per_episode):
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)

            # Predict action prob and estimate future rewards using the model
            action_probs, critic_value = model(state)
            # print(f'action probabilities: {action_probs}')
            # print(f'future rewards: {critic_value}')
            critic_value_history.append(critic_value[0, 0])

            # sample action
            action = np.random.choice(num_actions, p=np.squeeze(action_probs))
            action_prob_history.append(tf.math.log(action_probs[0, action]))

            # apply sampled action in the environment
            state, reward, done, _, _ = env.step(action)
            rewards_history.append(reward)
            episode_reward += reward

            if done:
                break
        # update running reward to check condition for solving
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

        # Calculate expected value of future rewards
        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)

        # Normalize
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()

        # Calculate loss values to update the network
        history = zip(action_prob_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        for log_prob, value, ret in history:
            diff = ret - value
            # actor loss
            actor_losses.append(-log_prob * diff)
            # update the critic to predict a better estimate of future rewards
            critic_losses.append(huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0)))

        # backpropagate episode loss
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # clear reward and loss history
        action_prob_history.clear()
        critic_value_history.clear()
        rewards_history.clear()

    # Log details
    episode_count += 1
    if episode_count % 10 == 0:
        # template = f"running reward: {:.2f} at episode {}"
        print(f"running reward: {running_reward:.2f} at episode {episode_count}")

    if running_reward > 200:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break


running reward: 7.07 at episode 10
running reward: 12.96 at episode 20
running reward: 17.63 at episode 30
running reward: 21.03 at episode 40
running reward: 23.06 at episode 50
running reward: 35.83 at episode 60
running reward: 48.56 at episode 70
running reward: 60.43 at episode 80
running reward: 56.98 at episode 90
running reward: 85.45 at episode 100
running reward: 113.92 at episode 110
running reward: 96.95 at episode 120
running reward: 70.10 at episode 130
running reward: 69.73 at episode 140
running reward: 70.76 at episode 150
running reward: 98.57 at episode 160
Solved at episode 165!
