In [4]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

# Environment setup
env = gym.make("CartPole-v1")
num_actions = env.action_space.n
num_states = env.observation_space.shape[0]

# Hyperparameters
gamma = 0.99  # Discount factor
lr_actor = 0.001
lr_critic = 0.005

# Actor network
actor_model = tf.keras.Sequential([
    layers.Input(shape=(num_states,)),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(num_actions, activation='softmax')
])

# Critic network
critic_model = tf.keras.Sequential([
    layers.Input(shape=(num_states,)),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(1)
])

# Optimizers
actor_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_actor)
critic_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_critic)

# Training function
def train():
    for episode in range(50):
        state = env.reset()
        state = np.reshape(state, [1, num_states])
        episode_reward = 0

        with tf.GradientTape(persistent=True) as tape:
            while True:
                # Select action
                state_tensor = tf.convert_to_tensor(state)
                action_probs = actor_model(state_tensor)
                action = np.random.choice(num_actions, p=np.squeeze(action_probs))

                # Execute action
                next_state, reward, done, _ = env.step(action)
                next_state = np.reshape(next_state, [1, num_states])

                # Compute TD target and error
                state_value = critic_model(state_tensor)
                next_state_value = critic_model(tf.convert_to_tensor(next_state))
                td_target = reward + gamma * next_state_value * (1 - int(done))
                td_error = td_target - state_value

                # Actor loss
                action_one_hot = tf.one_hot([action], num_actions)
                log_prob = tf.math.log(tf.reduce_sum(action_probs * action_one_hot, axis=1))
                actor_loss = -log_prob * tf.stop_gradient(td_error)

                # Critic loss
                critic_loss = tf.square(td_error)

                # Update networks
                actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
                critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
                actor_optimizer.apply_gradients(zip(actor_grad, actor_model.trainable_variables))
                critic_optimizer.apply_gradients(zip(critic_grad, critic_model.trainable_variables))

                episode_reward += reward
                state = next_state

                if done:
                    break

        print(f"Episode: {episode}, Reward: {episode_reward}")

train()


  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Reward: 17.0
Episode: 1, Reward: 13.0
Episode: 2, Reward: 15.0
Episode: 3, Reward: 30.0
Episode: 4, Reward: 36.0
Episode: 5, Reward: 14.0
Episode: 6, Reward: 13.0
Episode: 7, Reward: 17.0
Episode: 8, Reward: 36.0
Episode: 9, Reward: 41.0
Episode: 10, Reward: 29.0
Episode: 11, Reward: 19.0
Episode: 12, Reward: 12.0
Episode: 13, Reward: 22.0
Episode: 14, Reward: 28.0
Episode: 15, Reward: 34.0
Episode: 16, Reward: 13.0
Episode: 17, Reward: 14.0
Episode: 18, Reward: 18.0
Episode: 19, Reward: 14.0
Episode: 20, Reward: 47.0
Episode: 21, Reward: 20.0
Episode: 22, Reward: 63.0
Episode: 23, Reward: 30.0
Episode: 24, Reward: 17.0
Episode: 25, Reward: 35.0
Episode: 26, Reward: 20.0
Episode: 27, Reward: 30.0
Episode: 28, Reward: 26.0
Episode: 29, Reward: 15.0
Episode: 30, Reward: 60.0
Episode: 31, Reward: 37.0
Episode: 32, Reward: 14.0
Episode: 33, Reward: 30.0
Episode: 34, Reward: 32.0
Episode: 35, Reward: 17.0
Episode: 36, Reward: 28.0
Episode: 37, Reward: 37.0
Episode: 38, Reward: 1