In [11]:
import numpy as np
import tensorflow as tf
import gym

In [12]:
env = gym.make("CartPole-v1")

In [13]:
actor=tf.keras.Sequential([
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dense(env.action_space.n,activation='softmax')
])
critic=tf.keras.Sequential([
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dense(1)
])

In [14]:
actor_optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
critic_optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)

In [None]:
num_episodes = 1000
gamma = 0.99

for episode in range(num_episodes):
    state = env.reset()
    # gym >=0.26 may return (obs, info)
    if isinstance(state, tuple):
        state = state[0]
    episode_reward = 0.0

    for t in range(1, 10000):
        state_input = np.array([state], dtype=np.float32)

        with tf.GradientTape() as tape:
            action_probs = actor(state_input)
            action = np.random.choice(env.action_space.n, p=action_probs.numpy()[0])

            step_result = env.step(action)
            if len(step_result) == 5:
                next_state, reward, terminated, truncated, _ = step_result
                done = terminated or truncated
            else:
                next_state, reward, done, _ = step_result

            if isinstance(next_state, tuple):
                next_state = next_state[0]

            next_state_input = np.array([next_state], dtype=np.float32)

            state_value = critic(state_input)[0, 0]
            # for terminal next state, target value is 0
            if done:
                next_state_value = 0.0
            else:
                next_state_value = critic(next_state_input)[0, 0]

            advantage = reward + gamma * next_state_value - state_value

            # stabilize log and stop critic gradients flowing into actor
            action_prob = tf.clip_by_value(action_probs[0, action], 1e-8, 1.0)
            actor_loss = -tf.math.log(action_prob) * tf.stop_gradient(advantage)
            critic_loss = tf.square(advantage)

            total_loss = actor_loss + critic_loss

        # compute gradients and split for actor and critic
        trainable_vars = actor.trainable_variables + critic.trainable_variables
        grads = tape.gradient(total_loss, trainable_vars)
        grads = [g if g is not None else tf.zeros_like(v) for g, v in zip(grads, trainable_vars)]

        actor_grads = grads[: len(actor.trainable_variables)]
        critic_grads = grads[len(actor.trainable_variables) :]

        actor_optimizer.apply_gradients(zip(actor_grads, actor.trainable_variables))
        critic_optimizer.apply_gradients(zip(critic_grads, critic.trainable_variables))

        episode_reward += reward
        state = next_state

        if done:
            break

    if episode % 10 == 0:
        print(f"Episode {episode}, Reward: {episode_reward}")

env.close()

  if not isinstance(terminated, (bool, np.bool8)):


Episode 0, Reward: 23.0
Episode 10, Reward: 34.0
Episode 20, Reward: 22.0
Episode 30, Reward: 20.0
Episode 40, Reward: 19.0
Episode 50, Reward: 26.0
Episode 60, Reward: 19.0
Episode 70, Reward: 17.0
Episode 80, Reward: 38.0
Episode 90, Reward: 47.0
Episode 100, Reward: 61.0
Episode 110, Reward: 56.0
Episode 120, Reward: 52.0
Episode 130, Reward: 108.0
Episode 140, Reward: 209.0
