In [3]:
import tensorflow as tf
import numpy as np
import gym

env = gym.make('CartPole-v1')  

num_episodes = 1000

for episode in range(num_episodes):
    state = env.reset()
    episode_states, episode_actions, episode_rewards = [], [], []

class PolicyNetwork(tf.keras.Model):
    def __init__(self, num_actions):
        super(PolicyNetwork, self).__init__()
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(num_actions, activation='softmax')

    def call(self, state):
        x = self.dense1(state)
        return self.dense2(x)

def compute_loss(logits, actions, advantages):
    action_masks = tf.one_hot(actions, num_actions)
    log_prob = tf.math.log(tf.reduce_sum(action_masks * tf.nn.softmax(logits), axis=1))
    return -tf.reduce_sum(log_prob * advantages)

num_actions = 2
learning_rate = 0.01
gamma = 0.99

policy_network = PolicyNetwork(num_actions)
optimizer = tf.keras.optimizers.Adam(learning_rate)

num_episodes = 1000

for episode in range(num_episodes):
    state = env.reset()
    episode_states, episode_actions, episode_rewards = [], [], []

    while True:
        logits = policy_network(np.expand_dims(state, axis=0))
        action = np.random.choice(num_actions, p=np.squeeze(logits))

        next_state, reward, done, _ = env.step(action)

        episode_states.append(state)
        episode_actions.append(action)
        episode_rewards.append(reward)

        if done:
            break

        state = next_state

    discounted_rewards = []
    running_add = 0
    for r in reversed(episode_rewards):
        running_add = running_add * gamma + r
        discounted_rewards.insert(0, running_add)

    mean_reward = np.mean(discounted_rewards)
    std_reward = np.std(discounted_rewards)
    normalized_rewards = (discounted_rewards - mean_reward) / (std_reward + 1e-8)

    with tf.GradientTape() as tape:
        logits = policy_network(tf.convert_to_tensor(episode_states, dtype=tf.float32))
        loss = compute_loss(logits, episode_actions, normalized_rewards)

    grads = tape.gradient(loss, policy_network.trainable_variables)
    optimizer.apply_gradients(zip(grads, policy_network.trainable_variables))

    if episode % 10 == 0:
        print(f"Episode: {episode}, Total Reward: {sum(episode_rewards)}")



  deprecation(
  deprecation(


Episode: 0, Total Reward: 20.0
Episode: 10, Total Reward: 34.0
Episode: 20, Total Reward: 51.0
Episode: 30, Total Reward: 39.0
Episode: 40, Total Reward: 85.0
Episode: 50, Total Reward: 82.0
Episode: 60, Total Reward: 198.0
Episode: 70, Total Reward: 484.0
Episode: 80, Total Reward: 99.0
Episode: 90, Total Reward: 118.0
Episode: 100, Total Reward: 40.0
Episode: 110, Total Reward: 64.0
Episode: 120, Total Reward: 67.0
Episode: 130, Total Reward: 97.0
Episode: 140, Total Reward: 269.0
Episode: 150, Total Reward: 130.0
Episode: 160, Total Reward: 77.0
Episode: 170, Total Reward: 62.0
Episode: 180, Total Reward: 48.0
Episode: 190, Total Reward: 80.0
Episode: 200, Total Reward: 39.0
Episode: 210, Total Reward: 48.0
Episode: 220, Total Reward: 75.0
Episode: 230, Total Reward: 93.0
Episode: 240, Total Reward: 118.0
Episode: 250, Total Reward: 500.0
Episode: 260, Total Reward: 314.0
Episode: 270, Total Reward: 136.0
Episode: 280, Total Reward: 103.0
Episode: 290, Total Reward: 98.0
Episode: 30