In [16]:
! pip install gym tensorflow



In [25]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

env = gym.make("CartPole-v1")
obs_space = env.observation_space.shape[0]
act_space = env.action_space.n


In [26]:
gamma=0.99
learning_rate=0.01
num_episodes=1000
batch_size=32

In [27]:
class PolicyNetwork(tf.keras.Model):
    def __init__(self,hidden_units=128):
        super(PolicyNetwork,self).__init__()
        self.dense1=layers.Dense(hidden_units,activation='relu')
        self.dense2=layers.Dense(env.action_space.n,activation='softmax')

    def call(self,state):
        x=self.dense1(state)
        return self.dense2(x)

In [28]:
policy=PolicyNetwork()
optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate)

In [35]:
def compute_returns(rewards,gamma):
    returns=np.zeros_like(rewards,dtype=np.float32)
    running_return=0 

    for t in reversed(range(len(rewards))):
        running_return=rewards[t]+gamma*running_return
        returns[t]=running_return
    return returns

In [36]:
def train_step(states,actions,returns):
    with tf.GradientTape() as tape:
        action_probs=policy(states)
        action_indices=np.array(actions,dtype=np.int32)

        action_log_probs=tf.math.log(tf.reduce_sum(action_probs*tf.one_hot(action_indices,env.action_space.n),axis=1))

        loss=-tf.reduce_mean(action_log_probs*returns)

    grads=tape.gradient(loss,policy.trainable_variables)
    optimizer.apply_gradients(zip(grads,policy.trainable_variables))

In [None]:
for episode in range(num_episodes):
    state,_=env.reset()
    done=False
    states,actions,rewards=[],[],[]

    while not done:
        state_input=np.array(state,dtype=np.float32).reshape(1,-1)
        probs=policy(state_input).numpy().flatten()
        action=np.random.choice(env.action_space.n,p=probs)
        next_state,reward,terminated,truncated,_=env.step(action)
        done=terminated or truncated

        states.append(state)
        actions.append(action)
        rewards.append(reward)
        state=next_state
    returns=compute_returns(rewards,gamma)
    returns=(returns-np.mean(returns))/(np.std(returns)+1e-9)

    states_batch=np.array(states,dtype=np.float32)
    train_step(states_batch,actions,returns)
    if episode % 100 == 0 :
        print(f"Episode {episode}/{num_episodes} completed.")


Episode 0/1000 completed.
