# RL agent for Cartpole using Neural Network Policy using Policy Gradient Method

![NN](./images/NNPolicy.png)

### Dependencies

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import gymnasium as gym

In [74]:
env = gym.make("CartPole-v1")

In [75]:
print(tf.__version__)

2.17.0


## Create Neural Network Model

In [76]:
n_inputs = 4

model = keras.models.Sequential([
    keras.layers.Dense(5,activation="elu",input_shape=[n_inputs]),
    keras.layers.Dense(1,activation="sigmoid"),
])

### Play One Step

In [77]:
def play_one_step(env, obs, model, loss_fn): 
    with tf.GradientTape() as tape:
         left_proba = model(obs[np.newaxis])
         action = (tf.random.uniform([1, 1]) > left_proba)
         y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)
         loss = tf.reduce_mean(loss_fn(y_target, left_proba))
    grads = tape.gradient(loss, model.trainable_variables)
    obs, reward, done, terminated,info = env.step(int(action[0, 0].numpy())) 
    return obs, reward, done, grads

In [78]:
np.newaxis
print(obs.shape)
print(obs[np.newaxis].shape)

(4,)
(1, 4)


In [79]:
with tf.GradientTape() as tape:
    left_proba = model(obs[np.newaxis])
    print(f"left_proba: {left_proba}")
    action = (tf.random.uniform([1, 1]) > left_proba)
    print(f"action: {action}")
    y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)
    print(f"y_target: {y_target}")
    loss = tf.reduce_mean(loss_fn(y_target, left_proba))
    print(f"loss: {loss}")
grads = tape.gradient(loss, model.trainable_variables)
print(grads)

left_proba: [[0.72566235]]
action: [[ True]]
y_target: [[0.]]
loss: 1.293395757675171
[<tf.Tensor: shape=(4, 5), dtype=float32, numpy=
array([[-0.4776177 ,  0.45922408,  0.52160054,  0.7995444 ,  0.29034433],
       [ 0.02009101, -0.01931729, -0.02194116, -0.03363288, -0.01221335],
       [ 0.01338782, -0.01287224, -0.01462067, -0.02241155, -0.00813847],
       [-0.12374416,  0.11897862,  0.1351395 ,  0.20715092,  0.07522421]],
      dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=
array([-0.39170828,  0.37662312,  0.42777988,  0.6557298 ,  0.2381199 ],
      dtype=float32)>, <tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[-0.29392672],
       [ 0.6520673 ],
       [-0.04958965],
       [ 0.05024721],
       [ 0.2643369 ]], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.72566235], dtype=float32)>]


In [80]:
with tf.GradientTape() as tape:
    left_proba = model(obs[np.newaxis])
    print(f"left_proba: {left_proba}")
    action = (tf.random.uniform([1, 1]) > left_proba)
    print(f"action: {action}")
    y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)
    print(f"y_target: {y_target}")
    loss = tf.reduce_mean(loss_fn(y_target, left_proba))
    print(f"loss: {loss}")
grads = tape.gradient(loss, model.trainable_variables)
print(grads)

left_proba: [[0.72566235]]
action: [[False]]
y_target: [[1.]]
loss: 0.3206704258918762
[<tf.Tensor: shape=(4, 5), dtype=float32, numpy=
array([[ 0.18056405, -0.1736103 , -0.19719179, -0.30226883, -0.10976508],
       [-0.00759544,  0.00730293,  0.00829488,  0.01271496,  0.00461727],
       [-0.00506128,  0.00486637,  0.00552737,  0.00847272,  0.00307676],
       [ 0.04678165, -0.04498003, -0.05108967, -0.07831369, -0.02843861]],
      dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=
array([ 0.14808586, -0.14238289, -0.16172276, -0.24789953, -0.09002155],
      dtype=float32)>, <tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[ 0.1111194 ],
       [-0.24651493],
       [ 0.01874743],
       [-0.01899603],
       [-0.09993294]], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.27433765], dtype=float32)>]


In [81]:
model.trainable_variables

[<KerasVariable shape=(4, 5), dtype=float32, path=sequential_1/dense_2/kernel>,
 <KerasVariable shape=(5,), dtype=float32, path=sequential_1/dense_2/bias>,
 <KerasVariable shape=(5, 1), dtype=float32, path=sequential_1/dense_3/kernel>,
 <KerasVariable shape=(1,), dtype=float32, path=sequential_1/dense_3/bias>]

### Play Multiple Episodes

In [82]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()[0]
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env,obs,model,loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

In [91]:
env = gym.make("CartPole-v1")
n_episodes=10
n_max_steps=500
loss_fn = keras.losses.binary_crossentropy

all_rewards, all_grads = play_multiple_episodes(env,n_episodes,n_max_steps,model,loss_fn)

len(all_grads[0])

19

In [111]:
len(all_rewards[0])

19

In [105]:
print(len(all_grads[0]))
all_grads[0]

19


[[<tf.Tensor: shape=(4, 5), dtype=float32, numpy=
  array([[ 0.01386217, -0.00792973, -0.00966748, -0.01380628, -0.00501357],
         [ 0.01094549, -0.00626128, -0.00763339, -0.01090136, -0.00395869],
         [ 0.01707972, -0.00977031, -0.0119114 , -0.01701086, -0.00617727],
         [ 0.00563668, -0.00322442, -0.00393102, -0.00561395, -0.00203864]],
        dtype=float32)>,
  <tf.Tensor: shape=(5,), dtype=float32, numpy=
  array([ 0.44517198, -0.25465685, -0.31046304, -0.4433771 , -0.16100673],
        dtype=float32)>,
  <tf.Tensor: shape=(5, 1), dtype=float32, numpy=
  array([[-0.00579038],
         [-0.01664073],
         [-0.01892053],
         [-0.00054808],
         [-0.00755161]], dtype=float32)>,
  <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.4906626], dtype=float32)>],
 [<tf.Tensor: shape=(4, 5), dtype=float32, numpy=
  array([[-0.0131865 ,  0.00931124,  0.01065102,  0.01456957,  0.00588703],
         [ 0.07131463, -0.05035663, -0.05760239, -0.07879448, -0.03183797

### Discounted Rewards (Action Return and Action Advantage)

![NN](./images/CreditAssignment.png)

In [112]:
#Action Return
def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_factor
    return discounted

#Action Advantage
def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(rewards, discount_factor) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]
    
                    

In [115]:
for index,step in enumerate(range(len(all_rewards[0]) - 2, -1, -1)):
    print(index,step)

0 17
1 16
2 15
3 14
4 13
5 12
6 11
7 10
8 9
9 8
10 7
11 6
12 5
13 4
14 3
15 2
16 1
17 0


In [116]:
discount_rewards(all_rewards[0],0.95)

array([12.45292795, 12.05571363, 11.6375933 , 11.19746663, 10.7341754 ,
       10.24650042,  9.73315833,  9.19279825,  8.62399815,  8.02526122,
        7.39501181,  6.73159137,  6.03325408,  5.29816219,  4.52438125,
        3.709875  ,  2.8525    ,  1.95      ,  1.        ])

In [118]:
discount_and_normalize_rewards(all_rewards,0.95)

[array([ 1.58228181,  1.46351027,  1.33848759,  1.20688477,  1.06835549,
         0.92253519,  0.76904014,  0.6074664 ,  0.43738879,  0.25835971,
         0.06990806, -0.1284621 , -0.3372728 , -0.55707353, -0.78844273,
        -1.03198925, -1.28835401, -1.55821165, -1.84227232]),
 array([ 1.20688477,  1.06835549,  0.92253519,  0.76904014,  0.6074664 ,
         0.43738879,  0.25835971,  0.06990806, -0.1284621 , -0.3372728 ,
        -0.55707353, -0.78844273, -1.03198925, -1.28835401, -1.55821165,
        -1.84227232])]

In [20]:
discount_rewards([10,0,-50],discount_factor=0.8)

array([-22, -40, -50])

In [21]:
discount_and_normalize_rewards([[10,0,-50],[10,20]],0.8)

[array([-0.28435071, -0.86597718, -1.18910299]),
 array([1.26665318, 1.0727777 ])]

### Training the Neural Network Model with Policy Gradient

In [85]:
n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 500
discount_factor = 0.95

In [90]:
optimizer = keras.optimizers.Adam(0.01)
loss_fn = keras.losses.binary_crossentropy
obs = env.reset()

In [91]:
obs[0][np.newaxis]

array([[-0.01779805, -0.04352461,  0.04807696, -0.04226135]],
      dtype=float32)

In [92]:
for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(
                env, n_episodes_per_update, n_max_steps, model, loss_fn)
    all_final_rewards = discount_and_normalize_rewards(all_rewards,
                                                               discount_factor)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
                mean_grads = tf.reduce_mean(
                    [final_reward * all_grads[episode_index][step][var_index]
                        for episode_index, final_rewards in enumerate(all_final_rewards) for step, final_reward in enumerate(final_rewards)], axis=0)
                all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables)) #Adjust trainable  variables such that the action with higher advantage
    #becomes more likely


In [120]:
len(model.trainable_variables)

4

In [59]:
np.mean(all_final_rewards[0])

0.10835950103632878

In [48]:
env.reset()[0]

array([ 0.03568713,  0.01827965, -0.00327339, -0.03763362], dtype=float32)

In [62]:
obs=env.reset()[0]
obs

array([-0.04675896,  0.03011083, -0.0117334 ,  0.03751784], dtype=float32)

## Test the model

In [12]:
#see the model in action
env = gym.make("CartPole-v1",render_mode="human")
totals = []
episode_rewards = 0
obs = env.reset()[0]
for step in range(500):
    #print(f"Step Num: {step}")
    left_proba = model.predict(obs.reshape(1,-1),verbose=0)
    action = int(np.random.rand() > left_proba)
    obs, reward, done, state, info = env.step(action)
    env.render()
    episode_rewards += reward
    if done:
        print(obs)
        break
totals.append(episode_rewards)
print(f"Steps: {step}")

  action = int(np.random.rand() > left_proba)


[ 2.4187033e+00  1.4947851e+00 -1.8951163e-03 -3.6807171e-01]
Steps: 491


### Save and Load Model

In [97]:
# Save the entire model as a `.keras` zip archive.
model.save('cartpole_500.keras')

In [2]:
model = tf.keras.models.load_model('cartpole_500.keras')