In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import gym
import warnings
warnings.filterwarnings('ignore')

Init Plugin
Init Graph Optimizer
Init Kernel


In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Softmax
from tensorflow.keras.optimizers import Adam

In [4]:
### Policy gradient
# - Let the NN play the game several times
# - At each step calculate gradients, but don't apply it
# - After running severl times, compute each action's advantage by
#    aggregating the results with a discount factor (0.9-0.99)
# - If overall action is positive, apply averge gradient
#    if overall action is negative, apply negative average gradient  

def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        left_proba = model(obs[np.newaxis]) # calling the model with a single observation (obs has been reshaped to become a batch of size one)
        action = (tf.random.uniform([1, 1]) > left_proba)
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)
        loss = tf.reduce_mean(loss_fn(y_target, left_proba))
    
    # calcuating the gradient for losses over each action taken in given state during single episode.
    grads = tape.gradient(loss, model.trainable_variables)
    obs, reward, done, info = env.step(int(action[0, 0].numpy()))
    return obs, reward, done, grads


def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

def discount_rewards(rewards, discount_rate):
    discounted = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_rate
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate)
                              for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std
            for discounted_rewards in all_discounted_rewards]


In [None]:
if __name__ == '__main__':
    
    # Set the initial values of the model
    n_iterations = 200
    n_episodes_per_update = 15
    n_max_steps = 1000
    discount_rate = 0.95

    optimizer = Adam(learning_rate=0.01)
    loss_fn = keras.losses.binary_crossentropy

    keras.backend.clear_session()
    env = gym.make("LunarLander-v2")
    env.seed(42);
    n_inputs = env.observation_space.shape[0]

    model = keras.models.Sequential([
        keras.layers.Dense(32, activation="relu", input_shape=[8,]),
        keras.layers.Dense(64, activation="relu"),
        keras.layers.Dense(4, activation="softmax"),
    ])
    
    for iteration in range(n_iterations):
        all_rewards, all_grads = play_multiple_episodes(
            env, n_episodes_per_update, n_max_steps, model, loss_fn)
        total_rewards = sum(map(sum, all_rewards)) # aggregating the rewards                  
        print("\rIteration: {}, mean rewards: {:.1f}".format(          
            iteration, total_rewards / n_episodes_per_update), end="") 
        all_final_rewards = discount_and_normalize_rewards(all_rewards,
                                                           discount_rate)
        all_mean_grads = []
        for var_index in range(len(model.trainable_variables)):
            mean_grads = tf.reduce_mean(
                [final_reward * all_grads[episode_index][step][var_index]
                 for episode_index, final_rewards in enumerate(all_final_rewards)
                     for step, final_reward in enumerate(final_rewards)], axis=0)
            all_mean_grads.append(mean_grads)
        optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

    env.close()

Metal device set to: Apple M1


2021-12-11 17:29:25.171588: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-11 17:29:25.171677: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Iteration: 178, mean rewards: -157.2