In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


In [3]:
# Choose the 'Acrobot-v1' environment
environ = gym.make('Acrobot-v1')
environ.seed(1)
observe = environ.reset()
#action_probs
#x= np.squeeze(action_probs)
#print(x)
#action = np.random.choice(n_act, p=np.squeeze(action_probs))
#print(critic_value[0, 0])
#print(returns)
#import matplotlib.pyplot as pd
#pd.hist(-returns)
#print(retn)


In [4]:
keras.backend.clear_session()
tf.random.set_seed(1)
np.random.seed(1)

in_shape = environ.observation_space.shape[0]
n_act = environ.action_space.n


inputs = layers.Input(shape=(in_shape,))
common1 = layers.Dense(128, activation="relu")(inputs) #2^7=128 nodes for the hidden layer. Why? 
#One might ask what the benefit of another shared layer would be. I couldn't find a meaningful difference.
#common2 = layers.Dense(128, activation="relu")(common1) 
actor = layers.Dense(n_act, activation="softmax")(common1) # common2 for another shared layer
critic = layers.Dense(1)(common1) # common2 for another shared layer

model = keras.Model(inputs=inputs, outputs=[actor, critic])


2021-08-24 20:14:40.245272: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
eps = np.finfo(np.float32).eps.item() #eps wants to be the smallest real number not equal to zero. It tries.
discount_rate = 0.99 #
optimizer = keras.optimizers.Adam(learning_rate=1e-2) #
huber_loss = keras.losses.Huber() #
action_probs_history = []
critic_value_history = []
rewards_history = []
episode_count = 0
steps_per_epi = 500 #Somewhat arbitrarily set the steps per episode at 500
running_reward = -steps_per_epi +1
while True:  # This training paradigm is based on the actor-critic implementation to 'solve' cartpole-v0.
             # It can be found at the web address https://keras.io/examples/rl/actor_critic_cartpole/
    state = environ.reset()
    episode_reward = 0
    with tf.GradientTape() as tape:
        for timestep in range(1, steps_per_epi): 
            environ.render()
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)
            action_probs, critic_value = model(state)
            
            # The actor is tasked with predicting action probabilities
            # from the 'state' of the acrobot, provided by openai gym acrobot-v environment.
            # The critic is given the same input, and asked to predict future rewards.
            # We create histories for each here.
            
            action = np.random.choice(n_act, p=np.squeeze(action_probs))# Sample action from action probability distribution
            action_probs_history.append(tf.math.log(action_probs[0, action]))
            critic_value_history.append(critic_value[0, 0])

            
            state, reward, done, _ = environ.step(action) # Apply the sampled action in our environment
            rewards_history.append(reward)
            episode_reward += reward

            if done:
                break
        
        # print(critic_value_history) or print(action_probs_history) to get a feel for the dynamics of actor and critic.
        # Instead of initializing running_rewared try episode_reward = steps_per_epi + episode_reward

        # Update running reward to (hopefully) obtain condition for solving
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward 

        # Calculate expected value from rewards using Bellman Optimality Equation.
        # At each timestep what was the total reward received after that timestep.
        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]: # This is the main point in the code that differs from Nandan's example.
            discounted_sum = r + discount_rate * discounted_sum
            returns.append(discounted_sum)  # Nandan had returns.insert(0, discounted_sum), which I believe is a mistake.  
        #print(discounted_sum)

        # Normalize returns to be compatible with critic predictions. Also taken directly from Nandan's example.
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()
        #print(returns)

        # Calculating loss values to update our network
        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        for log_prob, value, retn in history:
            # print(log_prob,value, retn)
            #print(retn)
            # The critic estimates that we get a total reward = `value` in the future. 
            # We take an action with log probability of `log_prob` and end up recieving a total reward = `retn`.
            # The actor should be updated so that with high probability it predicts an action that leads to
            # high rewards, taking into account the critic's estimate.
            diff = value - retn 
            actor_losses.append(-log_prob * diff)  # actor loss. I wonder if the sign in front of log matters here.
                                                   # given that the sign of 'diff' seems to be arbitrary.

            # The critic must be updated so that it predicts a better estimate of
            # the future rewards.
            critic_losses.append(
                huber_loss(tf.expand_dims(value, 0), tf.expand_dims(retn, 0)) 
            )

        # Backpropagation 
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()

    # Print human readable development
    episode_count += 1
    if episode_count % 10 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(template.format(running_reward, episode_count))
        

    if running_reward > -150:  # Defined criterion for approximately optimal policy
        print("Solved at episode {}!".format(episode_count))
        break

2021-08-24 20:14:49.314 Python[9671:569010] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to (null)


running reward: -465.65 at episode 10
running reward: -354.49 at episode 20
running reward: -281.36 at episode 30
running reward: -240.48 at episode 40
running reward: -234.59 at episode 50
running reward: -236.77 at episode 60
running reward: -250.16 at episode 70
running reward: -265.81 at episode 80
running reward: -269.46 at episode 90
running reward: -274.79 at episode 100
running reward: -254.91 at episode 110
running reward: -246.95 at episode 120
running reward: -223.73 at episode 130
running reward: -199.93 at episode 140
running reward: -204.81 at episode 150
running reward: -189.55 at episode 160
running reward: -191.58 at episode 170
running reward: -185.94 at episode 180
running reward: -183.80 at episode 190
running reward: -194.05 at episode 200
running reward: -195.65 at episode 210
running reward: -208.92 at episode 220
running reward: -199.45 at episode 230
running reward: -190.39 at episode 240
running reward: -170.84 at episode 250
running reward: -161.36 at episode

In [18]:
# Create this function to test the agent below
def one_step(environ, state):
    state = tf.convert_to_tensor(state)
    state = tf.expand_dims(state, 0)
    action_probs, critic_value = model(state)
    action = np.random.choice(n_act, p=np.squeeze(action_probs))# Sample action from action probability distribution
    state, reward, done, _ = environ.step(action) # Apply the sampled action in our environment
    return state, reward, done, _


In [33]:
# Let's test the agent. Take 1. See 'Test_Run_1.gif' file in Deep_Q_Learning repository or READ_ME
observe = environ.reset()
steps_per_test = 500
for step in range(1,steps_per_test):
    observe, reward, done, info = one_step(environ, observe)
    if done:
        break
    environ.render()
print(step) 

98


In [34]:
# Test 2
observe = environ.reset()
steps_per_test = 500
for step in range(1,steps_per_test):
    observe, reward, done, info = one_step(environ, observe)
    if done:
        break
    environ.render()
print(step) 

111


In [35]:
# Test 3
observe = environ.reset()
steps_per_test = 500
for step in range(1,steps_per_test):
    observe, reward, done, info = one_step(environ, observe)
    if done:
        break
    environ.render()
print(step) 

130


In [36]:
# Test 4
observe = environ.reset()
steps_per_test = 500
for step in range(1,steps_per_test):
    observe, reward, done, info = one_step(environ, observe)
    if done:
        break
    environ.render()
print(step) 

137
