In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


In [2]:
# Choose the 'Acrobot-v1' environment
environ = gym.make('Acrobot-v1')
environ.seed(1)
observe = environ.reset()

In [3]:
keras.backend.clear_session()
tf.random.set_seed(1)
np.random.seed(1)

in_shape = environ.observation_space.shape[0]
n_act = environ.action_space.n


inputs = layers.Input(shape=(in_shape,))
common1 = layers.Dense(128, activation="relu")(inputs)  
actor = layers.Dense(n_act, activation="softmax")(common1)
critic = layers.Dense(1)(common1) 

model = keras.Model(inputs=inputs, outputs=[actor, critic])


2021-09-28 09:57:56.686467: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
eps = np.finfo(np.float32).eps.item() #eps wants to be the smallest positive real number not equal to zero. It tries.
discount_rate = 0.99 # .99 Discount Factor/Gamma
optimizer = keras.optimizers.Adam(learning_rate=1e-2) # Try ADAM optimizer
loss_fn = keras.losses.mean_absolute_error # loss function
action_probs_history = []
critic_value_history = []
rewards_history = []
episode_count = 0
steps_per_epi = 500 #max
running_reward = -steps_per_epi +1
while True:  # This training paradigm is based on the actor-critic implementation to 'solve' cartpole-v0.
             # It can be found at the web address https://keras.io/examples/rl/actor_critic_cartpole/
    state = environ.reset()
    episode_reward = 0
    with tf.GradientTape() as tape:
        for timestep in range(1, steps_per_epi): 
            environ.render()
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)
            action_probs, critic_value = model(state)
            
            # The actor is tasked with predicting action probabilities
            # from the 'state' of the acrobot, provided by openai gym acrobot-v environment.
            # The critic is given the same state, and asked to approximate the value function.
            # We create histories for each here.
            
            action = np.random.choice(n_act, p=np.squeeze(action_probs))# Sample action from action probability distribution
            action_probs_history.append(tf.math.log(action_probs[0, action]))
            critic_value_history.append(critic_value[0, 0])

            
            state, reward, done, _ = environ.step(action) # Apply the sampled action in our environment
            rewards_history.append(reward)
            episode_reward += reward

            if done:
                break
        

        # Update running reward to define condition for solving
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward 


        # At each timestep what was the total reward received after that timestep.
        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]: 
            discounted_sum = r + discount_rate * discounted_sum
            returns.append(discounted_sum)  # Not returns.insert(0, discounted_sum)

        # Normalize returns to be compatible with critic predictions. Also taken directly from Nandan's example.
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()
        #print(returns)

        # Calculating loss values to update our network
        history = zip(action_probs_history, critic_value_history, returns)
        actor_score = []
        critic_losses = []
        for log_prob, value, retn in history:
            # The critic estimation of the state value function is `value`. 
            # We take an action determined by the actors policy structure and end up recieving a return `retn`.
            diff = retn - value # The state value function does not contribute in expectation
            actor_score.append(log_prob * diff)  # The actor score is thus the gradient of log_prob. 
                                                 # The policy gradient is the expect value of this gradient multiplied
                                                 # by diff. We approximate the policy gradient by sampling

            # The critic peforms policy evaluation on the policy chosen by the actor.
            critic_losses.append(
                loss_fn(tf.expand_dims(value, 0), tf.expand_dims(retn, 0)) 
            )

        # Backpropagation 
        loss_value = sum(actor_score) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()

    # Print human readable development
    episode_count += 1
    if episode_count % 10 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(template.format(running_reward, episode_count))
        

    if running_reward > -115:  # Defined criterion for approximately optimal policy
        print("Solved at episode {}!".format(episode_count))
        break

2021-09-28 09:57:57.184 Python[5412:654874] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to (null)


running reward: -448.18 at episode 10
running reward: -348.69 at episode 20
running reward: -321.77 at episode 30
running reward: -286.32 at episode 40
running reward: -257.63 at episode 50
running reward: -211.34 at episode 60
running reward: -174.77 at episode 70
running reward: -149.82 at episode 80
running reward: -141.32 at episode 90
running reward: -132.54 at episode 100
running reward: -127.84 at episode 110
running reward: -129.59 at episode 120
running reward: -132.98 at episode 130
running reward: -138.36 at episode 140
running reward: -139.45 at episode 150
running reward: -133.11 at episode 160
running reward: -132.04 at episode 170
running reward: -141.34 at episode 180
running reward: -139.12 at episode 190
running reward: -143.84 at episode 200
running reward: -154.06 at episode 210
running reward: -152.80 at episode 220
running reward: -145.29 at episode 230
running reward: -133.94 at episode 240
running reward: -141.76 at episode 250
running reward: -134.78 at episode

In [5]:
# Create this function to test the agent below
def one_step(environ, state):
    state = tf.convert_to_tensor(state)
    state = tf.expand_dims(state, 0)
    action_probs, critic_value = model(state)
    action = np.random.choice(n_act, p=np.squeeze(action_probs))
    state, reward, done, _ = environ.step(action) 
    environ.render()
    return state, reward, done, _


In [6]:
# Let's test the agent. Take 1. See READ_ME file in the Deep Actor Critic repository
# Test 1
observe = environ.reset()
steps_per_test = 500
for step in range(1,steps_per_test):
    observe, reward, done, info = one_step(environ, observe)
    if done:
        break
    environ.render()
print(step) 

97


In [7]:
# Test 2
observe = environ.reset()
steps_per_test = 500
for step in range(1,steps_per_test):
    observe, reward, done, info = one_step(environ, observe)
    if done:
        break
    environ.render()
print(step) 

121


In [8]:
# Test 3
observe = environ.reset()
steps_per_test = 500
for step in range(1,steps_per_test):
    observe, reward, done, info = one_step(environ, observe)
    if done:
        break
    environ.render()
print(step) 

100


In [9]:
# Test 4
observe = environ.reset()
steps_per_test = 500
for step in range(1,steps_per_test):
    observe, reward, done, info = one_step(environ, observe)
    if done:
        break
    environ.render()
print(step)

84


In [13]:
# Test 4
observe = environ.reset()
steps_per_test = 500
for step in range(1,steps_per_test):
    observe, reward, done, info = one_step(environ, observe)
    if done:
        break
    environ.render()
print(step)

110
