In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


In [2]:
# Choose the 'Acrobot-v1' environment
environ = gym.make('Acrobot-v1')
environ.seed(1)
observe = environ.reset()

In [3]:
keras.backend.clear_session()
tf.random.set_seed(1)
np.random.seed(1)

in_shape = environ.observation_space.shape[0]
n_act = environ.action_space.n


inputs = layers.Input(shape=(in_shape,))
hid_act = layers.Dense(128, activation="relu")(inputs) 
hid_crit = layers.Dense(128, activation="relu")(inputs) 
actor = layers.Dense(n_act, activation="softmax")(hid_act)
critic = layers.Dense(1)(hid_crit) 

model = keras.Model(inputs=inputs, outputs=[actor, critic])


2021-10-06 12:17:12.294025: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
discount_rate = 0.99 # .99 Discount_Factor/Gamma
optimizer = keras.optimizers.Adam(learning_rate=1e-3) # Try ADAM optimizer
loss_fn = keras.losses.mean_absolute_error # loss function
episode_count = 0
steps_per_epi = 500 #max 
running_reward = -steps_per_epi +1
while True:  
    state = environ.reset()
    episode_reward = 0
    with tf.GradientTape(persistent=True) as tape:
        for timestep in range(1, steps_per_epi-1): 
            environ.render()
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)
            action_probs, critic_value = model(state)
            
            
            action = np.random.choice(n_act, p=np.squeeze(action_probs)) # The weights of the Neural Network
                                                                         # will modify the distribution determining
                                                                         # our policy

            next_state, reward, done, _ = environ.step(action)  # We take an action determined by the actors policy 
                                                                # structure and end up recieving a reward
            
            state_hold = next_state
            next_state = tf.convert_to_tensor(next_state)       
            next_state = tf.expand_dims(next_state, 0)
            
            
            next_action, next_critic_value = model(next_state)  # Sample an action from our estimation of the policy
                                                                # and value approximators
            
            TD_target = (reward + (1 - done) * discount_rate * next_critic_value) #TD target
            
            critic_loss = loss_fn(tf.expand_dims(critic_value, 0), tf.expand_dims(TD_target, 0)) # Critic loss
            
            diff = next_critic_value[0,0] - critic_value[0,0]
            actor_score = -tf.math.log(action_probs[0, action])*diff 
                                                                # The actual actor score is the gradient of 'actor_score'. 
                                                                # The policy gradient is the expect value of 
                                                                # this gradient multiplied by diff.
            loss_value = actor_score + critic_loss
            grads = tape.gradient(loss_value, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            
            state = state_hold

            episode_reward += reward

            if done:
                break
        


        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward 


 
    # Print human readable development
    episode_count += 1
    if episode_count % 10 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(template.format(running_reward, episode_count))
        

    if running_reward > -115:  # Defined criterion for approximately optimal policy
        print("Solved at episode {}!".format(episode_count))
        break

2021-10-06 12:17:12.917 Python[22343:1652883] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to (null)


running reward: -498.60 at episode 10
running reward: -406.69 at episode 20
running reward: -273.48 at episode 30
running reward: -203.99 at episode 40
running reward: -157.23 at episode 50
running reward: -128.87 at episode 60
running reward: -117.04 at episode 70
Solved at episode 72!


In [5]:
# Create this function to test the agent below
def one_step(environ, state):
    state = tf.convert_to_tensor(state)
    state = tf.expand_dims(state, 0)
    action_probs, critic_value = model(state)
    action = np.random.choice(n_act, p=np.squeeze(action_probs))
    state, reward, done, _ = environ.step(action) 
    environ.render()
    return state, reward, done, _


In [6]:
# Let's test the agent. Take 1. See READ_ME file in the Deep Actor Critic repository
# Test 1
observe = environ.reset()
steps_per_test = 500
for step in range(1,steps_per_test):
    observe, reward, done, info = one_step(environ, observe)
    if done:
        break
    environ.render()
print(step) 

102


In [7]:
# Test 2
observe = environ.reset()
steps_per_test = 500
for step in range(1,steps_per_test):
    observe, reward, done, info = one_step(environ, observe)
    if done:
        break
    environ.render()
print(step) 

89


In [11]:
# Test 3
observe = environ.reset()
steps_per_test = 500
for step in range(1,steps_per_test):
    observe, reward, done, info = one_step(environ, observe)
    if done:
        break
    environ.render()
print(step) 

76


In [9]:
# Test 4
observe = environ.reset()
steps_per_test = 500
for step in range(1,steps_per_test):
    observe, reward, done, info = one_step(environ, observe)
    if done:
        break
    environ.render()
print(step)

83


In [10]:
# Test 4
observe = environ.reset()
steps_per_test = 500
for step in range(1,steps_per_test):
    observe, reward, done, info = one_step(environ, observe)
    if done:
        break
    environ.render()
print(step)

93
