In [1]:
import sys
import os
import numpy as np
import sklearn
import tensorflow as tf
from tensorflow import keras
import gym

In [2]:
# Choose the 'Acrobot-v1' environment in openai gym
environ = gym.make('Acrobot-v1')

In [3]:
environ.seed(1)
observe = environ.reset()
print(observe) # Represents the 2-d positions of joint-nodes and angular velocities 
print(environ.action_space) # 0,1 or 2. Torque left/right/none

[ 0.99811082  0.06143937  0.99999579  0.00290001 -0.06177637 -0.06262504]
Discrete(3)


In [4]:
action = environ.action_space.sample() # any discrete action has a reward of -1
observe, reward, done, info = environ.step(action)
print(reward)

-1.0


In [5]:
# A deep neural net that will perform online greedy policy improvement.
# I chose 2 hidden layers, corresponding to the behavior and target policies, just to get the ball rolling.
keras.backend.clear_session()
environ.seed(1)
tf.random.set_seed(1)
np.random.seed(1)

in_shape = [environ.observation_space.shape[0]]
n_out = environ.action_space.n

model = keras.models.Sequential([
    keras.layers.Dense(32, activation="relu", input_shape=in_shape),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(n_out, activation="softmax")
])

2021-09-22 10:48:24.393945: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
# Setting up epsilon greedy (off policy) Q-Learning by creating a function that takes a uniformly random action with 
# probability epsilon. Otherwise it takes the action determined by greedy policy improvement.
def epsilon_greedy(state, epsilon=0):
    if np.random.rand() < epsilon: 
        return np.random.randint(n_out)
    else:
        Q_values = model.predict(state[np.newaxis])
        return np.argmax(Q_values[0])

In [7]:
# Here we create a linked list, or a deque, which will store the experience of our agent. We will perform stochastic 
# gradient descent given a sample of a fixed size from this deque of experiences. We call this a 'replay buffer'
from collections import deque
rep_buff = deque(maxlen=2000)

In [8]:
# A function that samples from the deque of experiences. Experience replay! This 'batch method' avoids some 
# of the pitfalls of value function approximation of classical Q-Learning.
def sample_experiences(batch_size):
    indices = np.random.randint(len(rep_buff), size=batch_size)
    batch = [rep_buff[k] for k in indices]
    states, actions, rewards, next_states, dones = [
        np.array([experience[j] for experience in batch])
        for j in range(5)]
    return states, actions, rewards, next_states, dones

In [9]:
# A function that allows the agent to peform one step, using our previously defined epsilon_greedy function.
def one_step(environ, state, epsilon):
    action = epsilon_greedy(state, epsilon)
    next_state, reward, done, info = environ.step(action)
    rep_buff.append((state, action, reward, next_state, done))
    return next_state, reward, done, info

In [10]:
# We set the sample size of experiences at 32. We chose a discount_rate of .99 (the future is important for the acrobot!)
# Following Geron's notes, we chose a learning_rate of 1/100 for the Adam optimizer. 
batch_size = 32
discount_rate = 0.99
optimizer = keras.optimizers.Adam(learning_rate=1e-2)
loss_fn = keras.losses.mean_squared_error

def training_step(batch_size):
    states, actions, rewards, next_states, dones = sample_experiences(batch_size)
    max_next_Q_values = np.max(model.predict(next_states), axis=1)
    target_Q_values = (rewards + (1 - dones) * discount_rate * max_next_Q_values)
    target_Q_values = target_Q_values.reshape(-1, 1)
    mask = tf.one_hot(actions, n_out)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [11]:
# Training phase. epsilon will start near 1 then about half-way through the episodes be around .01.
# I wanted each episode to have enough steps (N_steps) such that the agent had sufficient time to 'learn'.

N_steps = 300
N_episodes = 500
n_steps = N_steps # initialize n_steps
for episode in range(N_episodes):
    observe = environ.reset()
    for step in range(N_steps):
        epsilon = max(1 - 1.5*episode/N_episodes, 0.01)
        observe, reward, done, info = one_step(environ, observe, epsilon)
        if done:
            break
        environ.render()
    print("\rEpisode: {}, Steps: {}, epsilon: {:.3f}".format(episode, step + 1, epsilon), end="")
    if step < n_steps: 
        best_weights = model.get_weights() 
        n_steps = step
        best_epsilon = epsilon
    if done and step < N_steps - 1: 
        print(" It won!")
    if episode > 50:
        training_step(batch_size)

model.set_weights(best_weights)

2021-09-22 10:48:24.880 Python[11030:2830968] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to (null)


Episode: 2, Steps: 300, epsilon: 0.994

2021-09-22 10:48:42.628569: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Episode: 127, Steps: 260, epsilon: 0.619 It won!
Episode: 128, Steps: 267, epsilon: 0.616 It won!
Episode: 131, Steps: 268, epsilon: 0.607 It won!
Episode: 155, Steps: 226, epsilon: 0.535 It won!
Episode: 174, Steps: 208, epsilon: 0.478 It won!
Episode: 499, Steps: 300, epsilon: 0.010

In [33]:
# Let's test the trained policy. Take 1. See 'Test_Run_1.gif' file in repository or READ_ME
observe = environ.reset()
for step in range(N_steps):
    observe, reward, done, info = one_step(environ, observe, 0)
    if done:
        break
    environ.render()
print(step) 


170


In [34]:
# Take 2. See 2nd .gif file in repository
observe = environ.reset()
for step in range(N_steps):
    observe, reward, done, info = one_step(environ, observe, 0)
    if done:
        break
    environ.render()
print(step) 


197


In [35]:
# Take 3
observe = environ.reset()
for step in range(N_steps):
    observe, reward, done, info = one_step(environ, observe, 0)
    if done:
        break
    environ.render()
print(step) 


272


In [36]:
# Take 4
observe = environ.reset()
for step in range(400):
    observe, reward, done, info = one_step(environ, observe, 0)
    if done:
        break
    environ.render()
print(step) 


145
