# Tabular Q-learning using a linear model in TF2
## Christian Igel, 2019

This example implements tabular Q-learning via a linear model and applies it to simple gridworlds. If you have suggestions for improvement, [let me know](mailto:igel@diku.dk).

In [2]:
import tensorflow as tf

import gym
import gym_gridworlds  # pip install gym-gridworlds

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [4]:
# Choose either of the two environments
env_name = 'CliffWalking-v0'
# env_name = 'FrozenLake-v0'
env = gym.make(env_name)  

env.render()

number_of_actions = env.action_space.n
number_of_states = env.observation_space.n
print("|S| =", number_of_states)
print("|A| =", number_of_actions)

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T

|S| = 48
|A| = 4


In [6]:
env

<gym.envs.toy_text.cliffwalking.CliffWalkingEnv at 0x1731b968278>

In [16]:
# Define linear model
#x = tf.keras.Input(shape=(number_of_states,), dtype=tf.float64)  # input state
x = tf.keras.Input(shape=(number_of_states,))  # input state
y = tf.keras.layers.Dense(number_of_actions, activation=None, use_bias=False, 
                          kernel_initializer=tf.keras.initializers.RandomUniform(0, 0.01))(x)
#argmax_y = tf.argmax(y, 1) # best action

# Instantiate model
model = tf.keras.Model(inputs=x, outputs=y)

print(model.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 48)]              0         
_________________________________________________________________
dense (Dense)                (None, 4)                 192       
Total params: 192
Trainable params: 192
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
eta = 0.1  # (initial) learning rate
optimizer = tf.keras.optimizers.SGD(learning_rate=eta)

@tf.function
def compute_gradient(x, Q_target):
    with tf.GradientTape() as tape:
            Q = model(x)
            loss = tf.math.reduce_mean(tf.square(Q - Q_target))
    return tape.gradient(loss, model.trainable_variables)

In [None]:
# Set learning parameters
gamma = 1. #.99  # gamma
initial_epsilon = epsilon = 0.1  # epsilon for epsilon-greedy selection
number_of_episodes = 2000
max_number_of_steps = 100
T_list = []  # list gathering maximum number of steps for each episode
R_list = [] 

for i in tqdm(range(number_of_episodes)):
    s = env.reset()  # reset environment and get first state
    R = 0  # return (accumulated reward)
    for t in range(max_number_of_steps):  # maximum number of steps
        # Choose an action greedily (with e chance of random action) from the Q-network
        Q = model(np.eye(1, number_of_states, s, dtype=np.float32))
        a = np.argmax(Q, 1) # best action
        if np.random.rand(1) < epsilon:
            a[0] = env.action_space.sample()
        # Observe new state and reward from environment
        s_prime, r, d, _ = env.step(a[0])
        # Compute Q' by feeding the new state into the network
        Q_prime = model(np.eye(1, number_of_states, s_prime, dtype=np.float32))
        # Compute maximum value of Q_prime and set  target value for chosen action
        max_Q_prime = np.max(Q_prime)
        Q_target = Q.numpy()
        Q_target[0, a[0]] = r + gamma * max_Q_prime
        # Train network using target and predicted Q values
        gradients = compute_gradient(np.eye(1, number_of_states, s, dtype=np.float32), Q_target); 
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        R += r
        s = s_prime
        if d == True:  # episode ended
            # Reduce probability of random actions over time
            epsilon = 1./((i/50) + (1./initial_epsilon))
            break
    T_list.append(t)
    R_list.append(R)

In [92]:
Q_target = Q.numpy()
Q_target[0, a[0]] = 1 + 1 * max_Q_prime

In [95]:
Q.numpy()

array([[0.00691037, 0.00763018, 0.0064079 , 0.00580551]], dtype=float32)

In [93]:
Q_target

array([[0.00691037, 1.0076302 , 0.0064079 , 0.00580551]], dtype=float32)

In [91]:
print(Q_prime)
print(max_Q_prime)
Q.numpy()

tf.Tensor([[0.00691037 0.00763018 0.0064079  0.00580551]], shape=(1, 4), dtype=float32)
0.0076301834


array([[0.00691037, 0.00763018, 0.0064079 , 0.00580551]], dtype=float32)

In [96]:
s = env.reset()
np.eye(1, number_of_states, s, dtype=np.float32)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
      dtype=float32)

In [76]:
Q = model(np.eye(1, number_of_states, s, dtype=np.float32))
Q


<tf.Tensor: id=51, shape=(1, 4), dtype=float32, numpy=array([[0.00691037, 0.00763018, 0.0064079 , 0.00580551]], dtype=float32)>

In [50]:
np.random.rand(1)

array([0.25323255])

In [77]:
a = np.argmax(Q, 1)
a

array([1], dtype=int64)

In [None]:
if env_name == 'FrozenLake-v0':
    print("Percent of succesful episodes:", sum(R_list)/number_of_episodes)
plt.plot(R_list, 'g.')
plt.show()
plt.plot(T_list, 'b.')
plt.show()

In [None]:
env.render()

weights = model.get_weights()

if env_name == 'FrozenLake-v0':
    print("V:\n", np.around(np.max(weights, 2).reshape((4,4)), decimals=1))
    print("actions:\n", np.argmax(weights, 2).reshape((4,4)))
if env_name == 'CliffWalking-v0':
    print("V:\n", np.around(np.max(weights, 2).reshape((4,12)), decimals=1))
    print("actions:\n", np.argmax(weights, 2).reshape((4,12)))