# Temporal-difference Learning

In [1]:
import gym
import gym_gridworlds
import numpy
from collections import defaultdict
from matplotlib import pyplot

## Windy Gridworld

In [2]:
env = gym.make('WindyGridworld-v0')

observation = env.reset()
t = 0
while True:
    action = env.action_space.sample()  # take a random action
    observation, reward, done, info = env.step(action)
    if done:
        print('episode finished after {:>2} time steps'.format(t))
        break
    t += 1

episode finished after 12742 time steps


## Sarsa

In [3]:
def epsilon_greedy_policy(env, S, Q, epsilon):
    if numpy.random.rand() < epsilon:
        return env.action_space.sample()
    return numpy.argmax([Q[S, A] for A in range(env.action_space.n)])

In [4]:
def sarsa(env, n_episodes, gamma=1.0, alpha=0.5, epsilon=0.1):
    Q = defaultdict(float)
    for _ in range(n_episodes):
        S = env.reset()
        A = epsilon_greedy_policy(env, S, Q, epsilon)
        while True:
            S_prime, R, done, _ = env.step(A)
            A_prime = epsilon_greedy_policy(env, S_prime, Q, epsilon)
            Q[S, A] += alpha * (R + gamma * Q[S_prime, A_prime] - Q[S, A])
            S, A = S_prime, A_prime
            if done:
                break
    return Q

Q = sarsa(env, 1000)

In [5]:
def get_policy(env, Q):
    policy = numpy.zeros((env.height, env.width), numpy.int)
    for i in range(env.height):
        for j in range(env.width):
            S = i, j
            policy[S] = numpy.argmax([Q[S, A] for A in range(env.action_space.n)])
    return policy

policy = get_policy(env, Q)
policy

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 2],
       [1, 1, 1, 1, 0, 1, 2, 2, 1, 2],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 2],
       [1, 2, 1, 1, 1, 1, 1, 0, 1, 2],
       [1, 1, 1, 1, 1, 1, 0, 2, 3, 3],
       [1, 1, 1, 0, 1, 0, 0, 2, 1, 3],
       [1, 0, 1, 1, 0, 0, 0, 0, 2, 3]])

In [6]:
S = env.reset()
G = 0
for t in range(16):
    A = policy[S]
    S_, R, done, info = env.step(A)
    G += R
    print('state {}, action {}, reward {}, action-value {}'.format(S, A, R, Q[S, A]))
    S = S_
    if done:
        print('finished after {:>2} time steps with total reward {}'.format(t + 1, G))
        break

state (3, 0), action 1, reward -1, action-value -17.812312874384325
state (3, 1), action 2, reward -1, action-value -17.62016640750423
state (4, 1), action 1, reward -1, action-value -16.861268396427402
state (4, 2), action 1, reward -1, action-value -15.00778038879664
state (4, 3), action 1, reward -1, action-value -14.274361374235335
state (3, 4), action 1, reward -1, action-value -13.4915711445933
state (2, 5), action 1, reward -1, action-value -11.988767881309782
state (1, 6), action 2, reward -1, action-value -12.166049344610265
state (0, 6), action 1, reward -1, action-value -10.374973260394551
state (0, 7), action 1, reward -1, action-value -9.853977738865236
state (0, 8), action 1, reward -1, action-value -8.40435910993105
state (0, 9), action 2, reward -1, action-value -7.8481918978849565
state (1, 9), action 2, reward -1, action-value -5.10122422835186
state (2, 9), action 2, reward -1, action-value -4.003975206053289
state (3, 9), action 2, reward -1, action-value -3.0002978

## Q-learning

In [7]:
cliff = gym.make('Cliff-v0')

In [8]:
def q_learning(env, n_episodes, gamma=1.0, alpha=0.5, epsilon=0.1):
    Q = defaultdict(float)
    
    for _ in range(n_episodes):
        S = env.reset()
        while True:
            A = epsilon_greedy_policy(env, S, Q, epsilon)
            S_prime, R, done, _ = env.step(A)
            max_Q = numpy.max([Q[S_prime, A] for A in range(env.action_space.n)])
            Q[S, A] += alpha * (R + gamma * max_Q - Q[S, A])
            S = S_prime
            if done:
                break

    return Q

get_policy(cliff, q_learning(cliff, 200))

array([[0, 2, 1, 1, 0, 2, 1, 1, 1, 1, 1, 2],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [9]:
get_policy(cliff, sarsa(cliff, 200))

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2],
       [1, 1, 1, 1, 1, 1, 0, 0, 1, 2, 1, 2],
       [0, 0, 1, 0, 0, 3, 3, 1, 0, 1, 1, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [10]:
# TODO sum of reward during episodes graph
# see figure 6.4 in Sutton's book

In [11]:
Q = q_learning(env, 200)
policy = get_policy(env, Q)

S = env.reset()
G = 0
for t in range(16):
    A = policy[S]
    S_, R, done, info = env.step(A)
    G += R
    print('state {}, action {}, reward {}, action-value {}'.format(S, A, R, Q[S, A]))
    S = S_
    if done:
        print('finished after {:>2} time steps with total reward {}'.format(t + 1, G))
        break

state (3, 0), action 1, reward -1, action-value -15.0
state (3, 1), action 1, reward -1, action-value -14.0
state (3, 2), action 1, reward -1, action-value -13.0
state (3, 3), action 1, reward -1, action-value -12.0
state (2, 4), action 1, reward -1, action-value -11.0
state (1, 5), action 1, reward -1, action-value -10.0
state (0, 6), action 1, reward -1, action-value -9.0
state (0, 7), action 1, reward -1, action-value -8.0
state (0, 8), action 1, reward -1, action-value -7.0
state (0, 9), action 2, reward -1, action-value -6.0
state (1, 9), action 2, reward -1, action-value -5.0
state (2, 9), action 2, reward -1, action-value -4.0
state (3, 9), action 2, reward -1, action-value -3.0
state (4, 9), action 3, reward -1, action-value -2.0
state (4, 8), action 3, reward -1, action-value -1.0
finished after 15 time steps with total reward -15
