# Temporal-difference Learning

In [1]:
import gym
import gym_gridworlds
import numpy
from matplotlib import pyplot
import temporal_difference

## Windy Gridworld

In [2]:
env = gym.make('WindyGridworld-v0')

observation = env.reset()
t = 0
while True:
    action = env.action_space.sample()  # take a random action
    observation, reward, done, info = env.step(action)
    if done:
        print('episode finished after {:>2} time steps'.format(t))
        break
    t += 1

episode finished after 12742 time steps


In [3]:
env.observation_space, env.action_space.n

(Tuple(Discrete(7), Discrete(10)), 4)

## Sarsa

In [4]:
Q = temporal_difference.sarsa(env, 1000)

In [5]:
policy = temporal_difference.get_policy(env, Q)
policy

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 2],
       [1, 1, 1, 1, 1, 1, 1, 1, 0, 2],
       [0, 1, 1, 0, 1, 1, 1, 1, 1, 2],
       [1, 1, 1, 1, 1, 1, 1, 0, 1, 2],
       [1, 2, 0, 1, 0, 1, 0, 2, 3, 3],
       [1, 1, 1, 1, 1, 0, 0, 2, 3, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 3, 3]])

In [6]:
S = env.reset()
G = 0
for t in range(16):
    A = policy[S]
    S_, R, done, info = env.step(A)
    G += R
    print('state {}, action {}, reward {}, action-value {}'.format(S, A, R, Q[S, A]))
    S = S_
    if done:
        print('finished after {:>2} time steps with total reward {}'.format(t + 1, G))
        break

state (3, 0), action 1, reward -1, action-value -17.806889784101024
state (3, 1), action 1, reward -1, action-value -16.361138575662025
state (3, 2), action 1, reward -1, action-value -14.8279874296743
state (3, 3), action 1, reward -1, action-value -13.488620801138975
state (2, 4), action 1, reward -1, action-value -12.310457060268552
state (1, 5), action 1, reward -1, action-value -11.129805517575994
state (0, 6), action 1, reward -1, action-value -11.220947653031317
state (0, 7), action 1, reward -1, action-value -9.325163649843983
state (0, 8), action 1, reward -1, action-value -7.368076200561912
state (0, 9), action 2, reward -1, action-value -6.390252478220537
state (1, 9), action 2, reward -1, action-value -5.34191090902227
state (2, 9), action 2, reward -1, action-value -4.1665858070673885
state (3, 9), action 2, reward -1, action-value -3.0331458939283564
state (4, 9), action 3, reward -1, action-value -2.0000000000014015
state (4, 8), action 3, reward -1, action-value -1.0
fi

## Q-learning

In [7]:
cliff = gym.make('Cliff-v0')

In [8]:
temporal_difference.get_policy(cliff, temporal_difference.q_learning(cliff, 200))

array([[0, 1, 1, 0, 1, 3, 1, 1, 2, 1, 2, 2],
       [0, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [9]:
temporal_difference.get_policy(cliff, temporal_difference.sarsa(cliff, 200))

array([[1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2],
       [1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 2],
       [1, 0, 0, 0, 1, 1, 0, 3, 0, 1, 1, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [10]:
Q = temporal_difference.q_learning(env, 200)
policy = temporal_difference.get_policy(env, Q)

S = env.reset()
G = 0
for t in range(16):
    A = policy[S]
    S_, R, done, info = env.step(A)
    G += R
    print('state {}, action {}, reward {}, action-value {}'.format(S, A, R, Q[S, A]))
    S = S_
    if done:
        print('finished after {:>2} time steps with total reward {}'.format(t + 1, G))
        break

state (3, 0), action 1, reward -1, action-value -15.0
state (3, 1), action 1, reward -1, action-value -14.0
state (3, 2), action 1, reward -1, action-value -13.0
state (3, 3), action 1, reward -1, action-value -12.0
state (2, 4), action 1, reward -1, action-value -11.0
state (1, 5), action 1, reward -1, action-value -10.0
state (0, 6), action 1, reward -1, action-value -9.0
state (0, 7), action 1, reward -1, action-value -8.0
state (0, 8), action 1, reward -1, action-value -7.0
state (0, 9), action 2, reward -1, action-value -6.0
state (1, 9), action 2, reward -1, action-value -5.0
state (2, 9), action 2, reward -1, action-value -4.0
state (3, 9), action 2, reward -1, action-value -3.0
state (4, 9), action 3, reward -1, action-value -2.0
state (4, 8), action 3, reward -1, action-value -1.0
finished after 15 time steps with total reward -15
