In [1]:
import numpy as np
# MDP definition
S = [0,1,2,3]
A = [0,1]
T = {} # We'll assume T is deterministic for now.
R = {0: -1, 1:0, 2:0, 3:1}

# Fill in the transition matrix:
for s in S:
    for a in A:
        if a == 0:
            s_prime = max(0, s - 1)
        elif a ==1:
            s_prime = min(3, s+1)
        T[s,a] = s_prime

In [2]:
time_steps = 10
import time
s0 = 0
s = s0
score = 0

# Simulate random agent.
for i in range(time_steps):
    time.sleep(0.5)
    a = np.random.choice(A)
    s = T[s, a]
    map = np.zeros(len(S))
    map[s] = 1
    score += R[s]
    print(map, score)

[1. 0. 0. 0.] -1
[0. 1. 0. 0.] -1
[0. 0. 1. 0.] -1
[0. 0. 0. 1.] 0
[0. 0. 1. 0.] 0
[0. 1. 0. 0.] 0
[1. 0. 0. 0.] -1
[1. 0. 0. 0.] -2
[0. 1. 0. 0.] -2
[0. 0. 1. 0.] -2


In [3]:
# Compute the value function
V = np.zeros(len(S))

def E(s):
    # returns expected value of s
    e = 0
    for a in range(len(A)):
        s_prime = T[s,a]
        v_sprime = V[s_prime]
        e += 0.5*v_sprime # cheating a little bit
    return e

# Value iteration
for i in range(25):
    for s in S:
        V[s] = R[s] + E(s)
    print(V)

[-1.    -0.5   -0.25   0.875]
[-1.75    -1.      -0.0625   1.40625]
[-2.375   -1.21875  0.09375  1.75   ]
[-2.796875   -1.3515625   0.19921875  1.97460938]
[-3.07421875 -1.4375      0.26855469  2.12158203]
[-3.25585938 -1.49365234  0.31396484  2.21777344]
[-3.37475586 -1.53039551  0.34368896  2.2807312 ]
[-3.45257568 -1.55444336  0.36314392  2.32193756]
[-3.50350952 -1.5701828   0.37587738  2.34890747]
[-3.53684616 -1.58048439  0.38421154  2.36655951]
[-3.55866528 -1.58722687  0.38966632  2.37811291]
[-3.57294607 -1.59163988  0.39323652  2.38567472]
[-3.58229297 -1.59452823  0.39557324  2.39062398]
[-3.5884106  -1.59641868  0.39710265  2.39386331]
[-3.59241464 -1.59765599  0.39810366  2.39598349]
[-3.59503532 -1.59846583  0.39875883  2.39737116]
[-3.59675057 -1.59899587  0.39918764  2.3982794 ]
[-3.59787322 -1.59934279  0.39946831  2.39887385]
[-3.59860801 -1.59956985  0.399652    2.39926293]
[-3.59908893 -1.59971846  0.39977223  2.39951758]
[-3.5994037  -1.59981573  0.39985092  2.3996

In [4]:
pi = {0:0, 1:0, 2:0, 3:0}
for s in S:
    selection_arg = 0
    selection_val = -np.inf
    for a in A:
        state_action_value = V[T[s, a]]
        if state_action_value > selection_val:
            selection_val = state_action_value
            selection_arg = a
    pi[s] = selection_arg
print(pi)


{0: 1, 1: 1, 2: 1, 3: 1}


In [None]:
time_steps = 10
s0 = 0
s = s0
score = 0

for i in range(time_steps):
    time.sleep(0.5)
    a = pi[s]
    s = T[s, a]
    map = np.zeros(len(S))
    map[s] = 1
    score += R[s]
    print(map, score)

[0. 1. 0. 0.] 0
[0. 0. 1. 0.] 0
[0. 0. 0. 1.] 1
[0. 0. 0. 1.] 2
[0. 0. 0. 1.] 3
[0. 0. 0. 1.] 4
