# GridWorld Problem

## Problem setup

In [25]:
import ReinforcementLearning.FiniteMarkovDecisionProcesses as MDP

In [26]:
function V_to_matrix_form(V)
    VV = Matrix{Float64}(undef, 4, 4)
    VV[1,1] = V[15]
    for i = 1:14
        k = div(i,4)
        ℓ = rem(i,4)
        VV[k+1, ℓ+1] = V[i]
    end
    VV[4,4] = VV[1,1]
    return VV
end

V_test = 1:15
V_to_matrix_form(V_test)

4×4 Matrix{Float64}:
 15.0   1.0   2.0   3.0
  4.0   5.0   6.0   7.0
  8.0   9.0  10.0  11.0
 12.0  13.0  14.0  15.0

In [27]:
function action_to_char(a)
    if a == 1 # up
        return '↑'
    elseif a == 2 # down
        return '↓'
    elseif a == 3 # right
        return '→'
    elseif a == 4 # left
        return '←'
    else
        return '?'
    end
end

function P_to_matrix_form(P)
    PP = Matrix{Char}(undef, 4, 4)
    PP[1,1] = '█'
    for i = 1:14
        k = div(i,4)
        ℓ = rem(i,4)
        PP[k+1, ℓ+1] = action_to_char(P[i])
    end
    PP[4,4] = '█'
    return PP
end

P_test = rand(1:4, 15)
P_to_matrix_form(P_test)

4×4 Matrix{Char}:
 '█'  '←'  '↑'  '←'
 '↓'  '→'  '←'  '←'
 '↑'  '↑'  '↑'  '↑'
 '→'  '↓'  '→'  '█'

In [28]:
next_states = [
    # 1 (up)  2 (down)  3 (right)  4 (left) 
      1       5         2         15         #  1
      2       6         3          1         #  2 
      3       7         3          2         #  3
     15       8         5          4         #  4
      1       9         6          4         #  5
      2      10         7          5         #  6
      3      11         7          6         #  7
      4      12         9          8         #  8
      5      13        10          8         #  9
      6      14        11          9         # 10
      7      15        11         10         # 11
      8      12        13         12         # 12
      9      13        14         12         # 13
     10      14        15         13         # 14
     15      15        15         15         # 15
]

rewards = -1*ones(size(next_states))
rewards[15, :] .= 0

mdp = MDP.DeterministicFiniteMDP((s,a)->next_states[s, a], (s,a)->rewards[s, a], 4, 15, 15);

## Dynamic Programming

### DP: Policy Evaluation

In [29]:
policy = 0.25*ones(size(next_states)...);  # Vector{Int64}(undef, size(next_states, 1))

In [30]:
V, Q = MDP.allocate_V_and_Q(mdp)
V[1] = 0

iters_no, converged = MDP.dp_evaluate_policy!(V, Q, mdp, policy, 1.0; tol = 1e-4, maxiter = 1000)

(173, true)

In [31]:
V_dp_random_policy = copy(V)
V_to_matrix_form(round.(V_dp_random_policy; digits=2))

4×4 Matrix{Float64}:
   0.0  -14.0  -20.0  -22.0
 -14.0  -18.0  -20.0  -20.0
 -20.0  -20.0  -18.0  -14.0
 -22.0  -20.0  -14.0    0.0

In [32]:
P = MDP.create_P_from_Q(Q)
MDP.P_from_Q!(P, Q)

P_to_matrix_form(P)

4×4 Matrix{Char}:
 '█'  '←'  '←'  '←'
 '↑'  '↑'  '↓'  '↓'
 '↑'  '↑'  '→'  '↓'
 '↑'  '→'  '→'  '█'

### DP: Policy Optimization

In [33]:
P = rand(1:4, size(next_states, 1));
V, Q = MDP.allocate_V_and_Q(mdp)
V[1] = 0

converged = false
iters_no = 100
for i = 1:iters_no
    MDP.dp_evaluate_policy!(V, Q, mdp, P, 1.0; tol = 1e-4, maxiter = 5)
    modified = MDP.P_from_Q!(P, Q)
    if !modified
        converged = true
        iters_no = i
        break
    end
end

(iters_no, converged)

(4, true)

In [36]:
P_dp_optimal = copy(P)
P_to_matrix_form(P_dp_optimal)

4×4 Matrix{Char}:
 '█'  '←'  '←'  '↓'
 '↑'  '↑'  '↑'  '↓'
 '↑'  '↑'  '↓'  '↓'
 '↑'  '→'  '→'  '█'

In [37]:
V_dp_optimal = copy(V)
V_to_matrix_form(round.(V; digits=2))

4×4 Matrix{Float64}:
  0.0  -1.0  -2.0  -3.0
 -1.0  -2.0  -3.0  -2.0
 -2.0  -3.0  -2.0  -1.0
 -3.0  -2.0  -1.0   0.0

## Monte Karlo

### MK: Policy Evaluation (Stochastic Policy)

In [38]:
P = 0.25 * ones(size(next_states)...)
simulator = MDP.create_simulator(mdp, P)
V, Q = MDP.allocate_V_and_Q(mdp)
MDP.mk_evaluate_policy!(Q, simulator, 1.0; maxiter = 10000)


In [39]:

MDP.V_from_Q!(V, Q, P)
V_mk_random_policy = copy(V)
V_to_matrix_form(round.(V_mk_random_policy; digits=2))

4×4 Matrix{Float64}:
 NaN     -14.98  -21.25  -23.43
 -15.06  -19.06  -20.96  -21.15
 -20.87  -20.83  -18.77  -15.08
 -22.89  -20.98  -14.93  NaN

In [43]:
V_to_matrix_form(round.(V_dp_random_policy .- V_mk_random_policy; digits=2))

4×4 Matrix{Float64}:
 NaN     0.98  1.25    1.43
   1.06  1.06  0.96    1.15
   0.87  0.83  0.77    1.08
   0.9   0.98  0.93  NaN

### MK: Policy Evaluation (Deterministic Policy)

In [51]:
𝐩 = rand(1:4, size(next_states, 1))
simulator = MDP.create_simulator(mdp, 𝐩, 0.1)
V, Q = MDP.allocate_V_and_Q(mdp)
MDP.mk_evaluate_policy!(Q, simulator, 1.0; maxiter = 10000)

In [52]:
MDP.V_from_Q!(V, Q, P)
V_to_matrix_form(round.(V; digits=2))

4×4 Matrix{Float64}:
 NaN     -69.44  -94.55  -90.47
 -70.26  -93.08  -94.0   -92.0
 -99.06  -97.24  -99.95  -69.62
 -96.04  -97.23  -71.78  NaN

### MK: Policy Optimization

In [60]:
𝐩 = rand(1:4, size(next_states, 1))
simulator = MDP.create_simulator(mdp, 𝐩, 0.05)
V, Q = MDP.allocate_V_and_Q(mdp)
converged = false
for i = 1:1000
    MDP.mk_evaluate_policy!(Q, simulator, 1.0; maxiter = 1000)
    MDP.P_from_Q!(𝐩, Q)
end

In [61]:
Q

15×4 Matrix{Float64}:
  -3.25   -4.4       -4.0    -2.0
  -4.0    -5.33333   -5.25   -3.05985
 NaN      -6.0       -5.0    -4.13793
  -2.0    -4.0       -4.0    -3.0
  -3.1   NaN         -5.0   NaN
 NaN     NaN         -6.0    -4.0
  -5.0   NaN        NaN     NaN
  -3.0   NaN        NaN     NaN
 NaN     NaN        NaN     NaN
 NaN     NaN        NaN     NaN
 NaN     NaN        NaN     NaN
 NaN     NaN        NaN     NaN
 NaN     NaN        NaN     NaN
 NaN     NaN        NaN     NaN
 NaN     NaN        NaN     NaN

In [62]:
MDP.V_from_Q!(V, Q, 𝐩)
V_to_matrix_form(round.(V; digits=2))

4×4 Matrix{Float64}:
 NaN     -2.0   -3.06   -4.14
  -2.0   -3.1   -4.0    -5.0
  -3.0  NaN    NaN     NaN
 NaN    NaN    NaN     NaN

In [63]:
simulator = MDP.create_simulator(mdp, 𝐩, 0.05)
V, Q = MDP.allocate_V_and_Q(mdp)
converged = false
for i = 1:1000
    MDP.mk_evaluate_policy!(Q, simulator, 1.0; maxiter = 10000)
    MDP.P_from_Q!(𝐩, Q)
end

In [65]:
Q

15×4 Matrix{Float64}:
  -3.125     -4.19091   -4.10753   -2.0
  -4.13115   -5.24      -5.14754   -3.06828
  -5.14286   -4.125     -5.14286   -4.14049
  -2.0       -4.08      -4.16667   -3.17143
  -3.1055    -5.0       -5.0       -3.0
 NaN         -6.0       -4.0       -4.2
  -5.0       -3.06061  NaN        NaN
  -3.0      NaN         -5.0      NaN
  -4.0      NaN        NaN        NaN
  -5.0      NaN        NaN        NaN
  -4.0       -2.0      NaN        NaN
 NaN        NaN        NaN        NaN
 NaN        NaN        NaN        NaN
 NaN        NaN        NaN        NaN
 NaN        NaN        NaN        NaN

In [69]:
MDP.V_from_Q!(V, Q, 𝐩)
V_to_matrix_form(round.(V; digits=2))

4×4 Matrix{Float64}:
 NaN     -2.0   -3.07   -4.12
  -2.0   -3.0   -4.0    -3.06
  -3.0   -4.0   -5.0    -2.0
 NaN    NaN    NaN     NaN

In [70]:
MDP.P_from_Q!(𝐩, Q)
P_to_matrix_form(𝐩)

4×4 Matrix{Char}:
 '█'  '←'  '←'  '↓'
 '↑'  '←'  '→'  '↓'
 '↑'  '↑'  '↑'  '↓'
 '↑'  '↑'  '↑'  '█'

In [71]:
𝐩

15-element Vector{Int64}:
 4
 4
 2
 1
 4
 3
 2
 1
 1
 1
 2
 1
 1
 1
 1

In [72]:
Q

15×4 Matrix{Float64}:
  -3.125     -4.19091   -4.10753   -2.0
  -4.13115   -5.24      -5.14754   -3.06828
  -5.14286   -4.125     -5.14286   -4.14049
  -2.0       -4.08      -4.16667   -3.17143
  -3.1055    -5.0       -5.0       -3.0
 NaN         -6.0       -4.0       -4.2
  -5.0       -3.06061  NaN        NaN
  -3.0      NaN         -5.0      NaN
  -4.0      NaN        NaN        NaN
  -5.0      NaN        NaN        NaN
  -4.0       -2.0      NaN        NaN
 NaN        NaN        NaN        NaN
 NaN        NaN        NaN        NaN
 NaN        NaN        NaN        NaN
 NaN        NaN        NaN        NaN

In [73]:
P_to_matrix_form(P_dp_optimal)

4×4 Matrix{Char}:
 '█'  '←'  '←'  '↓'
 '↑'  '↑'  '↑'  '↓'
 '↑'  '↑'  '↓'  '↓'
 '↑'  '→'  '→'  '█'

In [74]:
P_to_matrix_form(𝐩)

4×4 Matrix{Char}:
 '█'  '←'  '←'  '↓'
 '↑'  '←'  '→'  '↓'
 '↑'  '↑'  '↑'  '↓'
 '↑'  '↑'  '↑'  '█'