# GridWorld Problem

## Problem setup

In [1]:
import ReinforcementLearning.FiniteMarkovDecisionProcesses as MDP

In [2]:
function V_to_matrix_form(V)
    VV = Matrix{Float64}(undef, 4, 4)
    VV[1,1] = V[15]
    for i = 1:14
        k = div(i,4)
        ℓ = rem(i,4)
        VV[k+1, ℓ+1] = V[i]
    end
    VV[4,4] = VV[1,1]
    return VV
end

V_test = 1:15
V_to_matrix_form(V_test)

4×4 Matrix{Float64}:
 15.0   1.0   2.0   3.0
  4.0   5.0   6.0   7.0
  8.0   9.0  10.0  11.0
 12.0  13.0  14.0  15.0

In [3]:
function action_to_char(a)
    if a == 1 # up
        return '↑'
    elseif a == 2 # down
        return '↓'
    elseif a == 3 # right
        return '→'
    elseif a == 4 # left
        return '←'
    else
        return '?'
    end
end

function P_to_matrix_form(P)
    PP = Matrix{Char}(undef, 4, 4)
    PP[1,1] = '█'
    for i = 1:14
        k = div(i,4)
        ℓ = rem(i,4)
        PP[k+1, ℓ+1] = action_to_char(P[i])
    end
    PP[4,4] = '█'
    return PP
end

P_test = rand(1:4, 15)
P_to_matrix_form(P_test)

4×4 Matrix{Char}:
 '█'  '↓'  '↑'  '←'
 '↑'  '↓'  '→'  '↑'
 '←'  '→'  '↓'  '→'
 '→'  '↓'  '→'  '█'

In [4]:
next_states = [
    # 1 (up)  2 (down)  3 (right)  4 (left) 
      1       5         2         15         #  1
      2       6         3          1         #  2 
      3       7         3          2         #  3
     15       8         5          4         #  4
      1       9         6          4         #  5
      2      10         7          5         #  6
      3      11         7          6         #  7
      4      12         9          8         #  8
      5      13        10          8         #  9
      6      14        11          9         # 10
      7      15        11         10         # 11
      8      12        13         12         # 12
      9      13        14         12         # 13
     10      14        15         13         # 14
     15      15        15         15         # 15
]

rewards = -1*ones(size(next_states))
rewards[15, :] .= 0

mdp = MDP.DeterministicFiniteMDP((s,a)->next_states[s, a], (s,a)->rewards[s, a], 4, 15, 15);

## Dynamic Programming

### DP: Policy Evaluation

In [5]:
policy = 0.25*ones(size(next_states)...);  # Vector{Int64}(undef, size(next_states, 1))

In [6]:
V, Q = MDP.allocate_V_and_Q(mdp)
V[1] = 0

iters_no, converged = MDP.dp_evaluate_policy!(V, Q, mdp, policy, 1.0; tol = 1e-4, maxiter = 1000)

(173, true)

In [7]:
V_dp_random_policy = copy(V)
V_to_matrix_form(round.(V_dp_random_policy; digits=2))

4×4 Matrix{Float64}:
   0.0  -14.0  -20.0  -22.0
 -14.0  -18.0  -20.0  -20.0
 -20.0  -20.0  -18.0  -14.0
 -22.0  -20.0  -14.0    0.0

In [8]:
P = MDP.create_P_from_Q(Q)
MDP.P_from_Q!(P, Q)

P_to_matrix_form(P)

4×4 Matrix{Char}:
 '█'  '←'  '←'  '↓'
 '↑'  '↑'  '↓'  '↓'
 '↑'  '↑'  '→'  '↓'
 '↑'  '→'  '→'  '█'

### DP: Policy Optimization

In [9]:
P = rand(1:4, size(next_states, 1));
V, Q = MDP.allocate_V_and_Q(mdp)
V[1] = 0

converged = false
iters_no = 100
for i = 1:iters_no
    MDP.dp_evaluate_policy!(V, Q, mdp, P, 1.0; tol = 1e-4, maxiter = 5)
    modified = MDP.P_from_Q!(P, Q)
    if !modified
        converged = true
        iters_no = i
        break
    end
end

(iters_no, converged)

(4, true)

In [10]:
P_dp_optimal = copy(P)
P_to_matrix_form(P_dp_optimal)

4×4 Matrix{Char}:
 '█'  '←'  '←'  '↓'
 '↑'  '↑'  '↑'  '↓'
 '↑'  '↑'  '↓'  '↓'
 '↑'  '→'  '→'  '█'

In [11]:
V_dp_optimal = copy(V)
V_to_matrix_form(round.(V; digits=2))

4×4 Matrix{Float64}:
  0.0  -1.0  -2.0  -3.0
 -1.0  -2.0  -3.0  -2.0
 -2.0  -3.0  -2.0  -1.0
 -3.0  -2.0  -1.0   0.0

## Monte Karlo

### MK: Policy Evaluation (Stochastic Policy)

In [12]:
P = 0.25 * ones(size(next_states)...)
simulator = MDP.create_simulator(mdp, P)
V, Q = MDP.allocate_V_and_Q(mdp)
MDP.mk_evaluate_policy!(Q, simulator, 1.0; maxiter = 10000)


In [13]:

MDP.V_from_Q!(V, Q, P)
V_mk_random_policy = copy(V)
V_to_matrix_form(round.(V_mk_random_policy; digits=2))

4×4 Matrix{Float64}:
 NaN     -15.02  -20.93  -22.98
 -14.68  -18.89  -21.05  -20.95
 -21.01  -20.92  -19.11  -15.24
 -23.0   -20.65  -15.03  NaN

In [14]:
V_to_matrix_form(round.(V_dp_random_policy .- V_mk_random_policy; digits=2))

4×4 Matrix{Float64}:
 NaN     1.02  0.94    0.98
   0.68  0.89  1.05    0.95
   1.01  0.92  1.11    1.24
   1.0   0.65  1.03  NaN

### MK: Policy Evaluation (Deterministic Policy)

In [15]:
𝐩 = rand(1:4, size(next_states, 1))
simulator = MDP.create_simulator(mdp, 𝐩, 0.1)
V, Q = MDP.allocate_V_and_Q(mdp)
MDP.mk_evaluate_policy!(Q, simulator, 1.0; maxiter = 10000)

In [16]:
MDP.V_from_Q!(V, Q, P)
V_to_matrix_form(round.(V; digits=2))

4×4 Matrix{Float64}:
  NaN     -152.36  -219.66  -226.95
 -164.27  -207.8   -216.77  -227.1
 -217.66  -162.35  -168.77  -187.86
 -171.99   -97.41   -77.13   NaN

### MK: Policy Optimization

In [17]:
𝐩 = rand(1:4, size(next_states, 1))
simulator = MDP.create_simulator(mdp, 𝐩, 0.05)
V, Q = MDP.allocate_V_and_Q(mdp)
converged = false
for i = 1:1000
    MDP.mk_evaluate_policy!(Q, simulator, 1.0; maxiter = 1000)
    MDP.P_from_Q!(𝐩, Q)
end

In [18]:
Q

15×4 Matrix{Float64}:
  -3.0       -4.0       -4.0       -2.0
 NaN         -5.0      NaN         -3.0137
  -5.0       -4.17568  NaN         -4.0
  -2.0       -4.0       -4.0       -3.0
  -3.07857   -5.0       -5.0       -3.0
  -4.0       -4.0      NaN         -4.10345
  -5.0       -3.06897  NaN         -5.0
  -3.06993   -5.0       -5.0      NaN
 NaN         -4.0       -4.0       -4.13514
  -5.0       -3.0       -3.15      -7.0
  -4.0       -2.0       -3.0       -4.0
  -4.0      NaN         -4.15714   -6.0
  -5.0      NaN         -3.06818   -5.0
  -4.0       -3.0       -2.0       -4.0
 NaN        NaN        NaN        NaN

In [19]:
MDP.V_from_Q!(V, Q, 𝐩)
V_to_matrix_form(round.(V; digits=2))

4×4 Matrix{Float64}:
 NaN     -2.0   -3.01   -4.0
  -2.0   -3.0   -4.0    -3.07
  -3.07  -4.0   -3.0    -2.0
  -4.0   -3.07  -2.0   NaN

In [20]:
simulator = MDP.create_simulator(mdp, 𝐩, 0.05)
V, Q = MDP.allocate_V_and_Q(mdp)
converged = false
for i = 1:1000
    MDP.mk_evaluate_policy!(Q, simulator, 1.0; maxiter = 10000)
    MDP.P_from_Q!(𝐩, Q)
end

In [21]:
Q

15×4 Matrix{Float64}:
  -3.0       -4.2069    -4.05556   -2.0
  -4.0       -5.06667   -5.2       -3.0571
  -5.09091   -4.13919   -5.21429   -4.0
  -2.0       -4.08824   -4.22449   -3.0566
  -3.11111   -5.15      -5.0       -3.06383
  -4.12342   -4.0       -4.14286   -4.1
  -5.08696   -3.06881   -4.16129   -5.08696
  -3.07317   -5.30769   -5.25      -4.41667
  -4.1317    -4.0       -4.4       -4.30769
  -5.0       -3.06739   -3.0       -5.0
  -4.11111   -2.0       -3.0       -4.0
  -4.14577   -5.2       -4.0       -5.4
  -5.0       -4.0       -3.05891   -5.42857
  -4.14286   -3.15625   -2.0       -4.07692
 NaN        NaN        NaN        NaN

In [22]:
MDP.V_from_Q!(V, Q, 𝐩)
V_to_matrix_form(round.(V; digits=2))

4×4 Matrix{Float64}:
 NaN     -2.0   -3.06   -4.0
  -2.0   -3.06  -4.0    -3.07
  -3.07  -4.0   -3.0    -2.0
  -4.0   -3.06  -2.0   NaN

In [23]:
MDP.P_from_Q!(𝐩, Q)
P_to_matrix_form(𝐩)

4×4 Matrix{Char}:
 '█'  '←'  '←'  '←'
 '↑'  '←'  '↓'  '↓'
 '↑'  '↓'  '→'  '↓'
 '→'  '→'  '→'  '█'