In [1]:
import ReinforcementLearning.FiniteMarkovDecisionProcesses as MDP

In [22]:
function V_to_matrix_form(V)
    VV = Matrix{Float64}(undef, 4, 4)
    VV[1,1] = round(V[15]; digits=2)
    for i = 1:14
        k = div(i,4)
        ℓ = rem(i,4)
        VV[k+1, ℓ+1] = round(V[i]; digits=2)
    end
    VV[4,4] = VV[1,1]
    return VV
end

V_test = 1:15
V_to_matrix_form(V_test)

4×4 Matrix{Float64}:
 15.0   1.0   2.0   3.0
  4.0   5.0   6.0   7.0
  8.0   9.0  10.0  11.0
 12.0  13.0  14.0  15.0

In [3]:
function action_to_char(a)
    if a == 1 # up
        return '↑'
    elseif a == 2 # down
        return '↓'
    elseif a == 3 # right
        return '→'
    elseif a == 4 # left
        return '←'
    else
        return '?'
    end
end

function P_to_matrix_form(P)
    PP = Matrix{Char}(undef, 4, 4)
    PP[1,1] = '█'
    for i = 1:14
        k = div(i,4)
        ℓ = rem(i,4)
        PP[k+1, ℓ+1] = action_to_char(P[i])
    end
    PP[4,4] = '█'
    return PP
end

P_test = rand(1:4, 15)
P_to_matrix_form(P_test)

4×4 Matrix{Char}:
 '█'  '→'  '↓'  '←'
 '←'  '→'  '←'  '→'
 '→'  '↑'  '↓'  '↓'
 '↓'  '→'  '↓'  '█'

In [4]:
next_states = [
    # 1 (up)  2 (down)  3 (right)  4 (left) 
      1       5         2         15         #  1
      2       6         3          1         #  2 
      3       7         3          2         #  3
     15       8         5          4         #  4
      1       9         6          4         #  5
      2      10         7          5         #  6
      3      11         7          6         #  7
      4      12         9          8         #  8
      5      13        10          8         #  9
      6      14        11          9         # 10
      7      15        11         10         # 11
      8      12        13         12         # 12
      9      13        14         12         # 13
     10      14        15         13         # 14
     15      15        15         15         # 15
]

rewards = -1*ones(size(next_states))
rewards[15, :] .= 0

mdp = MDP.DeterministicFiniteMDP((s,a)->next_states[s, a], (s,a)->rewards[s, a], 4, 15);

In [5]:
policy = 0.25*ones(size(next_states)...);  # Vector{Int64}(undef, size(next_states, 1))

In [6]:
V = Vector{Float64}(undef, size(next_states, 1))
V[1] = 0
Q = MDP.create_Q_from_V(V, size(next_states, 2))
iters_no, converged = MDP.evaluate_policy!(V, Q, mdp, policy, 1.0; tol = 1e-4, maxiter = 1000)

(173, true)

In [7]:
V_to_matrix_form(round.(V; digits=2))

4×4 Matrix{Float64}:
   0.0  -14.0  -20.0  -22.0
 -14.0  -18.0  -20.0  -20.0
 -20.0  -20.0  -18.0  -14.0
 -22.0  -20.0  -14.0    0.0

In [8]:
P = MDP.create_P_from_Q(Q)
MDP.P_from_Q!(P, Q)

P_to_matrix_form(P)

4×4 Matrix{Char}:
 '█'  '←'  '←'  '↓'
 '↑'  '↑'  '↓'  '↓'
 '↑'  '↑'  '→'  '↓'
 '↑'  '→'  '→'  '█'

In [25]:
P = rand(1:4, size(next_states, 1));
iters_no, converged, V, Q = MDP.optimize_policy!(P, mdp, 1.0; tol = 1e-4, maxiter = 1000, policy_evaluation_maxiter = 1);
iters_no, converged

(7, true)

In [26]:
P_to_matrix_form(P)

4×4 Matrix{Char}:
 '█'  '←'  '←'  '↓'
 '↑'  '↑'  '↑'  '↓'
 '↑'  '↑'  '↓'  '↓'
 '↑'  '→'  '→'  '█'

In [27]:
V_to_matrix_form(V)

4×4 Matrix{Float64}:
  0.0  -1.0  -2.0  -3.0
 -1.0  -2.0  -3.0  -2.0
 -2.0  -3.0  -2.0  -1.0
 -3.0  -2.0  -1.0   0.0