```
         Copyright Rein Halbersma 2020-2021.
Distributed under the Boost Software License, Version 1.0.
   (See accompanying file LICENSE_1_0.txt or copy at
         http://www.boost.org/LICENSE_1_0.txt)
```

# Dynamic programming for the Frozen Lake

In [1]:
from itertools import product

import gym
import numpy as np
import pandas as pd

from doctrina.algorithms import dp
from doctrina.spaces import state_table

env = gym.make('FrozenLake-v0')

In [2]:
H = W = int(np.sqrt(env.nS))
env.observation_shape = (H, W)
terminal = env.nS
Reward = np.array([0.0, 1.0])
nR = len(Reward)

In [3]:
# p(s', r|s, a): probability of transition to state s' with reward r, from state s and action a
P_tensor = np.zeros((env.nS + 1, env.nA, env.nS + 1, nR))
P_tensor[terminal, :, terminal, 0] = 1
for s, a in product(range(env.nS), range(env.nA)):
    for prob, next, reward, done in env.P[s][a]:
        P_tensor[s, a, terminal if done else next, int(reward)] += prob
assert np.isclose(P_tensor.sum(axis=(2, 3)), 1).all()

In [4]:
# p(s'|s, a): probability of transition to state s', from state s taking action a
transition = P_tensor.sum(axis=3)
assert np.isclose(transition.sum(axis=2), 1).all()

In [5]:
# r(s, a): expected immediate reward from state s after action a
reward = P_tensor.sum(axis=2) @ Reward

In [6]:
env.nS += 1
env.transition = transition
env.reward = reward

In [7]:
policy, V, *_ = dp.V_policy_iter(env)
print(state_table(policy[:-1], env))
print(state_table(V[:-1], env))

[[0 3 3 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]
[[0.82352917 0.8235291  0.82352904 0.82352901]
 [0.82352919 0.         0.5294116  0.        ]
 [0.82352923 0.82352927 0.76470576 0.        ]
 [0.         0.88235284 0.94117642 0.        ]]


In [8]:
policy, Q, *_ = dp.Q_policy_iter(env)
V = Q.max(axis=1)
print(state_table(policy[:-1], env))
print(state_table(V[:-1], env))

[[0 3 3 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]
[[0.82352917 0.8235291  0.82352904 0.82352901]
 [0.82352919 0.         0.5294116  0.        ]
 [0.82352923 0.82352927 0.76470576 0.        ]
 [0.         0.88235284 0.94117642 0.        ]]


In [9]:
policy, V, *_ = dp.V_value_iter(env)
print(state_table(policy[:-1], env))
print(state_table(V[:-1], env))

[[0 3 3 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]
[[0.82352918 0.8235291  0.82352904 0.82352901]
 [0.82352919 0.         0.5294116  0.        ]
 [0.82352923 0.82352927 0.76470576 0.        ]
 [0.         0.88235284 0.94117642 0.        ]]


In [10]:
policy, Q, *_ = dp.Q_value_iter(env)
V = Q.max(axis=1)
print(state_table(policy[:-1], env))
print(state_table(V[:-1], env))

[[0 3 3 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]
[[0.82352918 0.8235291  0.82352904 0.82352901]
 [0.82352919 0.         0.5294116  0.        ]
 [0.82352923 0.82352927 0.76470576 0.        ]
 [0.         0.88235284 0.94117642 0.        ]]


## Purrformance

In [11]:
%timeit dp.Q_value_iter(env)

10.2 ms ± 85.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
