```
         Copyright Rein Halbersma 2020-2021.
Distributed under the Boost Software License, Version 1.0.
   (See accompanying file LICENSE_1_0.txt or copy at
         http://www.boost.org/LICENSE_1_0.txt)
```

# Dynamic programming for the Frozen Lake
An implementation of the dynamic programming assignment of the [Udacity Deep Reinforcement Learning Nanodegree](https://github.com/udacity/deep-reinforcement-learning/blob/master/dynamic-programming/Dynamic_Programming_Solution.ipynb)

In [1]:
from itertools import product

import gym
import numpy as np

from doctrina.algorithms import dp

env = gym.make('FrozenLake-v0', is_slippery=True)

## Part -1: Preprocessing

### Infer the frozen lake's height and width
To plot value functions as 2D tables, we need to know the lake's height and width.

In [2]:
H = W = int(np.sqrt(env.nS))
env.observation_shape = (H, W)

### Avoid redundant computations (aka Once And Only Once)
To avoid redundant computations, we pre-compute a separate transition tensor and a reward matrix as discussed in Sutton & Barto (p.49).

In [3]:
# Equation (3.4) in Sutton & Barto (p.49):
# p(s'|s, a) = probability of transition to state s', from state s taking action a.
env.transition = np.zeros((env.nS, env.nA, env.nS))

# Equation (3.5) in Sutton & Barto (p.49):
# r(s, a) = expected immediate reward from state s after action a.
env.reward = np.zeros((env.nS, env.nA))

# Initialize the transition and reward tensors.
for s in env.P.keys():
    for a in env.P[s].keys():
        for prob, next, reward, done in env.P[s][a]:
            # Exclude transitions to the terminal state.
            if not done:
                env.transition[s, a, next] += prob
            env.reward[s, a] += prob * reward

## Part 0: Explore FrozenLakeEnv

In [4]:
# print the state space and action space
print(env.observation_space)
print(env.action_space)

# print the total number of states and actions
print(env.nS)
print(env.nA)

Discrete(16)
Discrete(4)
16
4


In [5]:
env.P[1][0]

[(0.3333333333333333, 1, 0.0, False),
 (0.3333333333333333, 0, 0.0, False),
 (0.3333333333333333, 5, 0.0, True)]

## Part 1: Iterative Policy Evaluation

In [6]:
random_policy = dp.policy_init(env)
print(random_policy)
V, delta, iter = dp.V_policy_eval(env, random_policy)
print(delta, iter)
print(V.reshape(env.observation_shape).round(5))

[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
8.452946889322965e-09 57
[[0.01394 0.01163 0.02095 0.01048]
 [0.01625 0.      0.04075 0.     ]
 [0.03481 0.08817 0.14205 0.     ]
 [0.      0.17582 0.43929 0.     ]]


## Part 2: Obtain $q_\pi$ from $v_\pi$

In [7]:
Q = dp.Q_from_V(env, V)
print(Q)

[[0.0147094  0.01393978 0.01393978 0.01317015]
 [0.00852356 0.01163091 0.0108613  0.01550788]
 [0.02444514 0.02095298 0.02406033 0.01435346]
 [0.01047649 0.01047649 0.00698432 0.01396865]
 [0.02166487 0.01701828 0.01624865 0.01006281]
 [0.         0.         0.         0.        ]
 [0.05433538 0.04735105 0.05433538 0.00698432]
 [0.         0.         0.         0.        ]
 [0.01701828 0.04099204 0.03480619 0.04640826]
 [0.07020885 0.11755991 0.10595784 0.05895312]
 [0.18940421 0.17582037 0.16001424 0.04297382]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.08799677 0.20503718 0.23442716 0.17582037]
 [0.25238823 0.53837051 0.52711478 0.43929118]
 [0.         0.         0.         0.        ]]


## Part 3: Policy Improvement

In [8]:
policy = dp.V_policy_impr(env, V)
print(policy)

[[1.   0.   0.   0.  ]
 [0.   0.   0.   1.  ]
 [1.   0.   0.   0.  ]
 [0.   0.   0.   1.  ]
 [1.   0.   0.   0.  ]
 [0.25 0.25 0.25 0.25]
 [0.5  0.   0.5  0.  ]
 [0.25 0.25 0.25 0.25]
 [0.   0.   0.   1.  ]
 [0.   1.   0.   0.  ]
 [1.   0.   0.   0.  ]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.   0.   1.   0.  ]
 [0.   1.   0.   0.  ]
 [0.25 0.25 0.25 0.25]]


## Part 4: Policy Iteration

In [9]:
policy, V, info = dp.V_policy_iter(env)
print(info)
print(policy)
print(V.reshape(env.observation_shape).round(5))

{'delta': 9.670292233643352e-09, 'evaluations': 798, 'improvements': 3}
[[0.25 0.25 0.25 0.25]
 [0.   0.   0.   1.  ]
 [0.   0.   0.   1.  ]
 [0.   0.   0.   1.  ]
 [1.   0.   0.   0.  ]
 [0.25 0.25 0.25 0.25]
 [0.5  0.   0.5  0.  ]
 [0.25 0.25 0.25 0.25]
 [0.   0.   0.   1.  ]
 [0.   1.   0.   0.  ]
 [1.   0.   0.   0.  ]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.   0.   1.   0.  ]
 [0.   1.   0.   0.  ]
 [0.25 0.25 0.25 0.25]]
[[0.82353 0.82353 0.82353 0.82353]
 [0.82353 0.      0.52941 0.     ]
 [0.82353 0.82353 0.76471 0.     ]
 [0.      0.88235 0.94118 0.     ]]


## Part 5: Truncated Policy Iteration

In [10]:
policy, V, info = dp.V_policy_iter(env, maxiter=2)
print(info)
print(policy)
print(V.reshape(env.observation_shape).round(5))

{'delta': 9.980803183928799e-09, 'evaluations': 620, 'improvements': 310}
[[0.25 0.25 0.25 0.25]
 [0.   0.   0.   1.  ]
 [0.   0.   0.   1.  ]
 [0.   0.   0.   1.  ]
 [1.   0.   0.   0.  ]
 [0.25 0.25 0.25 0.25]
 [0.5  0.   0.5  0.  ]
 [0.25 0.25 0.25 0.25]
 [0.   0.   0.   1.  ]
 [0.   1.   0.   0.  ]
 [1.   0.   0.   0.  ]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.   0.   1.   0.  ]
 [0.   1.   0.   0.  ]
 [0.25 0.25 0.25 0.25]]
[[0.82353 0.82353 0.82353 0.82353]
 [0.82353 0.      0.52941 0.     ]
 [0.82353 0.82353 0.76471 0.     ]
 [0.      0.88235 0.94118 0.     ]]


## Part 6: Value Iteration

In [11]:
print(info)
policy, V, info = dp.V_value_iter(env)
print(policy)
print(V.reshape(env.observation_shape).round(5))

{'delta': 9.980803183928799e-09, 'evaluations': 620, 'improvements': 310}
[[0.25 0.25 0.25 0.25]
 [0.   0.   0.   1.  ]
 [0.   0.   0.   1.  ]
 [0.   0.   0.   1.  ]
 [1.   0.   0.   0.  ]
 [0.25 0.25 0.25 0.25]
 [0.5  0.   0.5  0.  ]
 [0.25 0.25 0.25 0.25]
 [0.   0.   0.   1.  ]
 [0.   1.   0.   0.  ]
 [1.   0.   0.   0.  ]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.   0.   1.   0.  ]
 [0.   1.   0.   0.  ]
 [0.25 0.25 0.25 0.25]]
[[0.82353 0.82353 0.82353 0.82353]
 [0.82353 0.      0.52941 0.     ]
 [0.82353 0.82353 0.76471 0.     ]
 [0.      0.88235 0.94118 0.     ]]


## Part 7: Time to solution (aka The Need for Speed)

In [12]:
for method in ('async', 'sync', 'solve'):
    for format in ('stoch', 'deter'):
        print(f'V-policy iteration with format={format}, method={method}: ', end='')
        %timeit dp.V_policy_iter(env, format=format, method=method)
        print(f'Q-policy iteration with format={format}, method={method}: ', end='')
        %timeit dp.Q_policy_iter(env, format=format, method=method)

V-policy iteration with format=stoch, method=async: 166 ms ± 1.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Q-policy iteration with format=stoch, method=async: 642 ms ± 2.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
V-policy iteration with format=deter, method=async: 116 ms ± 397 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Q-policy iteration with format=deter, method=async: 1.18 s ± 20.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
V-policy iteration with format=stoch, method=sync: 19.4 ms ± 386 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Q-policy iteration with format=stoch, method=sync: 19.7 ms ± 147 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
V-policy iteration with format=deter, method=sync: 37.3 ms ± 312 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Q-policy iteration with format=deter, method=sync: 37.5 ms ± 803 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
V-policy iteration with format=sto

In [13]:
for method in ('async', 'sync'):
    for format in ('stoch', 'deter'):
        print(f'V-value iteration with format={format}, method={method}: ', end='')
        %timeit dp.V_value_iter(env, format=format, method=method)
        print(f'Q-value iteration with format={format}, method={method}: ', end='')
        %timeit dp.Q_value_iter(env, format=format, method=method)

V-value iteration with format=stoch, method=async: 85 ms ± 253 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Q-value iteration with format=stoch, method=async: 370 ms ± 3.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
V-value iteration with format=deter, method=async: 87.2 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Q-value iteration with format=deter, method=async: 379 ms ± 2.56 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
V-value iteration with format=stoch, method=sync: 10.8 ms ± 55.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Q-value iteration with format=stoch, method=sync: 11 ms ± 54.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
V-value iteration with format=deter, method=sync: 10.6 ms ± 108 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Q-value iteration with format=deter, method=sync: 10.7 ms ± 134 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
