```
         Copyright Rein Halbersma 2021.
Distributed under the Boost Software License, Version 1.0.
   (See accompanying file LICENSE_1_0.txt or copy at
         http://www.boost.org/LICENSE_1_0.txt)
```

# Dynamic Programming for Jack's Car Rental Environment

In [1]:
import gym
import numpy as np
import pandas as pd
import plotnine as p9

import gym_jcr

from doctrina.algorithms import dp

In [2]:
env0 = gym.make('JacksCarRental-v0')




In [3]:
env1 = gym.make('JacksCarRental-v1')

## Policy iteration

In [4]:
policy0, V0, info = dp.V_policy_iter(env0, gamma=0.9)
print(info)

{'delta': 9.793780009204056e-09, 'evaluations': 821, 'improvements': 5}


In [5]:
policy1, V1, info = dp.V_policy_iter(env1, gamma=0.9)
print(info)

{'delta': 9.835162018134724e-09, 'evaluations': 814, 'improvements': 5}


In [6]:
assert (policy0 == policy1).all()

## Optimal policy

In [None]:
df_policy = (pd
    .DataFrame((policy1 - 5).reshape(21, 21))
    .rename_axis('loc_1')
    .reset_index()
    .melt(id_vars='loc_1', var_name='loc_2', value_name='cars_moved')
    .astype({'loc_2': 'int64'})
)

In [None]:
(
    p9.ggplot(df_policy, p9.aes(x='loc_2', y='loc_1')) +
    p9.geom_tile(p9.aes(fill='cars_moved')) +
    p9.xlab('#Cars at second location') + p9.ylab('#Cars at first location')
)

## State value function

In [None]:
df_V = (pd
    .DataFrame(V1.reshape(21, 21))
    .rename_axis('loc_1')
    .reset_index()
    .melt(id_vars='loc_1', var_name='loc_2', value_name='expected_value')
    .astype({'loc_2': 'int64'})
)

In [None]:
(
    p9.ggplot(df_V, p9.aes(x='loc_2', y='loc_1')) +
    p9.geom_tile(p9.aes(fill='expected_value')) +
    p9.xlab('#Cars at second location') + p9.ylab('#Cars at first location')
)

## Time to solution (aka The Need for Speed)

In [None]:
%timeit dp.V_policy_iter(env1, gamma=0.9)

In [None]:
%timeit dp.V_value_iter(env1, gamma=0.9)

## Check timing of manual simulation

In [None]:
from tqdm import tqdm

In [None]:
env1.reset()
for _ in tqdm(range(1_000_000)):
    env1.step(5)

TODO: apply Temporal Difference methods and verify DP results. (Monte Carlo methods don't apply to continuing tasks, but TD(1) is similar)