# Car rental problem

The problem is described in the "Richard S. Sutton and Andrew G. Barto. 2018. Reinforcement Learning: An Introduction. A Bradford Book, Cambridge, MA, USA", in the example 4.2.

Add description...

Add solution details...

In [1]:
from pathlib import Path
import pickle
from omegaconf import OmegaConf, DictConfig

import numpy as np
from scipy import linalg, stats

import plotly.express as px
import plotly.graph_objects as go

from components import reward_prob_tensor, value_iteration, get_optimal_policy

In [None]:
# read config with problem setting
problem_dir = Path("sutton")
assert problem_dir.exists()

In [3]:
problem_config_path = problem_dir / "config.yaml"
with open(problem_config_path) as f:
    config: DictConfig = OmegaConf.load(f)
# set tolerance for value iteration convergence
tol = 1e-6

In [4]:
# consturct task's average rewards and probabilities of state transitions
reward_t, prob_t = reward_prob_tensor(config)

Output()

Output()

In [5]:
def probs_test():
    summed_probs = prob_t.sum(axis=-1)
    # check that probs add up to 1.
    assert np.isclose(summed_probs, np.ones_like(summed_probs)).all()
    assert np.all(prob_t >= 0.)

probs_test()

In [6]:
tensors_dir = problem_dir / "tensors"
tensors_dir.mkdir(exist_ok=True)

In [7]:
with open(tensors_dir / "reward.pkl", "bw") as f:
    pickle.dump(reward_t, f)
with open(tensors_dir / "prob.pkl", "bw") as f:
    pickle.dump(prob_t, f)

In [8]:
# initial value function
v0 = np.zeros([(config.max_cars_location + 1) ** 2 + 1])

In [9]:
v = value_iteration(
    reward_t, prob_t, config.gamma, v0,
    tol, max_steps=1000
)

Output()

In [10]:
policy = get_optimal_policy(reward_t, prob_t, config.gamma, v)

In [11]:
v[-1]

np.float64(0.0)

In [12]:
v = v[:-1].reshape([config.max_cars_location + 1, config.max_cars_location + 1])
policy = policy[:-1].reshape([config.max_cars_location + 1, config.max_cars_location + 1])

In [13]:
# make and save figures
img_folder = problem_dir / "assets"
img_folder.mkdir(exist_ok=True)

In [18]:
value_fig = go.Figure(
    data=go.Contour(
        z=v,
        x=np.arange(0, config.max_cars_location + 1),
        y=np.arange(0, config.max_cars_location + 1),
        colorbar=dict(
            title=dict(text='Value function', side='right'),
        ),
    ),
    layout={
        "xaxis_title": "num_cars0",
        "yaxis_title": "num_cars1",
    }
)
value_fig.write_html(img_folder / "value.html")
value_fig.show()

In [19]:
px.imshow(
    v,
    #text_auto=True,
    labels=dict(x="num_cars1", y="num_cars0", color="Value function"),
    x=np.arange(0, config.max_cars_location + 1),
    y=np.arange(0, config.max_cars_location + 1),
    origin="lower"
)

In [20]:
policy_fig = px.imshow(
    policy,
    #text_auto=True,
    labels=dict(x="num_cars1", y="num_cars0", color="Cars moved"),
    x=np.arange(0, config.max_cars_location + 1),
    y=np.arange(0, config.max_cars_location + 1),
    origin="lower"
)
policy_fig.write_html(img_folder / "policy.html")
policy_fig.show()

## Links

- [Sutton and Barto book's site](http://incompleteideas.net/book/the-book-2nd.html)
- [Other implementation in Python](https://github.com/ShangtongZhang/reinforcement-learning-an-introduction)