In [130]:
import numpy as np
import itertools
import rl.iterate as iterate
from pprint import pprint
from rl.distribution import Constant, Categorical
from typing import Iterator, Iterable, Mapping
from rl.markov_process import S

Question 1: (Tabular MC Prediction)

In [131]:
from rl.returns import returns
from rl.markov_process import TransitionStep
from rl.function_approx import X

def mc_tabular_prediction(
    traces: Iterable[Iterable[TransitionStep[S]]],
    γ: float,
    episode_length_tolerance: float = 1e-6
) -> Iterator[Mapping[X,float]]:
    episodes: Iterator[Iterator[mp.ReturnStep[S]]] = \
        (returns(trace, γ, episode_length_tolerance) for trace in traces) ## traces of returns of each episode

    values_map: Mapping[X, float] = {}
    counts_map: Mapping[X, int] = {}
    count_to_weight_func: Callable[[int], float] = lambda n: 1.0 / n

    yield values_map

    for episode in episodes:
        for step in episode:
            counts_map.setdefault(step.state, 0)
            weight: float = 1 if counts_map[step.state] == 0 else count_to_weight_func(counts_map[step.state])
            values_map.setdefault(step.state, 0.)
            values_map[step.state] += weight * (step.return_ - values_map[step.state])
            counts_map[step.state] += 1
        yield values_map

Question 2: (Tabular TD Prediction)

In [132]:
def td_tabular_prediction(
    transitions: Iterable[TransitionStep[S]],
    γ: float,
    alph = 0.03,
    beta = 0.5,
    H = 1000
) -> Iterator[Mapping[X, float]]:
    values_map: Mapping[X, float] = {}
    counts = 1
    count_to_weight_func: Callable[[int], float] = lambda n: alph/(1+((n-1)/H)**beta)

    yield values_map

    for transition in transitions:
        weight: float = count_to_weight_func(counts)
        values_map.setdefault(transition.state, 0.)
        values_map[transition.state] += weight * (transition.reward + γ * values_map.get(transition.next_state, 0.) - values_map[transition.state])
        counts += 1

        yield values_map

Question 3: (Test new implementations of MC and TD algorithms using mrp from rl/chapter2/simple inventory mrp.py)

In [133]:
# Construct testing process
from rl.markov_process import NonTerminal
from rl.chapter2.simple_inventory_mrp import InventoryState
from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite
from rl.markov_process import FiniteMarkovRewardProcess

## Set parameters
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0
user_gamma = 0.9

si_mrp = SimpleInventoryMRPFinite(
    capacity=user_capacity,
    poisson_lambda=user_poisson_lambda,
    holding_cost=user_holding_cost,
    stockout_cost=user_stockout_cost
)
Test_Process = FiniteMarkovRewardProcess(
        si_mrp.get_transition_reward_map()
    )

# test using MC in the BOOK
from rl.function_approx import Tabular, FunctionApprox
from rl.monte_carlo import mc_prediction
from rl.approximate_dynamic_programming import ValueFunctionApprox
from rl.markov_process import TransitionStep

user_episode_length = 100
user_num_episodes = 100
Initial_InventoryState = NonTerminal(InventoryState(0,0))

pred_mc: Iterator[ValueFunctionApprox[S]] = mc_prediction(
         traces = si_mrp.reward_traces(Constant(Initial_InventoryState)),
         approx_0 = Tabular(),
         γ = user_gamma
)
pred_mc_tabular: ValueFunctionApprox[S] \
    = iterate.last(itertools.islice(pred_mc, user_num_episodes))

print("Tabular MC in book")
pprint({s: pred_mc_tabular(s) for s in Test_Process.non_terminal_states})
pred_mc_test: Iterator[ValueFunctionApprox[S]] = mc_tabular_prediction(
              traces = si_mrp.reward_traces(Constant(Initial_InventoryState)),
              γ = user_gamma
)
pred_mc_tabular_test: Mapping[X, float] \
    = iterate.last(itertools.islice(pred_mc_test, user_num_episodes))

print("Self-made Tabular MC")
pprint({s: pred_mc_tabular_test[s].item() for s in Test_Process.non_terminal_states})

Tabular MC in book
{NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.379297688171324,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.88618045547448,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.921721037184003,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.63595660535119,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.48367471271226,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.280296765256875}
Self-made Tabular MC
{NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.54749029271724,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -29.064033740000188,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -28.071514157450466,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.62943132343653,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.503610819721416,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.517518528757908}


In [129]:
# test using TD in the BOOK
from rl.td import td_prediction

user_episode_length = 100
user_num_episodes = 1000

# parameters in learning rate
alph = 0.03
beta = 0.5
H = 1000

pred_td: Iterator[ValueFunctionApprox[S]] = td_prediction(
         transitions = si_mrp.simulate_reward(Constant(Initial_InventoryState)),
         approx_0 = Tabular(count_to_weight_func = lambda n: alph/(1+((n-1)/H)**beta)),
         γ = user_gamma)
pred_td_tabular: ValueFunctionApprox[S] \
    = iterate.last(itertools.islice(pred_td, user_episode_length * user_num_episodes))

print("Tabular TD in book")
pprint({s: pred_td_tabular(s) for s in Test_Process.non_terminal_states})

pred_td_test: Iterator[ValueFunctionApprox[S]] = td_tabular_prediction(
              transitions = si_mrp.simulate_reward(Constant(Initial_InventoryState)),
              γ = user_gamma)
pred_td_tabular_test: Mapping[X, float] \
    = iterate.last(itertools.islice(pred_td_test, user_episode_length * user_num_episodes))

print("Self-made Tabular TD")
pprint({s: pred_td_tabular_test[s].item() for s in Test_Process.non_terminal_states})

Tabular TD in book
{NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.622525584801767,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.736294972889755,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -28.852091192324906,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.33290263519442,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.108707084693147,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.272061703808046}
Self-made Tabular TD
{NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.872969704985923,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.881735923742305,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.228925351270824,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.49517804806315,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.37348401033258,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.336314112690246}
