# CME 241 Assignment 11

## Shaan Patel

### Question 1

In [2]:
from typing import Iterable, Iterator, TypeVar, Mapping
import rl.markov_process as mp
from rl.returns import returns
from collections import defaultdict

In [15]:
S = TypeVar('S')


def tabular_mc(
    traces: Iterable[Iterable[mp.TransitionStep[S]]],
    approx_0: Mapping[S,float],
    gamma: float,
    episode_length_tolerance: float = 1e-6
) -> Iterator[Mapping[S,float]]:

    counts = defaultdict(lambda: 0)

    episodes: Iterator[Iterator[mp.ReturnStep[S]]] =\
        (returns(trace, gamma, episode_length_tolerance) for trace in traces)
    f = approx_0
    yield f

    for episode in episodes:
        for entry in episode:
            s = entry.state
            r = entry.return_
            counts[s] += 1
            f[s] += (1/counts[s])*(r - f[s])
        yield f



### Question 2

In [4]:
def tabular_td(
    transitions: Iterable[mp.TransitionStep[S]],
    approx_0: Mapping[S,float],
    gamma: float
) -> Iterator[Mapping[S,float]]:

    counts = defaultdict(lambda: 0)
    f = approx_0
    yield f

    for t in transitions:
        counts[t.state] += 1
        f[t.state] += (1/counts[t.state])*(t.reward + gamma*f[t.next_state] - f[t.state])
        yield f

### Question 3

In [12]:
from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite
from rl.distribution import Choose
import itertools
from pprint import pprint

In [6]:
approx_0 = defaultdict(lambda: 0)

user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mrp = SimpleInventoryMRPFinite(
    capacity=user_capacity,
    poisson_lambda=user_poisson_lambda,
    holding_cost=user_holding_cost,
    stockout_cost=user_stockout_cost
)

si_mrp.display_value_function(gamma = user_gamma)


{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.511,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.932,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.345,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.932,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.345,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.345}


In [19]:
traces = si_mrp.reward_traces(Choose(si_mrp.non_terminal_states))

approx = defaultdict(lambda: 0)

it = tabular_mc(traces, approx, user_gamma)

num_traces = 2000

val_list = list(itertools.islice(it,num_traces))

last = val_list[-1]

pprint({s: approx[s] for s in si_mrp.non_terminal_states})

{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.5035978132024,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.942053750561858,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.345518409109868,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.9437094000773,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.356687017040663,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.352151921795816}
