In [31]:
from typing import Iterator, Iterable, Mapping, Callable, Sequence
import numpy as np
from rl.markov_process import NonTerminal, TransitionStep
from rl.approximate_dynamic_programming import S,A

Question 1. (LSTD covered in the class)

In [116]:
def lstd_prediction(
    transitions: Iterable[TransitionStep[S]],
    feature_functions: Sequence[Callable[[NonTerminal[S]], float]],
    gamma: float,
    epsilon: float
) -> Callable[[NonTerminal[S]], float]:
    num_features: int = len(feature_functions)
    a: np.ndarray = np.eye(num_features) * epsilon
    b_vec: np.ndarray = np.zeros(num_features)
    for tr in transitions:
        phi1: np.ndarray = np.array([f(tr.state) for f in feature_functions])
        if isinstance(tr.next_state, NonTerminal):
            phi2 = phi1 - gamma * np.array([f(tr.next_state)
                                                for f in feature_functions])
        else:
            phi2 = phi1
        update: np.ndarray = np.outer(phi1, phi2)
        a += update
        b_vec += phi1 * tr.reward

    a_inv = np.linalg.inv(a)
    opt_wts: np.ndarray = a_inv.dot(b_vec)

    return lambda s: np.dot(np.array(
            [f(s) for f in feature_functions]
            ), opt_wts)

In [117]:
#construct test process
from rl.chapter10.random_walk_mrp import RandomWalkMRP
from rl.iterate import iterate
import itertools
from rl.approximate_dynamic_programming import NTStateDistribution
from rl.distribution import Choose

this_barrier: int = 20
this_p: float = 0.55
random_walk: RandomWalkMRP = RandomWalkMRP(
             barrier=this_barrier,
             p=this_p
)
gamma = 1.0
true_vf: np.ndarray = random_walk.get_value_function_vec(gamma=gamma)

num_transitions: int = 10000
nt_states: Sequence[NonTerminal[int]] = random_walk.non_terminal_states
start_distribution: NTStateDistribution[int] = Choose(set(nt_states))
traces: Iterable[Iterable[TransitionStep[int]]] = \
    random_walk.reward_traces(start_distribution)
transitions: Iterable[TransitionStep[int]] = \
    itertools.chain.from_iterable(traces)
lstd_transitions: Iterable[TransitionStep[int]] = \
    itertools.islice(transitions, num_transitions)

In [None]:
#simple test
from rl.chapter12.laguerre import laguerre_state_features
from rl.function_approx import LinearFunctionApprox

num_polynomials:int = 5
features: Sequence[Callable[[NonTerminal[int]], float]] = \
    laguerre_state_features(num_polynomials)
epsilon: float = 1e-4

lstd_func_test: Linear_Approx = \
    lstd_prediction(
        transitions = lstd_transitions,
        feature_functions = features,
        gamma = gamma,
        epsilon = epsilon
    )
lstd_vf_test: np.ndarray = np.array([lstd_func_test(s) for s in nt_states])

x_vals: Sequence[int] = [s.state for s in nt_states]

from rl.gen_utils.plot_funcs import plot_list_of_curves

plot_list_of_curves(
    [x_vals, x_vals],
    [true_vf, lstd_vf_test],
    ["b", "g"],
    ["True Value Function", "self-made LSTD Function"],
    x_label="States",
    y_label="Value Function",
    title="LSTD and self-made LSTD versus True Value Function"
)

Question 3. (LSPI customized for American Options Pricing)