# CME 241 Assignment 14

## Shaan Patel

### Question 2

In [7]:
from typing import Iterable, Iterator, TypeVar, Tuple, Sequence, Callable, Dict
import rl.markov_decision_process as mp
import rl.distribution as ds
from rl.policy import DeterministicPolicy
from rl.approximate_dynamic_programming import QValueFunctionApprox
from rl.function_approx import LinearFunctionApprox, Weights
from collections import defaultdict
import numpy as np

In [2]:
S = TypeVar('S')
A = TypeVar('A')

def greedy_policy_from_qvf(
    q: QValueFunctionApprox[S,A],
    actions: Callable[[mp.NonTerminal[S]], Iterable[A]]
) -> DeterministicPolicy[S,A]:
    def optimal_action(s: S) -> A:
        _, a = q.argmax((mp.NonTerminal(s), a) for a in actions(mp.NonTerminal(s)))
        return a
    return DeterministicPolicy(optimal_action)


def LSTD(
    transitions: Iterable[mp.TransitionStep[S,A]],
    feature_functions: Sequence[Callable[[mp.NonTerminal[S],A],float]],
    approx_pol: DeterministicPolicy[mp.NonTerminal[S],A],
    gamma: float,
    epsilon: float
) -> LinearFunctionApprox[Tuple[mp.NonTerminal[S],A]]:

    num_features: int = len(feature_functions)
    a_inv: np.ndarray = np.eye(num_features) / epsilon  
    b_vec: np.ndarray = np.zeros(num_features)

    pi: DeterministicPolicy[S,A] = approx_pol

    for tr in transitions:
        phi1: np.ndarray = np.array([f(tr.state,tr.action) for f in feature_functions])
        if isinstance(tr.next_state, mp.NonTerminal):
            phi2 = phi1 - gamma * np.array([f((tr.next_state, pi.action_for(tr.next_state.state))) for f in feature_functions])
        else:
            phi2 = phi1
        temp: np.ndarray = a_inv.T.dot(phi2)
        a_inv = a_inv - np.outer(a_inv.dot(phi1), temp) / (1 + phi1.dot(temp))
        b_vec += phi1 * tr.reward
    opt_wts: np.ndarray = a_inv.dot(b_vec)
    return LinearFunctionApprox.create(
        feature_functions=feature_functions,
        weights=Weights.create(opt_wts)
    )

def LSPI(
    transitions: Iterable[mp.TransitionStep[S,A]],
    actions: Callable[[mp.NonTerminal[S]], Iterable[A]],
    feature_functions: Sequence[Callable[[mp.NonTerminal[S],A],float]],
    approx_pol: DeterministicPolicy[mp.NonTerminal[S],A],
    gamma: float,
    epsilon: float
) -> Iterator[LinearFunctionApprox[Tuple[mp.NonTerminal[S], A]]]:
    pi: DeterministicPolicy[S,A] = approx_pol
    transition_seq: Sequence[mp.TransitionStep[S,A]] = list(transitions)
    while True:
        q: LinearFunctionApprox[Tuple[mp.NonTerminal[S], A]] =\
            LSTD(
                transitions=transition_seq,
                feature_functions=feature_functions,
                approx_pol=pi,
                gamma=gamma,
                epsilon=epsilon
            )
        pi = greedy_policy_from_qvf(q, actions)
        yield q


    

### Question 3

In [9]:
class OptionsMDP(mp.MarkovDecisionProcess[int, int]):
    price: float
    strike: float
    iscall: bool

    def get_payoff(self):
        price = self.price
        strike = self.strike
        iscall = self.iscall

        if iscall:
            return max(0, price - strike)
        else:
            return max(0, strike - price)

    def actions(self, state: mp.NonTerminal[int]) -> Iterable[str]:
        return ['e','c']

    def step(
        self,
        state: mp.NonTerminal[int],
        action: A,
    ) -> ds.Distribution[Tuple[mp.State[int], float]]:

        if action == 'e':
            return ds.Constant(Tuple[mp.State[state + 1], self.get_payoff()])
        else:
            self.price += np.random.randn()
            return ds.Constant(Tuple[mp.State[state + 1], 0])

            
    