In [1]:
from dataclasses import dataclass
from typing import Tuple, Iterator, Dict, Mapping, Sequence, Optional, Generic, TypeVar
from abc import ABC, abstractmethod
from dataclasses import dataclass
import itertools
import numpy as np
from scipy.stats import poisson
import os

def checkPath():
    for i in range(5):
        path = os.getcwd()
        
        if path[-7:] == 'RL-book':
            break
        else:
            os.chdir('../')      
    print(os.getcwd())

if __name__ == '__main__':
    checkPath()

C:\Users\lunar\PycharmProjects\RL-book


![nn](image-9.jpg)

![nn](image-10.jpg)

In [2]:
#도영님 파트

$P_{R}$ : transition probability function of the MRP / state-reward transition probability function of the MDP.

$MRP: \mathcal{P}_{R}\left(s, r, s^{\prime}\right)=\mathbb{P}\left[\left(R_{t+1}=r, S_{t+1}=s^{\prime}\right) \mid S_{t}=s\right] $


$MDP: \mathcal{P}_{R}\left(s, a, r, s^{\prime}\right)=\mathbb{P}\left[\left(R_{t+1}=r, S_{t+1}=s^{\prime}\right) \mid\left(S_{t}=s, A_{t}=a\right)\right]$

$P$ : transition probability function of the Markov Process implicit in the MRP / state transition probability function of the MDP.

$MRP: \mathcal{P}\left(s, s^{\prime}\right)=\sum_{r \in \mathcal{D}} \mathcal{P}_{R}\left(s, r, s^{\prime}\right)$

$MDP: \mathcal{P}\left(s, a, s^{\prime}\right)=\sum_{r \in \mathcal{D}} \mathcal{P}_{R}\left(s, a, r, s^{\prime}\right)$

$R_{T}$ : reward transition function of the MRP / reward transition function of the MDP.

$MRP: \mathcal{R}_{T}\left(s, s^{\prime}\right)=\mathbb{E}\left[R_{t+1} \mid S_{t+1}=s^{\prime}, S_{t}=s\right]$

$MDP: \mathcal{R}_{T}\left(s, a, s^{\prime}\right)=\mathbb{E}\left[R_{t+1} \mid\left(S_{t+1}=s^{\prime}, S_{t}=s, A_{t}=a\right)\right]$

$R$ : reward function of the MRP / reward function of the MDP

$MRP: \mathcal{R}(s)=\mathbb{E}\left[R_{t+1} \mid S_{t}=s\right]$

$MDP: \mathcal{R}(s, a)=\mathbb{E}\left[R_{t+1} \mid\left(S_{t}=s, A_{t}=a\right)\right]$


$$
\mathcal{P}_{R}^{\pi}\left(s, r, s^{\prime}\right)=\sum_{a \in \mathcal{A}} \pi(s, a) \cdot \mathcal{P}_{R}\left(s, a, r, s^{\prime}\right)
$$
Likewise,
$$
\begin{aligned}
\mathcal{P}^{\pi}\left(s, s^{\prime}\right) &=\sum_{a \in \mathcal{A}} \pi(s, a) \cdot \mathcal{P}\left(s, a, s^{\prime}\right) \\
\mathcal{R}_{T}^{\pi}\left(s, s^{\prime}\right) &=\sum_{a \in \mathcal{A}} \pi(s, a) \cdot \mathcal{R}_{T}\left(s, a, s^{\prime}\right) \\
\mathcal{R}^{\pi}(s) &=\sum_{a \in \mathcal{A}} \pi(s, a) \cdot \mathcal{R}(s, a)
\end{aligned}
$$

In [11]:
from rl.distribution import Distribution
from rl.markov_process import MarkovRewardProcess
from typing import Iterable

A = TypeVar('A')
S = TypeVar('S')

class Policy(ABC, Generic[S, A]):
    
    @abstractmethod
    def act(self, state: S) -> Optional[Distribution[A]]:
        pass


@dataclass(frozen=True)
class TransitionStep(Generic[S, A]):
    state: S
    action: A
    next_state: S
    reward: float
        
class MarkovDecisionProcess(ABC, Generic[S, A]):
    
    #  input: state: S, and produces as output an Iterable[A] to represent the set of actions allowable for the input state
    @abstractmethod
    def actions(self, state: S) -> Iterable[A]:
        pass

    # It is means to specify the distribution of pairs of next state and reward, given a state and action.
    @abstractmethod
    def step(self, state: S, action: A) -> Optional[Distribution[Tuple[S, float]]]:
        pass
    
    # input: a policy: Policy[S, A] and returns MRP
    def apply_policy(self, policy: Policy[S, A]) -> MarkovRewardProcess[S]:
        mdp = self
        
        class RewardProcess(MarkovRewardProcess[S]):
            def transition_reward(self, state: S) -> \
            Optional[Distribution[Tuple[S, float]]]:
                actions: Optional[Distribution[A]] = policy.act(state)
                if actions is None:
                    return None
                return actions.apply(lambda a: mdp.step(state, a))
        
        return RewardProcess()
    
    # input state: S and returns a bool signifying whether state is a terminal state or not.
    # Since the actions method can returns the Iterbale type of allowable actions, 
    # the only way to check is by checking that next method triggers error.
    def is_terminal(self, state: S) -> bool:
        try:
            next(iter(self.actions(state)))
            return False
        except StopIteration:
            return True
        
    def simulate_actions(self, start_states: Distribution[S], policy: Policy[S, A]) -> \
    Iterable[TransitionStep[S, A]]:
        state: S = start_states.sample()
        reward: float = 0
            
        while True:
            # action distribution from given policy and state
            action_distribution = policy.act(state)
            if action_distribution is None:
                return
            
            # the distribution of paris of next state and reward, given a state and action.
            action = action_distribution.sample()
            next_distribution = self.step(state, action)
            if next_distribution is None:
                return
            
            # after sampling, yield TransitionStep
            next_state, reward = next_distribution.sample()
            yield TransitionStep(state, action, next_state, reward)
            state = next_state

In [4]:
from dataclasses import dataclass
from typing import Tuple, Iterator
import itertools
import numpy as np
from scipy.stats import poisson
import random

from rl.markov_decision_process import MarkovDecisionProcess
from rl.markov_process import MarkovRewardProcess, NonTerminal, State
from rl.policy import Policy, DeterministicPolicy
from rl.distribution import Constant, SampledDistribution


@dataclass(frozen=True)
class InventoryState:
    on_hand: int
    on_order: int

    def inventory_position(self) -> int:
        return self.on_hand + self.on_order


@dataclass(frozen=True)
class SimpleInventoryMDPNoCap(MarkovDecisionProcess[InventoryState, int]):
    # hyperparameters
    poisson_lambda: float
    holding_cost: float
    stockout_cost: float

    def step(
        self,
        state: NonTerminal[InventoryState],
        order: int
    ) -> SampledDistribution[Tuple[State[InventoryState], float]]:

        def sample_next_state_reward(
            state=state,
            order=order
        ) -> Tuple[State[InventoryState], float]:
            demand_sample: int = np.random.poisson(self.poisson_lambda)
            ip: int = state.state.inventory_position()
            next_state: InventoryState = InventoryState(
                max(ip - demand_sample, 0),
                order
            )
            reward: float = - self.holding_cost * state.state.on_hand\
                - self.stockout_cost * max(demand_sample - ip, 0)
            return NonTerminal(next_state), reward

        return SampledDistribution(sample_next_state_reward)

    def actions(self, state: NonTerminal[InventoryState]) -> Iterator[int]:
        return itertools.count(start=0, step=1)

    def fraction_of_days_oos(
        self,
        policy: Policy[InventoryState, int],
        time_steps: int,
        num_traces: int
    ) -> float:
        impl_mrp: MarkovRewardProcess[InventoryState] =\
            self.apply_policy(policy)
        count: int = 0
        high_fractile: int = int(poisson(self.poisson_lambda).ppf(0.98))
        start: InventoryState = random.choice(
            [InventoryState(i, 0) for i in range(high_fractile + 1)])

        for _ in range(num_traces):
            steps = itertools.islice(
                impl_mrp.simulate_reward(Constant(NonTerminal(start))),
                time_steps
            )
            for step in steps:
                if step.reward < -self.holding_cost * step.state.state.on_hand:
                    count += 1

        return float(count) / (time_steps * num_traces)


class SimpleInventoryDeterministicPolicy(
        DeterministicPolicy[InventoryState, int]
):
    def __init__(self, reorder_point: int):
        self.reorder_point: int = reorder_point

        def action_for(s: InventoryState) -> int:
            return max(self.reorder_point - s.inventory_position(), 0)

        super().__init__(action_for)


class SimpleInventoryStochasticPolicy(Policy[InventoryState, int]):
    def __init__(self, reorder_point_poisson_mean: float):
        self.reorder_point_poisson_mean: float = reorder_point_poisson_mean

    def act(self, state: NonTerminal[InventoryState]) -> \
            SampledDistribution[int]:
        def action_func(state=state) -> int:
            reorder_point_sample: int = \
                np.random.poisson(self.reorder_point_poisson_mean)
            return max(
                reorder_point_sample - state.state.inventory_position(),
                0
            )
        return SampledDistribution(action_func)


if __name__ == '__main__':
    user_poisson_lambda = 2.0
    user_holding_cost = 1.0
    user_stockout_cost = 10.0
# hyperparameters
    user_reorder_point = 8
    user_reorder_point_poisson_mean = 8.0

    user_time_steps = 1000
    user_num_traces = 1000

    si_mdp_nocap = SimpleInventoryMDPNoCap(poisson_lambda=user_poisson_lambda,
                                           holding_cost=user_holding_cost,
                                           stockout_cost=user_stockout_cost)

    si_dp = SimpleInventoryDeterministicPolicy(
        reorder_point=user_reorder_point
    )

    oos_frac_dp = si_mdp_nocap.fraction_of_days_oos(policy=si_dp,
                                                    time_steps=user_time_steps,
                                                    num_traces=user_num_traces)
    print(
        f"Deterministic Policy yields {oos_frac_dp * 100:.2f}%"
        + " of Out-Of-Stock days"
    )

    si_sp = SimpleInventoryStochasticPolicy(
        reorder_point_poisson_mean=user_reorder_point_poisson_mean)

    oos_frac_sp = si_mdp_nocap.fraction_of_days_oos(policy=si_sp,
                                                    time_steps=user_time_steps,
                                                    num_traces=user_num_traces)
    print(
        f"Stochastic Policy yields {oos_frac_sp * 100:.2f}%"
        + " of Out-Of-Stock days"
    )


Deterministic Policy yields 1.88% of Out-Of-Stock days
Stochastic Policy yields 2.94% of Out-Of-Stock days


**Finite Markov Decision Process**
![nn](image-1.png)

In [5]:
# %load C:\Users\14ZD\RL-book\rl\markov_decision_process.py
from __future__ import annotations
import sys
import os
sys.path.append('C:/Users/14ZD/RL-book')
import numpy
import graphviz
from abc import ABC, abstractmethod
from collections import defaultdict
from dataclasses import dataclass
from typing import (DefaultDict, Dict, Iterable, Generic, Mapping,
                    Tuple, Sequence, TypeVar, Set)

from rl.distribution import (Categorical, Distribution, FiniteDistribution)

from rl.markov_process import (
    FiniteMarkovRewardProcess, MarkovRewardProcess, StateReward, State,
    NonTerminal, Terminal)
from rl.policy import FinitePolicy, Policy

A = TypeVar('A')
S = TypeVar('S')

#print("start")
@dataclass(frozen=True)
class TransitionStep(Generic[S, A]):
    '''A single step in the simulation of an MDP, containing:

    state -- the state we start from
    action -- the action we took at that state
    next_state -- the state we ended up in after the action
    reward -- the instantaneous reward we got for this transition
    '''
    state: NonTerminal[S]
    action: A
    next_state: State[S]
    reward: float

    def add_return(self, γ: float, return_: float) -> ReturnStep[S, A]:
        '''Given a γ and the return from 'next_state', this annotates the
        transition with a return for 'state'.

        '''
        return ReturnStep(
            self.state,
            self.action,
            self.next_state,
            self.reward,
            return_=self.reward + γ * return_
        )


@dataclass(frozen=True)
class ReturnStep(TransitionStep[S, A]):
    '''A Transition that also contains the total *return* for its starting
    state.

    '''
    return_: float


class MarkovDecisionProcess(ABC, Generic[S, A]):
    def apply_policy(self, policy: Policy[S, A]) -> MarkovRewardProcess[S]:
        mdp = self

        class RewardProcess(MarkovRewardProcess[S]):
            def transition_reward(
                self,
                state: NonTerminal[S]
            ) -> Distribution[Tuple[State[S], float]]:
                actions: Distribution[A] = policy.act(state)
                return actions.apply(lambda a: mdp.step(state, a))

        return RewardProcess()

    @abstractmethod
    def actions(self, state: NonTerminal[S]) -> Iterable[A]:
        pass

    @abstractmethod
    def step(
        self,
        state: NonTerminal[S],
        action: A
    ) -> Distribution[Tuple[State[S], float]]:
        pass

    def simulate_actions(
            self,
            start_states: Distribution[NonTerminal[S]],
            policy: Policy[S, A]
    ) -> Iterable[TransitionStep[S, A]]:
        '''Simulate this MDP with the given policy, yielding the
        sequence of (states, action, next state, reward) 4-tuples
        encountered in the simulation trace.

        '''
        state: State[S] = start_states.sample()

        while isinstance(state, NonTerminal):
            action_distribution = policy.act(state)

            action = action_distribution.sample()
            next_distribution = self.step(state, action)

            next_state, reward = next_distribution.sample()
            yield TransitionStep(state, action, next_state, reward)
            state = next_state

    def action_traces(
            self,
            start_states: Distribution[NonTerminal[S]],
            policy: Policy[S, A]
    ) -> Iterable[Iterable[TransitionStep[S, A]]]:
        '''Yield an infinite number of traces as returned by
        simulate_actions.

        '''
        while True:
            yield self.simulate_actions(start_states, policy)


ActionMapping = Mapping[A, StateReward[S]]
StateActionMapping = Mapping[NonTerminal[S], ActionMapping[A, S]]



class FiniteMarkovDecisionProcess(MarkovDecisionProcess[S, A]):
    '''A Markov Decision Process with finite state and action spaces.

    '''

    mapping: StateActionMapping[S, A]
    non_terminal_states: Sequence[NonTerminal[S]]

    def __init__(
        self,
        mapping: Mapping[S, Mapping[A, FiniteDistribution[Tuple[S, float]]]]
    ):
        non_terminals: Set[S] = set(mapping.keys())
        self.mapping = {NonTerminal(s): {a: Categorical(
            {(NonTerminal(s1) if s1 in non_terminals else Terminal(s1), r): p
             for (s1, r), p in v.table().items()}
        ) for a, v in d.items()} for s, d in mapping.items()}
        self.non_terminal_states = list(self.mapping.keys())

    def __repr__(self) -> str:
        display = ""
        for s, d in self.mapping.items():
            display += f"From State {s.state}:\n"
            for a, d1 in d.items():
                display += f"  With Action {a}:\n"
                for (s1, r), p in d1:
                    opt = "Terminal " if isinstance(s1, Terminal) else ""
                    display += f"    To [{opt}State {s1.state} and "\
                        + f"Reward {r:.3f}] with Probability {p:.3f}\n"
        return display

    def step(self, state: NonTerminal[S], action: A) -> StateReward[S]:
        action_map: ActionMapping[A, S] = self.mapping[state]
        return action_map[action]

    def apply_finite_policy(self, policy: FinitePolicy[S, A])\
            -> FiniteMarkovRewardProcess[S]:

        transition_mapping: Dict[S, FiniteDistribution[Tuple[S, float]]] = {}

        for state in self.mapping:
            action_map: ActionMapping[A, S] = self.mapping[state]
            outcomes: DefaultDict[Tuple[S, float], float]\
                = defaultdict(float)
            actions = policy.act(state)
            for action, p_action in actions:
                for (s1, r), p in action_map[action].table().items():
                    outcomes[(s1.state, r)] += p_action * p

            transition_mapping[state.state] = Categorical(outcomes)

        return FiniteMarkovRewardProcess(transition_mapping)

    def actions(self, state: NonTerminal[S]) -> Iterable[A]:
        '''All the actions allowed for the given state.

        This will be empty for terminal states.

        '''
        return self.mapping[state].keys()

#print("Done")

In [6]:
class FiniteMarkovDecisionProcess(MarkovDecisionProcess[S, A]):
    '''A Markov Decision Process with finite state and action spaces.

    '''

    mapping: StateActionMapping[S, A]
    non_terminal_states: Sequence[NonTerminal[S]]
    
    #input: mapping
    #Maps Non-terminal state to an action map
    #Terminal state to None
    #Maps each action to a finite probability distributino of pairs (next state, reward)
    def __init__(
        self,
        mapping: Mapping[S, Mapping[A, FiniteDistribution[Tuple[S, float]]]]
    ):
        non_terminals: Set[S] = set(mapping.keys())
        self.mapping = {NonTerminal(s): {a: Categorical(
            {(NonTerminal(s1) if s1 in non_terminals else Terminal(s1), r): p
             for (s1, r), p in v.table().items()}
        ) for a, v in d.items()} for s, d in mapping.items()}
        self.non_terminal_states = list(self.mapping.keys())

    def __repr__(self) -> str:
        display = ""
        for s, d in self.mapping.items():
            display += f"From State {s.state}:\n"
            for a, d1 in d.items():
                display += f"  With Action {a}:\n"
                for (s1, r), p in d1:
                    opt = "Terminal " if isinstance(s1, Terminal) else ""
                    display += f"    To [{opt}State {s1.state} and "\
                        + f"Reward {r:.3f}] with Probability {p:.3f}\n"
        return display
    
    
    #step() returns finite probability distribution of (next state, reward)
    #if self.mapping[state] == terminal, returns None
    def step(self, state: NonTerminal[S], action: A) -> StateReward[S]:
        action_map: ActionMapping[A, S] = self.mapping[state]
        return action_map[action]
    
    
    #Input: FinitePolicy[S,A] which maps non-terminal state to probability distribution over a finite set of actions
    #returns a FiniteMRP
    def apply_finite_policy(self, policy: FinitePolicy[S, A])\
            -> FiniteMarkovRewardProcess[S]:

        transition_mapping: Dict[S, FiniteDistribution[Tuple[S, float]]] = {}

        for state in self.mapping:
            action_map: ActionMapping[A, S] = self.mapping[state]
            outcomes: DefaultDict[Tuple[S, float], float]\
                = defaultdict(float)
            actions = policy.act(state)
            for action, p_action in actions:
                for (s1, r), p in action_map[action].table().items():
                    outcomes[(s1.state, r)] += p_action * p

            transition_mapping[state.state] = Categorical(outcomes)

        return FiniteMarkovRewardProcess(transition_mapping)
    
    
    #Iterable한 actions들을 mapping[state].keys()로 return
    def actions(self, state: NonTerminal[S]) -> Iterable[A]:
        '''All the actions allowed for the given state.

        This will be empty for terminal states.

        '''
        return self.mapping[state].keys()

$\mathcal{N} \rightarrow(\mathcal{A} \rightarrow(\mathcal{S} \times \mathcal{D} \rightarrow[0,1]))$

$\pi : \mathcal{N} \rightarrow(\mathcal{A} \rightarrow[0,1])$

In [7]:
# state x reward
StateReward = FiniteDistribution[Tuple[S, float]]
# action X (state x reward)
ActionMapping = Mapping[A, StateReward[S]]
# state X (action X (state X reward))
StateActionMapping = Mapping[S, Optional[ActionMapping[A, S]]]

In [8]:
from rl.distribution import SampledDistribution
from collections import defaultdict
from rl.distribution import Categorical

class FiniteMarkovDecisionProcess(MarkovDecisionProcess[S, A]):
    
    # state X (action X (state X reward))
    mapping: StateActionMapping[S, A]
    non_terminal_states: Sequence[S]
    
    def __init__(self, mapping: StateActionMapping[S, A]):
        self.mapping = mapping
        self.non_terminal_states = [s for s, v in mapping.items()]
        
    # Create display string using nested for statements
    def __repr__(self) -> str:
        display = ""
        for s, d in self.mapping.items():
            if d is None:
                display += f"{s} is Terminal State\n"
            else:
                display += f"From State {s}:\n"
                for a, d1 in d.items():
                    display += f"  With Action {a}:\n"
                    for (s1, r), p in d1.table():
                        display += f"    To [State {s1} and "\
                            + f"Reward {r:.3f} with Probability {p:.3f}\n"
                        
        return display
    
    def apply_finite_policy(self, policy: FinitePolicy[S, A])\
        -> FiniteMarkovRewardProcess[S]:
        
        # Create dict: state X (state X reward)
        transition_mapping: Dict[S, Optional[StateReward[S]]] = {}
            
        for state in self.mapping:
            # Create ActionMapping: action X (state X reward)
            action_map: Optional[ActionMapping[A, S]] = self.mapping[state]
            if action_map is None:
                transition_mapping[state] = None
                
            # Create outcomes: Tuple(state X reward), prob
            else:
                outcomes: DefaultDict[Tuple[S, float], float]\
                    = defaultdict(float)
                # actions: action X prob
                actions = policy.act(state)
                if actions is not None:
                    for action, p_action in actions:
                        for outcome, p_state_reward in action_map[action]:
                            # state X (sum of all reward*action_prob)
                            outcomes[outcome] += p_action * p_state_reward
                 
                # state X (next_state X reward)
                transition_mapping[state] = Categorical(outcomes)
                
        return FiniteMarkovRewardProcess(transition_mapping)    
    
    # return probability
    def step(self, sate: S, action: A) -> Optional[StateReward]:
        action_map: Optional[ActionMapping[A, S]] = self.mapping[state]
        if action_map is None:
            return None
        return action_map[action]
    
    # return actions that are possible in the current state
    def actions(self, state: S) -> Iterable[A]:
        actions = self.mapping[state]
        return iter([]) if actions is None else actions.keys()
    
    # return ActionMapping
    def action_mapping(self, state: S) -> Optional[ActionMapping[A, S]]:
        return self.mapping[state]
    
    # return state key
    def states(self) -> Iterable[S]:
        return self.mapping.keys()
    
class FinitePolicy(Policy[S, A]):
    # state X distribution
    policy_map: Mapping[S, Optional[FiniteDistribution[A]]]
        
    def __init__(self, policy_map: Mapping[S, Optional[FiniteDistribution[A]]]):
        self.policy_map = policy_map
    
    # create display string using nested for statement
    def __repr__(self) -> str:
        display = ""
        for s, d in self.policy_map.items():
            if d is None:
                display += f"{s} is a Terminal State\n"
            else:
                display += f"For State {s}:\n"
                for a, p in d:
                    display += f" Do Action {a} with Probability {p:.3f}\n"
        return display

    # return action distribution
    def act(self, state: S) -> Optional[FiniteDistribution[A]]:
        return self.policy_map[state]
    
    # return state keys
    def states(self) -> Iterable[S]:
        return self.policy_map.keys()

In [9]:
from scipy.stats import poisson
from rl.distribution import Categorical

@dataclass(frozen=True)
class InventoryState:
    on_hand: int
    on_order: int
        
    def inventory_position(self) -> int:
        return self.on_hand + self.on_order
    
    
InvOrderMapping = StateActionMapping[InventoryState, int]

class SimpleInventoryMDPCap(FiniteMarkovDecisionProcess[InventoryState, int]):
    
    def __init__(
        self,
        capacity: int,
        poisson_lambda: float,
        holding_cost: float,
        stockout_cost: float
    ):
        self.capacity: int = capacity
        self.poisson_lambda: float = poisson_lambda
        self.holding_cost: float = holding_cost
        self.stockout_cost: float = stockout_cost
            
        self.poisson_distr = poisson(poisson_lambda)
        super().__init__(self.get_action_transition_reward_map())
        
    def get_action_transition_reward_map(self) -> InvOrderMapping:
        d: Dict[InventoryState, Dict[int, Categorical[Tuple[InventoryState, float]]]] = {}
        
        for alpha in range(self.capacity + 1):
            for beta in range(self.capacity +1 - alpha):
                state: InventoryState = InventoryState(alpha, beta)
                ip: int = state. inventory_position()
                base_reward: float = -self.holding_cost * alpha
                d1: Dict[int, Categorical[Tuple[InventoryState, float]]] = {}
                    
                for order in range(self.capacity - ip + 1):
                    sr_probs_dict: Dict[Tuple[InventoryState, float], float] =\
                        {(InventoryState(ip - i, order), base_reward):
                        self.poisson_distr.pmf(i) for i in range(ip)}
                        
                    probability: float = 1 - self.poisson_distr.cdf(ip - 1)
                    reward: float = base_reward - self.stockout_cost *\
                        (probability * (self.poisson_lambda - ip) +
                        ip * self.poisson_distr.pmf(ip))
                    sr_probs_dict[(InventoryState(0, order), reward)] = \
                        probability
                    d1[order] = Categorical(sr_probs_dict)
                    
            d[state] = d1
        return d

In [10]:
from rl.distribution import Constant

user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
    SimpleInventoryMDPCap(
        capacity=user_capacity,
        poisson_lambda=user_poisson_lambda,
        holding_cost=user_holding_cost,
        stockout_cost=user_stockout_cost
    )
    
fdp: FinitePolicy[InventoryState, int] = FinitePolicy(
    {InventoryState(alpha, beta):
    Constant(user_capacity - (alpha + beta)) for alpha in
    range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha)}
)
    
implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\
    si_mdp.apply_finite_policy(fdp)
    
print(fdp)
print(implied_mrp)

For State InventoryState(on_hand=0, on_order=0):
 Do Action 2 with Probability 1.000
For State InventoryState(on_hand=0, on_order=1):
 Do Action 1 with Probability 1.000
For State InventoryState(on_hand=0, on_order=2):
 Do Action 0 with Probability 1.000
For State InventoryState(on_hand=1, on_order=0):
 Do Action 1 with Probability 1.000
For State InventoryState(on_hand=1, on_order=1):
 Do Action 0 with Probability 1.000
For State InventoryState(on_hand=2, on_order=0):
 Do Action 0 with Probability 1.000

From State InventoryState(on_hand=0, on_order=2):
  To [State InventoryState(on_hand=2, on_order=0) and Reward -0.000] with Probability 0.368
  To [Terminal State InventoryState(on_hand=1, on_order=0) and Reward -0.000] with Probability 0.368
  To [Terminal State InventoryState(on_hand=0, on_order=0) and Reward -1.036] with Probability 0.264
From State InventoryState(on_hand=1, on_order=1):
  To [State InventoryState(on_hand=2, on_order=0) and Reward -1.000] with Probability 0.368
  T

MDP Value Function for a fixed policy
![nn](image-2.png)
![nn](image-3.png)
![nn](image-4.png)

V(s):
![nn](image-5.png)

Action-value function which maps (state, action) pair to expected return
![nn](image-6.png)

State-value function = weighted average of action-value funciton 
![nn](image-7.png)


$V^{*}(s)=\max _{a \in \mathcal{A}} Q^{*}(s, a)$ for all $s \in \mathcal{N}$

$$
Q^{*}(s, a)=\mathcal{R}(s, a)+\gamma \cdot \sum_{s^{\prime} \in \mathcal{N}} \mathcal{P}\left(s, a, s^{\prime}\right) \cdot V^{*}\left(s^{\prime}\right) \text { for all } s \in \mathcal{N}, a \in \mathcal{A}
$$
Substituting for $Q^{*}(s, a)$ from Equation (2.6) in Equation (2.5) gives:
$$
V^{*}(s)=\max _{a \in \mathcal{A}}\left\{\mathcal{R}(s, a)+\gamma \cdot \sum_{s^{\prime} \in \mathcal{N}} \mathcal{P}\left(s, a, s^{\prime}\right) \cdot V^{*}\left(s^{\prime}\right)\right\} \text { for all } s \in \mathcal{N}
$$

![nn](image-8.png)

**MDP State-Value Function Bellman Optimality Equation**

$V^{*}(s)=\max _{a \in \mathcal{A}}\left\{\mathcal{R}(s, a)+\gamma \cdot \sum_{s^{\prime} \in \mathcal{N}} \mathcal{P}\left(s, a, s^{\prime}\right) \cdot V^{*}\left(s^{\prime}\right)\right\}$ for all $s \in \mathcal{N}$

**MDP Action-Value Function Bellman Optimality Equation**

$Q^{*}(s, a)=\mathcal{R}(s, a)+\gamma \cdot \sum_{s^{\prime} \in \mathcal{N}} \mathcal{P}\left(s, a, s^{\prime}\right) \cdot \max _{a^{\prime} \in \mathcal{A}} Q^{*}\left(s^{\prime}, a^{\prime}\right)$ for all $s \in \mathcal{N}, a \in \mathcal{A}$


MDP Bellman Optimality Equations address the ultimate purpose of Markov Decision Processes

- to identify the Optimal Value Function and the associated policy/policies
- enabling us to solve the MDP Control problem

**Optimal Policy as one that dominates all others**

$\pi^{*} \in \Pi$ is an Optimal Policy if $V^{\pi^{*}}(s) \geq V^{\pi}(s)$ for all $\pi \in \Pi$ and for all states $s \in \mathcal{N}$

**Theorem 2.0.1.** 

For any (discrete-time, countable-states, stationary) MDP:
- There exists an Optimal Policy $\pi^{*} \in \Pi$, i.e., there exists a Policy $\pi^{*} \in \Pi$ such that $V^{\pi^{*}}(s) \geq V^{\pi}(s)$ for all policies $\pi \in \Pi$ and for all states $s \in \mathcal{N}$
- All Optimal Policies achieve the Optimal Value Function, i.e. $V^{\pi^{*}}(s)=V^{*}(s)$ for all $s \in \mathcal{N}$, for all Optimal Policies $\pi^{*}$
- All Optimal Policies achieve the Optimal Action-Value Function, i.e. $Q^{\pi^{*}}(s, a)=$ $Q^{*}(s, a)$ for all $s \in \mathcal{N}$, for all $a \in \mathcal{A}$, for all Optimal Policies $\pi^{*}$

**Lemma 2.0.2**. 

For any two Optimal Policies $\pi_{1}^{*}$ and $\pi_{2}^{*}, V^{\pi_{1}^{*}}(s)=V^{\pi_{2}^{*}}(s)$ for all $s \in \mathcal{N}$ 

**Lemma 2.0.2 Proof.** 

Since $\pi_{1}^{*}$ is an Optimal Policy, from the Optimal Policy definition, we have: $V^{\pi_{1}^{*}}(s) \geq V^{\pi_{2}^{*}}(s)$ for all $s \in \mathcal{N}$. Likewise, since $\pi_{2}^{*}$ is an Optimal Policy, from the Optimal Policy definition, we have: $V^{\pi_{2}^{*}}(s) \geq V^{\pi_{1}^{2}}(s)$ for all $s \in \mathcal{N}$. This implies: $V^{\pi_{1}^{*}}(s)=V^{\pi_{2}^{*}}(s)$ for all $s \in \mathcal{N}$.

$$
\pi_{D}^{*}(s)=\underset{a \in \mathcal{A}}{\arg \max } Q^{*}(s, a) \text { for all } s \in \mathcal{N}
$$


we show that $\pi_{D}^{*}$ achieves the Optimal Value Functions $V^{*}$ and $Q^{*}$. 

Since $\pi_{D}^{*}(s)=\arg \max _{a \in \mathcal{A}} Q^{*}(s, a)$ and $V^{*}(s)=\max _{a \in \mathcal{A}} Q^{*}(s, a)$ for all $s \in \mathcal{N}$, we can infer for all $s \in \mathcal{N}$ that:
$$
V^{*}(s)=Q^{*}\left(s, \pi_{D}^{*}(s)\right)
$$

$$
\begin{gathered}
V^{\pi_{D}^{*}}(s)=V^{*}(s) \text { for all } s \in \mathcal{N} \\
Q^{\pi_{D}^{*}}(s, a)=Q^{*}(s, a) \text { for all } s \in \mathcal{N}, \text { for all } a \in \mathcal{A}
\end{gathered}
$$