In [23]:
import sys 
import os
import numpy as np
import itertools
from dataclasses import dataclass
from matplotlib import pyplot as plt
from typing import Optional, Mapping, Dict, Tuple
from scipy.stats import poisson

sys.path.append(os.path.abspath("/Users/quinnhollister/RL-book/rl/markov_process.py"))
from rl import markov_process
from rl.distribution import Categorical, Constant, Categorical, FiniteDistribution
from rl.markov_process import FiniteMarkovProcess, NonTerminal, MarkovRewardProcess, FiniteMarkovRewardProcess
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.policy import FiniteDeterministicPolicy



@dataclass(frozen=True)
class TwoStores:
    on_hand_A: int
    on_hand_B: int
    on_order_A: int
    on_order_B: int

    def inventory_position(self) -> Tuple[int,int]:
        intA = self.on_hand_A + self.on_order_A
        intB = self.on_hand_B + self.on_order_B
        return (intA, intB)

InvOrderMapping = Mapping[TwoStores,                
                         Mapping[Tuple[int, int, int], 
                         Categorical[Tuple[TwoStores, float]]]
                         ]

#Mapping takes us from current state, to action, to next possible state and the reward associated

class TwoStoresMDP(FiniteMarkovDecisionProcess[TwoStores, int]):
    
    def __init__(self, capacityA: int, capacityB: int, poisson_A: float, poisson_B: float, 
                 holding_cost_A: float, holding_cost_B: float, stockout_cost_A: float, 
                 stockout_cost_B: float, supplier_cost: float, transfer_cost: float):
        
        self.capacity_A: int = capacity_A
        self.capacity_B: int = capacity_B
        self.poisson_A: float = poisson_A
        self.poisson_B: float = poisson_B
        self.holding_cost_A: float = holding_cost_A
        self.holding_cost_B: float = holding_cost_B
        self.stockout_cost_A: float = stockout_cost_A
        self.stockout_cost_B: float = stockout_cost_B
        self.supplier_cost: float = supplier_cost
        self.transfer_cost: float = transfer_cost
            
        self.poisson_distr_A = poisson(poisson_A)
        self.poisson_distr_B = poisson(poisson_B)
        super().__init__(self.get_action_transition_reward_map())
        
        
        
        
        
        
    def get_action_transition_reward_map(self) -> InvOrderMapping:
        d: Dict[TwoStores, Dict[Tuple[int,int,int], Categorical[Tuple[TwoStores, float]]]] = {}
        
        #State space is essentially a 4-tuple so we need to do 4-inner loops to represent all possible
        #permutations
        for alpha_A in range(self.capacity_A + 1):
            for alpha_B in range(self.capacity_B + 1):
                for beta_A in range(self.capacity_A - alpha_A + 1):
                    for beta_B in range(self.capacity_B - alpha_B + 1):
                        
                        state: TwoStores = TwoStores(alpha_A, alpha_B, beta_A, beta_B)
                        ip_A, ip_B = state.inventory_position()
                        base_reward = -(self.holding_cost_A * alpha_A) 
                        base_reward -= self.holding_cost_B * alpha_B
                        d1: Dict[Tuple[int,int,int], Categorical[Tuple[TwoStates, float]]] = {}
                            
                        #Now we'll list out actions    
                        for swap in range(-1*alpha_A, alpha_B + 1):
                            for order_A in range(self.capacity_A - ip_A + swap + 1):
                                for order_B in range(self.capacity_B - ip_B - swap + 1):
                                    
                                    
                                    base_reward -= self.transfer_cost*swap
                                    base_reward -= self.supplier_cost*(order_A + order_B)
                                    
                                    sr_probs_dict: Dict[Tuple[TwoStore, float], float] = {}
                                        
                                    #Now list out next possible states given action and current state  
                                    #Either neither store sells out, or one of them sells out, or both sell out
                                    #Leaving us with four different possibilities
                                    for demand_A in range(ip_A + swap):
                                        for demand_B in range(ip_B - swap):
                                            
                                            prob = self.poisson_distr_A.pmf(demand_A)
                                            prob *= self.poisson_distr_B.pmf(demand_B)
                                            sr_probs_dict[(TwoStores(ip_A - demand_A + swap, order_A, ip_B - demand_B - swap, order_B), 
                                                          base_reward)] = prob
                                            
                                    #We've covered when both stores don't stockout, now well do when they both do
                                    
                                    probability_A: float = (1 - self.poisson_distr_A.cdf(ip_A-1))
                                    probability_B: float = (1 - self.poisson_distr_B.cdf(ip_B-1))
                                        
                                    stockout_A = self.stockout_cost_A * (probability_A*(self.poisson_A - ip_A) +\
                                                                        (ip_A)*(self.poisson_distr_A.pmf(ip_A)))
                                    stockout_B = self.stockout_cost_B * (probability_B*(self.poisson_B - ip_B) +\
                                                                        (ip_B)*(self.poisson_distr_B.pmf(ip_B)))
                                        
                                    reward: float = base_reward - stockout_A - stockout_B
                                        
                                    sr_probs_dict[(TwoStores(0, order_A, 0, order_B), reward)] = probability_A * probability_B
                                    
                                    #Now cover when A goes stockout
                                    
                                    for demand_B in range(ip_B - swap):
                                        prob = probability_A * self.poisson_distr_B.pmf(demand_B)
                                        sr_probs_dict[(TwoStores(0, order_A, ip_B - demand_B - swap, order_B), reward + stockout_B)] = prob
                                    
                                    
                                    #Now cover when B goes stockout
                                    for demand_A in range(ip_A + swap):
                                        prob = probability_B * self.poisson_distr_A.pmf(demand_A)
                                        sr_probs_dict[(TwoStores(ip_A - demand_A + swap, order_A, 0, order_B), reward + stockout_A)] = prob
                                    
                                    
                                    #We've filled the statespace, now add the dictionaries
                                    d1[(swap, order_A, order_B)] = Categorical(sr_probs_dict)
                                    
                            d[state] = d1

        return d
                            
                        

In [37]:
#Now lets create an instance of our class
from rl.dynamic_programming import value_iteration_result


capacity_A = 3
capacity_B = 3
lambda_A = 1
lambda_B = 1
holding_A = 1.0
holding_B = 1.0 
stockout_A = 10.0
stockout_B = 1000.0
supplier_cost = 3.0
transfer_cost = 0.5

user_gamma = 0.9


two_store_mdp: FiniteMarkovDecisionProcess[TwoStores, int] =\
    TwoStoresMDP(
        capacityA = capacity_A,
        capacityB = capacity_B,
        poisson_A = lambda_A,
        poisson_B = lambda_B,
        holding_cost_A = holding_A,
        holding_cost_B = holding_B,
        stockout_cost_A = stockout_A,
        stockout_cost_B = stockout_B,
        supplier_cost = supplier_cost,
        transfer_cost = transfer_cost
    )
    

In [38]:
opt_vf_vi, opt_policy_vi = value_iteration_result(two_store_mdp, gamma = user_gamma)

print(opt_policy_vi)


For State TwoStores(on_hand_A=0, on_hand_B=0, on_order_A=0, on_order_B=0): Do Action (0, 1, 3)
For State TwoStores(on_hand_A=0, on_hand_B=0, on_order_A=0, on_order_B=1): Do Action (0, 2, 2)
For State TwoStores(on_hand_A=0, on_hand_B=0, on_order_A=0, on_order_B=2): Do Action (0, 3, 1)
For State TwoStores(on_hand_A=0, on_hand_B=0, on_order_A=0, on_order_B=3): Do Action (0, 2, 0)
For State TwoStores(on_hand_A=0, on_hand_B=0, on_order_A=1, on_order_B=0): Do Action (0, 1, 3)
For State TwoStores(on_hand_A=0, on_hand_B=0, on_order_A=1, on_order_B=1): Do Action (0, 2, 2)
For State TwoStores(on_hand_A=0, on_hand_B=0, on_order_A=1, on_order_B=2): Do Action (0, 1, 1)
For State TwoStores(on_hand_A=0, on_hand_B=0, on_order_A=1, on_order_B=3): Do Action (0, 2, 0)
For State TwoStores(on_hand_A=0, on_hand_B=0, on_order_A=2, on_order_B=0): Do Action (0, 1, 3)
For State TwoStores(on_hand_A=0, on_hand_B=0, on_order_A=2, on_order_B=1): Do Action (0, 0, 2)
For State TwoStores(on_hand_A=0, on_hand_B=0, on_o