In [1]:
import numpy as np 
from dataclasses import dataclass

In [15]:
@dataclass
class Process1:
    @dataclass
    class State:
        price: int

    level_param: int  # level to which price mean-reverts
    alpha1: float = 0.25  # strength of mean-reversion (non-negative value)

    def up_prob(self, state: State) -> float:
        return 1. / (1+np.exp(-self.alpha1*(state.price - self.level_param)))

    def next_state(self, state: State) -> State:
        up_move: int = np.random.binomial(1, self.up_prob(state), 1)[0]
        return Process1.State(price=state.price + up_move * 2 - 1)


In [16]:
def simulation(process, start_state):
  state = start_state 
  while True: 
    yield state 
    state = process.next_state(state)

In [17]:
import itertools 

def process1_price_traces(
  start_price: int, 
  level_param: int, 
  alpha1: float, 
  time_steps: int, 
  num_traces: int 
) -> np.ndarray: 
  process = Process1(level_param=level_param, alpha1=alpha1)
  start_state = Process1.State(price=start_price)
  return np.vstack([
    np.fromiter((s.price for s in itertools.islice(
      simulation(process, start_state), 
      time_steps + 1
    )), float) for _ in range(num_traces)
  ])

In [18]:
start_price: int = 100
level_param: int = 100
alpha1: float = 0.25
alpha2: float = 0.75
alpha3: float = 1.0
time_steps: int = 100
num_traces: int = 1000

process1_traces: np.ndarray = process1_price_traces(
    start_price=start_price,
    level_param=level_param,
    alpha1=alpha1,
    time_steps=time_steps,
    num_traces=num_traces
)

In [19]:
print(process1_traces)

[[100. 101. 100. ... 186. 187. 188.]
 [100. 101. 102. ...  32.  31.  30.]
 [100.  99. 100. ...   8.   7.   6.]
 ...
 [100.  99.  98. ...   6.   5.   4.]
 [100.  99.  98. ... 192. 193. 194.]
 [100.  99.  98. ...   6.   5.   4.]]


In [20]:
from dataclasses import dataclass
from typing import Tuple, Dict, Mapping
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.policy import FiniteDeterministicPolicy
from rl.markov_process import FiniteMarkovProcess, FiniteMarkovRewardProcess
from rl.distribution import Categorical
from scipy.stats import poisson


@dataclass(frozen=True)
class InventoryState:
    on_hand: int
    on_order: int

    def inventory_position(self) -> int:
        return self.on_hand + self.on_order


InvOrderMapping = Mapping[
    InventoryState,
    Mapping[int, Categorical[Tuple[InventoryState, float]]]
]


class SimpleInventoryMDPCap(FiniteMarkovDecisionProcess[InventoryState, int]):

    def __init__(
        self,
        capacity: int,
        poisson_lambda: float,
        holding_cost: float,
        stockout_cost: float
    ):
        self.capacity: int = capacity
        self.poisson_lambda: float = poisson_lambda
        self.holding_cost: float = holding_cost
        self.stockout_cost: float = stockout_cost

        self.poisson_distr = poisson(poisson_lambda)
        super().__init__(self.get_action_transition_reward_map())

    def get_action_transition_reward_map(self) -> InvOrderMapping:
        d: Dict[InventoryState, Dict[int, Categorical[Tuple[InventoryState,
                                                            float]]]] = {}

        for alpha in range(self.capacity + 1):
            for beta in range(self.capacity + 1 - alpha):
                state: InventoryState = InventoryState(alpha, beta)
                ip: int = state.inventory_position()
                base_reward: float = - self.holding_cost * alpha
                d1: Dict[int, Categorical[Tuple[InventoryState, float]]] = {}

                for order in range(self.capacity - ip + 1):
                    sr_probs_dict: Dict[Tuple[InventoryState, float], float] =\
                        {(InventoryState(ip - i, order), base_reward):
                        self.poisson_distr.pmf(i) for i in range(ip)}

                    probability: float = 1 - self.poisson_distr.cdf(ip - 1)
                    reward: float = base_reward - self.stockout_cost * \
                        (self.poisson_lambda - ip * 
                        (1 - self.poisson_distr.pmf(ip) / probability))
                    sr_probs_dict[(InventoryState(0, order), reward)] = \
                        probability
                    d1[order] = Categorical(sr_probs_dict)

                d[state] = d1
        return d


if __name__ == '__main__':
    from pprint import pprint

    user_capacity = 2
    user_poisson_lambda = 1.0
    user_holding_cost = 1.0
    user_stockout_cost = 10.0

    user_gamma = 0.9

    si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
        SimpleInventoryMDPCap(
            capacity=user_capacity,
            poisson_lambda=user_poisson_lambda,
            holding_cost=user_holding_cost,
            stockout_cost=user_stockout_cost
        )

    print("MDP Transition Map")
    print("------------------")
    print(si_mdp)

    fdp: FiniteDeterministicPolicy[InventoryState, int] = \
        FiniteDeterministicPolicy(
            {InventoryState(alpha, beta): user_capacity - (alpha + beta)
            for alpha in range(user_capacity + 1)
            for beta in range(user_capacity + 1 - alpha)}
    )

    print("Deterministic Policy Map")
    print("------------------------")
    print(fdp)

    implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\
        si_mdp.apply_finite_policy(fdp)
    print("Implied MP Transition Map")
    print("--------------")
    print(FiniteMarkovProcess(
        {s.state: Categorical({s1.state: p for s1, p in v.table().items()})
        for s, v in implied_mrp.transition_map.items()}
    ))

    print("Implied MRP Transition Reward Map")
    print("---------------------")
    print(implied_mrp)

    print("Implied MP Stationary Distribution")
    print("-----------------------")
    implied_mrp.display_stationary_distribution()
    print()

    print("Implied MRP Reward Function")
    print("---------------")
    implied_mrp.display_reward_function()
    print()

    print("Implied MRP Value Function")
    print("--------------")
    implied_mrp.display_value_function(gamma=user_gamma)
    print()

    from rl.dynamic_programming import evaluate_mrp_result
    from rl.dynamic_programming import policy_iteration_result
    from rl.dynamic_programming import value_iteration_result

    print("Implied MRP Policy Evaluation Value Function")
    print("--------------")
    pprint(evaluate_mrp_result(implied_mrp, gamma=user_gamma))
    print()

    print("MDP Policy Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_pi, opt_policy_pi = policy_iteration_result(
        si_mdp,
        gamma=user_gamma
    )
    pprint(opt_vf_pi)
    print(opt_policy_pi)
    print()

    print("MDP Value Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp, gamma=user_gamma)
    pprint(opt_vf_vi)
    print(opt_policy_vi)
    print()

MDP Transition Map
------------------
From State InventoryState(on_hand=0, on_order=0):
  With Action 0:
    To [State InventoryState(on_hand=0, on_order=0) and Reward -10.000] with Probability 1.000
  With Action 1:
    To [State InventoryState(on_hand=0, on_order=1) and Reward -10.000] with Probability 1.000
  With Action 2:
    To [State InventoryState(on_hand=0, on_order=2) and Reward -10.000] with Probability 1.000
From State InventoryState(on_hand=0, on_order=1):
  With Action 0:
    To [State InventoryState(on_hand=1, on_order=0) and Reward -0.000] with Probability 0.368
    To [State InventoryState(on_hand=0, on_order=0) and Reward -5.820] with Probability 0.632
  With Action 1:
    To [State InventoryState(on_hand=1, on_order=1) and Reward -0.000] with Probability 0.368
    To [State InventoryState(on_hand=0, on_order=1) and Reward -5.820] with Probability 0.632
From State InventoryState(on_hand=0, on_order=2):
  With Action 0:
    To [State InventoryState(on_hand=2, on_order=

Below is one acceptable answer. One possible solution was to “lift‐and‐shift” the single–store code into a new MDP whose state now is a pair of inventory states (one for each store) and whose actions are a triple giving (order₁, order₂, transfer) where the “order” components are subject to the usual capacity constraints and the “transfer” decision (positive meaning “from store 1 to store 2” and negative meaning “from store 2 to store 1”) is chosen from the physically feasible set (you cannot send more than a store’s current on–hand inventory). In addition, fixed ordering cost (K₁) is incurred at a store if its order is nonzero and a fixed transfer cost (K₂) is incurred whenever a nonzero transfer is made. (The cost‐penalties for holding and for stockouts remain “local” to each store.) One acceptable solution is shown below.

---

```python
"""
TwoStoresInventoryMDP.py

We model a two–store inventory control problem as a Finite MDP. Each store has its
own capacity, Poisson demand (with its own mean), holding cost and stockout cost.
At “6pm” each store may order inventory (subject to the constraint that the store’s
inventory position, defined as on_hand + on_order, does not exceed capacity) and
an inventory transfer between the stores is also possible (with a fixed cost K₂ if
a nonzero transfer occurs). Orders incur a fixed cost K₁. (All costs are taken as penalties.)
We then solve for the optimal value function and policy using standard DP methods.
"""

from dataclasses import dataclass
from typing import Tuple, Dict, Mapping
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.distribution import Categorical
from scipy.stats import poisson

# We re–use the same InventoryState as in the single–store example.
@dataclass(frozen=True)
class InventoryState:
    on_hand: int
    on_order: int

    def inventory_position(self) -> int:
        return self.on_hand + self.on_order

# The state of the two–store system is just a pair of InventoryStates.
@dataclass(frozen=True)
class TwoStoreInventoryState:
    store1: InventoryState
    store2: InventoryState

    def __str__(self):
        return f"(Store1: {self.store1}, Store2: {self.store2})"

# For clarity we define our action type to be a triple:
# (order1, order2, transfer)
# where transfer > 0 means transferring that many units from store 1 to store 2,
# transfer < 0 means transferring |transfer| units from store 2 to store 1.
TwoStoreAction = Tuple[int, int, int]

# Type alias for the mapping (from states to action maps).
TwoStoreMapping = Mapping[
    TwoStoreInventoryState,
    Mapping[TwoStoreAction, Categorical[Tuple[TwoStoreInventoryState, float]]]
]

class TwoStoresInventoryMDP(FiniteMarkovDecisionProcess[TwoStoreInventoryState, TwoStoreAction]):
    def __init__(
        self,
        capacity1: int,
        capacity2: int,
        poisson_lambda1: float,
        poisson_lambda2: float,
        holding_cost1: float,
        holding_cost2: float,
        stockout_cost1: float,
        stockout_cost2: float,
        fixed_order_cost: float,
        fixed_transfer_cost: float
    ):
        self.capacity1 = capacity1
        self.capacity2 = capacity2
        self.poisson_lambda1 = poisson_lambda1
        self.poisson_lambda2 = poisson_lambda2
        self.holding_cost1 = holding_cost1
        self.holding_cost2 = holding_cost2
        self.stockout_cost1 = stockout_cost1
        self.stockout_cost2 = stockout_cost2
        self.fixed_order_cost = fixed_order_cost
        self.fixed_transfer_cost = fixed_transfer_cost

        # Pre‐compute the Poisson distributions.
        self.poisson1 = poisson(poisson_lambda1)
        self.poisson2 = poisson(poisson_lambda2)
        super().__init__(self.get_action_transition_reward_map())

    def get_action_transition_reward_map(self) -> TwoStoreMapping:
        mapping: Dict[TwoStoreInventoryState,
                      Dict[TwoStoreAction, Categorical[Tuple[TwoStoreInventoryState, float]]]] = {}

        # Enumerate over all possible states for store 1 and store 2.
        for on_hand1 in range(self.capacity1 + 1):
            for on_order1 in range(self.capacity1 - on_hand1 + 1):
                state1 = InventoryState(on_hand1, on_order1)
                for on_hand2 in range(self.capacity2 + 1):
                    for on_order2 in range(self.capacity2 - on_hand2 + 1):
                        state2 = InventoryState(on_hand2, on_order2)
                        state = TwoStoreInventoryState(state1, state2)

                        # Determine feasible orders.
                        max_order1 = self.capacity1 - state1.inventory_position()
                        max_order2 = self.capacity2 - state2.inventory_position()
                        orders1 = list(range(max_order1 + 1))
                        orders2 = list(range(max_order2 + 1))

                        # Determine feasible transfers.
                        # We allow transfer from store 1 to store 2 up to state1.on_hand,
                        # or from store 2 to store 1 up to state2.on_hand.
                        transfer_set = {0}
                        for t in range(1, state1.on_hand + 1):
                            transfer_set.add(t)
                        for t in range(1, state2.on_hand + 1):
                            transfer_set.add(-t)

                        action_dict: Dict[TwoStoreAction, Categorical[Tuple[TwoStoreInventoryState, float]]] = {}
                        for order1 in orders1:
                            for order2 in orders2:
                                for transfer in transfer_set:
                                    action: TwoStoreAction = (order1, order2, transfer)
                                    # Compute fixed costs.
                                    cost = 0.0
                                    if order1 > 0:
                                        cost += self.fixed_order_cost
                                    if order2 > 0:
                                        cost += self.fixed_order_cost
                                    if transfer != 0:
                                        cost += self.fixed_transfer_cost
                                    # Also, incur holding costs on the current on‐hand levels.
                                    cost += self.holding_cost1 * state1.on_hand
                                    cost += self.holding_cost2 * state2.on_hand
                                    base_reward = -cost

                                    # The available inventory to meet demand is the store’s inventory position
                                    # adjusted by any transfer.
                                    ip1 = state1.inventory_position()
                                    ip2 = state2.inventory_position()
                                    if transfer >= 0:
                                        # transferring from store1 to store2
                                        ip1_prime = ip1 - transfer
                                        ip2_prime = ip2 + transfer
                                    else:
                                        t_abs = -transfer
                                        ip1_prime = ip1 + t_abs
                                        ip2_prime = ip2 - t_abs

                                    # For each store, we mimic the single–store dynamics:
                                    # (i) For demand d < available inventory, new on–hand is (available – d);
                                    # (ii) For demand d >= available inventory, we lump the tail with an extra penalty.
                                    outcomes1: Dict[Tuple[int, float], float] = {}
                                    for d1 in range(ip1_prime):
                                        new_on_hand1 = ip1_prime - d1
                                        outcomes1[(new_on_hand1, 0.0)] = self.poisson1.pmf(d1)
                                    tail_prob1 = 1 - self.poisson1.cdf(ip1_prime - 1) if ip1_prime > 0 else 1.0
                                    if tail_prob1 > 0:
                                        # Following the same “aggregation” used in the single–store code.
                                        adj1 = - self.stockout_cost1 * (
                                            self.poisson_lambda1 - ip1_prime * (1 - self.poisson1.pmf(ip1_prime) / tail_prob1)
                                        )
                                        outcomes1[(0, adj1)] = tail_prob1

                                    outcomes2: Dict[Tuple[int, float], float] = {}
                                    for d2 in range(ip2_prime):
                                        new_on_hand2 = ip2_prime - d2
                                        outcomes2[(new_on_hand2, 0.0)] = self.poisson2.pmf(d2)
                                    tail_prob2 = 1 - self.poisson2.cdf(ip2_prime - 1) if ip2_prime > 0 else 1.0
                                    if tail_prob2 > 0:
                                        adj2 = - self.stockout_cost2 * (
                                            self.poisson_lambda2 - ip2_prime * (1 - self.poisson2.pmf(ip2_prime) / tail_prob2)
                                        )
                                        outcomes2[(0, adj2)] = tail_prob2

                                    # Combine outcomes for store1 and store2 (assuming independent demands).
                                    sr_probs: Dict[Tuple[TwoStoreInventoryState, float], float] = {}
                                    for (new_on_hand1, adj1), prob1 in outcomes1.items():
                                        for (new_on_hand2, adj2), prob2 in outcomes2.items():
                                            next_state = TwoStoreInventoryState(
                                                InventoryState(new_on_hand1, order1),
                                                InventoryState(new_on_hand2, order2)
                                            )
                                            total_reward = base_reward + adj1 + adj2
                                            key = (next_state, total_reward)
                                            sr_probs[key] = sr_probs.get(key, 0.0) + prob1 * prob2

                                    action_dict[action] = Categorical(sr_probs)
                        mapping[state] = action_dict
        return mapping

# A simple test and policy computation using value iteration.
if __name__ == '__main__':
    from rl.dynamic_programming import value_iteration_result

    # Discount factor.
    gamma = 0.9

    # Example problem parameters.
    capacity1 = 3
    capacity2 = 3
    poisson_lambda1 = 1.0
    poisson_lambda2 = 2.0
    holding_cost1 = 1.0
    holding_cost2 = 1.0
    stockout_cost1 = 10.0
    stockout_cost2 = 10.0
    fixed_order_cost = 5.0
    fixed_transfer_cost = 2.0

    mdp = TwoStoresInventoryMDP(
        capacity1, capacity2,
        poisson_lambda1, poisson_lambda2,
        holding_cost1, holding_cost2,
        stockout_cost1, stockout_cost2,
        fixed_order_cost, fixed_transfer_cost
    )

    opt_vf, opt_policy = value_iteration_result(mdp, gamma=gamma)
    print("Optimal Value Function:")
    for state, value in opt_vf.items():
        print(f"{state}: {value}")

    print("\nOptimal Policy:")
    print(opt_policy)
```

---

### Explanation and Analysis

1. **State and Action spaces:**  
   The new state is a pair  
   $$
   s = \bigl(\texttt{InventoryState(on\_hand₁, on\_order₁)},\,\texttt{InventoryState(on\_hand₂, on\_order₂)}\bigr),
   $$
   where (on_hand, on_order) for each store satisfy the capacity constraint  
   $$
   \text{on\_hand} + \text{on\_order} \le \text{Capacity}.
   $$
   The actions are triples $(\text{order₁}, \text{order₂}, \text{transfer})$ where the ordering decisions obey  
   $$
   \text{order}_i \in \{0,1,\dots, \text{Capacity}_i - (\text{on\_hand}_i+\text{on\_order}_i)\},
   $$
   and the transfer decision is chosen from the feasible set: a store can send at most its available on–hand inventory (so, if store 1 has on_hand = a₁ then it may send any t in 1,…,a₁; similarly, store 2 may send up to its on–hand inventory in the negative direction).

2. **Transitions and Rewards:**  
   Once an action is chosen the “available inventory” for demand is computed as the store’s current inventory position (on_hand + on_order) adjusted by the transfer. Then, independent Poisson demands are realized at each store. (For computational efficiency the tail of the demand distribution is lumped as in the single–store example.) In addition to the usual “stockout” penalties and holding costs, fixed ordering and transfer costs are subtracted immediately when the corresponding (nonzero) action is taken.

3. **Optimal Policy:**  
   When the problem is solved (for example by calling the provided `value_iteration_result`), one typically finds that:  
   - **Ordering:** A store with low current inventory (or low inventory position) tends to order more in order to “top–up” its inventory, but the fixed ordering cost makes it undesirable to order a little at a time.  
   - **Transfers:** When one store has a surplus relative to its demand distribution while the other is relatively “short,” the optimal policy may call for transferring inventory rather than placing an order. (In our example the transfer cost is lower than the ordering cost, so a moderate transfer is often optimal.)  
   - **Parameter Sensitivity:** As the fixed order cost increases, the policy tends to delay ordering or rely more on transfers if possible. Conversely, if the demand mean is high (or if stockout penalties are steep), the policy is more aggressive in ordering (and perhaps transferring) to avoid stockouts.

This behavior is entirely intuitive. In many real–world settings, managers must balance the trade–offs between incurring fixed order (or shipping) costs versus the risk and cost of stockouts, and sharing inventory between stores is a natural way to do so. The above implementation and its computed optimal policy illustrate these trade–offs in a formal dynamic–programming model.

---

Any solution that produces a similar MDP model and obtains an optimal policy with these properties is acceptable.