In [1]:
import cvxpy as cp
import numpy as np
import dccp
import numpy.typing as npt
from enum import Enum
from multireward_ope.tabular.mdp import MDP
from multireward_ope.tabular.characteristic_time import CharacteristicTimeSolver
from multireward_ope.tabular.reward_set import RewardSet, RewardSetCircle, RewardSetType, RewardSetRewardFree, RewardSetBox
from typing import NamedTuple


mdp = MDP.generate_random_mdp(3, 2)
policy = np.array([0, 1, 0], dtype=np.long)
rewards = RewardSetCircle(mdp.dim_state, np.zeros(mdp.dim_state), radius=1, p=2)
rbox = RewardSetBox(mdp.dim_state, np.zeros(mdp.dim_state), np.ones(mdp.dim_state))
rfree = RewardSetRewardFree(mdp.dim_state)
solver = CharacteristicTimeSolver(mdp.dim_state, mdp.dim_action)
solver.build_problem(rewards)

print(solver.solve(0.9, mdp, policy))

solver.build_problem(rfree)
print(solver.solve(0.9, mdp, policy))

solver.build_problem(rbox)
print(solver.solve(0.9, mdp, policy))
        


BoundResult(value=5.1823334656784485, w=array([[0.28083245, 0.06877288],
       [0.        , 0.25322662],
       [0.31439408, 0.08277397]]))
BoundResult(value=5.182333482488856, w=array([[0.28083245, 0.06877288],
       [0.        , 0.25322662],
       [0.31439408, 0.08277397]]))
BoundResult(value=5.182333480276443, w=array([[0.28083245, 0.06877288],
       [0.        , 0.25322662],
       [0.31439408, 0.08277397]]))


In [None]:
from multireward_ope.tabular.envs.riverswim import RiverSwim
import matplotlib.pyplot as plt


results_circle = []
results_rfree = []
results_box = []
np.random.seed(0)
for s in range(2, 10):
    mdp = RiverSwim(s)
    policy = np.ones(mdp.dim_state, dtype=np.long)
    rewards = RewardSetCircle(mdp.dim_state, np.zeros(mdp.dim_state), radius=1, p=2)
    rbox = RewardSetBox(mdp.dim_state, np.zeros(mdp.dim_state), np.ones(mdp.dim_state))
    rfree = RewardSetRewardFree(mdp.dim_state)
    solver = CharacteristicTimeSolver(mdp.dim_state, mdp.dim_action)
    solver.build_problem(rewards)
    results_circle.append(solver.solve(0.9, mdp, policy).value)

    solver.build_problem(rfree)
    solution = solver.solve(0.9, mdp, policy)
    # print(solution.w / solution.w.sum(-1, keepdims=True))
    results_rfree.append(solution.value)

    solver.build_problem(rbox)
    results_box.append(solver.solve(0.9, mdp, policy).value)
plt.plot(range(2,10), results_circle, label='$\|r\|_2 \leq 1, r\in [0,1]$')
plt.plot(range(2,10), results_rfree, label='All rewards in [0,1] (closed-form)')
plt.plot(range(2,11), results_box, label='All rewards in [0,1]')
plt.legend()
plt.xlabel('Num states')
plt.ylabel('Characteristic time $T^\star$')

[[1.07928891e-10 1.00000000e+00]
 [0.00000000e+00 1.00000000e+00]]
[[6.65545904e-10 9.99999999e-01]
 [0.00000000e+00 1.00000000e+00]
 [0.00000000e+00 1.00000000e+00]]
[[0.         1.        ]
 [0.18933516 0.81066484]
 [0.         1.        ]
 [0.         1.        ]]
[[0.         1.        ]
 [0.27796938 0.72203062]
 [0.11222818 0.88777182]
 [0.         1.        ]
 [0.         1.        ]]
[[0.         1.        ]
 [0.22417685 0.77582315]
 [0.27065485 0.72934515]
 [0.0705685  0.9294315 ]
 [0.0017292  0.9982708 ]
 [0.         1.        ]]
[[0.         1.        ]
 [0.21909058 0.78090942]
 [0.23245887 0.76754113]
 [0.14626051 0.85373949]
 [0.11233193 0.88766807]
 [0.0049633  0.9950367 ]
 [0.         1.        ]]
