diff --git a/obp/dataset/real.py b/obp/dataset/real.py index 7f081bb7..aa4eda17 100644 --- a/obp/dataset/real.py +++ b/obp/dataset/real.py @@ -155,7 +155,7 @@ def calc_on_policy_policy_value_estimate( Returns --------- on_policy_policy_value_estimate: float - Policy value of the behavior policy estimated by on-policy estimation, i.e., :math:`\\mathbb{E}_{\\mathcal{D}} [r_t]`. + Policy value of the behavior policy estimated by on-policy estimation, i.e., :math:`\\mathbb{E}_{\\mathcal{D}} [r_i]`. where :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`. This parameter is used as a ground-truth policy value in the evaluation of OPE estimators. @@ -297,7 +297,7 @@ def sample_bootstrap_bandit_feedback( ----------- sample_size: int, default=None Number of data sampled by bootstrap. - When None is given, the original data size (n_rounds) is used as `sample_size`. + If None is given, the original data size (n_rounds) is used as `sample_size`. The value must be smaller than the original data size. test_size: float, default=0.3 diff --git a/obp/dataset/synthetic.py b/obp/dataset/synthetic.py index f2047092..9a0686d6 100644 --- a/obp/dataset/synthetic.py +++ b/obp/dataset/synthetic.py @@ -340,7 +340,7 @@ def calc_ground_truth_policy_value( This is often the expected_reward of the test set of logged bandit feedback data. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. Returns ---------- diff --git a/obp/dataset/synthetic_slate.py b/obp/dataset/synthetic_slate.py index 4f9a638b..b761492e 100644 --- a/obp/dataset/synthetic_slate.py +++ b/obp/dataset/synthetic_slate.py @@ -77,13 +77,13 @@ class SyntheticSlateBanditDataset(BaseBanditDataset): click_model: str, default=None Type of click model, which must be one of None, 'pbm', or 'cascade'. - When None is given, reward at each slot is sampled based on the original expected rewards. + If None is given, reward at each slot is sampled based on the original expected rewards. When 'pbm' is given, reward at each slot is sampled based on the position-based model. When 'cascade' is given, reward at each slot is sampled based on the cascade model. When using some click model, 'continuous' reward type is unavailable. eta: float, default=1.0 - A hyperparameter to define the click models. + Hyperparameter to define the click models. When click_model='pbm', then eta defines the examination probabilities of the position-based model. For example, when eta=0.5, then the examination probability at position `k` is :math:`\\theta (k) = (1/k)^{0.5}`. When click_model='cascade', then eta defines the position-dependent attractiveness parameters of the dependent click model @@ -420,7 +420,7 @@ def obtain_pscore_given_evaluation_policy_logit( clip_logit_value: Optional[float], default=None A float parameter to clip logit value (<= `700.`). - When None is given, we calculate softmax values without clipping to obtain `pscore_item_position`. + If None is given, we calculate softmax values without clipping to obtain `pscore_item_position`. When a float value is given, we clip logit values to calculate softmax values to obtain `pscore_item_position`. When n_actions and len_list are large, giving None to this parameter may lead to a large computational time. @@ -436,7 +436,7 @@ def obtain_pscore_given_evaluation_policy_logit( or evaluation_policy_logit_.shape[1] != self.n_unique_action ): raise ValueError( - "the shape of action and evaluation_policy_logit_ must be (n_rounds * len_list, )" + "the shape of `action` and `evaluation_policy_logit_` must be (n_rounds * len_list, )" "and (n_rounds, n_unique_action) respectively" ) @@ -540,7 +540,7 @@ def sample_action_and_obtain_pscore( clip_logit_value: Optional[float], default=None A float parameter to clip logit value (<= `700.`). - When None is given, we calculate softmax values without clipping to obtain `pscore_item_position`. + If None is given, we calculate softmax values without clipping to obtain `pscore_item_position`. When a float value is given, we clip logit values to calculate softmax values to obtain `pscore_item_position`. When n_actions and len_list are large, giving None to this parameter may lead to a large computational time. @@ -550,17 +550,17 @@ def sample_action_and_obtain_pscore( Actions sampled by a behavior policy. Action list of slate `i` is stored in action[`i` * `len_list`: (`i + 1`) * `len_list`] - pscore_cascade: array-like, shape (n_rounds * len_list) - Joint action choice probabilities above the slot (:math:`k`) in each slate given context (:math:`x`). - i.e., :math:`\\pi_k: \\mathcal{X} \\rightarrow \\Delta(\\mathcal{A}^{k})`. + evaluation_policy_pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. - pscore: array-like, shape (n_rounds * len_list) - Joint action choice probabilities of the slate given context (:math:`x`). - i.e., :math:`\\pi: \\mathcal{X} \\rightarrow \\Delta(\\mathcal{A}^{\\text{len_list}})`. + evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) + Joint probabilities of evaluation policy selecting a slate action, i.e., :math:`\\pi_e(a_i|x_i)`. + This parameter must be unique in each slate. - pscore_item_position: array-like, shape (n_rounds * len_list) - Marginal action choice probabilities of each slot given context (:math:`x`). - i.e., :math:`\\pi: \\mathcal{X} \\rightarrow \\Delta(\\mathcal{A})`. + evaluation_policy_pscore_item_position: array-like, shape (<= n_rounds * len_list,) + Marginal probabilities of evaluation policy selecting each action :math:`a` at position (slot) :math:`k`, i.e., :math:`\\pi_e(a_{t}(k) |x_t)`. """ action = np.zeros(n_rounds * self.len_list, dtype=int) @@ -732,7 +732,7 @@ def obtain_batch_bandit_feedback( clip_logit_value: Optional[float], default=None A float parameter to clip logit value. - When None is given, we calculate softmax values without clipping to obtain `pscore_item_position`. + If None is given, we calculate softmax values without clipping to obtain `pscore_item_position`. When a float value is given, we clip logit values to calculate softmax values to obtain `pscore_item_position`. When n_actions and len_list are large, giving None to this parameter may lead to a large computational time. @@ -1026,7 +1026,7 @@ def generate_evaluation_policy_pscore( Type of evaluation policy, which must be one of 'optimal', 'anti-optimal', or 'random'. When 'optimal' is given, we sort actions based on the base expected rewards (outputs of `base_reward_function`) and extract top-L actions (L=`len_list`) for each slate. When 'anti-optimal' is given, we sort actions based on the base expected rewards (outputs of `base_reward_function`) and extract bottom-L actions (L=`len_list`) for each slate. - We calculate the three variants of the propensity scores (pscore, pscore_item_position, and pscore_cascade) of the epsilon-greedy policy when either 'optimal' or 'anti-optimal' is given. + We calculate the three variants of the propensity scores (pscore, `pscore_item_position`, and pscore_cascade) of the epsilon-greedy policy when either 'optimal' or 'anti-optimal' is given. When 'random' is given, we calculate the three variants of the propensity scores of the uniform random policy. context: array-like, shape (n_rounds, dim_context) @@ -1043,17 +1043,17 @@ def generate_evaluation_policy_pscore( Returns ---------- - pscore: array-like, shape (n_unique_action * len_list) - Joint action choice probabilities of the slate given context (:math:`x`). - i.e., :math:`\\pi: \\mathcal{X} \\rightarrow \\Delta(\\mathcal{A}^{\\text{len_list}})`. + evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) + Joint probabilities of evaluation policy selecting a slate action, i.e., :math:`\\pi_e(a_i|x_i)`. + This parameter must be unique in each slate. - pscore_item_position: array-like, shape (n_unique_action * len_list) - Marginal action choice probabilities of each slot given context (:math:`x`). - i.e., :math:`\\pi: \\mathcal{X} \\rightarrow \\Delta(\\mathcal{A})`. + evaluation_policy_pscore_item_position: array-like, shape (<= n_rounds * len_list,) + Marginal probabilities of evaluation policy selecting each action :math:`a` at position (slot) :math:`k`, i.e., :math:`\\pi_e(a_{t}(k) |x_t)`. - pscore_cascade: array-like, shape (n_unique_action * len_list) - Joint action choice probabilities above the slot (:math:`k`) in each slate given context (:math:`x`). - i.e., :math:`\\pi_k: \\mathcal{X} \\rightarrow \\Delta(\\mathcal{A}^{k})`. + evaluation_policy_pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. """ check_array(array=context, name="context", expected_dim=2) @@ -1137,6 +1137,70 @@ def generate_evaluation_policy_pscore( ) return pscore, pscore_item_position, pscore_cascade + def calc_evaluation_policy_action_dist( + self, + action: np.ndarray, + evaluation_policy_logit_: np.ndarray, + ): + """Calculate action distribution at each slot from a given evaluation policy logit. + + Parameters + ---------- + action: array-like, shape (n_rounds * len_list, ) + Action chosen by behavior policy. + + evaluation_policy_logit_: array-like, shape (n_rounds, n_unique_action) + Logit values of evaluation policy given context (:math:`x`), i.e., :math:`\\f: \\mathcal{X} \\rightarrow \\mathbb{R}^{\\mathcal{A}}`. + + Returns + ---------- + evaluation_policy_action_dist: array-like, shape (n_rounds * len_list * n_unique_action, ) + Plackett-luce style action distribution induced by evaluation policy (action choice probabilities at each slot given previous action choices). + , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1)) \\forall a_t(k) \\in \\mathcal{A}`. + + """ + check_array(action, name="action", expected_dim=1) + check_array( + evaluation_policy_logit_, name="evaluation_policy_logit_", expected_dim=2 + ) + if evaluation_policy_logit_.shape[1] != self.n_unique_action: + raise ValueError( + "Expected `evaluation_policy_logit_.shape[1] == n_unique_action`, but found it False" + ) + if len(action) != evaluation_policy_logit_.shape[0] * self.len_list: + raise ValueError( + "Expected `len(action) == evaluation_policy_logit_.shape[0] * len_list`, but found it False" + ) + n_rounds = evaluation_policy_logit_.shape[0] + + # (n_rounds * len_list, ) -> (n_rounds, len_list) + action = action.reshape((n_rounds, self.len_list)) + # (n_rounds, n_unique_action) -> (n_rounds, len_list, n_unique_action) + evaluation_policy_logit_ = np.array( + [ + [evaluation_policy_logit_[i] for _ in range(self.len_list)] + for i in range(n_rounds) + ] + ) + # calculate action probabilities for all the counterfactual actions at the position + # (n_rounds, len_list, n_unique_action) + evaluation_policy_action_dist = [] + for i in range(n_rounds): + if not self.is_factorizable: + for position_ in range(self.len_list - 1): + action_ = action[i][position_] + # mask action choice probability of the previously chosen action + # to avoid overflow in softmax function, set -1e4 instead of -np.inf + # (make action choice probability 0 for the previously chosen action by softmax) + evaluation_policy_logit_[i, position_ + 1 :, action_] = -1e4 + # (len_list, n_unique_action) + evaluation_policy_action_dist.append(softmax(evaluation_policy_logit_[i])) + # (n_rounds, len_list, n_unique_action) -> (n_rounds * len_list * n_unique_action, ) + evaluation_policy_action_dist = np.array( + evaluation_policy_action_dist + ).flatten() + return evaluation_policy_action_dist + def _calc_epsilon_greedy_pscore( self, epsilon: float, @@ -1175,17 +1239,17 @@ def _calc_epsilon_greedy_pscore( Returns ---------- - pscore: array-like, shape (n_unique_action * len_list) - Joint action choice probabilities of the slate given context (:math:`x`). - i.e., :math:`\\pi: \\mathcal{X} \\rightarrow \\Delta(\\mathcal{A}^{\\text{len_list}})`. + evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) + Joint probabilities of evaluation policy selecting a slate action, i.e., :math:`\\pi_e(a_i|x_i)`. + This parameter must be unique in each slate. - pscore_item_position: array-like, shape (n_unique_action * len_list) - Marginal action choice probabilities of each slot given context (:math:`x`). - i.e., :math:`\\pi: \\mathcal{X} \\rightarrow \\Delta(\\mathcal{A})`. + evaluation_policy_pscore_item_position: array-like, shape (<= n_rounds * len_list,) + Marginal probabilities of evaluation policy selecting each action :math:`a` at position (slot) :math:`k`, i.e., :math:`\\pi_e(a_{t}(k) |x_t)`. - pscore_cascade: array-like, shape (n_unique_action * len_list) - Joint action choice probabilities above the slot (:math:`k`) in each slate given context (:math:`x`). - i.e., :math:`\\pi_k: \\mathcal{X} \\rightarrow \\Delta(\\mathcal{A}^{k})`. + evaluation_policy_pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. """ check_array(array=action_2d, name="action_2d", expected_dim=2) diff --git a/obp/ope/__init__.py b/obp/ope/__init__.py index 289b18f2..e3a09049 100644 --- a/obp/ope/__init__.py +++ b/obp/ope/__init__.py @@ -1,3 +1,6 @@ +from obp.ope.classification_model import ImportanceWeightEstimator +from obp.ope.classification_model import PropensityScoreEstimator +from obp.ope.estimators import BalancedInverseProbabilityWeighting from obp.ope.estimators import BaseOffPolicyEstimator from obp.ope.estimators import DirectMethod from obp.ope.estimators import DoublyRobust @@ -6,8 +9,9 @@ from obp.ope.estimators import ReplayMethod from obp.ope.estimators import SelfNormalizedDoublyRobust from obp.ope.estimators import SelfNormalizedInverseProbabilityWeighting +from obp.ope.estimators import SubGaussianDoublyRobust +from obp.ope.estimators import SubGaussianInverseProbabilityWeighting from obp.ope.estimators import SwitchDoublyRobust -from obp.ope.estimators import BalancedInverseProbabilityWeighting from obp.ope.estimators_continuous import ( KernelizedSelfNormalizedInverseProbabilityWeighting, ) @@ -21,19 +25,21 @@ from obp.ope.estimators_slate import SelfNormalizedSlateIndependentIPS from obp.ope.estimators_slate import SelfNormalizedSlateRewardInteractionIPS from obp.ope.estimators_slate import SelfNormalizedSlateStandardIPS +from obp.ope.estimators_slate import SlateCascadeDoublyRobust from obp.ope.estimators_slate import SlateIndependentIPS from obp.ope.estimators_slate import SlateRewardInteractionIPS from obp.ope.estimators_slate import SlateStandardIPS from obp.ope.estimators_tuning import DoublyRobustTuning from obp.ope.estimators_tuning import DoublyRobustWithShrinkageTuning from obp.ope.estimators_tuning import InverseProbabilityWeightingTuning +from obp.ope.estimators_tuning import SubGaussianDoublyRobustTuning +from obp.ope.estimators_tuning import SubGaussianInverseProbabilityWeightingTuning from obp.ope.estimators_tuning import SwitchDoublyRobustTuning from obp.ope.meta import OffPolicyEvaluation from obp.ope.meta_continuous import ContinuousOffPolicyEvaluation from obp.ope.meta_slate import SlateOffPolicyEvaluation from obp.ope.regression_model import RegressionModel -from obp.ope.classification_model import ImportanceWeightEstimator -from obp.ope.classification_model import PropensityScoreEstimator +from obp.ope.regression_model_slate import SlateRegressionModel __all__ = [ @@ -46,17 +52,23 @@ "SelfNormalizedDoublyRobust", "SwitchDoublyRobust", "DoublyRobustWithShrinkage", + "SubGaussianInverseProbabilityWeighting", + "SubGaussianDoublyRobust", "InverseProbabilityWeightingTuning", "DoublyRobustTuning", "SwitchDoublyRobustTuning", "DoublyRobustWithShrinkageTuning", + "SubGaussianInverseProbabilityWeightingTuning", + "SubGaussianDoublyRobustTuning", "OffPolicyEvaluation", "SlateOffPolicyEvaluation", "ContinuousOffPolicyEvaluation", "RegressionModel", + "SlateRegressionModel", "SlateStandardIPS", "SlateIndependentIPS", "SlateRewardInteractionIPS", + "SlateCascadeDoublyRobust", "SelfNormalizedSlateRewardInteractionIPS", "SelfNormalizedSlateIndependentIPS", "SelfNormalizedSlateStandardIPS", @@ -82,6 +94,8 @@ "DoublyRobustWithShrinkage", "SwitchDoublyRobust", "SelfNormalizedDoublyRobust", + "SubGaussianInverseProbabilityWeighting", + "SubGaussianDoublyRobust", "BalancedInverseProbabilityWeighting", ] @@ -92,3 +106,9 @@ "SwitchDoublyRobustTuning", "DoublyRobustWithShrinkageTuning", ] + + +__all_estimators_tuning_sg__ = [ + "SubGaussianInverseProbabilityWeightingTuning", + "SubGaussianDoublyRobustTuning", +] diff --git a/obp/ope/classification_model.py b/obp/ope/classification_model.py index b6bd397e..44e8d7d5 100644 --- a/obp/ope/classification_model.py +++ b/obp/ope/classification_model.py @@ -8,12 +8,14 @@ import numpy as np from sklearn.base import BaseEstimator from sklearn.base import clone +from sklearn.calibration import CalibratedClassifierCV from sklearn.model_selection import KFold from sklearn.utils import check_random_state from sklearn.utils import check_scalar -from sklearn.calibration import CalibratedClassifierCV -from ..utils import check_array, sample_action_fast, check_bandit_feedback_inputs +from ..utils import check_array +from ..utils import check_bandit_feedback_inputs +from ..utils import sample_action_fast @dataclass @@ -29,12 +31,12 @@ class ImportanceWeightEstimator(BaseEstimator): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. action_context: array-like, shape (n_actions, dim_action_context), default=None Context vector characterizing action (i.e., vector representation of each action). - If not given, one-hot encoding of the action variable is used as default. + If None, one-hot encoding of the action variable is used as default. If fitting_method is 'raw', one-hot encoding will be used as action_context. fitting_method: str, default='sample' @@ -76,7 +78,7 @@ def __post_init__(self) -> None: ) if not isinstance(self.base_model, BaseEstimator): raise ValueError( - "base_model must be BaseEstimator or a child class of BaseEstimator" + "`base_model` must be BaseEstimator or a child class of BaseEstimator" ) if self.calibration_cv > 1: @@ -103,7 +105,7 @@ def fit( position: Optional[np.ndarray] = None, random_state: Optional[int] = None, ) -> None: - """Fit the classification model on given logged bandit feedback data. + """Fit the classification model on given logged bandit data. Parameters ---------- @@ -111,13 +113,13 @@ def fit( Context vectors observed in each round of the logged bandit feedback, i.e., :math:`x_t`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. If None is given, a classification model assumes that there is only a single position in a recommendation interface. When `len_list` > 1, this position argument has to be set. @@ -141,14 +143,14 @@ def fit( check_array(array=position, name="position", expected_dim=1) if position.max() >= self.len_list: raise ValueError( - f"position elements must be smaller than len_list, but the maximum value is {position.max()} (>= {self.len_list})" + f"`position` elements must be smaller than `len_list`, but the maximum value is {position.max()} (>= {self.len_list})" ) if action_dist.shape != (n_rounds, self.n_actions, self.len_list): raise ValueError( - f"shape of action_dist must be (n_rounds, n_actions, len_list)=({n_rounds, self.n_actions, self.len_list}), but is {action_dist.shape}" + f"shape of `action_dist` must be (n_rounds, n_actions, len_list)=({n_rounds, self.n_actions, self.len_list}), but is {action_dist.shape}" ) if not np.allclose(action_dist.sum(axis=1), 1): - raise ValueError("action_dist must be a probability distribution") + raise ValueError("`action_dist` must be a probability distribution") # If self.fitting_method != "sample", `sampled_action` has no information sampled_action = np.zeros(n_rounds, dtype=int) @@ -188,10 +190,10 @@ def predict( Context vectors observed in each round of the logged bandit feedback, i.e., :math:`x_t`. action: array-like, shape (n_rounds_of_new_data,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. position: array-like, shape (n_rounds_of_new_data,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. If None is given, a classification model assumes that there is only a single position in a recommendation interface. When `len_list` > 1, this position argument has to be set. @@ -224,7 +226,7 @@ def fit_predict( random_state: Optional[int] = None, evaluate_model_performance: bool = False, ) -> np.ndarray: - """Fit the classification model on given logged bandit feedback data and predict the importance weights on the same data, possibly using cross-fitting to avoid over-fitting. + """Fit the classification model on given logged bandit data and predict the importance weights on the same data, possibly using cross-fitting to avoid over-fitting. Note ------ @@ -236,13 +238,13 @@ def fit_predict( Context vectors observed in each round of the logged bandit feedback, i.e., :math:`x_t`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. If None is given, a classification model assumes that there is only a single position in a recommendation interface. When `len_list` > 1, this position argument has to be set. @@ -281,7 +283,7 @@ def fit_predict( else: if position.max() >= self.len_list: raise ValueError( - f"position elements must be smaller than len_list, but the maximum value is {position.max()} (>= {self.len_list})" + f"`position` elements must be smaller than `len_list`, but the maximum value is {position.max()} (>= {self.len_list})" ) check_scalar(n_folds, "n_folds", int, min_val=1) @@ -289,10 +291,10 @@ def fit_predict( if action_dist.shape != (n_rounds, self.n_actions, self.len_list): raise ValueError( - f"shape of action_dist must be (n_rounds, n_actions, len_list)=({n_rounds, self.n_actions, self.len_list}), but is {action_dist.shape}" + f"shape of `action_dist` must be (n_rounds, n_actions, len_list)=({n_rounds, self.n_actions, self.len_list}), but is {action_dist.shape}" ) if not np.allclose(action_dist.sum(axis=1), 1): - raise ValueError("action_dist must be a probability distribution") + raise ValueError("`action_dist` must be a probability distribution") if n_folds == 1: self.fit( @@ -371,11 +373,11 @@ def _pre_process_for_clf_model( Context vectors observed in each round of the logged bandit feedback, i.e., :math:`x_t`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist_at_position: array-like, shape (n_rounds, n_actions,) - Action choice probabilities of evaluation policy of each position (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy of each position (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. sampled_action_at_position: array-like, shape (n_rounds, n_actions,) Actions sampled by evaluation policy for each data at each position. @@ -404,13 +406,13 @@ class PropensityScoreEstimator(BaseEstimator): Parameters ------------ base_model: BaseEstimator - A machine learning model used to estimate the mean reward function. + A machine learning model used to estimate the reward function. n_actions: int Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. calibration_cv: int, default=2 @@ -436,7 +438,7 @@ def __post_init__(self) -> None: check_scalar(self.calibration_cv, "calibration_cv", int) if not isinstance(self.base_model, BaseEstimator): raise ValueError( - "base_model must be BaseEstimator or a child class of BaseEstimator" + "`base_model` must be BaseEstimator or a child class of BaseEstimator" ) if self.calibration_cv > 1: @@ -459,7 +461,7 @@ def fit( action: np.ndarray, position: Optional[np.ndarray] = None, ) -> None: - """Fit the classification model on given logged bandit feedback data. + """Fit the classification model on given logged bandit data. Parameters ---------- @@ -467,10 +469,10 @@ def fit( Context vectors observed in each round of the logged bandit feedback, i.e., :math:`x_t`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. If None is given, a classification model assumes that there is only a single position in a recommendation interface. When `len_list` > 1, this position argument has to be set. @@ -488,7 +490,7 @@ def fit( else: if position.max() >= self.len_list: raise ValueError( - f"position elements must be smaller than len_list, but the maximum value is {position.max()} (>= {self.len_list})" + f"`position` elements must be smaller than `len_list`, but the maximum value is {position.max()} (>= {self.len_list})" ) for position_ in np.arange(self.len_list): @@ -511,10 +513,10 @@ def predict( Context vectors observed in each round of the logged bandit feedback, i.e., :math:`x_t`. action: array-like, shape (n_rounds_of_new_data,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. position: array-like, shape (n_rounds_of_new_data,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. If None is given, a classification model assumes that there is only a single position in a recommendation interface. When `len_list` > 1, this position argument has to be set. @@ -543,7 +545,7 @@ def fit_predict( random_state: Optional[int] = None, evaluate_model_performance: bool = False, ) -> np.ndarray: - """Fit the classification model on given logged bandit feedback data and predict the propensity score on the same data, possibly using the cross-fitting procedure to avoid over-fitting. + """Fit the classification model on given logged bandit data and predict the propensity score on the same data, possibly using the cross-fitting procedure to avoid over-fitting. Note ------ @@ -555,10 +557,10 @@ def fit_predict( Context vectors observed in each round of the logged bandit feedback, i.e., :math:`x_t`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. If None is given, a classification model assumes that there is only a single position. When `len_list` > 1, this position argument has to be set. @@ -596,7 +598,7 @@ def fit_predict( else: if position.max() >= self.len_list: raise ValueError( - f"position elements must be smaller than len_list, but the maximum value is {position.max()} (>= {self.len_list})" + f"`position` elements must be smaller than `len_list`, but the maximum value is {position.max()} (>= {self.len_list})" ) check_scalar(n_folds, "n_folds", int, min_val=1) diff --git a/obp/ope/estimators.py b/obp/ope/estimators.py index 559ea30c..239a244d 100644 --- a/obp/ope/estimators.py +++ b/obp/ope/estimators.py @@ -44,17 +44,17 @@ class ReplayMethod(BaseOffPolicyEstimator): Note ------- - Replay Method (RM) estimates the policy value of evaluation policy :math:`\\pi_e` by + RM estimates the policy value of evaluation policy :math:`\\pi_e` as .. math:: \\hat{V}_{\\mathrm{RM}} (\\pi_e; \\mathcal{D}) := - \\frac{\\mathbb{E}_{\\mathcal{D}}[\\mathbb{I} \\{ \\pi_e (x_t) = a_t \\} r_t ]}{\\mathbb{E}_{\\mathcal{D}}[\\mathbb{I} \\{ \\pi_e (x_t) = a_t \\}]}, + \\frac{\\mathbb{E}_{n}[\\mathbb{I} \\{ \\pi_e(x_t)=a_t \\} r_t]}{\\mathbb{E}_{n}[\\mathbb{I} \\{ \\pi_e(x_t)=a_t \\}]}, - where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by - a behavior policy :math:`\\pi_b`. :math:`\\pi_e: \\mathcal{X} \\rightarrow \\mathcal{A}` is the function + where :math:`\\mathcal{D}=\\{(x_i,a_i,r_i)\\}_{i=1}^{n}` is logged bandit data with :math:`n` observations collected by + behavior policy :math:`\\pi_b`. :math:`\\pi_e: \\mathcal{X} \\rightarrow \\mathcal{A}` is the function representing action choices by the evaluation policy realized during offline bandit simulation. - :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`. + :math:`\\mathbb{E}_{n}[\\cdot]` is the empirical average over :math:`n` observations in :math:`\\mathcal{D}`. Parameters ---------- @@ -83,23 +83,23 @@ def _estimate_round_rewards( Parameters ------------ reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (must be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (must be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) Returns ---------- estimated_rewards: array-like, shape (n_rounds,) - Rewards of each round estimated by the Replay Method. + Estimated rewards for each observation. """ if position is None: @@ -125,23 +125,23 @@ def estimate_policy_value( Parameters ------------ reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (must be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (must be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) Returns ---------- V_hat: float - Estimated policy value (performance) of a given evaluation policy. + Estimated policy value of evaluation policy. """ check_array(array=reward, name="reward", expected_dim=1) @@ -175,18 +175,18 @@ def estimate_interval( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (must be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (must be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) alpha: float, default=0.05 Significance level. @@ -231,17 +231,17 @@ class InverseProbabilityWeighting(BaseOffPolicyEstimator): Note ------- - Inverse Probability Weighting (IPW) estimates the policy value of evaluation policy :math:`\\pi_e` by + IPW estimates the policy value of evaluation policy :math:`\\pi_e` as .. math:: - \\hat{V}_{\\mathrm{IPW}} (\\pi_e; \\mathcal{D}) := \\mathbb{E}_{\\mathcal{D}} [ w(x_t,a_t) r_t], + \\hat{V}_{\\mathrm{IPW}} (\\pi_e; \\mathcal{D}) := \\mathbb{E}_{n} [ w(x_i,a_i) r_i], - where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by - a behavior policy :math:`\\pi_b`. :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the importance weight given :math:`x` and :math:`a`. - :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`. - When the weight-clipping is applied, a large importance weight is clipped as :math:`\\hat{w}(x,a) := \\min \\{ \\lambda, w(x,a) \\}` - where :math:`\\lambda (>0)` is a hyperparameter that decides a maximum allowed importance weight. + where :math:`\\mathcal{D}=\\{(x_i,a_i,r_i)\\}_{i=1}^{n}` is logged bandit data with :math:`n` observations collected by + behavior policy :math:`\\pi_b`. :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the importance weight given :math:`x` and :math:`a`. + :math:`\\mathbb{E}_{n}[\\cdot]` is the empirical average over :math:`n` observations in :math:`\\mathcal{D}`. + When the clipping is applied, a large importance weight is clipped as :math:`\\hat{w}(x,a) := \\min \\{ \\lambda, w(x,a) \\}` + where :math:`\\lambda (>0)` is a hyperparameter to specify a maximum allowed importance weight. IPW re-weights the rewards by the ratio of the evaluation policy and behavior policy (importance weight). When the behavior policy is known, IPW is unbiased and consistent for the true policy value. @@ -253,12 +253,12 @@ class InverseProbabilityWeighting(BaseOffPolicyEstimator): A maximum possible value of the importance weight. When a positive finite value is given, importance weights larger than `lambda_` will be clipped. + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. + estimator_name: str, default='ipw'. Name of the estimator. - use_estimated_pscore: bool, default=False. - If True, estimated_pscore is used to estimate the policy value, otherwise, pscore (the true propensity scores) is used. - References ------------ Alex Strehl, John Langford, Lihong Li, and Sham M Kakade. @@ -273,8 +273,8 @@ class InverseProbabilityWeighting(BaseOffPolicyEstimator): """ lambda_: float = np.inf - estimator_name: str = "ipw" use_estimated_pscore: bool = False + estimator_name: str = "ipw" def __post_init__(self) -> None: """Initialize Class.""" @@ -285,7 +285,7 @@ def __post_init__(self) -> None: min_val=0.0, ) if self.lambda_ != self.lambda_: - raise ValueError("lambda_ must not be nan") + raise ValueError("`lambda_` must not be nan") if not isinstance(self.use_estimated_pscore, bool): raise TypeError( f"`use_estimated_pscore` must be a bool, but {type(self.use_estimated_pscore)} is given" @@ -305,26 +305,26 @@ def _estimate_round_rewards( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) Returns ---------- estimated_rewards: array-like, shape (n_rounds,) - Rewards of each round estimated by IPW. + Estimated rewards for each observation. """ if position is None: @@ -350,31 +350,31 @@ def estimate_policy_value( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. If self.use_estimated_pscore is False, pscore must be given. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. - If self.use_estimated_pscore is True, estimated_pscore must be given. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. Returns ---------- V_hat: float - Estimated policy value (performance) of a given evaluation policy. + Estimated policy value of evaluation policy. """ check_array(array=reward, name="reward", expected_dim=1) @@ -422,26 +422,26 @@ def estimate_interval( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. If self.use_estimated_pscore is False, pscore must be given. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. - If self.use_estimated_pscore is True, estimated_pscore must be given. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. alpha: float, default=0.05 Significance level. @@ -507,28 +507,28 @@ def _estimate_mse_score( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only a single action is chosen at each round, you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) use_bias_upper_bound: bool, default=True Whether to use bias upper bound in hyperparameter tuning. - If False, direct bias estimator is used to estimate the MSE. + If False, the direct bias estimator is used to estimate the MSE. See Su et al.(2020) for details. delta: float, default=0.05 - A confidence delta to construct a high probability upper bound based on the Bernstein’s inequality. + A confidence delta to construct a high probability upper bound based on Bernstein inequality. Returns ---------- @@ -575,16 +575,16 @@ class SelfNormalizedInverseProbabilityWeighting(InverseProbabilityWeighting): Note ------- - Self-Normalized Inverse Probability Weighting (SNIPW) estimates the policy value of evaluation policy :math:`\\pi_e` by + SNIPW estimates the policy value of evaluation policy :math:`\\pi_e` as .. math:: \\hat{V}_{\\mathrm{SNIPW}} (\\pi_e; \\mathcal{D}) := - \\frac{\\mathbb{E}_{\\mathcal{D}} [w(x_t,a_t) r_t]}{ \\mathbb{E}_{\\mathcal{D}} [w(x_t,a_t)]}, + \\frac{\\mathbb{E}_{n} [w(x_i,a_i) r_i]}{ \\mathbb{E}_{n} [w(x_i,a_i)]}, - where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by - a behavior policy :math:`\\pi_b`. :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the importance weight given :math:`x` and :math:`a`. - :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`. + where :math:`\\mathcal{D}=\\{(x_i,a_i,r_i)\\}_{i=1}^{n}` is logged bandit data with :math:`n` observations collected by + behavior policy :math:`\\pi_b`. :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the importance weight given :math:`x` and :math:`a`. + :math:`\\mathbb{E}_{n}[\\cdot]` is the empirical average over :math:`n` observations in :math:`\\mathcal{D}`. SNIPW re-weights the observed rewards by the self-normalized importance weihgt. This estimator is not unbiased even when the behavior policy is known. @@ -593,6 +593,9 @@ class SelfNormalizedInverseProbabilityWeighting(InverseProbabilityWeighting): Parameters ---------- + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. + estimator_name: str, default='snipw'. Name of the estimator. @@ -622,24 +625,26 @@ def _estimate_round_rewards( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) Returns ---------- estimated_rewards: array-like, shape (n_rounds,) - Rewards of each round estimated by the SNIPW estimator. + Estimated rewards for each observation. """ if position is None: @@ -654,25 +659,25 @@ class DirectMethod(BaseOffPolicyEstimator): Note ------- - DM first learns a supervised machine learning model, such as ridge regression and gradient boosting, - to estimate the mean reward function (:math:`q(x,a) = \\mathbb{E}[r|x,a]`). - It then uses it to estimate the policy value as follows. + DM first trains a supervised ML model, such as ridge regression and gradient boosting, + to estimate the reward function (:math:`q(x,a) = \\mathbb{E}[r|x,a]`). + It then uses the estimated rewards to estimate the policy value as follows. .. math:: \\hat{V}_{\\mathrm{DM}} (\\pi_e; \\mathcal{D}, \\hat{q}) - &:= \\mathbb{E}_{\\mathcal{D}} \\left[ \\sum_{a \\in \\mathcal{A}} \\hat{q} (x_t,a) \\pi_e(a|x_t) \\right], \\\\ - & = \\mathbb{E}_{\\mathcal{D}}[\\hat{q} (x_t,\\pi_e)], + &:= \\mathbb{E}_{n} \\left[ \\sum_{a \\in \\mathcal{A}} \\hat{q} (x_i,a) \\pi_e(a|x_i) \\right], \\\\ + & = \\mathbb{E}_{n}[\\hat{q} (x_i,\\pi_e)], - where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by - a behavior policy :math:`\\pi_b`. :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`. - :math:`\\hat{q} (x,a)` is an estimated expected reward given :math:`x` and :math:`a`. - :math:`\\hat{q} (x_t,\\pi):= \\mathbb{E}_{a \\sim \\pi(a|x)}[\\hat{q}(x,a)]` is the expectation of the estimated reward function over :math:`\\pi`. - To estimate the mean reward function, please use `obp.ope.regression_model.RegressionModel`, which supports several fitting methods specific to OPE. + where :math:`\\mathcal{D}=\\{(x_i,a_i,r_i)\\}_{i=1}^{n}` is logged bandit data with :math:`n` observations collected by + behavior policy :math:`\\pi_b`. :math:`\\mathbb{E}_{n}[\\cdot]` is the empirical average over :math:`n` observations in :math:`\\mathcal{D}`. + :math:`\\hat{q} (x,a)` is the estimated expected reward given :math:`x` and :math:`a`. + :math:`\\hat{q} (x_i,\\pi):= \\mathbb{E}_{a \\sim \\pi(a|x)}[\\hat{q}(x,a)]` is the expectation of the estimated reward function over :math:`\\pi`. + To estimate the reward function, please use `obp.ope.regression_model.RegressionModel`, which supports several fitting methods specific to OPE (such as cross-fitting). If the regression model (:math:`\\hat{q}`) is a good approximation to the true mean reward function, this estimator accurately estimates the policy value of the evaluation policy. - If the regression function fails to approximate the mean reward function well, + If the regression function fails to approximate the reward function well, however, the final estimator is no longer consistent. Parameters @@ -704,20 +709,20 @@ def _estimate_round_rewards( Parameters ---------- action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) Returns ---------- estimated_rewards: array-like, shape (n_rounds,) - Rewards of each round estimated by the DM estimator. + Estimated rewards for each observation. """ if position is None: @@ -735,7 +740,7 @@ def _estimate_round_rewards( axis=1, ) else: - raise ValueError("action must be 1D array") + raise ValueError("`action` must be 1D array") def estimate_policy_value( self, @@ -749,20 +754,20 @@ def estimate_policy_value( Parameters ---------- action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) Returns ---------- V_hat: float - Estimated policy value (performance) of a given evaluation policy. + Estimated policy value of evaluation policy. """ check_array( @@ -799,15 +804,15 @@ def estimate_interval( Parameters ---------- action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) alpha: float, default=0.05 Significance level. @@ -856,25 +861,24 @@ class DoublyRobust(BaseOffPolicyEstimator): Note ------- - Similar to DM, DR first learns a supervised machine learning model, such as ridge regression and gradient boosting, - to estimate the mean reward function (:math:`q(x,a) = \\mathbb{E}[r|x,a]`). - It then uses it to estimate the policy value as follows. + Similar to DM, DR estimates the reward function (:math:`q(x,a) = \\mathbb{E}[r|x,a]`). + It then uses the estimated rewards to estimate the policy value as follows. .. math:: \\hat{V}_{\\mathrm{DR}} (\\pi_e; \\mathcal{D}, \\hat{q}) - := \\mathbb{E}_{\\mathcal{D}}[\\hat{q}(x_t,\\pi_e) + w(x_t,a_t) (r_t - \\hat{q}(x_t,a_t))], + := \\mathbb{E}_{n}[\\hat{q}(x_i,\\pi_e) + w(x_i,a_i) (r_i - \\hat{q}(x_i,a_i))], - where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by - a behavior policy :math:`\\pi_b`. + where :math:`\\mathcal{D}=\\{(x_i,a_i,r_i)\\}_{i=1}^{n}` is logged bandit data with :math:`n` observations collected by + behavior policy :math:`\\pi_b`. :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the importance weight given :math:`x` and :math:`a`. - :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`. - :math:`\\hat{q} (x,a)` is an estimated expected reward given :math:`x` and :math:`a`. - :math:`\\hat{q} (x_t,\\pi):= \\mathbb{E}_{a \\sim \\pi(a|x)}[\\hat{q}(x,a)]` is the expectation of the estimated reward function over :math:`\\pi`. - When the weight-clipping is applied, a large importance weight is clipped as :math:`\\hat{w}(x,a) := \\min \\{ \\lambda, w(x,a) \\}` - where :math:`\\lambda (>0)` is a hyperparameter that decides a maximum allowed importance weight. + :math:`\\mathbb{E}_{n}[\\cdot]` is the empirical average over :math:`n` observations in :math:`\\mathcal{D}`. + :math:`\\hat{q} (x,a)` is the estimated expected reward given :math:`x` and :math:`a`. + :math:`\\hat{q} (x_i,\\pi):= \\mathbb{E}_{a \\sim \\pi(a|x)}[\\hat{q}(x,a)]` is the expectation of the estimated reward function over :math:`\\pi`. + When the clipping is applied, a large importance weight is clipped as :math:`\\hat{w}(x,a) := \\min \\{ \\lambda, w(x,a) \\}` + where :math:`\\lambda (>0)` is a hyperparameter to specify a maximum allowed importance weight. - To estimate the mean reward function, please use `obp.ope.regression_model.RegressionModel`, + To estimate the reward function, please use `obp.ope.regression_model.RegressionModel`, which supports several fitting methods specific to OPE such as *more robust doubly robust*. DR mimics IPW to use a weighted version of rewards, but DR also uses the estimated mean reward @@ -888,14 +892,15 @@ class DoublyRobust(BaseOffPolicyEstimator): lambda_: float, default=np.inf A maximum possible value of the importance weight. When a positive finite value is given, importance weights larger than `lambda_` will be clipped. - DoublyRobust with a finite positive `lambda_` corresponds to Doubly Robust with Pessimistic Shrinkage of Su et al.(2020) or CAB-DR of Su et al.(2019). + DoublyRobust with a finite positive `lambda_` corresponds to DR with Pessimistic Shrinkage of Su et al.(2020) + or CAB-DR of Su et al.(2019). + + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. estimator_name: str, default='dr'. Name of the estimator. - use_estimated_pscore: bool, default=False. - If True, estimated_pscore is used to estimate the policy value, otherwise, pscore (the true propensity scores) is used. - References ---------- Miroslav Dudík, Dumitru Erhan, John Langford, and Lihong Li. @@ -913,8 +918,8 @@ class DoublyRobust(BaseOffPolicyEstimator): """ lambda_: float = np.inf - estimator_name: str = "dr" use_estimated_pscore: bool = False + estimator_name: str = "dr" def __post_init__(self) -> None: """Initialize Class.""" @@ -925,7 +930,7 @@ def __post_init__(self) -> None: min_val=0.0, ) if self.lambda_ != self.lambda_: - raise ValueError("lambda_ must not be nan") + raise ValueError("`lambda_` must not be nan") if not isinstance(self.use_estimated_pscore, bool): raise TypeError( f"`use_estimated_pscore` must be a bool, but {type(self.use_estimated_pscore)} is given" @@ -946,29 +951,29 @@ def _estimate_round_rewards( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) Returns ---------- estimated_rewards: array-like, shape (n_rounds,) - Rewards of each round estimated by the DR estimator. + Estimated rewards for each observation. """ if position is None: @@ -993,7 +998,7 @@ def _estimate_round_rewards( axis=1, ) else: - raise ValueError("reward must be 1D array") + raise ValueError("`reward` must be 1D array") estimated_rewards += iw * (reward - q_hat_factual) return estimated_rewards @@ -1014,34 +1019,34 @@ def estimate_policy_value( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. If self.use_estimated_pscore is False, pscore must be given. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. - If self.use_estimated_pscore is True, estimated_pscore must be given. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. Returns ---------- V_hat: float - Policy value estimated by the DR estimator. + Estimated policy value of evaluation policy. """ check_array( @@ -1096,29 +1101,29 @@ def estimate_interval( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. If self.use_estimated_pscore is False, pscore must be given. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. - If self.use_estimated_pscore is True, estimated_pscore must be given. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. alpha: float, default=0.05 Significance level. @@ -1190,31 +1195,31 @@ def _estimate_mse_score( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. use_bias_upper_bound: bool, default=True Whether to use bias upper bound in hyperparameter tuning. - If False, direct bias estimator is used to estimate the MSE. + If False, the direct bias estimator is used to estimate the MSE. See Su et al.(2020) for details. delta: float, default=0.05 - A confidence delta to construct a high probability upper bound based on the Bernstein’s inequality. + A confidence delta to construct a high probability upper bound based on Bernstein inequality. Returns ---------- @@ -1271,25 +1276,27 @@ class SelfNormalizedDoublyRobust(DoublyRobust): Note ------- - Self-Normalized Doubly Robust estimates the policy value of evaluation policy :math:`\\pi_e` by + SNDR estimates the policy value of evaluation policy :math:`\\pi_e` as .. math:: \\hat{V}_{\\mathrm{SNDR}} (\\pi_e; \\mathcal{D}, \\hat{q}) := - \\mathbb{E}_{\\mathcal{D}} \\left[\\hat{q}(x_t,\\pi_e) + \\frac{w(x_t,a_t) (r_t - \\hat{q}(x_t,a_t))}{\\mathbb{E}_{\\mathcal{D}}[ w(x_t,a_t) ]} \\right], + \\mathbb{E}_{n} \\left[\\hat{q}(x_i,\\pi_e) + \\frac{w(x_i,a_i) (r_i - \\hat{q}(x_i,a_i))}{\\mathbb{E}_{n}[ w(x_i,a_i) ]} \\right], - where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by - a behavior policy :math:`\\pi_b`. :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the importance weight given :math:`x` and :math:`a`. - :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`. - :math:`\\hat{q} (x,a)` is an estimated expected reward given :math:`x` and :math:`a`. - :math:`\\hat{q} (x_t,\\pi):= \\mathbb{E}_{a \\sim \\pi(a|x)}[\\hat{q}(x,a)]` is the expectation of the estimated reward function over :math:`\\pi`. - To estimate the mean reward function, please use `obp.ope.regression_model.RegressionModel`. + where :math:`\\mathcal{D}=\\{(x_i,a_i,r_i)\\}_{i=1}^{n}` is logged bandit data with :math:`n` observations collected by + behavior policy :math:`\\pi_b`. :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the importance weight given :math:`x` and :math:`a`. + :math:`\\mathbb{E}_{n}[\\cdot]` is the empirical average over :math:`n` observations in :math:`\\mathcal{D}`. + :math:`\\hat{q} (x,a)` is the estimated expected reward given :math:`x` and :math:`a`. + :math:`\\hat{q} (x_i,\\pi):= \\mathbb{E}_{a \\sim \\pi(a|x)}[\\hat{q}(x,a)]` is the expectation of the estimated reward function over :math:`\\pi`. + To estimate the reward function, please use `obp.ope.regression_model.RegressionModel`. - Similar to Self-Normalized Inverse Probability Weighting, SNDR estimator applies the self-normalized importance weighting technique to - increase the stability of the original Doubly Robust estimator. + Similar to SNIPW, SNDR estimator applies the self-normalized importance weighting technique to gain some stability. Parameters ---------- + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. + estimator_name: str, default='sndr'. Name of the estimator. @@ -1320,29 +1327,29 @@ def _estimate_round_rewards( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) Returns ---------- estimated_rewards: array-like, shape (n_rounds,) - Rewards of each round estimated by the SNDR estimator. + Estimated rewards for each observation. """ n_rounds = action.shape[0] @@ -1359,7 +1366,7 @@ def _estimate_round_rewards( axis=1, ) else: - raise ValueError("reward must be 1D array") + raise ValueError("`reward` must be 1D array") q_hat_factual = estimated_rewards_by_reg_model[ np.arange(n_rounds), action, position @@ -1375,26 +1382,29 @@ class SwitchDoublyRobust(DoublyRobust): Note ------- Switch-DR aims to reduce the variance of the DR estimator by using direct method when the importance weight is large. - This estimator estimates the policy value of evaluation policy :math:`\\pi_e` by + This estimator estimates the policy value of evaluation policy :math:`\\pi_e` as .. math:: \\hat{V}_{\\mathrm{SwitchDR}} (\\pi_e; \\mathcal{D}, \\hat{q}, \\lambda) - := \\mathbb{E}_{\\mathcal{D}} [\\hat{q}(x_t,\\pi_e) + w(x_t,a_t) (r_t - \\hat{q}(x_t,a_t)) \\mathbb{I} \\{ w(x_t,a_t) \\le \\lambda \\}], + := \\mathbb{E}_{n} [\\hat{q}(x_i,\\pi_e) + w(x_i,a_i) (r_i - \\hat{q}(x_i,a_i)) \\mathbb{I} \\{ w(x_i,a_i) \\le \\lambda \\}], - where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by - a behavior policy :math:`\\pi_b`. :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the importance weight given :math:`x` and :math:`a`. - :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`. + where :math:`\\mathcal{D}=\\{(x_i,a_i,r_i)\\}_{i=1}^{n}` is logged bandit data with :math:`n` observations collected by + behavior policy :math:`\\pi_b`. :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the importance weight given :math:`x` and :math:`a`. + :math:`\\mathbb{E}_{n}[\\cdot]` is the empirical average over :math:`n` observations in :math:`\\mathcal{D}`. :math:`\\lambda (\\ge 0)` is a switching hyperparameter, which decides the threshold for the importance weight. - :math:`\\hat{q} (x,a)` is an estimated expected reward given :math:`x` and :math:`a`. - :math:`\\hat{q} (x_t,\\pi):= \\mathbb{E}_{a \\sim \\pi(a|x)}[\\hat{q}(x,a)]` is the expectation of the estimated reward function over :math:`\\pi`. - To estimate the mean reward function, please use `obp.ope.regression_model.RegressionModel`. + :math:`\\hat{q} (x,a)` is the estimated expected reward given :math:`x` and :math:`a`. + :math:`\\hat{q} (x_i,\\pi):= \\mathbb{E}_{a \\sim \\pi(a|x)}[\\hat{q}(x,a)]` is the expectation of the estimated reward function over :math:`\\pi`. + To estimate the reward function, please use `obp.ope.regression_model.RegressionModel`. Parameters ---------- lambda_: float, default=np.inf Switching hyperparameter. When importance weight is larger than this parameter, DM is applied, otherwise DR is used. - This hyperparameter should be larger than or equal to 0., otherwise it is meaningless. + Should be larger than or equal to 0., otherwise it is meaningless. + + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. estimator_name: str, default='switch-dr'. Name of the estimator. @@ -1424,7 +1434,7 @@ def __post_init__(self) -> None: min_val=0.0, ) if self.lambda_ != self.lambda_: - raise ValueError("lambda_ must not be nan") + raise ValueError("`lambda_` must not be nan") if not isinstance(self.use_estimated_pscore, bool): raise TypeError( f"`use_estimated_pscore` must be a bool, but {type(self.use_estimated_pscore)} is given" @@ -1445,29 +1455,29 @@ def _estimate_round_rewards( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) Returns ---------- estimated_rewards: array-like, shape (n_rounds,) - Rewards of each round estimated by the Switch-DR estimator. + Estimated rewards for each observation. """ n_rounds = action.shape[0] @@ -1504,31 +1514,31 @@ def _estimate_mse_score( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) use_bias_upper_bound: bool, default=True Whether to use bias upper bound in hyperparameter tuning. - If False, direct bias estimator is used to estimate the MSE. + If False, the direct bias estimator is used to estimate the MSE. See Su et al.(2020) for details. delta: float, default=0.05 - A confidence delta to construct a high probability upper bound based on the Bernstein’s inequality. + A confidence delta to construct a high probability upper bound based on Bernstein inequality. Returns ---------- @@ -1585,36 +1595,31 @@ class DoublyRobustWithShrinkage(DoublyRobust): Note ------ - DR with (optimistic) shrinkage replaces the importance weight in the original DR estimator with a new weight mapping - found by directly optimizing sharp bounds on the resulting MSE. + DRos shrinks the importance weight in the vanilla DR by directly optimizing sharp bounds on the resulting MSE. .. math:: \\hat{V}_{\\mathrm{DRos}} (\\pi_e; \\mathcal{D}, \\hat{q}, \\lambda) - := \\mathbb{E}_{\\mathcal{D}} [\\hat{q}(x_t,\\pi_e) + w_o(x_t,a_t;\\lambda) (r_t - \\hat{q}(x_t,a_t))], + := \\mathbb{E}_{n} [\\hat{q}(x_i,\\pi_e) + \\frac{\\lambda w(x_i,a_i)}{w^2(x_i,a_i) + \\lambda} w(x_i,a_i) (r_i - \\hat{q}(x_i,a_i))], - where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by - a behavior policy :math:`\\pi_b`. + where :math:`\\mathcal{D}=\\{(x_i,a_i,r_i)\\}_{i=1}^{n}` is logged bandit data with :math:`n` observations collected by + behavior policy :math:`\\pi_b`. :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the importance weight given :math:`x` and :math:`a`. - :math:`\\hat{q} (x_t,\\pi):= \\mathbb{E}_{a \\sim \\pi(a|x)}[\\hat{q}(x,a)]` is the expectation of the estimated reward function over :math:`\\pi`. - :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`. - :math:`\\hat{q} (x,a)` is an estimated expected reward given :math:`x` and :math:`a`. - To estimate the mean reward function, please use `obp.ope.regression_model.RegressionModel`. - - :math:`w_{o} (x_t,a_t;\\lambda)` is a new weight by the shrinkage technique which is defined as + :math:`\\hat{q} (x_i,\\pi):= \\mathbb{E}_{a \\sim \\pi(a|x)}[\\hat{q}(x,a)]` is the expectation of the estimated reward function over :math:`\\pi`. + :math:`\\mathbb{E}_{n}[\\cdot]` is the empirical average over :math:`n` observations in :math:`\\mathcal{D}`. + :math:`\\hat{q} (x,a)` is the estimated expected reward given :math:`x` and :math:`a`. + To estimate the reward function, please use `obp.ope.regression_model.RegressionModel`. - .. math:: - - w_{o} (x_t,a_t;\\lambda) := \\frac{\\lambda}{w^2(x_t,a_t) + \\lambda} w(x_t,a_t). - - When :math:`\\lambda=0`, we have :math:`w_{o} (x,a;\\lambda)=0` corresponding to the DM estimator. - In contrast, as :math:`\\lambda \\rightarrow \\infty`, :math:`w_{o} (x,a;\\lambda)` increases and in the limit becomes equal to the original importance weight, corresponding to the standard DR estimator. + When :math:`\\lambda=0`, we have :math:`\\hat{w} (x,a;\\lambda)=0` corresponding to DM. + In contrast, as :math:`\\lambda \\rightarrow \\infty`, :math:`\\hat{w} (x,a;\\lambda)` increases and in the limit becomes equal to the original importance weight, corresponding to the standard DR estimator. Parameters ---------- lambda_: float - Shrinkage hyperparameter. - This hyperparameter should be larger than or equal to 0., otherwise it is meaningless. + Hyperparameter to shrink the importance weights. Should be larger than or equal to 0., otherwise it is meaningless. + + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. estimator_name: str, default='dr-os'. Name of the estimator. @@ -1641,7 +1646,7 @@ def __post_init__(self) -> None: min_val=0.0, ) if self.lambda_ != self.lambda_: - raise ValueError("lambda_ must not be nan") + raise ValueError("`lambda_` must not be nan") if not isinstance(self.use_estimated_pscore, bool): raise TypeError( f"`use_estimated_pscore` must be a bool, but {type(self.use_estimated_pscore)} is given" @@ -1662,29 +1667,29 @@ def _estimate_round_rewards( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) Returns ---------- estimated_rewards: array-like, shape (n_rounds,) - Rewards of each round estimated by the DRos estimator. + Estimated rewards for each observation. """ n_rounds = action.shape[0] @@ -1708,7 +1713,7 @@ def _estimate_round_rewards( axis=1, ) else: - raise ValueError("reward must be 1D array") + raise ValueError("`reward` must be 1D array") estimated_rewards += iw_hat * (reward - q_hat_factual) return estimated_rewards @@ -1723,35 +1728,36 @@ def _estimate_mse_score( position: Optional[np.ndarray] = None, use_bias_upper_bound: bool = False, delta: float = 0.05, + **kwargs, ) -> float: """Estimate the MSE score of a given shrinkage hyperparameter to conduct hyperparameter tuning. Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. use_bias_upper_bound: bool, default=True Whether to use bias upper bound in hyperparameter tuning. - If False, direct bias estimator is used to estimate the MSE. + If False, the direct bias estimator is used to estimate the MSE. See Su et al.(2020) for details. delta: float, default=0.05 - A confidence delta to construct a high probability upper bound based on the Bernstein’s inequality. + A confidence delta to construct a high probability upper bound based on Bernstein inequality. Returns ---------- @@ -1790,7 +1796,403 @@ def _estimate_mse_score( q_hat=estimated_rewards_by_reg_model[ np.arange(n_rounds), action, position ], - delta=0.05, + delta=delta, + ) + else: + bias_term = estimate_bias_in_ope( + reward=reward, + iw=iw, + iw_hat=iw_hat, + q_hat=estimated_rewards_by_reg_model[ + np.arange(n_rounds), action, position + ], + ) + estimated_mse_score = sample_variance + (bias_term ** 2) + + return estimated_mse_score + + +@dataclass +class SubGaussianInverseProbabilityWeighting(InverseProbabilityWeighting): + """Sub-Gaussian Inverse Probability Weighting (SG-IPW) Estimator. + + Note + ------ + Sub-Gaussian IPW replaces the importance weights in the vanilla IPW by applying the power mean as follows. + + .. math:: + + \\hat{V}_{\\mathrm{SGIPW}} (\\pi_e; \\mathcal{D}, \\hat{q}, \\lambda) + := \\mathbb{E}_{n} [\\frac{w(x_i,a_i)}{1 - \\lambda + \\lambda \cdot w(x_i,a_i)} r_i ], + + where :math:`\\mathcal{D}=\\{(x_i,a_i,r_i)\\}_{i=1}^{n}` is logged bandit data with :math:`n` observations + collected by behavior policy :math:`\\pi_b`. + :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the true importance weight given :math:`x` and :math:`a`. + :math:`\\mathbb{E}_{n}[\\cdot]` is the empirical average over :math:`n` observations in :math:`\\mathcal{D}`. + + Parameters + ---------- + lambda_: float + Hyperparameter to shrink the importance weights. Should be within the range of [0.0, 1.0]. + When `lambda_=0`, the estimator is identical to the vanilla DR. + When `lambda_=1`, the importance weights will be uniform. + + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. + + estimator_name: str, default='sg-ipw'. + Name of the estimator. + + References + ---------- + Alberto Maria Metelli, Alessio Russo, and Marcello Restelli. + "Subgaussian and Differentiable Importance Sampling for Off-Policy Evaluation and Learning.", 2021. + + """ + + lambda_: float = 0.0 + estimator_name: str = "sg-ipw" + + def __post_init__(self) -> None: + """Initialize Class.""" + check_scalar( + self.lambda_, + name="lambda_", + target_type=(int, float), + min_val=0.0, + max_val=1.0, + ) + if self.lambda_ != self.lambda_: + raise ValueError("`lambda_` must not be nan") + if not isinstance(self.use_estimated_pscore, bool): + raise TypeError( + f"`use_estimated_pscore` must be a bool, but {type(self.use_estimated_pscore)} is given" + ) + + def _estimate_round_rewards( + self, + reward: np.ndarray, + action: np.ndarray, + pscore: np.ndarray, + action_dist: np.ndarray, + position: Optional[np.ndarray] = None, + **kwargs, + ) -> np.ndarray: + """Estimate round-wise (or sample-wise) rewards. + + Parameters + ---------- + reward: array-like or Tensor, shape (n_rounds,) + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. + + action: array-like or Tensor, shape (n_rounds,) + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. + + pscore: array-like or Tensor, shape (n_rounds,) + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. + + action_dist: array-like or Tensor, shape (n_rounds, n_actions, len_list) + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. + + position: array-like or Tensor, shape (n_rounds,), default=None + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) + + Returns + ---------- + estimated_rewards: array-like or Tensor, shape (n_rounds,) + Estimated rewards for each observation. + + """ + n_rounds = action.shape[0] + iw = action_dist[np.arange(n_rounds), action, position] / pscore + iw_hat = iw / (1 - self.lambda_ + self.lambda_ * iw) + estimated_rewards = iw_hat * reward + + return estimated_rewards + + def _estimate_mse_score( + self, + reward: np.ndarray, + action: np.ndarray, + pscore: np.ndarray, + action_dist: np.ndarray, + position: Optional[np.ndarray] = None, + use_bias_upper_bound: bool = False, + delta: float = 0.05, + **kwargs, + ) -> float: + """Estimate the MSE score of a given shrinkage hyperparameter to conduct hyperparameter tuning. + + Parameters + ---------- + reward: array-like, shape (n_rounds,) + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. + + action: array-like, shape (n_rounds,) + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. + + pscore: array-like, shape (n_rounds,) + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. + + action_dist: array-like, shape (n_rounds, n_actions, len_list) + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. + + estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. + + position: array-like, shape (n_rounds,), default=None + Position in a recommendation interface where the action was presented. + + use_bias_upper_bound: bool, default=True + Whether to use bias upper bound in hyperparameter tuning. + If False, the direct bias estimator is used to estimate the MSE. See Su et al.(2020) for details. + + delta: float, default=0.05 + A confidence delta to construct a high probability upper bound based on Bernstein inequality. + + Returns + ---------- + estimated_mse_score: float + Estimated MSE score of a given shrinkage hyperparameter `lambda_`. + MSE score is the sum of (high probability) upper bound of bias and the sample variance. + This is estimated using the automatic hyperparameter tuning procedure + based on Section 5 of Su et al.(2020). + + """ + n_rounds = reward.shape[0] + # estimate the sample variance of DRos + sample_variance = np.var( + self._estimate_round_rewards( + reward=reward, + action=action, + pscore=pscore, + action_dist=action_dist, + position=position, + ) + ) + sample_variance /= n_rounds + + # estimate the (high probability) upper bound of the bias of SGIPW + iw = action_dist[np.arange(n_rounds), action, position] / pscore + iw_hat = iw / (1 - self.lambda_ + self.lambda_ * iw) + if use_bias_upper_bound: + bias_term = estimate_high_probability_upper_bound_bias( + reward=reward, + iw=iw, + iw_hat=iw_hat, + delta=delta, + ) + else: + bias_term = estimate_bias_in_ope( + reward=reward, + iw=iw, + iw_hat=iw_hat, + ) + estimated_mse_score = sample_variance + (bias_term ** 2) + + return estimated_mse_score + + +@dataclass +class SubGaussianDoublyRobust(DoublyRobust): + """Sub-Gaussian Doubly Robust (SG-DR) Estimator. + + Note + ------ + Sub-Gaussian DR replaces the importance weights in the vanilla DR by applying the power mean as follows. + + .. math:: + + \\hat{V}_{\\mathrm{SGDR}} (\\pi_e; \\mathcal{D}, \\hat{q}, \\lambda) + := \\mathbb{E}_{n} [\\hat{q}(x_i,\\pi_e) + \\frac{w(x_i,a_i)}{1 - \\lambda + \\lambda \cdot w(x_i,a_i)} (r_i - \\hat{q}(x_i,a_i))], + + where :math:`\\mathcal{D}=\\{(x_i,a_i,r_i)\\}_{i=1}^{n}` is logged bandit data with :math:`n` observations collected by behavior policy :math:`\\pi_b`. + :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the true importance weight given :math:`x` and :math:`a`. + :math:`\\hat{q} (x_i,\\pi):= \\mathbb{E}_{a \\sim \\pi(a|x)}[\\hat{q}(x,a)]` is the expectation of the estimated reward function over :math:`\\pi`. + :math:`\\mathbb{E}_{n}[\\cdot]` is the empirical average over :math:`n` observations in :math:`\\mathcal{D}`. + :math:`\\hat{q} (x,a)` is the estimated expected reward given :math:`x` and :math:`a`. + To estimate the reward function, please use `obp.ope.regression_model.RegressionModel`. + + Parameters + ---------- + lambda_: float + Hyperparameter to shrink the importance weights. Should be within the range of [0.0, 1.0]. + When `lambda_=0`, the estimator is identical to the vanilla DR. + When `lambda_=1`, the importance weights will be uniform. + + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. + + estimator_name: str, default='sg-dr'. + Name of the estimator. + + References + ---------- + Alberto Maria Metelli, Alessio Russo, and Marcello Restelli. + "Subgaussian and Differentiable Importance Sampling for Off-Policy Evaluation and Learning.", 2021. + + """ + + lambda_: float = 0.0 + estimator_name: str = "sg-dr" + + def __post_init__(self) -> None: + """Initialize Class.""" + check_scalar( + self.lambda_, + name="lambda_", + target_type=(int, float), + min_val=0.0, + max_val=1.0, + ) + if self.lambda_ != self.lambda_: + raise ValueError("`lambda_` must not be nan") + if not isinstance(self.use_estimated_pscore, bool): + raise TypeError( + f"`use_estimated_pscore` must be a bool, but {type(self.use_estimated_pscore)} is given" + ) + + def _estimate_round_rewards( + self, + reward: np.ndarray, + action: np.ndarray, + pscore: np.ndarray, + action_dist: np.ndarray, + estimated_rewards_by_reg_model: np.ndarray, + position: Optional[np.ndarray] = None, + **kwargs, + ) -> np.ndarray: + """Estimate round-wise (or sample-wise) rewards. + + Parameters + ---------- + reward: array-like or Tensor, shape (n_rounds,) + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. + + action: array-like or Tensor, shape (n_rounds,) + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. + + pscore: array-like or Tensor, shape (n_rounds,) + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. + + action_dist: array-like or Tensor, shape (n_rounds, n_actions, len_list) + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. + + estimated_rewards_by_reg_model: array-like or Tensor, shape (n_rounds, n_actions, len_list) + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. + + position: array-like or Tensor, shape (n_rounds,), default=None + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) + + Returns + ---------- + estimated_rewards: array-like or Tensor, shape (n_rounds,) + Estimated rewards for each observation. + + """ + n_rounds = action.shape[0] + iw = action_dist[np.arange(n_rounds), action, position] / pscore + iw_hat = iw / (1 - self.lambda_ + self.lambda_ * iw) + q_hat_at_position = estimated_rewards_by_reg_model[ + np.arange(n_rounds), :, position + ] + q_hat_factual = estimated_rewards_by_reg_model[ + np.arange(n_rounds), action, position + ] + pi_e_at_position = action_dist[np.arange(n_rounds), :, position] + + if isinstance(reward, np.ndarray): + estimated_rewards = np.average( + q_hat_at_position, + weights=pi_e_at_position, + axis=1, + ) + else: + raise ValueError("`reward` must be 1D array") + + estimated_rewards += iw_hat * (reward - q_hat_factual) + return estimated_rewards + + def _estimate_mse_score( + self, + reward: np.ndarray, + action: np.ndarray, + pscore: np.ndarray, + action_dist: np.ndarray, + estimated_rewards_by_reg_model: np.ndarray, + position: Optional[np.ndarray] = None, + use_bias_upper_bound: bool = False, + delta: float = 0.05, + **kwargs, + ) -> float: + """Estimate the MSE score of a given shrinkage hyperparameter to conduct hyperparameter tuning. + + Parameters + ---------- + reward: array-like, shape (n_rounds,) + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. + + action: array-like, shape (n_rounds,) + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. + + pscore: array-like, shape (n_rounds,) + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. + + action_dist: array-like, shape (n_rounds, n_actions, len_list) + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. + + estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. + + position: array-like, shape (n_rounds,), default=None + Position in a recommendation interface where the action was presented. + + use_bias_upper_bound: bool, default=True + Whether to use bias upper bound in hyperparameter tuning. + If False, the direct bias estimator is used to estimate the MSE. See Su et al.(2020) for details. + + delta: float, default=0.05 + A confidence delta to construct a high probability upper bound based on Bernstein inequality. + + Returns + ---------- + estimated_mse_score: float + Estimated MSE score of a given shrinkage hyperparameter `lambda_`. + MSE score is the sum of (high probability) upper bound of bias and the sample variance. + This is estimated using the automatic hyperparameter tuning procedure + based on Section 5 of Su et al.(2020). + + """ + n_rounds = reward.shape[0] + # estimate the sample variance of DRos + sample_variance = np.var( + self._estimate_round_rewards( + reward=reward, + action=action, + pscore=pscore, + action_dist=action_dist, + estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, + position=position, + ) + ) + sample_variance /= n_rounds + + # estimate the (high probability) upper bound of the bias of SGDR + iw = action_dist[np.arange(n_rounds), action, position] / pscore + iw_hat = iw / (1 - self.lambda_ + self.lambda_ * iw) + if use_bias_upper_bound: + bias_term = estimate_high_probability_upper_bound_bias( + reward=reward, + iw=iw, + iw_hat=iw_hat, + q_hat=estimated_rewards_by_reg_model[ + np.arange(n_rounds), action, position + ], + delta=delta, ) else: bias_term = estimate_bias_in_ope( @@ -1812,17 +2214,17 @@ class BalancedInverseProbabilityWeighting(BaseOffPolicyEstimator): Note ------- - Balanced Inverse Probability Weighting (B-IPW) estimates the policy value of evaluation policy :math:`\\pi_e` by + B-IPW estimates the policy value of evaluation policy :math:`\\pi_e` as .. math:: - \\hat{V}_{\\mathrm{B-IPW}} (\\pi_e; \\mathcal{D}) := \\frac{\\mathbb{E}_{\\mathcal{D}} [\\hat{w}(x_t,a_t) r_t]}{\\mathbb{E}_{\\mathcal{D}} [\\hat{w}(x_t,a_t)}, + \\hat{V}_{\\mathrm{B-IPW}} (\\pi_e; \\mathcal{D}) := \\frac{\\mathbb{E}_{\\mathcal{D}} [\\hat{w}(x_i,a_i) r_i]}{\\mathbb{E}_{\\mathcal{D}} [\\hat{w}(x_i,a_i)}, - where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by + where :math:`\\mathcal{D}=\\{(x_i,a_i,r_t)\\}_{t=1}^{T}` is logged bandit data with :math:`n` observations collected by a behavior policy :math:`\\pi_b`. :math:`\\hat{w}(x,a):=\\Pr[C=1|x,a] / \\Pr[C=0|x,a]`, where :math:`\\Pr[C=1|x,a]` is the probability that the data coming from the evaluation policy given action :math:`a` and :math:`x`. - :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`. - When the weight-clipping is applied, large importance weights are clipped as :math:`\\hat{w_c}(x,a) := \\min \\{ \\lambda, \\hat{w}(x,a) \\}` + :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`n` observations in :math:`\\mathcal{D}`. + When the clipping is applied, large importance weights are clipped as :math:`\\hat{w_c}(x,a) := \\min \\{ \\lambda, \\hat{w}(x,a) \\}` where :math:`\\lambda (>0)` is a hyperparameter to define a maximum value allowed for importance weights. B-IPW re-weights the rewards by the importance weights estimated via a supervised classification procedure, and thus can be used even when the behavior policy (or the propensity score of the behavior policy) is not known. `obp.ope.ImportanceWeightEstimator` can be used to estimate the importance weights for B-IPW. @@ -1832,7 +2234,7 @@ class BalancedInverseProbabilityWeighting(BaseOffPolicyEstimator): .. math:: - \\hat{V}_{\\mathrm{B-IPW}} (\\pi_e; \\mathcal{D}) := \\frac{\\mathbb{E}_{\\mathcal{D}} [ \\hat{w}(x_t,\\pi_e (x_t)) r_t]}{\\mathbb{E}_{\\mathcal{D}} [ \\hat{w}(x_t,\\pi_e (x_t))}, + \\hat{V}_{\\mathrm{B-IPW}} (\\pi_e; \\mathcal{D}) := \\frac{\\mathbb{E}_{\\mathcal{D}} [ \\hat{w}(x_t,\\pi_e (x_t)) r_i]}{\\mathbb{E}_{\\mathcal{D}} [ \\hat{w}(x_t,\\pi_e (x_t))}, where :math:`\\pi_e` is a deterministic evaluation policy. We modify this original definition to adjust to stochastic evaluation policies. @@ -1864,7 +2266,7 @@ def __post_init__(self) -> None: min_val=0.0, ) if self.lambda_ != self.lambda_: - raise ValueError("lambda_ must not be nan") + raise ValueError("`lambda_` must not be nan") def _estimate_round_rewards( self, @@ -1880,26 +2282,26 @@ def _estimate_round_rewards( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. estimated_importance_weights: array-like, shape (n_rounds,) - Importance weights estimated via supervised classification using `obp.ope.ImportanceWeightEstimator`, i.e., :math:`\\hat{w}(x_t, a_t)`. + Importance weights estimated via supervised classification using `obp.ope.ImportanceWeightEstimator`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only one action is chosen and there is no posion, then you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) Returns ---------- estimated_rewards: array-like, shape (n_rounds,) - Rewards of each round estimated by Balanced IPW. + Estimated rewards for each observation. """ if position is None: @@ -1924,26 +2326,26 @@ def estimate_policy_value( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. estimated_importance_weights: array-like, shape (n_rounds,) - Importance weights estimated via supervised classification using `obp.ope.ImportanceWeightEstimator`, i.e., :math:`\\hat{w}(x_t, a_t)`. + Importance weights estimated via supervised classification using `obp.ope.ImportanceWeightEstimator`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only a single action is chosen at each round, you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) Returns ---------- V_hat: float - Estimated policy value (performance) of a given evaluation policy. + Estimated policy value of evaluation policy. """ check_array(array=reward, name="reward", expected_dim=1) @@ -1987,21 +2389,21 @@ def estimate_interval( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. estimated_importance_weights: array-like, shape (n_rounds,) - Importance weights estimated via supervised classification using `obp.ope.ImportanceWeightEstimator`, i.e., :math:`\\hat{w}(x_t, a_t)`. + Importance weights estimated via supervised classification using `obp.ope.ImportanceWeightEstimator`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only a single action is chosen at each round, you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) alpha: float, default=0.05 Significance level. diff --git a/obp/ope/estimators_continuous.py b/obp/ope/estimators_continuous.py index d4bf3345..768d2204 100644 --- a/obp/ope/estimators_continuous.py +++ b/obp/ope/estimators_continuous.py @@ -76,20 +76,19 @@ class KernelizedInverseProbabilityWeighting(BaseContinuousOffPolicyEstimator): Note ------- - Kernelized Inverse Probability Weighting (KernelizedIPW) - estimates the policy value of a given (deterministic) evaluation policy :math:`\\pi_e` by + Kernel IPW estimates the policy value of a given (deterministic) evaluation policy :math:`\\pi_e` as .. math:: \\hat{V}_{\\mathrm{Kernel-IPW}} (\\pi_e; \\mathcal{D}) - := \\mathbb{E}_{\\mathcal{D}} \\left[ \\frac{1}{h} K \\left( \\frac{\pi_e(x_t) - a_t}{h} \\right) \\frac{r_t}{q_t} \\right], + := \\mathbb{E}_{n} \\left[ \\frac{1}{h} K \\left( \\frac{\\pi_e(x_i) - a_i}{h} \\right) \\frac{r_i}{q_i} \\right], - where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by behavior policy. + where :math:`\\mathcal{D}=\\{(x_i,a_i,r_i)\\}_{i=1}^{n}` is logged bandit data with :math:`n` observations collected by behavior policy. Note that each action :math:`a_t` in the logged bandit data is a continuous variable. - :math:`q_t` is a generalized propensity score that is defined as the conditional probability density of the behavior policy. + :math:`q_i` is the generalized propensity score, which is defined as the conditional probability density of the behavior policy. :math:`K(\cdot)` is a kernel function such as the gaussian kernel, and :math:`h` is a bandwidth hyperparameter. :math:`\\pi_e (x)` is a deterministic evaluation policy that maps :math:`x` to a continuous action value. - :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`. + :math:`\\mathbb{E}_{n}[\\cdot]` is the empirical average over :math:`n` observations in :math:`\\mathcal{D}`. Parameters ------------ @@ -98,7 +97,7 @@ class KernelizedInverseProbabilityWeighting(BaseContinuousOffPolicyEstimator): Must be one of "gaussian", "epanechnikov", "triangular", or "cosine". bandwidth: float - A bandwidth hyperparameter. + Bandwidth hyperparameter. A larger value increases bias instead of reducing variance. A smaller value increases variance instead of reducing bias. @@ -138,14 +137,14 @@ def _estimate_round_rewards( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action_by_behavior_policy: array-like, shape (n_rounds,) - Continuous action values sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) Probability densities of the continuous action values sampled by behavior policy - (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + (generalized propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_by_evaluation_policy: array-like, shape (n_rounds,) Continuous action values given by evaluation policy (can be deterministic), i.e., :math:`\\pi_e(x_t)`. @@ -153,7 +152,7 @@ def _estimate_round_rewards( Returns ---------- estimated_rewards: array-like, shape (n_rounds,) - Rewards of each round estimated by KernelizedIPW. + Estimated rewards for each observation. """ kernel_func = kernel_functions[self.kernel] @@ -176,14 +175,14 @@ def estimate_policy_value( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action_by_behavior_policy: array-like, shape (n_rounds,) - Continuous action values sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) Probability densities of the continuous action values sampled by behavior policy - (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + (generalized propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_by_evaluation_policy: array-like, shape (n_rounds,) Continuous action values given by evaluation policy (can be deterministic), i.e., :math:`\\pi_e(x_t)`. @@ -191,7 +190,7 @@ def estimate_policy_value( Returns ---------- V_hat: float - Estimated policy value (performance) of a given evaluation policy. + Estimated policy value of evaluation policy. """ check_array(array=reward, name="reward", expected_dim=1) @@ -231,14 +230,14 @@ def estimate_interval( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action_by_behavior_policy: array-like, shape (n_rounds,) - Continuous action values sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) Probability densities of the continuous action values sampled by behavior policy - (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + (generalized propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_by_evaluation_policy: array-like, shape (n_rounds,) Continuous action values given by evaluation policy (can be deterministic), i.e., :math:`\\pi_e(x_t)`. @@ -295,20 +294,19 @@ class KernelizedSelfNormalizedInverseProbabilityWeighting( Note ------- - Kernelized Self-Normalized Inverse Probability Weighting (KernelizedSNIPW) - estimates the policy value of a given (deterministic) evaluation policy :math:`\\pi_e` by + Kernel SNIPW estimates the policy value of a given (deterministic) evaluation policy :math:`\\pi_e` as .. math:: \\hat{V}_{\\mathrm{Kernel-SNIPW}} (\\pi_e; \\mathcal{D}) - := \\frac{\\mathbb{E}_{\\mathcal{D}} \\left[ K \\left( \\frac{\pi_e(x_t) - a_t}{h} \\right) \\frac{r_t}{q_t} \\right]}{\\mathbb{E}_{\\mathcal{D}} \\left[ K \\left( \\frac{\pi_e(x_t) - a_t}{h} \\right) \\frac{r_t}{q_t}}, + := \\frac{\\mathbb{E}_{n} \\left[ K \\left( \\frac{\\pi_e(x_i) - a_i}{h} \\right) \\frac{r_i}{q_i} \\right]}{\\mathbb{E}_{n} \\left[ K \\left( \\frac{\\pi_e(x_i) - a_i}{h} \\right) \\frac{r_i}{q_i}}, - where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by behavior policy. + where :math:`\\mathcal{D}=\\{(x_i,a_i,r_i)\\}_{i=1}^{n}` is logged bandit data with :math:`n` observations collected by behavior policy. Note that each action :math:`a_t` in the logged bandit data is a continuous variable. - :math:`q_t` is a generalized propensity score that is defined as the conditional probability density of the behavior policy. + :math:`q_i` is the generalized propensity score, which is defined as the conditional probability density of the behavior policy. :math:`K(\cdot)` is a kernel function such as the gaussian kernel, and :math:`h` is a bandwidth hyperparameter. :math:`\\pi_e (x)` is a deterministic evaluation policy that maps :math:`x` to a continuous action value. - :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`. + :math:`\\mathbb{E}_{n}[\\cdot]` is the empirical average over :math:`n` observations in :math:`\\mathcal{D}`. Parameters ------------ @@ -317,7 +315,7 @@ class KernelizedSelfNormalizedInverseProbabilityWeighting( Must be one of "gaussian", "epanechnikov", "triangular", or "cosine". bandwidth: float - A bandwidth hyperparameter. + Bandwidth hyperparameter. A larger value increases bias instead of reducing variance. A smaller value increases variance instead of reducing bias. @@ -357,14 +355,14 @@ def _estimate_round_rewards( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action_by_behavior_policy: array-like, shape (n_rounds,) - Continuous action values sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) Probability densities of the continuous action values sampled by behavior policy - (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + (generalized propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_by_evaluation_policy: array-like, shape (n_rounds,) Continuous action values given by evaluation policy (can be deterministic), i.e., :math:`\\pi_e(x_t)`. @@ -372,7 +370,7 @@ def _estimate_round_rewards( Returns ---------- estimated_rewards: array-like, shape (n_rounds,) - Rewards of each round estimated by KernelizedSNIPW. + Estimated rewards for each observation. """ check_array(array=reward, name="reward", expected_dim=1) @@ -403,14 +401,14 @@ def estimate_policy_value( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action_by_behavior_policy: array-like, shape (n_rounds,) - Continuous action values sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) Probability densities of the continuous action values sampled by behavior policy - (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + (generalized propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_by_evaluation_policy: array-like, shape (n_rounds,) Continuous action values given by evaluation policy (can be deterministic), i.e., :math:`\\pi_e(x_t)`. @@ -418,7 +416,7 @@ def estimate_policy_value( Returns ---------- V_hat: float - Estimated policy value (performance) of a given evaluation policy. + Estimated policy value of evaluation policy. """ check_array(array=reward, name="reward", expected_dim=1) @@ -458,14 +456,14 @@ def estimate_interval( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action_by_behavior_policy: array-like, shape (n_rounds,) - Continuous action values sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) Probability densities of the continuous action values sampled by behavior policy - (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + (generalized propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_by_evaluation_policy: array-like, shape (n_rounds,) Continuous action values given by evaluation policy (can be deterministic), i.e., :math:`\\pi_e(x_t)`. @@ -520,20 +518,20 @@ class KernelizedDoublyRobust(BaseContinuousOffPolicyEstimator): Note ------- - Kernelized Doubly Robust (KernelizedDR) estimates the policy value of a given (deterministic) evaluation policy :math:`\\pi_e` by + Kernel DR estimates the policy value of a given (deterministic) evaluation policy :math:`\\pi_e` as .. math:: \\hat{V}_{\\mathrm{Kernel-DR}} (\\pi_e; \\mathcal{D}) - := \\mathbb{E}_{\\mathcal{D}} \\left[ \\frac{1}{h} K \\left( \\frac{\pi_e(x_t) - a_t}{h} \\right) \\frac{(r_t - \\hat{q}(x_t, \\pi_e(x_t)))}{q_t} + \\hat{q}(x_t, \\pi_e(x_t)) \\right], + := \\mathbb{E}_{n} \\left[ \\frac{1}{h} K \\left( \\frac{\\pi_e(x_i) - a_i}{h} \\right) \\frac{(r_i - \\hat{q}(x_i, \\pi_e(x_t)))}{q_t} + \\hat{q}(x_i, \\pi_e(x_t)) \\right], - where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by behavior policy. + where :math:`\\mathcal{D}=\\{(x_i,a_i,r_i)\\}_{i=1}^{n}` is logged bandit data with :math:`n` observations collected by behavior policy. Note that each action :math:`a_t` in the logged bandit data is a continuous variable. - :math:`q_t` is a generalized propensity score that is defined as the conditional probability density of the behavior policy. + :math:`q_i` is the generalized propensity score, which is defined as the conditional probability density of the behavior policy. :math:`K(\cdot)` is a kernel function such as the gaussian kernel, and :math:`h` is a bandwidth hyperparameter. :math:`\\pi_e (x)` is a deterministic evaluation policy that maps :math:`x` to a continuous action value. - :math:`\\hat{q} (x,a)` is an estimated expected reward given :math:`x` and :math:`a`. - :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`. + :math:`\\hat{q} (x,a)` is the estimated expected reward given :math:`x` and :math:`a`. + :math:`\\mathbb{E}_{n}[\\cdot]` is the empirical average over :math:`n` observations in :math:`\\mathcal{D}`. Parameters ------------ @@ -542,7 +540,7 @@ class KernelizedDoublyRobust(BaseContinuousOffPolicyEstimator): Must be one of "gaussian", "epanechnikov", "triangular", or "cosine". bandwidth: float - A bandwidth hyperparameter. + Bandwidth hyperparameter. A larger value increases bias instead of reducing variance. A smaller value increases variance instead of reducing bias. @@ -583,25 +581,25 @@ def _estimate_round_rewards( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action_by_behavior_policy: array-like, shape (n_rounds,) - Continuous action values sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) Probability densities of the continuous action values sampled by behavior policy - (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + (generalized propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_by_evaluation_policy: array-like, shape (n_rounds,) Continuous action values given by evaluation policy (can be deterministic), i.e., :math:`\\pi_e(x_t)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds,) - Expected rewards given context and action estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context and action estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. Returns ---------- estimated_rewards: array-like, shape (n_rounds,) - Rewards of each round estimated by KernelizedDR. + Estimated rewards for each observation. """ kernel_func = kernel_functions[self.kernel] @@ -628,25 +626,25 @@ def estimate_policy_value( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action_by_behavior_policy: array-like, shape (n_rounds,) - Continuous action values sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) Probability densities of the continuous action values sampled by behavior policy - (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + (generalized propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_by_evaluation_policy: array-like, shape (n_rounds,) Continuous action values given by evaluation policy (can be deterministic), i.e., :math:`\\pi_e(x_t)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds,) - Expected rewards given context and action estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context and action estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. Returns ---------- V_hat: float - Estimated policy value (performance) of a given evaluation policy. + Estimated policy value of evaluation policy. """ check_array( @@ -694,20 +692,20 @@ def estimate_interval( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action_by_behavior_policy: array-like, shape (n_rounds,) - Continuous action values sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,) Probability densities of the continuous action values sampled by behavior policy - (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + (generalized propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_by_evaluation_policy: array-like, shape (n_rounds,) Continuous action values given by evaluation policy (can be deterministic), i.e., :math:`\\pi_e(x_t)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds,) - Expected rewards given context and action estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context and action estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. alpha: float, default=0.05 Significance level. diff --git a/obp/ope/estimators_slate.py b/obp/ope/estimators_slate.py index c58171fd..ac7faf69 100644 --- a/obp/ope/estimators_slate.py +++ b/obp/ope/estimators_slate.py @@ -9,7 +9,9 @@ from typing import Optional import numpy as np +from sklearn.utils import check_scalar +from ..utils import check_cascade_dr_inputs from ..utils import check_iips_inputs from ..utils import check_rips_inputs from ..utils import check_sips_inputs @@ -41,7 +43,7 @@ class BaseSlateInverseProbabilityWeighting(BaseSlateOffPolicyEstimator): """Base Class of Inverse Probability Weighting Estimators for the slate contextual bandit setting. len_list: int (> 1) - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, `len_list=3`. """ @@ -67,10 +69,12 @@ def _estimate_round_rewards( IDs to differentiate slot (i.e., position in recommendation/ranking interface) in each slate. behavior_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Marginal probabilities of behavior policy selecting an action (propensity scores) at each position (slot) `k` or + joint probabilities of behavior policy selecting a set of actions. evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of evaluation policy, i.e., :math:`\\pi_e(a_t|x_t)`. + Marginal probabilities of evaluation policy selecting an action at each position (slot) `k` or + joint probabilities of evaluation policy selecting a set of actions. Returns ---------- @@ -176,16 +180,17 @@ def estimate_policy_value( IDs to differentiate slot (i.e., position in recommendation/ranking interface) in each slate. pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Joint probabilities of behavior policy selecting a slate action (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. This parameter must be unique in each slate. evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of evaluation policy, i.e., :math:`\\pi_e(a_t|x_t)`. + Joint probabilities of evaluation policy selecting a slate action, i.e., :math:`\\pi_e(a_i|x_i)`. + This parameter must be unique in each slate. Returns ---------- V_hat: float - Estimated policy value (performance) of a given evaluation policy. + Estimated policy value of evaluation policy. """ check_sips_inputs( @@ -231,11 +236,12 @@ def estimate_interval( IDs to differentiate slot (i.e., position in recommendation/ranking interface) in each slate. pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Joint probabilities of behavior policy selecting a slate action (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. This parameter must be unique in each slate. evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of evaluation policy, i.e., :math:`\\pi_e(a_t|x_t)`. + Joint probabilities of evaluation policy selecting a slate action, i.e., :math:`\\pi_e(a_i|x_i)`. + This parameter must be unique in each slate. alpha: float, default=0.05 Significance level. @@ -324,15 +330,15 @@ def estimate_policy_value( IDs to differentiate slot (i.e., position in recommendation/ranking interface) in each slate. pscore_item_position: array-like, shape (<= n_rounds * len_list,) - Probabilities that behavior policy selects each action :math:`a` at position (slot) :math:`k` given context :math:`x`, i.e., :math:`\\pi_b(a_{t}(k) |x_t)`. + Marginal probabilities of behavior policy selecting each action :math:`a` at position (slot) :math:`k`, i.e., :math:`\\pi_b(a_{t}(k) |x_t)`. evaluation_policy_pscore_item_position: array-like, shape (<= n_rounds * len_list,) - Probabilities that evaluation policy selects each action :math:`a` at position (slot) :math:`k` given context :math:`x`, i.e., :math:`\\pi_e(a_{t}(k) |x_t)`. + Marginal probabilities of evaluation policy selecting each action :math:`a` at position (slot) :math:`k`, i.e., :math:`\\pi_e(a_{t}(k) |x_t)`. Returns ---------- V_hat: float - Estimated policy value (performance) of a given evaluation policy. + Estimated policy value of evaluation policy. """ check_iips_inputs( @@ -378,10 +384,10 @@ def estimate_interval( IDs to differentiate slot (i.e., position in recommendation/ranking interface) in each slate. pscore_item_position: array-like, shape (<= n_rounds * len_list,) - Marginal action choice probabilities of the slot (:math:`k`) by a behavior policy (propensity scores), i.e., :math:`\\pi_b(a_{t, k}|x_t)`. + Marginal probabilities of behavior policy selecting each action :math:`a` at position (slot) :math:`k`, i.e., :math:`\\pi_b(a_{t}(k) |x_t)`. evaluation_policy_pscore_item_position: array-like, shape (<= n_rounds * len_list,) - Marginal action choice probabilities of the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(a_{t, k}|x_t)`. + Marginal probabilities of evaluation policy selecting each action :math:`a` at position (slot) :math:`k`, i.e., :math:`\\pi_e(a_{t}(k) |x_t)`. alpha: float, default=0.05 Significance level. @@ -467,17 +473,19 @@ def estimate_policy_value( IDs to differentiate slot (i.e., position in recommendation/ranking interface) in each slate. pscore_cascade: array-like, shape (<= n_rounds * len_list,) - Probabilities that behavior policy selects action :math:`a` at position (slot) `k` conditional on the previous actions (presented at position `1` to `k-1`) - , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \ldots, a_t(k-1))`. + Joint probabilities of behavior policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of behavior policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. evaluation_policy_pscore_cascade: array-like, shape (<= n_rounds * len_list,) - Probabilities that evaluation policy selects action :math:`a` at position (slot) `k` conditional on the previous actions (presented at position `1` to `k-1`) - , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \ldots, a_t(k-1))`. + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. Returns ---------- V_hat: float - Estimated policy value (performance) of a given evaluation policy. + Estimated policy value of evaluation policy. """ @@ -524,10 +532,14 @@ def estimate_interval( IDs to differentiate slot (i.e., position in recommendation/ranking interface) in each slate. pscore_cascade: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities above the slot (:math:`k`) by a behavior policy (propensity scores), i.e., :math:`\\pi_b(\\{a_{t, j}\\}_{j \\le k}|x_t)`. + Joint probabilities of behavior policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of behavior policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. evaluation_policy_pscore_cascade: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities above the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(\\{a_{t, j}\\}_{j \\le k}|x_t)`. + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. alpha: float, default=0.05 Significance level. @@ -566,6 +578,343 @@ def estimate_interval( ) +@dataclass +class SlateCascadeDoublyRobust(BaseSlateOffPolicyEstimator): + """Cascade Doubly Robust (Cascade-DR) Estimator. + + Note + ------- + Cascade Doubly Robust (Cascade-DR) estimates the policy value of evaluation (ranking) policy :math:`\\pi_e` + assuming the cascade click model (users interact with actions from the top position to the bottom in a slate). + It also uses reward prediction :math:`\\hat{Q}_k` as a control variate, which is derived using `obp.ope.SlateRegressionModel`. + Please refer to Section 3.1 of Kiyohara et al.(2022) for the detail. + + Parameters + ---------- + len_list: int + Length of a list of actions recommended in each impression (slate size). + When Open Bandit Dataset is used, 3 should be set. + + n_unique_action: int + Number of unique actions. + + estimator_name: str, default='cascade-dr'. + Name of the estimator. + + References + ------------ + Haruka Kiyohara, Yuta Saito, Tatsuya Matsuhiro, Yusuke Narita, Nobuyuki Shimizu, and Yasuo Yamamoto. + "Doubly Robust Off-Policy Evaluation for Ranking Policies under the Cascade Behavior Model.", 2022. + + """ + + len_list: int + n_unique_action: int + estimator_name: str = "cascade-dr" + + def __post_init__(self): + """Initialize Class.""" + check_scalar(self.n_unique_action, "n_unique_action", int, min_val=1) + + def _estimate_round_rewards( + self, + action: np.ndarray, + reward: np.ndarray, + position: np.ndarray, + behavior_policy_pscore: np.ndarray, + evaluation_policy_pscore: np.ndarray, + q_hat: np.ndarray, + evaluation_policy_action_dist: np.ndarray, + **kwargs, + ) -> np.ndarray: + """Estimate rewards given round (slate_id) and slot (position). + + Parameters + ---------- + action: array-like, (n_rounds * len_list,) + Action observed at each slot in each round of the logged bandit feedback, i.e., :math:`a_{t}(k)`, + which is chosen by the behavior policy :math:`\\pi_b`. + + reward: array-like, shape (n_rounds * len_list,) + Reward observed at each slot in each round of the logged bandit feedback, i.e., :math:`r_{t}(k)`. + + position: array-like, shape (n_rounds * len_list,) + IDs to differentiate slot (i.e., position in recommendation/ranking interface) in each slate. + + pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of behavior policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of behavior policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. + + evaluation_policy_pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. + + q_hat: array-like (n_rounds * len_list * n_unique_actions, ) + :math:`\\hat{Q}_k` for all unique actions + , i.e., :math:`\\hat{Q}_{t, k}(x_t, a_t(1), \\ldots, a_t(k-1), a_t(k)) \\forall a_t(k) \\in \\mathcal{A}`. + + evaluation_policy_action_dist: array-like (n_rounds * len_list * n_unique_actions, ) + Plackett-luce style action distribution induced by evaluation policy (action choice probabilities at each slot given previous action choices). + , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1)) \\forall a_t(k) \\in \\mathcal{A}`. + + Returns + ---------- + estimated_rewards: array-like, shape (n_rounds * len_list,) + Rewards estimated by Cascade-DR given round (slate_id) and slot (position). + + """ + # (n_rounds_ * len_list * n_unique_action, ) -> (n_rounds_, len_list, n_unique_action) + q_hat_3d = q_hat.reshape((-1, self.len_list, self.n_unique_action)) + # the estimated Q functions for the action taken by the behavior policy + # (n_rounds_, len_list, n_unique_action) -> (n_rounds_ * len_list, ) + q_hat_for_observed_action = [] + for i in range(self.n_rounds_): + for position_ in range(self.len_list): + q_hat_for_observed_action.append( + q_hat_3d[i, position_, action[i * self.len_list + position_]] + ) + q_hat_for_observed_action = np.array(q_hat_for_observed_action) + # the expected Q function under the evaluation policy + # (n_rounds_ * len_list * n_unique_action, ) -> (n_rounds_, len_list, n_unique_action) -> (n_rounds_, len_list) -> (n_rounds_ * len_list, ) + expected_q_hat_under_eval_policy = ( + (evaluation_policy_action_dist * q_hat) + .reshape((-1, self.len_list, self.n_unique_action)) + .sum(axis=2) + .flatten() + ) + # importance weights + # (n_rounds * len_list, ) + iw = evaluation_policy_pscore / behavior_policy_pscore + iw_prev = np.roll(iw, 1) + iw_prev[np.array([i * self.len_list for i in range(self.n_rounds_)])] = 1 + # estimate policy value given each round and slot in a doubly robust manner + estimated_rewards = ( + iw * (reward - q_hat_for_observed_action) + + iw_prev * expected_q_hat_under_eval_policy + ) + return estimated_rewards + + def estimate_policy_value( + self, + slate_id: np.ndarray, + action: np.ndarray, + reward: np.ndarray, + position: np.ndarray, + pscore_cascade: np.ndarray, + evaluation_policy_pscore_cascade: np.ndarray, + q_hat: np.ndarray, + evaluation_policy_action_dist: np.ndarray, + **kwargs, + ) -> float: + """Estimate the policy value of evaluation policy. + + Parameters + ---------- + slate_id: array-like, shape (n_rounds * len_list,) + IDs to differentiate slates (i.e., rounds or lists of actions). + + action: array-like, (n_rounds * len_list,) + Action observed at each slot in each round of the logged bandit feedback, i.e., :math:`a_{t}(k)`, + which is chosen by the behavior policy :math:`\\pi_b`. + + reward: array-like, shape (n_rounds * len_list,) + Reward observed at each slot in each round of the logged bandit feedback, i.e., :math:`r_{t}(k)`. + + position: array-like, shape (n_rounds * len_list,) + IDs to differentiate slot (i.e., position in recommendation/ranking interface) in each slate. + + pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of behavior policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of behavior policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. + + evaluation_policy_pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. + + q_hat: array-like (n_rounds * len_list * n_unique_actions, ) + :math:`\\hat{Q}_k` for all unique actions + , i.e., :math:`\\hat{Q}_{t, k}(x_t, a_t(1), \\ldots, a_t(k-1), a_t(k)) \\forall a_t(k) \\in \\mathcal{A}`. + + evaluation_policy_action_dist: array-like (n_rounds * len_list * n_unique_actions, ) + Plackett-luce style action distribution induced by evaluation policy (action choice probabilities at each slot given previous action choices). + , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1)) \\forall a_t(k) \\in \\mathcal{A}`. + + Returns + ---------- + V_hat: array-like, shape (n_rounds * len_list,) + Estimated policy value (performance) of a given evaluation policy. + + """ + check_cascade_dr_inputs( + n_unique_action=self.n_unique_action, + slate_id=slate_id, + action=action, + reward=reward, + position=position, + pscore_cascade=pscore_cascade, + evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + q_hat=q_hat, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + self.n_rounds_ = np.unique(slate_id).shape[0] + return ( + self._estimate_round_rewards( + action=action, + reward=reward, + position=position, + behavior_policy_pscore=pscore_cascade, + evaluation_policy_pscore=evaluation_policy_pscore_cascade, + q_hat=q_hat, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ).sum() + / self.n_rounds_ + ) + + def estimate_interval( + self, + slate_id: np.ndarray, + action: np.ndarray, + reward: np.ndarray, + position: np.ndarray, + pscore_cascade: np.ndarray, + evaluation_policy_pscore_cascade: np.ndarray, + q_hat: np.ndarray, + evaluation_policy_action_dist: np.ndarray, + alpha: float = 0.05, + n_bootstrap_samples: int = 10000, + random_state: Optional[int] = None, + **kwargs, + ) -> Dict[str, float]: + """Estimate confidence interval of policy value by nonparametric bootstrap procedure. + + Parameters + ---------- + slate_id: array-like, shape (n_rounds * len_list,) + IDs to differentiate slates (i.e., rounds or lists of actions). + + action: array-like, (n_rounds * len_list,) + Action observed at each slot in each round of the logged bandit feedback, i.e., :math:`a_{t}(k)`, + which is chosen by the behavior policy :math:`\\pi_b`. + + reward: array-like, shape (n_rounds * len_list,) + Reward observed at each slot in each round of the logged bandit feedback, i.e., :math:`r_{t}(k)`. + + position: array-like, shape (n_rounds * len_list,) + IDs to differentiate slot (i.e., position in recommendation/ranking interface) in each slate. + + pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of behavior policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of behavior policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. + + evaluation_policy_pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. + + q_hat: array-like (n_rounds * len_list * n_unique_actions, ) + :math:`\\hat{Q}_k` for all unique actions + , i.e., :math:`\\hat{Q}_{t, k}(x_t, a_t(1), \\ldots, a_t(k-1), a_t(k)) \\forall a_t(k) \\in \\mathcal{A}`. + + evaluation_policy_action_dist: array-like (n_rounds * len_list * n_unique_actions, ) + Plackett-luce style action distribution induced by evaluation policy (action choice probabilities at each slot given previous action choices). + , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1)) \\forall a_t(k) \\in \\mathcal{A}`. + + alpha: float, default=0.05 + Significance level. + + n_bootstrap_samples: int, default=10000 + Number of resampling performed in the bootstrap procedure. + + random_state: int, default=None + Controls the random seed in bootstrap sampling. + + Returns + ---------- + estimated_confidence_interval: Dict[str, float] + Dictionary storing the estimated mean and upper-lower confidence bounds. + + """ + check_cascade_dr_inputs( + n_unique_action=self.n_unique_action, + slate_id=slate_id, + action=action, + reward=reward, + position=position, + pscore_cascade=pscore_cascade, + evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + q_hat=q_hat, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + self.n_rounds_ = np.unique(slate_id).shape[0] + estimated_rewards = self._estimate_round_rewards( + action=action, + reward=reward, + position=position, + behavior_policy_pscore=pscore_cascade, + evaluation_policy_pscore=evaluation_policy_pscore_cascade, + q_hat=q_hat, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + return self._estimate_slate_confidence_interval_by_bootstrap( + slate_id=slate_id, + estimated_rewards=estimated_rewards, + alpha=alpha, + n_bootstrap_samples=n_bootstrap_samples, + random_state=random_state, + ) + + def _estimate_slate_confidence_interval_by_bootstrap( + self, + slate_id: np.ndarray, + estimated_rewards: np.ndarray, + alpha: float = 0.05, + n_bootstrap_samples: int = 10000, + random_state: Optional[int] = None, + ) -> Dict[str, float]: + """Estimate confidence interval of policy value by nonparametric bootstrap-like procedure. + + Parameters + ---------- + slate_id: array-like, shape (<= n_rounds * len_list,) + IDs to differentiate slates (i.e., rounds or lists of actions). + + estimated_rewards: array-like, shape (<= n_rounds * len_list,) + Rewards estimated by IPW given round (slate_id) and slot (position). + + alpha: float, default=0.05 + Significance level. + + n_bootstrap_samples: int, default=10000 + Number of resampling performed in the bootstrap procedure. + + random_state: int, default=None + Controls the random seed in bootstrap sampling. + + Returns + ---------- + estimated_confidence_interval: Dict[str, float] + Dictionary storing the estimated mean and upper-lower confidence bounds. + + """ + unique_slate = np.unique(slate_id) + # sum estimated_rewards in each slate + estimated_round_rewards = list() + for slate in unique_slate: + estimated_round_rewards.append(estimated_rewards[slate_id == slate].sum()) + estimated_round_rewards = np.array(estimated_round_rewards) + return estimate_confidence_interval_by_bootstrap( + samples=estimated_round_rewards, + alpha=alpha, + n_bootstrap_samples=n_bootstrap_samples, + random_state=random_state, + ) + + @dataclass class BaseSlateSelfNormalizedInverseProbabilityWeighting( BaseSlateInverseProbabilityWeighting @@ -573,7 +922,7 @@ class BaseSlateSelfNormalizedInverseProbabilityWeighting( """Base Class of Self-Normalized Inverse Probability Weighting Estimators for the slate contextual bandit setting. len_list: int (> 1) - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, `len_list=3`. """ @@ -599,10 +948,12 @@ def _estimate_round_rewards( IDs to differentiate slot (i.e., position in recommendation/ranking interface) in each slate. behavior_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Marginal probabilities of behavior policy selecting an action (propensity scores) at each position (slot) `k` or + joint probabilities of behavior policy selecting a set of actions. evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of evaluation policy, i.e., :math:`\\pi_e(a_t|x_t)`. + Marginal probabilities of evaluation policy selecting an action at each position (slot) `k` or + joint probabilities of evaluation policy selecting a set of actions. Returns ---------- @@ -674,11 +1025,13 @@ def _estimate_round_rewards( position: array-like, shape (<= n_rounds * len_list,) IDs to differentiate slot (i.e., position in recommendation/ranking interface) in each slate. - behavior_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + pscore: array-like, shape (<= n_rounds * len_list,) + Joint probabilities of behavior policy selecting a slate action (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. + This parameter must be unique in each slate. evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of evaluation policy, i.e., :math:`\\pi_e(a_t|x_t)`. + Joint probabilities of evaluation policy selecting a slate action, i.e., :math:`\\pi_e(a_i|x_i)`. + This parameter must be unique in each slate. Returns ---------- diff --git a/obp/ope/estimators_tuning.py b/obp/ope/estimators_tuning.py index eaf9e989..a5127617 100644 --- a/obp/ope/estimators_tuning.py +++ b/obp/ope/estimators_tuning.py @@ -17,7 +17,10 @@ from .estimators import DoublyRobust from .estimators import DoublyRobustWithShrinkage from .estimators import InverseProbabilityWeighting +from .estimators import SubGaussianDoublyRobust +from .estimators import SubGaussianInverseProbabilityWeighting from .estimators import SwitchDoublyRobust +from .helper import estimate_student_t_lower_bound @dataclass @@ -31,12 +34,21 @@ class BaseOffPolicyEstimatorTuning: lambdas: List[float] A list of candidate hyperparameter values. + tuning_method: str, default="slope". + A method used to tune the hyperparameter of an OPE estimator. + Must be either of "slope" or "mse". + Note that the implementation of "slope" is based on SLOPE++ proposed by Tucker and Lee.(2021), + which improves the original SLOPE proposed by Su et al.(2020). + use_bias_upper_bound: bool, default=True Whether to use bias upper bound in hyperparameter tuning. - If False, direct bias estimator is used to estimate the MSE. + If False, the direct bias estimator is used to estimate the MSE. See Su et al.(2020) for details. delta: float, default=0.05 - A confidence delta to construct a high probability upper bound based on the Bernstein’s inequality. + A confidence delta to construct a high probability upper bound based on Bernstein inequality. + + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. References ---------- @@ -46,10 +58,17 @@ class BaseOffPolicyEstimatorTuning: Yi Su, Maria Dimakopoulou, Akshay Krishnamurthy, and Miroslav Dudik. "Doubly Robust Off-Policy Evaluation with Shrinkage.", 2020. + Yi Su, Pavithra Srinath, and Akshay Krishnamurthy. + "Adaptive Estimator Selection for Off-Policy Evaluation.", 2020. + + George Tucker and Jonathan Lee. + "Improved Estimator Selection for Off-Policy Evaluation.", 2021. + """ base_ope_estimator: BaseOffPolicyEstimator = field(init=False) lambdas: List[float] = None + tuning_method: str = "slope" use_bias_upper_bound: bool = True delta: float = 0.05 use_estimated_pscore: bool = False @@ -58,7 +77,7 @@ def __new__(cls, *args, **kwargs): dataclass(cls) return super().__new__(cls) - def _check_lambdas(self) -> None: + def _check_lambdas(self, min_val: float = 0.0, max_val: float = np.inf) -> None: """Check type and value of lambdas.""" if isinstance(self.lambdas, list): if len(self.lambdas) == 0: @@ -68,7 +87,8 @@ def _check_lambdas(self) -> None: hyperparam_, name="an element of lambdas", target_type=(int, float), - min_val=0.0, + min_val=min_val, + max_val=max_val, ) if hyperparam_ != hyperparam_: raise ValueError("an element of lambdas must not be nan") @@ -77,10 +97,15 @@ def _check_lambdas(self) -> None: def _check_init_inputs(self) -> None: """Initialize Class.""" + if self.tuning_method not in ["slope", "mse"]: + raise ValueError( + "`tuning_method` must be either 'slope' or 'mse'" + f", but {self.tuning_method} is given" + ) if not isinstance(self.use_bias_upper_bound, bool): raise TypeError( "`use_bias_upper_bound` must be a bool" - ", but {type(self.use_bias_upper_bound)} is given" + f", but {type(self.use_bias_upper_bound)} is given" ) check_scalar(self.delta, "delta", (float), min_val=0.0, max_val=1.0) if not isinstance(self.use_estimated_pscore, bool): @@ -88,7 +113,7 @@ def _check_init_inputs(self) -> None: f"`use_estimated_pscore` must be a bool, but {type(self.use_estimated_pscore)} is given" ) - def _tune_hyperparam( + def _tune_hyperparam_with_mse( self, reward: np.ndarray, action: np.ndarray, @@ -96,8 +121,8 @@ def _tune_hyperparam( action_dist: np.ndarray, estimated_rewards_by_reg_model: Optional[np.ndarray] = None, position: Optional[np.ndarray] = None, - ) -> None: - """Find the best hyperparameter value from the given candidate set.""" + ) -> float: + """Find the best hyperparameter value from the candidate set by estimating the mse.""" self.estimated_mse_score_dict = dict() for hyperparam_ in self.lambdas: estimated_mse_score = self.base_ope_estimator( @@ -113,9 +138,55 @@ def _tune_hyperparam( delta=self.delta, ) self.estimated_mse_score_dict[hyperparam_] = estimated_mse_score - self.best_hyperparam = min( - self.estimated_mse_score_dict.items(), key=lambda x: x[1] - )[0] + return min(self.estimated_mse_score_dict.items(), key=lambda x: x[1])[0] + + def _tune_hyperparam_with_slope( + self, + reward: np.ndarray, + action: np.ndarray, + pscore: np.ndarray, + action_dist: np.ndarray, + estimated_rewards_by_reg_model: Optional[np.ndarray] = None, + position: Optional[np.ndarray] = None, + ) -> float: + """Find the best hyperparameter value from the candidate set by SLOPE.""" + C = np.sqrt(6) - 1 + theta_list, cnf_list = [], [] + theta_list_for_sort, cnf_list_for_sort = [], [] + for hyperparam_ in self.lambdas: + estimated_round_rewards = self.base_ope_estimator( + hyperparam_ + )._estimate_round_rewards( + reward=reward, + action=action, + pscore=pscore, + action_dist=action_dist, + estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, + position=position, + ) + theta_list_for_sort.append(estimated_round_rewards.mean()) + cnf = estimated_round_rewards.mean() + cnf -= estimate_student_t_lower_bound( + x=estimated_round_rewards, + delta=self.delta, + ) + cnf_list_for_sort.append(cnf) + + sorted_idx_list = np.argsort(cnf_list_for_sort)[::-1] + for i, idx in enumerate(sorted_idx_list): + cnf_i = cnf_list_for_sort[idx] + theta_i = theta_list_for_sort[idx] + if len(theta_list) < 1: + theta_list.append(theta_i), cnf_list.append(cnf_i) + else: + theta_j, cnf_j = np.array(theta_list), np.array(cnf_list) + if (np.abs(theta_j - theta_i) <= cnf_i + C * cnf_j).all(): + theta_list.append(theta_i), cnf_list.append(cnf_i) + else: + best_idx = sorted_idx_list[i - 1] + return self.lambdas[best_idx] + + return self.lambdas[sorted_idx_list[-1]] def estimate_policy_value_with_tuning( self, @@ -132,34 +203,34 @@ def estimate_policy_value_with_tuning( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list), default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. If self.use_estimated_pscore is False, pscore must be given. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only a single action is chosen at each round, you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. - If self.use_estimated_pscore is True, estimated_pscore must be given. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. Returns ---------- V_hat: float - Policy value estimated by the DR estimator. + Estimated policy value of evaluation policy. """ if self.use_estimated_pscore: @@ -170,14 +241,24 @@ def estimate_policy_value_with_tuning( pscore_ = pscore # tune hyperparameter if necessary if not hasattr(self, "best_hyperparam"): - self._tune_hyperparam( - reward=reward, - action=action, - pscore=pscore_, - action_dist=action_dist, - estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, - position=position, - ) + if self.tuning_method == "mse": + self.best_hyperparam = self._tune_hyperparam_with_mse( + reward=reward, + action=action, + pscore=pscore_, + action_dist=action_dist, + estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, + position=position, + ) + elif self.tuning_method == "slope": + self.best_hyperparam = self._tune_hyperparam_with_slope( + reward=reward, + action=action, + pscore=pscore_, + action_dist=action_dist, + estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, + position=position, + ) return self.base_ope_estimator( lambda_=self.best_hyperparam, use_estimated_pscore=self.use_estimated_pscore @@ -185,7 +266,7 @@ def estimate_policy_value_with_tuning( reward=reward, action=action, position=position, - pscore=pscore, + pscore=pscore_, estimated_pscore=estimated_pscore, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, @@ -210,29 +291,29 @@ def estimate_interval_with_tuning( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list), default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. If self.use_estimated_pscore is False, pscore must be given. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only a single action is chosen at each round, you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. - If self.use_estimated_pscore is True, estimated_pscore must be given. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. alpha: float, default=0.05 Significance level. @@ -257,20 +338,30 @@ def estimate_interval_with_tuning( pscore_ = pscore # tune hyperparameter if necessary if not hasattr(self, "best_hyperparam"): - self._tune_hyperparam( - reward=reward, - action=action, - pscore=pscore_, - action_dist=action_dist, - estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, - position=position, - ) + if self.tuning_method == "mse": + self.best_hyperparam = self._tune_hyperparam_with_mse( + reward=reward, + action=action, + pscore=pscore_, + action_dist=action_dist, + estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, + position=position, + ) + elif self.tuning_method == "slope": + self.best_hyperparam = self._tune_hyperparam_with_slope( + reward=reward, + action=action, + pscore=pscore_, + action_dist=action_dist, + estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, + position=position, + ) return self.base_ope_estimator(self.best_hyperparam).estimate_interval( reward=reward, action=action, position=position, - pscore=pscore, + pscore=pscore_, estimated_pscore=estimated_pscore, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, @@ -287,15 +378,24 @@ class InverseProbabilityWeightingTuning(BaseOffPolicyEstimatorTuning): ---------- lambdas: List[float] A list of candidate clipping hyperparameters. - The automatic hyperparameter tuning proposed by Su et al.(2020) - will choose the best hyperparameter value from the data. + The automatic hyperparameter tuning procedure proposed by Su et al.(2020) + or Tucker and Lee.(2021) will choose the best hyperparameter value from the logged data. + + tuning_method: str, default="slope". + A method used to tune the hyperparameter of an OPE estimator. + Must be either of "slope" or "mse". + Note that the implementation of "slope" is based on SLOPE++ proposed by Tucker and Lee.(2021), + which improves the original SLOPE proposed by Su et al.(2020). use_bias_upper_bound: bool, default=True Whether to use bias upper bound in hyperparameter tuning. - If False, direct bias estimator is used to estimate the MSE. + If False, the direct bias estimator is used to estimate the MSE. See Su et al.(2020) for details. delta: float, default=0.05 - A confidence delta to construct a high probability upper bound based on the Bernstein’s inequality. + A confidence delta to construct a high probability upper bound based on Bernstein inequality. + + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. estimator_name: str, default='ipw'. Name of the estimator. @@ -333,31 +433,31 @@ def estimate_policy_value( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. If self.use_estimated_pscore is False, pscore must be given. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only a single action is chosen at each round, you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. - If self.use_estimated_pscore is True, estimated_pscore must be given. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. Returns ---------- V_hat: float - Estimated policy value (performance) of a given evaluation policy. + Estimated policy value of evaluation policy. """ check_array(array=reward, name="reward", expected_dim=1) @@ -382,7 +482,7 @@ def estimate_policy_value( reward=reward, action=action, position=position, - pscore=pscore, + pscore=pscore_, action_dist=action_dist, estimated_pscore=estimated_pscore, ) @@ -405,27 +505,27 @@ def estimate_interval( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) Action choice probabilities - by the evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + by the evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. If self.use_estimated_pscore is False, pscore must be given. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only a single action is chosen at each round, you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. - If self.use_estimated_pscore is True, estimated_pscore must be given. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. alpha: float, default=0.05 Significance level. @@ -464,7 +564,7 @@ def estimate_interval( reward=reward, action=action, position=position, - pscore=pscore, + pscore=pscore_, estimated_pscore=estimated_pscore, action_dist=action_dist, alpha=alpha, @@ -481,8 +581,17 @@ class DoublyRobustTuning(BaseOffPolicyEstimatorTuning): ---------- lambdas: List[float] A list of candidate clipping hyperparameters. - The automatic hyperparameter tuning proposed by Su et al.(2020) - will choose the best hyperparameter value from the data. + The automatic hyperparameter tuning procedure proposed by Su et al.(2020) + or Tucker and Lee.(2021) will choose the best hyperparameter value from the logged data. + + tuning_method: str, default="slope". + A method used to tune the hyperparameter of an OPE estimator. + Must be either of "slope" or "mse". + Note that the implementation of "slope" is based on SLOPE++ proposed by Tucker and Lee.(2021), + which improves the original SLOPE proposed by Su et al.(2020). + + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. estimator_name: str, default='dr'. Name of the estimator. @@ -522,34 +631,34 @@ def estimate_policy_value( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. If self.use_estimated_pscore is False, pscore must be given. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only a single action is chosen at each round, you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. - If self.use_estimated_pscore is True, estimated_pscore must be given. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. Returns ---------- V_hat: float - Policy value estimated by the DR estimator. + Estimated policy value of evaluation policy. """ check_array( @@ -580,7 +689,7 @@ def estimate_policy_value( reward=reward, action=action, position=position, - pscore=pscore, + pscore=pscore_, estimated_pscore=estimated_pscore, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, @@ -605,29 +714,29 @@ def estimate_interval( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. If self.use_estimated_pscore is False, pscore must be given. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only a single action is chosen at each round, you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. - If self.use_estimated_pscore is True, estimated_pscore must be given. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. alpha: float, default=0.05 Significance level. @@ -672,7 +781,7 @@ def estimate_interval( reward=reward, action=action, position=position, - pscore=pscore, + pscore=pscore_, estimated_pscore=estimated_pscore, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, @@ -690,8 +799,17 @@ class SwitchDoublyRobustTuning(BaseOffPolicyEstimatorTuning): ---------- lambdas: List[float] A list of candidate switching hyperparameters. - The automatic hyperparameter tuning proposed by Su et al.(2020) - will choose the best hyperparameter value from the data. + The automatic hyperparameter tuning procedure proposed by Su et al.(2020) + or Tucker and Lee.(2021) will choose the best hyperparameter value from the logged data. + + tuning_method: str, default="slope". + A method used to tune the hyperparameter of an OPE estimator. + Must be either of "slope" or "mse". + Note that the implementation of "slope" is based on SLOPE++ proposed by Tucker and Lee.(2021), + which improves the original SLOPE proposed by Su et al.(2020). + + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. estimator_name: str, default='switch-dr'. Name of the estimator. @@ -730,34 +848,34 @@ def estimate_policy_value( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. If self.use_estimated_pscore is False, pscore must be given. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only a single action is chosen at each round, you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. - If self.use_estimated_pscore is True, estimated_pscore must be given. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. Returns ---------- V_hat: float - Policy value estimated by the DR estimator. + Estimated policy value of evaluation policy. """ check_array( @@ -788,7 +906,7 @@ def estimate_policy_value( reward=reward, action=action, position=position, - pscore=pscore, + pscore=pscore_, estimated_pscore=estimated_pscore, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, @@ -813,29 +931,29 @@ def estimate_interval( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. If self.use_estimated_pscore is False, pscore must be given. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only a single action is chosen at each round, you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. - If self.use_estimated_pscore is True, estimated_pscore must be given. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. alpha: float, default=0.05 Significance level. @@ -880,7 +998,7 @@ def estimate_interval( reward=reward, action=action, position=position, - pscore=pscore, + pscore=pscore_, estimated_pscore=estimated_pscore, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, @@ -898,8 +1016,17 @@ class DoublyRobustWithShrinkageTuning(BaseOffPolicyEstimatorTuning): ---------- lambdas: List[float] A list of candidate shrinkage hyperparameters. - The automatic hyperparameter tuning proposed by Su et al.(2020) - will choose the best hyperparameter value from the data. + The automatic hyperparameter tuning procedure proposed by Su et al.(2020) + or Tucker and Lee.(2021) will choose the best hyperparameter value from the logged data. + + tuning_method: str, default="slope". + A method used to tune the hyperparameter of an OPE estimator. + Must be either of "slope" or "mse". + Note that the implementation of "slope" is based on SLOPE++ proposed by Tucker and Lee.(2021), + which improves the original SLOPE proposed by Su et al.(2020). + + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. estimator_name: str, default='dr-os'. Name of the estimator. @@ -938,34 +1065,34 @@ def estimate_policy_value( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. If self.use_estimated_pscore is False, pscore must be given. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only a single action is chosen at each round, you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. - If self.use_estimated_pscore is True, estimated_pscore must be given. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. Returns ---------- V_hat: float - Policy value estimated by the DR estimator. + Estimated policy value of evaluation policy. """ check_array( @@ -996,7 +1123,7 @@ def estimate_policy_value( reward=reward, action=action, position=position, - pscore=pscore, + pscore=pscore_, estimated_pscore=estimated_pscore, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, @@ -1021,29 +1148,29 @@ def estimate_interval( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. If self.use_estimated_pscore is False, pscore must be given. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. - When None is given, the effect of position on the reward will be ignored. - (If only a single action is chosen at each round, you can just ignore this argument.) + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. - If self.use_estimated_pscore is True, estimated_pscore must be given. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. alpha: float, default=0.05 Significance level. @@ -1088,7 +1215,7 @@ def estimate_interval( reward=reward, action=action, position=position, - pscore=pscore, + pscore=pscore_, estimated_pscore=estimated_pscore, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, @@ -1096,3 +1223,423 @@ def estimate_interval( n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) + + +class SubGaussianInverseProbabilityWeightingTuning(BaseOffPolicyEstimatorTuning): + """Sub-Gaussian Inverse Probability Weighting (SG-IPW) with built-in hyperparameter tuning. + + Parameters + ---------- + lambdas: List[float] + A list of candidate hyperparameter values, which should be in the range of [0.0, 1.0]. + The automatic hyperparameter tuning procedure proposed by Su et al.(2020) + or Tucker and Lee.(2021) will choose the best hyperparameter value from the logged data. + + tuning_method: str, default="slope". + A method used to tune the hyperparameter of an OPE estimator. + Must be either of "slope" or "mse". + Note that the implementation of "slope" is based on SLOPE++ proposed by Tucker and Lee.(2021), + which improves the original SLOPE proposed by Su et al.(2020). + + use_bias_upper_bound: bool, default=True + Whether to use bias upper bound in hyperparameter tuning. + If False, the direct bias estimator is used to estimate the MSE. See Su et al.(2020) for details. + + delta: float, default=0.05 + A confidence delta to construct a high probability upper bound based on Bernstein inequality. + + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. + + estimator_name: str, default='sg-ipw'. + Name of the estimator. + + References + ---------- + Miroslav Dudík, Dumitru Erhan, John Langford, and Lihong Li. + "Doubly Robust Policy Evaluation and Optimization.", 2014. + + Yi Su, Maria Dimakopoulou, Akshay Krishnamurthy, and Miroslav Dudik. + "Doubly Robust Off-Policy Evaluation with Shrinkage.", 2020. + + Alberto Maria Metelli, Alessio Russo, and Marcello Restelli. + "Subgaussian and Differentiable Importance Sampling for Off-Policy Evaluation and Learning.", 2021. + + """ + + estimator_name: str = "sg-ipw" + + def __post_init__(self) -> None: + """Initialize Class.""" + self.base_ope_estimator = SubGaussianInverseProbabilityWeighting + super()._check_lambdas(max_val=1.0) + super()._check_init_inputs() + + def estimate_policy_value( + self, + reward: np.ndarray, + action: np.ndarray, + action_dist: np.ndarray, + pscore: Optional[np.ndarray] = None, + position: Optional[np.ndarray] = None, + estimated_pscore: Optional[np.ndarray] = None, + **kwargs, + ) -> float: + """Estimate the policy value of evaluation policy with a tuned hyperparameter. + + Parameters + ---------- + reward: array-like, shape (n_rounds,) + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. + + action: array-like, shape (n_rounds,) + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. + + action_dist: array-like, shape (n_rounds, n_actions, len_list) + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. + + pscore: array-like, shape (n_rounds,), default=None + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. + If self.use_estimated_pscore is False, pscore must be given. + + position: array-like, shape (n_rounds,), default=None + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) + + estimated_pscore: array-like, shape (n_rounds,), default=None + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. + + Returns + ---------- + V_hat: float + Estimated policy value. + + """ + check_array(array=reward, name="reward", expected_dim=1) + check_array(array=action, name="action", expected_dim=1) + if self.use_estimated_pscore: + check_array(array=estimated_pscore, name="estimated_pscore", expected_dim=1) + pscore_ = estimated_pscore + else: + check_array(array=pscore, name="pscore", expected_dim=1) + pscore_ = pscore + check_ope_inputs( + action_dist=action_dist, + position=position, + action=action, + reward=reward, + pscore=pscore_, + ) + if position is None: + position = np.zeros(action_dist.shape[0], dtype=int) + + return super().estimate_policy_value_with_tuning( + reward=reward, + action=action, + position=position, + pscore=pscore_, + action_dist=action_dist, + ) + + def estimate_interval( + self, + reward: np.ndarray, + action: np.ndarray, + action_dist: np.ndarray, + pscore: Optional[np.ndarray] = None, + position: Optional[np.ndarray] = None, + estimated_pscore: Optional[np.ndarray] = None, + alpha: float = 0.05, + n_bootstrap_samples: int = 10000, + random_state: Optional[int] = None, + **kwargs, + ) -> Dict[str, float]: + """Estimate confidence interval of policy value by nonparametric bootstrap procedure. + + Parameters + ---------- + reward: array-like, shape (n_rounds,) + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. + + action: array-like, shape (n_rounds,) + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. + + action_dist: array-like, shape (n_rounds, n_actions, len_list) + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. + + pscore: array-like, shape (n_rounds,), default=None + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. + If self.use_estimated_pscore is False, pscore must be given. + + position: array-like, shape (n_rounds,), default=None + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) + + estimated_pscore: array-like, shape (n_rounds,), default=None + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. + + alpha: float, default=0.05 + Significance level. + + n_bootstrap_samples: int, default=10000 + Number of resampling performed in the bootstrap procedure. + + random_state: int, default=None + Controls the random seed in bootstrap sampling. + + Returns + ---------- + estimated_confidence_interval: Dict[str, float] + Dictionary storing the estimated mean and upper-lower confidence bounds. + + """ + check_array(array=reward, name="reward", expected_dim=1) + check_array(array=action, name="action", expected_dim=1) + if self.use_estimated_pscore: + check_array(array=estimated_pscore, name="estimated_pscore", expected_dim=1) + pscore_ = estimated_pscore + else: + check_array(array=pscore, name="pscore", expected_dim=1) + pscore_ = pscore + check_ope_inputs( + action_dist=action_dist, + position=position, + action=action, + reward=reward, + pscore=pscore_, + ) + if position is None: + position = np.zeros(action_dist.shape[0], dtype=int) + + return super().estimate_interval_with_tuning( + reward=reward, + action=action, + position=position, + pscore=pscore_, + action_dist=action_dist, + alpha=alpha, + n_bootstrap_samples=n_bootstrap_samples, + random_state=random_state, + ) + + +@dataclass +class SubGaussianDoublyRobustTuning(BaseOffPolicyEstimatorTuning): + """Sub-Gaussian Doubly Robust (SG-DR) with built-in hyperparameter tuning. + + Parameters + ---------- + lambdas: List[float] + A list of candidate hyperparameter values, which should be in the range of [0.0, 1.0]. + The automatic hyperparameter tuning procedure proposed by Su et al.(2020) + or Tucker and Lee.(2021) will choose the best hyperparameter value from the logged data. + + tuning_method: str, default="slope". + A method used to tune the hyperparameter of an OPE estimator. + Must be either of "slope" or "mse". + Note that the implementation of "slope" is based on SLOPE++ proposed by Tucker and Lee.(2021), + which improves the original SLOPE proposed by Su et al.(2020). + + use_estimated_pscore: bool, default=False. + If True, `estimated_pscore` is used, otherwise, `pscore` (the true propensity scores) is used. + + estimator_name: str, default='sg-dr'. + Name of the estimator. + + References + ---------- + Miroslav Dudík, Dumitru Erhan, John Langford, and Lihong Li. + "Doubly Robust Policy Evaluation and Optimization.", 2014. + + Yi Su, Maria Dimakopoulou, Akshay Krishnamurthy, and Miroslav Dudik. + "Doubly Robust Off-Policy Evaluation with Shrinkage.", 2020. + + Alberto Maria Metelli, Alessio Russo, and Marcello Restelli. + "Subgaussian and Differentiable Importance Sampling for Off-Policy Evaluation and Learning.", 2021. + + """ + + estimator_name: str = "sg-dr" + + def __post_init__(self) -> None: + """Initialize Class.""" + self.base_ope_estimator = SubGaussianDoublyRobust + super()._check_lambdas(max_val=1.0) + super()._check_init_inputs() + + def estimate_policy_value( + self, + reward: np.ndarray, + action: np.ndarray, + action_dist: np.ndarray, + estimated_rewards_by_reg_model: np.ndarray, + pscore: Optional[np.ndarray] = None, + position: Optional[np.ndarray] = None, + estimated_pscore: Optional[np.ndarray] = None, + **kwargs, + ) -> float: + """Estimate the policy value of evaluation policy with a tuned hyperparameter. + + Parameters + ---------- + reward: array-like, shape (n_rounds,) + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. + + action: array-like, shape (n_rounds,) + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. + + action_dist: array-like, shape (n_rounds, n_actions, len_list) + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. + + estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. + + pscore: array-like, shape (n_rounds,), default=None + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. + If self.use_estimated_pscore is False, pscore must be given. + + position: array-like, shape (n_rounds,), default=None + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) + + estimated_pscore: array-like, shape (n_rounds,), default=None + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. + + Returns + ---------- + V_hat: float + Estimated policy value of evaluation policy. + + """ + check_array( + array=estimated_rewards_by_reg_model, + name="estimated_rewards_by_reg_model", + expected_dim=3, + ) + check_array(array=reward, name="reward", expected_dim=1) + check_array(array=action, name="action", expected_dim=1) + if self.use_estimated_pscore: + check_array(array=estimated_pscore, name="estimated_pscore", expected_dim=1) + pscore_ = estimated_pscore + else: + check_array(array=pscore, name="pscore", expected_dim=1) + pscore_ = pscore + check_ope_inputs( + action_dist=action_dist, + position=position, + action=action, + reward=reward, + pscore=pscore_, + estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, + ) + if position is None: + position = np.zeros(action_dist.shape[0], dtype=int) + + return super().estimate_policy_value_with_tuning( + reward=reward, + action=action, + position=position, + pscore=pscore_, + action_dist=action_dist, + estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, + ) + + def estimate_interval( + self, + reward: np.ndarray, + action: np.ndarray, + action_dist: np.ndarray, + estimated_rewards_by_reg_model: np.ndarray, + pscore: Optional[np.ndarray] = None, + position: Optional[np.ndarray] = None, + estimated_pscore: Optional[np.ndarray] = None, + alpha: float = 0.05, + n_bootstrap_samples: int = 10000, + random_state: Optional[int] = None, + **kwargs, + ) -> Dict[str, float]: + """Estimate confidence interval of policy value by nonparametric bootstrap procedure. + + Parameters + ---------- + reward: array-like, shape (n_rounds,) + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. + + action: array-like, shape (n_rounds,) + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. + + action_dist: array-like, shape (n_rounds, n_actions, len_list) + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. + + estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. + + pscore: array-like, shape (n_rounds,), default=None + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. + If self.use_estimated_pscore is False, pscore must be given. + + position: array-like, shape (n_rounds,), default=None + Position in a recommendation interface where the action was presented. + If None is given, the effect of position on the reward will be ignored. + (If only a single action is chosen for each data, you can just ignore this argument.) + + estimated_pscore: array-like, shape (n_rounds,), default=None + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. + If `self.use_estimated_pscore` is True, `estimated_pscore` must be given. + + alpha: float, default=0.05 + Significance level. + + n_bootstrap_samples: int, default=10000 + Number of resampling performed in the bootstrap procedure. + + random_state: int, default=None + Controls the random seed in bootstrap sampling. + + Returns + ---------- + estimated_confidence_interval: Dict[str, float] + Dictionary storing the estimated mean and upper-lower confidence bounds. + + """ + check_array( + array=estimated_rewards_by_reg_model, + name="estimated_rewards_by_reg_model", + expected_dim=3, + ) + check_array(array=reward, name="reward", expected_dim=1) + check_array(array=action, name="action", expected_dim=1) + if self.use_estimated_pscore: + check_array(array=estimated_pscore, name="estimated_pscore", expected_dim=1) + pscore_ = estimated_pscore + else: + check_array(array=pscore, name="pscore", expected_dim=1) + pscore_ = pscore + check_ope_inputs( + action_dist=action_dist, + position=position, + action=action, + reward=reward, + pscore=pscore_, + estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, + ) + if position is None: + position = np.zeros(action_dist.shape[0], dtype=int) + + return super().estimate_interval_with_tuning( + reward=reward, + action=action, + position=position, + pscore=pscore_, + action_dist=action_dist, + estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, + alpha=alpha, + n_bootstrap_samples=n_bootstrap_samples, + random_state=random_state, + ) diff --git a/obp/ope/helper.py b/obp/ope/helper.py index 6cd0176f..61031e1e 100644 --- a/obp/ope/helper.py +++ b/obp/ope/helper.py @@ -4,6 +4,10 @@ from typing import Optional import numpy as np +from numpy import log +from numpy import sqrt +from numpy import var +from scipy import stats from sklearn.utils import check_scalar @@ -18,10 +22,10 @@ def estimate_bias_in_ope( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_t`. iw: array-like, shape (n_rounds,) - Importance weight in each round of the logged bandit feedback, i.e., :math:`w(x,a)=\\pi_e(a|x)/ \\pi_b(a|x)`. + Importance weight for each data in logged bandit data, i.e., :math:`w(x,a)=\\pi_e(a|x)/ \\pi_b(a|x)`. iw_hat: array-like, shape (n_rounds,) Importance weight (IW) modified by a hyparpareter. How IW is modified depends on the estimator as follows. @@ -31,7 +35,7 @@ def estimate_bias_in_ope( where :math:`\\lambda` is a hyperparameter value. q_hat: array-like, shape (n_rounds,), default=None - Estimated expected reward given context :math:`x_t` and action :math:`a_t`. + Estimated expected reward given context :math:`x_i` and action :math:`a_i`. Returns ---------- @@ -45,9 +49,8 @@ def estimate_bias_in_ope( "Doubly Robust Off-Policy Evaluation with Shrinkage.", 2020. """ - n_rounds = reward.shape[0] if q_hat is None: - q_hat = np.zeros(n_rounds) + q_hat = np.zeros(reward.shape[0]) estimated_bias_arr = (iw - iw_hat) * (reward - q_hat) estimated_bias = np.abs(estimated_bias_arr.mean()) @@ -66,10 +69,10 @@ def estimate_high_probability_upper_bound_bias( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_t`. iw: array-like, shape (n_rounds,) - Importance weight in each round of the logged bandit feedback, i.e., :math:`w(x,a)=\\pi_e(a|x)/ \\pi_b(a|x)`. + Importance weight for each data in logged bandit data, i.e., :math:`w(x,a)=\\pi_e(a|x)/ \\pi_b(a|x)`. iw_hat: array-like, shape (n_rounds,) Importance weight (IW) modified by a hyparpareter. How IW is modified depends on the estimator as follows. @@ -79,10 +82,10 @@ def estimate_high_probability_upper_bound_bias( where :math:`\\lambda` and :math:`\\lambda` are hyperparameters. q_hat: array-like, shape (n_rounds,), default=None - Estimated expected reward given context :math:`x_t` and action :math:`a_t`. + Estimated expected reward given context :math:`x_i` and action :math:`a_i`. delta: float, default=0.05 - A confidence delta to construct a high probability upper bound based on the Bernstein’s inequality. + A confidence delta to construct a high probability upper bound based on Bernstein inequality. Returns ---------- @@ -99,14 +102,132 @@ def estimate_high_probability_upper_bound_bias( """ check_scalar(delta, "delta", (int, float), min_val=0.0, max_val=1.0) - bias_upper_bound = estimate_bias_in_ope( + estimated_bias = estimate_bias_in_ope( reward=reward, iw=iw, iw_hat=iw_hat, q_hat=q_hat, ) - n_rounds = reward.shape[0] - bias_upper_bound += np.sqrt((2 * (iw ** 2).mean() * np.log(2 / delta)) / n_rounds) - bias_upper_bound += (2 * iw.max() * np.log(2 / delta)) / (3 * n_rounds) + n = reward.shape[0] + bias_upper_bound = estimated_bias + bias_upper_bound += sqrt((2 * (iw ** 2).mean() * log(2 / delta)) / n) + bias_upper_bound += (2 * iw.max() * log(2 / delta)) / (3 * n) return bias_upper_bound + + +def estimate_hoeffding_lower_bound( + x: np.ndarray, x_max: Optional[float] = None, delta: float = 0.05 +) -> float: + """Estimate a high probability lower bound of mean of random variables by Hoeffding Inequality. + + Parameters + ---------- + x: array-like, shape (n,) + Size n of independent real-valued bounded random variables of interest. + + x_max: float, default=None. + A maximum value of random variable `x`. + If None is given, this is estimated from the given samples. + + delta: float, default=0.05 + A confidence delta to construct a high probability lower bound. + + Returns + ---------- + lower_bound_estimate: float + A high probability lower bound of mean of random variables `x` estimated by Hoeffding Inequality. + See page 3 of Thomas et al.(2015) for details. + + References + ---------- + Philip S. Thomas, Georgios Theocharous, and Mohammad Ghavamzadeh. + "High Confidence Off-Policy Evaluation.", 2015. + + """ + if x_max is None: + x_max = x.max() + else: + check_scalar(x_max, "x_max", (int, float), min_val=x.max()) + check_scalar(delta, "delta", (int, float), min_val=0.0, max_val=1.0) + + n = x.shape[0] + ci = x_max * sqrt(log(1.0 / delta) / (2 * n)) + lower_bound_estimate = x.mean() - ci + + return lower_bound_estimate + + +def estimate_bernstein_lower_bound( + x: np.ndarray, x_max: Optional[float], delta: float = 0.05 +) -> float: + """Estimate a high probability lower bound of mean of random variables by empirical Bernstein Inequality. + + Parameters + ---------- + x: array-like, shape (n, ) + Size n of independent real-valued bounded random variables of interest. + + x_max: float, default=None. + A maximum value of random variable `x`. + If None is given, this is estimated from the given samples. + + delta: float, default=0.05 + A confidence delta to construct a high probability lower bound. + + Returns + ---------- + lower_bound_estimate: float + A high probability lower bound of mean of random variables `x` estimated by Hoeffding Inequality. + See page 3 of Thomas et al.(2015) for details. + + References + ---------- + Philip S. Thomas, Georgios Theocharous, and Mohammad Ghavamzadeh. + "High Confidence Off-Policy Evaluation.", 2015. + + """ + if x_max is None: + x_max = x.max() + else: + check_scalar(x_max, "x_max", (int, float), min_val=x.max()) + check_scalar(delta, "delta", (int, float), min_val=0.0, max_val=1.0) + + n = x.shape[0] + ci1 = 7 * x_max * log(2.0 / delta) / (3 * (n - 1)) + ci2 = sqrt(2 * log(2.0 / delta) * var(x) / (n - 1)) + lower_bound_estimate = x.mean() - ci1 - ci2 + + return lower_bound_estimate + + +def estimate_student_t_lower_bound(x: np.ndarray, delta: float = 0.05) -> float: + """Estimate a high probability lower bound of mean of random variables based on Student t distribution. + + Parameters + ---------- + x: array-like, shape (n, ) + Size n of independent real-valued bounded random variables of interest. + + delta: float, default=0.05 + A confidence delta to construct a high probability lower bound. + + Returns + ---------- + lower_bound_estimate: float + A high probability lower bound of mean of random variables `x` estimated based on Student t distribution. + See Section 2.4 of Thomas et al.(2015) for details. + + References + ---------- + Philip S. Thomas, Georgios Theocharous, and Mohammad Ghavamzadeh. + "High Confidence Off-Policy Improvement.", 2015. + + """ + check_scalar(delta, "delta", (int, float), min_val=0.0, max_val=1.0) + + n = x.shape[0] + ci = sqrt(var(x) / (n - 1)) * stats.t(n - 1).ppf(1.0 - delta) + lower_bound_estimate = x.mean() - ci + + return lower_bound_estimate diff --git a/obp/ope/meta.py b/obp/ope/meta.py index 78455e4e..430661ff 100644 --- a/obp/ope/meta.py +++ b/obp/ope/meta.py @@ -30,12 +30,12 @@ @dataclass class OffPolicyEvaluation: - """Class to conduct OPE by multiple estimators simultaneously. + """Class to conduct OPE with multiple estimators simultaneously. Parameters ----------- bandit_feedback: BanditFeedback - Logged bandit feedback data used to conduct OPE. + Logged bandit data used to conduct OPE. ope_estimators: List[BaseOffPolicyEstimator] List of OPE estimators used to evaluate the policy value of evaluation policy. @@ -216,29 +216,28 @@ def estimate_policy_values( Parameters ------------ action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None - Expected rewards given each round, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given each round, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. When an array-like is given, all OPE estimators use it. - When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used. - When it is not given, model-dependent estimators such as DM and DR cannot be used. + When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. + If None, model-dependent estimators such as DM and DR cannot be used. estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. When an array-like is given, all OPE estimators use it. When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. estimated_importance_weights: array-like, shape (n_rounds,) or Dict[str, array-like], default=None - Importance weights estimated via supervised classification implemented by `obp.ope.ImportanceWeightEstimator`, i.e., :math:`\\hat{w}(x_t, a_t)`. + Importance weights estimated via supervised classification implemented by `obp.ope.ImportanceWeightEstimator`. When an array-like is given, all OPE estimators use it. When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. - Returns ---------- policy_value_dict: Dict[str, float] - Dictionary containing estimated policy values by OPE estimators. + Dictionary containing the policy values estimated by OPE estimators. """ if self.is_model_dependent: @@ -279,21 +278,21 @@ def estimate_intervals( Parameters ------------ action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. When an array-like is given, all OPE estimators use it. - When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used. - When it is not given, model-dependent estimators such as DM and DR cannot be used. + When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. + If None, model-dependent estimators such as DM and DR cannot be used. estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. When an array-like is given, all OPE estimators use it. When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. estimated_importance_weights: array-like, shape (n_rounds,) or Dict[str, array-like], default=None - Importance weights estimated via supervised classification implemented by `obp.ope.ImportanceWeightEstimator`, i.e., :math:`\\hat{w}(x_t, a_t)`. + Importance weights estimated via supervised classification implemented by `obp.ope.ImportanceWeightEstimator`. When an array-like is given, all OPE estimators use it. When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. @@ -360,23 +359,23 @@ def summarize_off_policy_estimates( Parameters ------------ action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None - Expected rewards given each round, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given each round, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. When an array-like is given, all OPE estimators use it. - When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used. - When it is not given, model-dependent estimators such as DM and DR cannot be used. + When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. + If None, model-dependent estimators such as DM and DR cannot be used. estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. When an array-like is given, all OPE estimators use it. - When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used. + When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. estimated_importance_weights: array-like, shape (n_rounds,) or Dict[str, array-like], default=None - Importance weights estimated via supervised classification implemented by `obp.ope.ImportanceWeightEstimator`, i.e., :math:`\\hat{w}(x_t, a_t)`. + Importance weights estimated via supervised classification implemented by `obp.ope.ImportanceWeightEstimator`. When an array-like is given, all OPE estimators use it. - When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used. + When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. alpha: float, default=0.05 Significance level. @@ -390,7 +389,7 @@ def summarize_off_policy_estimates( Returns ---------- (policy_value_df, policy_value_interval_df): Tuple[DataFrame, DataFrame] - Policy values and their confidence intervals Estimated by OPE estimators. + Policy values and their confidence intervals estimated by OPE estimators. """ policy_value_df = DataFrame( @@ -448,23 +447,23 @@ def visualize_off_policy_estimates( Parameters ---------- action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. When an array-like is given, all OPE estimators use it. - When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used. - When it is not given, model-dependent estimators such as DM and DR cannot be used. + When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. + If None, model-dependent estimators such as DM and DR cannot be used. estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. When an array-like is given, all OPE estimators use it. - When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used. + When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. estimated_importance_weights: array-like, shape (n_rounds,) or Dict[str, array-like], default=None - Importance weights estimated via supervised classification implemented by `obp.ope.ImportanceWeightEstimator`, i.e., :math:`\\hat{w}(x_t, a_t)`. + Importance weights estimated via supervised classification implemented by `obp.ope.ImportanceWeightEstimator`. When an array-like is given, all OPE estimators use it. - When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used. + When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. alpha: float, default=0.05 Significance level. @@ -541,13 +540,13 @@ def evaluate_performance_of_estimators( estimated_importance_weights: Optional[ Union[np.ndarray, Dict[str, np.ndarray]] ] = None, - metric: str = "relative-ee", + metric: str = "se", ) -> Dict[str, float]: - """Evaluate estimation performance of OPE estimators. + """Evaluate the accuracy of OPE estimators. Note ------ - Evaluate the estimation performance of OPE estimators by relative estimation error (relative-EE) or squared error (SE): + Evaluate the estimation performance of OPE estimators with relative estimation error (relative-EE) or squared error (SE): .. math :: @@ -558,7 +557,7 @@ def evaluate_performance_of_estimators( \\text{SE} (\\hat{V}; \\mathcal{D}) = \\left(\\hat{V}(\\pi; \\mathcal{D}) - V(\\pi) \\right)^2, where :math:`V({\\pi})` is the ground-truth policy value of the evalation policy :math:`\\pi_e` (often estimated using on-policy estimation). - :math:`\\hat{V}(\\pi; \\mathcal{D})` is an estimated policy value by an OPE estimator :math:`\\hat{V}` and logged bandit feedback :math:`\\mathcal{D}`. + :math:`\\hat{V}(\\pi; \\mathcal{D})` is the policy value estimated by an OPE estimator :math:`\\hat{V}` and logged bandit feedback :math:`\\mathcal{D}`. Parameters ---------- @@ -567,32 +566,32 @@ def evaluate_performance_of_estimators( With Open Bandit Dataset, we use an on-policy estimate of the policy value as its ground-truth. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. When an array-like is given, all OPE estimators use it. - When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used. - When it is not given, model-dependent estimators such as DM and DR cannot be used. + When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. + If None, model-dependent estimators such as DM and DR cannot be used. estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. When an array-like is given, all OPE estimators use it. When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. estimated_importance_weights: array-like, shape (n_rounds,) or Dict[str, array-like], default=None - Importance weights estimated via supervised classification implemented by `obp.ope.ImportanceWeightEstimator`, i.e., :math:`\\hat{w}(x_t, a_t)`. + Importance weights estimated via supervised classification implemented by `obp.ope.ImportanceWeightEstimator`. When an array-like is given, all OPE estimators use it. When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. - metric: str, default="relative-ee" + metric: str, default="se" Evaluation metric used to evaluate and compare the estimation performance of OPE estimators. - Must be "relative-ee" or "se". + Must be either "relative-ee" or "se". Returns ---------- eval_metric_ope_dict: Dict[str, float] - Dictionary containing evaluation metric for evaluating the estimation performance of OPE estimators. + Dictionary containing the value of evaluation metric for the estimation performance of OPE estimators. """ check_scalar( @@ -602,11 +601,11 @@ def evaluate_performance_of_estimators( ) if metric not in ["relative-ee", "se"]: raise ValueError( - f"metric must be either 'relative-ee' or 'se', but {metric} is given" + f"`metric` must be either 'relative-ee' or 'se', but {metric} is given" ) if metric == "relative-ee" and ground_truth_policy_value == 0.0: raise ValueError( - "ground_truth_policy_value must be non-zero when metric is relative-ee" + "`ground_truth_policy_value` must be non-zero when metric is relative-ee" ) eval_metric_ope_dict = dict() @@ -640,7 +639,7 @@ def summarize_estimators_comparison( estimated_importance_weights: Optional[ Union[np.ndarray, Dict[str, np.ndarray]] ] = None, - metric: str = "relative-ee", + metric: str = "se", ) -> DataFrame: """Summarize performance comparisons of OPE estimators. @@ -651,23 +650,23 @@ def summarize_estimators_comparison( With Open Bandit Dataset, we use an on-policy estimate of the policy value as ground-truth. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list), default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. - When it is not given, model-dependent estimators such as DM and DR cannot be used. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. + If None, model-dependent estimators such as DM and DR cannot be used. estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. When an array-like is given, all OPE estimators use it. When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. estimated_importance_weights: array-like, shape (n_rounds,) or Dict[str, array-like], default=None - Importance weights estimated via supervised classification implemented by `obp.ope.ImportanceWeightEstimator`, i.e., :math:`\\hat{w}(x_t, a_t)`. + Importance weights estimated via supervised classification implemented by `obp.ope.ImportanceWeightEstimator`. When an array-like is given, all OPE estimators use it. When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. - metric: str, default="relative-ee" + metric: str, default="se" Evaluation metric used to evaluate and compare the estimation performance of OPE estimators. Must be either "relative-ee" or "se". @@ -716,21 +715,21 @@ def visualize_off_policy_estimates_of_multiple_policies( List of the names of evaluation policies. action_dist_list: List[array-like, shape (n_rounds, n_actions, len_list)] - List of action choice probabilities by the evaluation policies (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + List of action choice probabilities of evaluation policies (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. When an array-like is given, all OPE estimators use it. When a dict is given, if the dict has the name of an estimator as a key, the corresponding value is used. - When it is not given, model-dependent estimators such as DM and DR cannot be used. + If None, model-dependent estimators such as DM and DR cannot be used. estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. When an array-like is given, all OPE estimators use it. When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. estimated_importance_weights: array-like, shape (n_rounds,) or Dict[str, array-like], default=None - Importance weights estimated via supervised classification implemented by `obp.ope.ImportanceWeightEstimator`, i.e., :math:`\\hat{w}(x_t, a_t)`. + Importance weights estimated via supervised classification implemented by `obp.ope.ImportanceWeightEstimator`. When an array-like is given, all OPE estimators use it. When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. diff --git a/obp/ope/meta_continuous.py b/obp/ope/meta_continuous.py index 1a3e6e09..5b265330 100644 --- a/obp/ope/meta_continuous.py +++ b/obp/ope/meta_continuous.py @@ -34,7 +34,7 @@ class ContinuousOffPolicyEvaluation: Parameters ----------- bandit_feedback: BanditFeedback - Logged bandit feedback data with continuous actions used to conduct OPE. + Logged bandit data with continuous actions used to conduct OPE. ope_estimators: List[BaseOffPolicyEstimator] List of OPE estimators used to evaluate the policy value of evaluation policy. @@ -183,15 +183,15 @@ def estimate_policy_values( Continuous action values given by evaluation policy, i.e., :math:`\\pi_e(x_t)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds,) or Dict[str, array-like], default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. When an array-like is given, all OPE estimators use it. - When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used. - When it is not given, model-dependent estimators such as DM and DR cannot be used. + When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. + If None, model-dependent estimators such as DM and DR cannot be used. Returns ---------- policy_value_dict: Dict[str, float] - Dictionary containing estimated policy values by OPE estimators. + Dictionary containing the policy values estimated by OPE estimators. """ if self.is_model_dependent: @@ -230,10 +230,10 @@ def estimate_intervals( Continuous action values given by the (deterministic) evaluation policy, i.e., :math:`\\pi_e(x_t)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. When an array-like is given, all OPE estimators use it. - When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used. - When it is not given, model-dependent estimators such as DM and DR cannot be used. + When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. + If None, model-dependent estimators such as DM and DR cannot be used. alpha: float, default=0.05 Significance level. @@ -295,10 +295,10 @@ def summarize_off_policy_estimates( Continuous action values given by the (deterministic) evaluation policy, i.e., :math:`\\pi_e(x_t)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. When an array-like is given, all OPE estimators use it. - When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used. - When it is not given, model-dependent estimators such as DM and DR cannot be used. + When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. + If None, model-dependent estimators such as DM and DR cannot be used. alpha: float, default=0.05 Significance level. @@ -365,10 +365,10 @@ def visualize_off_policy_estimates( Continuous action values given by the (deterministic) evaluation policy, i.e., :math:`\\pi_e(x_t)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. When an array-like is given, all OPE estimators use it. - When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used. - When it is not given, model-dependent estimators such as DM and DR cannot be used. + When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. + If None, model-dependent estimators such as DM and DR cannot be used. alpha: float, default=0.05 Significance level. @@ -441,11 +441,11 @@ def evaluate_performance_of_estimators( ] = None, metric: str = "relative-ee", ) -> Dict[str, float]: - """Evaluate estimation performance of OPE estimators. + """Evaluate the accuracy of OPE estimators. Note ------ - Evaluate the estimation performance of OPE estimators by relative estimation error (relative-EE) or squared error (SE): + Evaluate the estimation performance of OPE estimators with relative estimation error (relative-EE) or squared error (SE): .. math :: \\text{Relative-EE} (\\hat{V}; \\mathcal{D}) = \\left| \\frac{\\hat{V}(\\pi; \\mathcal{D}) - V(\\pi)}{V(\\pi)} \\right|, @@ -454,7 +454,7 @@ def evaluate_performance_of_estimators( \\text{SE} (\\hat{V}; \\mathcal{D}) = \\left(\\hat{V}(\\pi; \\mathcal{D}) - V(\\pi) \\right)^2, where :math:`V({\\pi})` is the ground-truth policy value of the evalation policy :math:`\\pi_e` (often estimated using on-policy estimation). - :math:`\\hat{V}(\\pi; \\mathcal{D})` is an estimated policy value by an OPE estimator :math:`\\hat{V}` and logged bandit feedback :math:`\\mathcal{D}`. + :math:`\\hat{V}(\\pi; \\mathcal{D})` is the policy value estimated by an OPE estimator :math:`\\hat{V}` and logged bandit feedback :math:`\\mathcal{D}`. Parameters ---------- @@ -466,19 +466,19 @@ def evaluate_performance_of_estimators( Continuous action values given by the (deterministic) evaluation policy, i.e., :math:`\\pi_e(x_t)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. When an array-like is given, all OPE estimators use it. - When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used. - When it is not given, model-dependent estimators such as DM and DR cannot be used. + When a dict with an estimator's name as its key is given, the corresponding value is used for the estimator. + If None, model-dependent estimators such as DM and DR cannot be used. metric: str, default="relative-ee" Evaluation metric to evaluate and compare the estimation performance of OPE estimators. - Must be "relative-ee" or "se". + Must be either "relative-ee" or "se". Returns ---------- eval_metric_ope_dict: Dict[str, float] - Dictionary containing evaluation metric for evaluating the estimation performance of OPE estimators. + Dictionary containing the value of evaluation metric for the estimation performance of OPE estimators. """ @@ -489,11 +489,11 @@ def evaluate_performance_of_estimators( ) if metric not in ["relative-ee", "se"]: raise ValueError( - f"metric must be either 'relative-ee' or 'se', but {metric} is given" + f"`metric` must be either 'relative-ee' or 'se', but {metric} is given" ) if metric == "relative-ee" and ground_truth_policy_value == 0.0: raise ValueError( - "ground_truth_policy_value must be non-zero when metric is relative-ee" + "`ground_truth_policy_value` must be non-zero when metric is relative-ee" ) eval_metric_ope_dict = dict() @@ -535,8 +535,8 @@ def summarize_estimators_comparison( Continuous action values given by the (deterministic) evaluation policy, i.e., :math:`\\pi_e(x_t)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list), default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. - When it is not given, model-dependent estimators such as DM and DR cannot be used. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. + If None, model-dependent estimators such as DM and DR cannot be used. metric: str, default="relative-ee" Evaluation metric to evaluate and compare the estimation performance of OPE estimators. @@ -584,10 +584,10 @@ def visualize_off_policy_estimates_of_multiple_policies( List of action values given by the (deterministic) evaluation policies, i.e., :math:`\\pi_e(x_t)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. When an array-like is given, all OPE estimators use it. When a dict is given, if the dict has the name of an estimator as a key, the corresponding value is used. - When it is not given, model-dependent estimators such as DM and DR cannot be used. + If None, model-dependent estimators such as DM and DR cannot be used. alpha: float, default=0.05 Significance level. diff --git a/obp/ope/meta_slate.py b/obp/ope/meta_slate.py index 27f4563a..a1fc3593 100644 --- a/obp/ope/meta_slate.py +++ b/obp/ope/meta_slate.py @@ -19,6 +19,7 @@ from ..types import BanditFeedback from ..utils import check_confidence_interval_arguments from .estimators_slate import BaseSlateOffPolicyEstimator +from .estimators_slate import SlateCascadeDoublyRobust as CascadeDR logger = getLogger(__name__) @@ -31,7 +32,7 @@ class SlateOffPolicyEvaluation: Parameters ----------- bandit_feedback: BanditFeedback - Logged bandit feedback data used for off-policy evaluation for the slate recommendation setting. + Logged bandit data used for off-policy evaluation for the slate recommendation setting. ope_estimators: List[BaseSlateOffPolicyEstimator] List of OPE estimators used to evaluate the policy value of evaluation policy. @@ -115,18 +116,30 @@ class SlateOffPolicyEvaluation: def __post_init__(self) -> None: """Initialize class.""" - for key_ in ["slate_id", "position", "reward"]: + for key_ in [ + "slate_id", + "context", + "action", + "reward", + "position", + ]: if key_ not in self.bandit_feedback: raise RuntimeError(f"Missing key of {key_} in 'bandit_feedback'.") + self.ope_estimators_ = dict() + self.use_cascade_dr = False for estimator in self.ope_estimators: self.ope_estimators_[estimator.estimator_name] = estimator + if isinstance(estimator, CascadeDR): + self.use_cascade_dr = True def _create_estimator_inputs( self, evaluation_policy_pscore: Optional[np.ndarray] = None, evaluation_policy_pscore_item_position: Optional[np.ndarray] = None, evaluation_policy_pscore_cascade: Optional[np.ndarray] = None, + evaluation_policy_action_dist: Optional[np.ndarray] = None, + q_hat: Optional[np.ndarray] = None, ) -> Dict[str, np.ndarray]: """Create input dictionary to estimate policy value by subclasses of `BaseSlateOffPolicyEstimator`""" if ( @@ -137,11 +150,18 @@ def _create_estimator_inputs( raise ValueError( "one of evaluation_policy_pscore, evaluation_policy_pscore_item_position, or evaluation_policy_pscore_cascade must be given" ) + if self.use_cascade_dr and evaluation_policy_action_dist is None: + raise ValueError( + "evaluation_policy_action_dist must be given when using SlateCascadeDoublyRobust" + ) + if self.use_cascade_dr and q_hat is None: + raise ValueError("q_hat must be given when using SlateCascadeDoublyRobust") estimator_inputs = { input_: self.bandit_feedback[input_] for input_ in [ "slate_id", + "action", "reward", "position", "pscore", @@ -157,6 +177,10 @@ def _create_estimator_inputs( estimator_inputs[ "evaluation_policy_pscore_cascade" ] = evaluation_policy_pscore_cascade + estimator_inputs[ + "evaluation_policy_action_dist" + ] = evaluation_policy_action_dist + estimator_inputs["q_hat"] = q_hat return estimator_inputs @@ -165,24 +189,39 @@ def estimate_policy_values( evaluation_policy_pscore: Optional[np.ndarray] = None, evaluation_policy_pscore_item_position: Optional[np.ndarray] = None, evaluation_policy_pscore_cascade: Optional[np.ndarray] = None, + evaluation_policy_action_dist: Optional[np.ndarray] = None, + q_hat: Optional[np.ndarray] = None, ) -> Dict[str, float]: """Estimate the policy value of evaluation policy. Parameters ------------ evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of evaluation policy, i.e., :math:`\\pi_e(a_t|x_t)`. + Joint probabilities of evaluation policy selecting a slate action, i.e., :math:`\\pi_e(a_i|x_i)`. + This parameter must be unique in each slate. evaluation_policy_pscore_item_position: array-like, shape (<= n_rounds * len_list,) - Marginal action choice probabilities of the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(a_{t, k}|x_t)`. + Marginal probabilities of evaluation policy selecting each action :math:`a` at position (slot) :math:`k`, i.e., :math:`\\pi_e(a_{t}(k) |x_t)`. + + evaluation_policy_pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. - evaluation_policy_pscore_cascade: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities above the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(\\{a_{t, j}\\}_{j \\le k}|x_t)`. + evaluation_policy_action_dist: array-like, shape (n_rounds * len_list * n_unique_action, ) + Plackett-luce style action distribution induced by evaluation policy (action choice probabilities at each slot given previous action choices). + , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1)) \\forall a_t(k) \\in \\mathcal{A}`. + Required when using SlateCascadeDoublyRobust. + + q_hat: array-like (n_rounds * len_list * n_unique_actions, ) + :math:`\\hat{Q}_k` for all unique actions + , i.e., :math:`\\hat{Q}_{t, k}(x_t, a_t(1), \\ldots, a_t(k-1), a_t(k)) \\forall a_t(k) \\in \\mathcal{A}`. + Required when using SlateCascadeDoublyRobust. Returns ---------- policy_value_dict: Dict[str, float] - Dictionary containing estimated policy values by OPE estimators. + Dictionary containing the policy values estimated by OPE estimators. """ policy_value_dict = dict() @@ -190,6 +229,8 @@ def estimate_policy_values( evaluation_policy_pscore=evaluation_policy_pscore, evaluation_policy_pscore_item_position=evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, ) for estimator_name, estimator in self.ope_estimators_.items(): policy_value_dict[estimator_name] = estimator.estimate_policy_value( @@ -203,6 +244,8 @@ def estimate_intervals( evaluation_policy_pscore: Optional[np.ndarray] = None, evaluation_policy_pscore_item_position: Optional[np.ndarray] = None, evaluation_policy_pscore_cascade: Optional[np.ndarray] = None, + evaluation_policy_action_dist: Optional[np.ndarray] = None, + q_hat: Optional[np.ndarray] = None, alpha: float = 0.05, n_bootstrap_samples: int = 100, random_state: Optional[int] = None, @@ -212,13 +255,25 @@ def estimate_intervals( Parameters ------------ evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of evaluation policy, i.e., :math:`\\pi_e(a_t|x_t)`. + Joint probabilities of evaluation policy selecting a slate action, i.e., :math:`\\pi_e(a_i|x_i)`. + This parameter must be unique in each slate. evaluation_policy_pscore_item_position: array-like, shape (<= n_rounds * len_list,) - Marginal action choice probabilities of the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(a_{t, k}|x_t)`. + Marginal probabilities of evaluation policy selecting each action :math:`a` at position (slot) :math:`k`, i.e., :math:`\\pi_e(a_{t}(k) |x_t)`. + + evaluation_policy_pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. - evaluation_policy_pscore_cascade: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities above the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(\\{a_{t, j}\\}_{j \\le k}|x_t)`. + evaluation_policy_action_dist: array-like, shape (n_rounds * len_list * n_unique_action, ) + Plackett-luce style action distribution induced by evaluation policy (action choice probabilities at each slot given previous action choices). + , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1)) \\forall a_t(k) \\in \\mathcal{A}`. + + q_hat: array-like (n_rounds * len_list * n_unique_actions, ) + :math:`\\hat{Q}_k` for all unique actions + , i.e., :math:`\\hat{Q}_{t, k}(x_t, a_t(1), \\ldots, a_t(k-1), a_t(k)) \\forall a_t(k) \\in \\mathcal{A}`. + Required when using SlateCascadeDoublyRobust. alpha: float, default=0.05 Significance level. @@ -246,6 +301,8 @@ def estimate_intervals( evaluation_policy_pscore=evaluation_policy_pscore, evaluation_policy_pscore_item_position=evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, ) for estimator_name, estimator in self.ope_estimators_.items(): policy_value_interval_dict[estimator_name] = estimator.estimate_interval( @@ -262,6 +319,8 @@ def summarize_off_policy_estimates( evaluation_policy_pscore: Optional[np.ndarray] = None, evaluation_policy_pscore_item_position: Optional[np.ndarray] = None, evaluation_policy_pscore_cascade: Optional[np.ndarray] = None, + evaluation_policy_action_dist: Optional[np.ndarray] = None, + q_hat: Optional[np.ndarray] = None, alpha: float = 0.05, n_bootstrap_samples: int = 100, random_state: Optional[int] = None, @@ -271,13 +330,25 @@ def summarize_off_policy_estimates( Parameters ------------ evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of evaluation policy, i.e., :math:`\\pi_e(a_t|x_t)`. + Joint probabilities of evaluation policy selecting a slate action, i.e., :math:`\\pi_e(a_i|x_i)`. + This parameter must be unique in each slate. evaluation_policy_pscore_item_position: array-like, shape (<= n_rounds * len_list,) - Marginal action choice probabilities of the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(a_{t, k}|x_t)`. + Marginal probabilities of evaluation policy selecting each action :math:`a` at position (slot) :math:`k`, i.e., :math:`\\pi_e(a_{t}(k) |x_t)`. + + evaluation_policy_pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. - evaluation_policy_pscore_cascade: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities above the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(\\{a_{t, j}\\}_{j \\le k}|x_t)`. + evaluation_policy_action_dist: array-like, shape (n_rounds * len_list * n_unique_action, ) + Plackett-luce style action distribution induced by evaluation policy (action choice probabilities at each slot given previous action choices). + , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1)) \\forall a_t(k) \\in \\mathcal{A}`. + + q_hat: array-like (n_rounds * len_list * n_unique_actions, ) + :math:`\\hat{Q}_k` for all unique actions + , i.e., :math:`\\hat{Q}_{t, k}(x_t, a_t(1), \\ldots, a_t(k-1), a_t(k)) \\forall a_t(k) \\in \\mathcal{A}`. + Required when using SlateCascadeDoublyRobust. alpha: float, default=0.05 Significance level. @@ -291,7 +362,7 @@ def summarize_off_policy_estimates( Returns ---------- (policy_value_df, policy_value_interval_df): Tuple[DataFrame, DataFrame] - Policy values and their confidence intervals Estimated by OPE estimators. + Policy values and their confidence intervals estimated by OPE estimators. """ policy_value_df = DataFrame( @@ -299,6 +370,8 @@ def summarize_off_policy_estimates( evaluation_policy_pscore=evaluation_policy_pscore, evaluation_policy_pscore_item_position=evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, ), index=["estimated_policy_value"], ) @@ -307,6 +380,8 @@ def summarize_off_policy_estimates( evaluation_policy_pscore=evaluation_policy_pscore, evaluation_policy_pscore_item_position=evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, @@ -333,6 +408,8 @@ def visualize_off_policy_estimates( evaluation_policy_pscore: Optional[np.ndarray] = None, evaluation_policy_pscore_item_position: Optional[np.ndarray] = None, evaluation_policy_pscore_cascade: Optional[np.ndarray] = None, + evaluation_policy_action_dist: Optional[np.ndarray] = None, + q_hat: Optional[np.ndarray] = None, alpha: float = 0.05, is_relative: bool = False, n_bootstrap_samples: int = 100, @@ -345,13 +422,25 @@ def visualize_off_policy_estimates( Parameters ---------- evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of evaluation policy, i.e., :math:`\\pi_e(a_t|x_t)`. + Joint probabilities of evaluation policy selecting a slate action, i.e., :math:`\\pi_e(a_i|x_i)`. + This parameter must be unique in each slate. evaluation_policy_pscore_item_position: array-like, shape (<= n_rounds * len_list,) - Marginal action choice probabilities of the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(a_{t, k}|x_t)`. + Marginal probabilities of evaluation policy selecting each action :math:`a` at position (slot) :math:`k`, i.e., :math:`\\pi_e(a_{t}(k) |x_t)`. + + evaluation_policy_pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. - evaluation_policy_pscore_cascade: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities above the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(\\{a_{t, j}\\}_{j \\le k}|x_t)`. + evaluation_policy_action_dist: array-like, shape (n_rounds * len_list * n_unique_action, ) + Plackett-luce style action distribution induced by evaluation policy (action choice probabilities at each slot given previous action choices). + , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1)) \\forall a_t(k) \\in \\mathcal{A}`. + + q_hat: array-like (n_rounds * len_list * n_unique_actions, ) + :math:`\\hat{Q}_k` for all unique actions + , i.e., :math:`\\hat{Q}_{t, k}(x_t, a_t(1), \\ldots, a_t(k-1), a_t(k)) \\forall a_t(k) \\in \\mathcal{A}`. + Required when using SlateCascadeDoublyRobust. alpha: float, default=0.05 Significance level. @@ -383,6 +472,8 @@ def visualize_off_policy_estimates( evaluation_policy_pscore=evaluation_policy_pscore, evaluation_policy_pscore_item_position=evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, @@ -428,13 +519,15 @@ def evaluate_performance_of_estimators( evaluation_policy_pscore: Optional[np.ndarray] = None, evaluation_policy_pscore_item_position: Optional[np.ndarray] = None, evaluation_policy_pscore_cascade: Optional[np.ndarray] = None, + evaluation_policy_action_dist: Optional[np.ndarray] = None, + q_hat: Optional[np.ndarray] = None, metric: str = "relative-ee", ) -> Dict[str, float]: - """Evaluate estimation performance of OPE estimators. + """Evaluate the accuracy of OPE estimators. Note ------ - Evaluate the estimation performance of OPE estimators by relative estimation error (relative-EE) or squared error (SE): + Evaluate the estimation performance of OPE estimators with relative estimation error (relative-EE) or squared error (SE): .. math :: @@ -445,7 +538,7 @@ def evaluate_performance_of_estimators( \\text{SE} (\\hat{V}; \\mathcal{D}) = \\left(\\hat{V}(\\pi; \\mathcal{D}) - V(\\pi) \\right)^2, where :math:`V({\\pi})` is the ground-truth policy value of the evalation policy :math:`\\pi_e` (often estimated using on-policy estimation). - :math:`\\hat{V}(\\pi; \\mathcal{D})` is an estimated policy value by an OPE estimator :math:`\\hat{V}` and logged bandit feedback :math:`\\mathcal{D}`. + :math:`\\hat{V}(\\pi; \\mathcal{D})` is the policy value estimated by an OPE estimator :math:`\\hat{V}` and logged bandit feedback :math:`\\mathcal{D}`. Parameters ---------- @@ -454,32 +547,44 @@ def evaluate_performance_of_estimators( With Open Bandit Dataset, in general, we use an on-policy estimate of the policy value as its ground-truth. evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of evaluation policy, i.e., :math:`\\pi_e(a_t|x_t)`. + Joint probabilities of evaluation policy selecting a slate action, i.e., :math:`\\pi_e(a_i|x_i)`. + This parameter must be unique in each slate. evaluation_policy_pscore_item_position: array-like, shape (<= n_rounds * len_list,) - Marginal action choice probabilities of the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(a_{t, k}|x_t)`. + Marginal probabilities of evaluation policy selecting each action :math:`a` at position (slot) :math:`k`, i.e., :math:`\\pi_e(a_{t}(k) |x_t)`. + + evaluation_policy_pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. - evaluation_policy_pscore_cascade: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities above the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(\\{a_{t, j}\\}_{j \\le k}|x_t)`. + evaluation_policy_action_dist: array-like, shape (n_rounds * len_list * n_unique_action, ) + Plackett-luce style action distribution induced by evaluation policy (action choice probabilities at each slot given previous action choices). + , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1)) \\forall a_t(k) \\in \\mathcal{A}`. + + q_hat: array-like (n_rounds * len_list * n_unique_actions, ) + :math:`\\hat{Q}_k` for all unique actions + , i.e., :math:`\\hat{Q}_{t, k}(x_t, a_t(1), \\ldots, a_t(k-1), a_t(k)) \\forall a_t(k) \\in \\mathcal{A}`. + Required when using SlateCascadeDoublyRobust. metric: str, default="relative-ee" Evaluation metric used to evaluate and compare the estimation performance of OPE estimators. - Must be "relative-ee" or "se". + Must be either "relative-ee" or "se". Returns ---------- eval_metric_ope_dict: Dict[str, float] - Dictionary containing evaluation metric for evaluating the estimation performance of OPE estimators. + Dictionary containing the value of evaluation metric for the estimation performance of OPE estimators. """ check_scalar(ground_truth_policy_value, "ground_truth_policy_value", float) if metric not in ["relative-ee", "se"]: raise ValueError( - f"metric must be either 'relative-ee' or 'se', but {metric} is given" + f"`metric` must be either 'relative-ee' or 'se', but {metric} is given" ) if metric == "relative-ee" and ground_truth_policy_value == 0.0: raise ValueError( - "ground_truth_policy_value must be non-zero when metric is relative-ee" + "`ground_truth_policy_value` must be non-zero when metric is relative-ee" ) eval_metric_ope_dict = dict() @@ -487,6 +592,8 @@ def evaluate_performance_of_estimators( evaluation_policy_pscore=evaluation_policy_pscore, evaluation_policy_pscore_item_position=evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, ) for estimator_name, estimator in self.ope_estimators_.items(): estimated_policy_value = estimator.estimate_policy_value(**estimator_inputs) @@ -505,6 +612,8 @@ def summarize_estimators_comparison( evaluation_policy_pscore: Optional[np.ndarray] = None, evaluation_policy_pscore_item_position: Optional[np.ndarray] = None, evaluation_policy_pscore_cascade: Optional[np.ndarray] = None, + evaluation_policy_action_dist: Optional[np.ndarray] = None, + q_hat: Optional[np.ndarray] = None, metric: str = "relative-ee", ) -> DataFrame: """Summarize performance comparisons of OPE estimators. @@ -516,13 +625,25 @@ def summarize_estimators_comparison( With Open Bandit Dataset, in general, we use an on-policy estimate of the policy value as ground-truth. evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of evaluation policy, i.e., :math:`\\pi_e(a_t|x_t)`. + Joint probabilities of evaluation policy selecting a slate action, i.e., :math:`\\pi_e(a_i|x_i)`. + This parameter must be unique in each slate. evaluation_policy_pscore_item_position: array-like, shape (<= n_rounds * len_list,) - Marginal action choice probabilities of the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(a_{t, k}|x_t)`. + Marginal probabilities of evaluation policy selecting each action :math:`a` at position (slot) :math:`k`, i.e., :math:`\\pi_e(a_{t}(k) |x_t)`. + + evaluation_policy_pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. + + evaluation_policy_action_dist: array-like, shape (n_rounds * len_list * n_unique_action, ) + Plackett-luce style action distribution induced by evaluation policy (action choice probabilities at each slot given previous action choices). + , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1)) \\forall a_t(k) \\in \\mathcal{A}`. - evaluation_policy_pscore_cascade: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities above the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(\\{a_{t, j}\\}_{j \\le k}|x_t)`. + q_hat: array-like (n_rounds * len_list * n_unique_actions, ) + :math:`\\hat{Q}_k` for all unique actions + , i.e., :math:`\\hat{Q}_{t, k}(x_t, a_t(1), \\ldots, a_t(k-1), a_t(k)) \\forall a_t(k) \\in \\mathcal{A}`. + Required when using SlateCascadeDoublyRobust. metric: str, default="relative-ee" Evaluation metric used to evaluate and compare the estimation performance of OPE estimators. @@ -540,6 +661,8 @@ def summarize_estimators_comparison( evaluation_policy_pscore=evaluation_policy_pscore, evaluation_policy_pscore_item_position=evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, metric=metric, ), index=[metric], diff --git a/obp/ope/regression_model.py b/obp/ope/regression_model.py index d14f9427..83798f82 100644 --- a/obp/ope/regression_model.py +++ b/obp/ope/regression_model.py @@ -18,7 +18,7 @@ @dataclass class RegressionModel(BaseEstimator): - """Machine learning model to estimate the mean reward function (:math:`q(x,a):= \\mathbb{E}[r|x,a]`). + """Machine learning model to estimate the reward function (:math:`q(x,a):= \\mathbb{E}[r|x,a]`). Note ------- @@ -27,18 +27,18 @@ class RegressionModel(BaseEstimator): Parameters ------------ base_model: BaseEstimator - A machine learning model used to estimate the mean reward function. + A machine learning model used to estimate the reward function. n_actions: int Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. action_context: array-like, shape (n_actions, dim_action_context), default=None Context vector characterizing action (i.e., vector representation of each action). - If not given, one-hot encoding of the action variable is used as default. + If None, one-hot encoding of the action variable is used as default. fitting_method: str, default='normal' Method to fit the regression model. @@ -73,11 +73,11 @@ def __post_init__(self) -> None: and self.fitting_method in ["normal", "iw", "mrdr"] ): raise ValueError( - f"fitting_method must be one of 'normal', 'iw', or 'mrdr', but {self.fitting_method} is given" + f"`fitting_method` must be one of 'normal', 'iw', or 'mrdr', but {self.fitting_method} is given" ) if not isinstance(self.base_model, BaseEstimator): raise ValueError( - "base_model must be BaseEstimator or a child class of BaseEstimator" + "`base_model` must be BaseEstimator or a child class of BaseEstimator" ) self.base_model_list = [ @@ -95,7 +95,7 @@ def fit( position: Optional[np.ndarray] = None, action_dist: Optional[np.ndarray] = None, ) -> None: - """Fit the regression model on given logged bandit feedback data. + """Fit the regression model on given logged bandit data. Parameters ---------- @@ -103,23 +103,23 @@ def fit( Context vectors observed in each round of the logged bandit feedback, i.e., :math:`x_t`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. pscore: array-like, shape (n_rounds,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. - When None is given, behavior policy is assumed to be uniform. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. + If None is given, behavior policy is assumed to be uniform. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. If None is given, a regression model assumes that there is only one position. When `len_list` > 1, this position argument has to be set. action_dist: array-like, shape (n_rounds, n_actions, len_list), default=None - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. - When either of 'iw' or 'mrdr' is used as the 'fitting_method' argument, then `action_dist` must be given. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. + When either of 'iw' or 'mrdr' is used as `fitting_method`, `action_dist` must be given. """ check_bandit_feedback_inputs( @@ -137,19 +137,19 @@ def fit( else: if position.max() >= self.len_list: raise ValueError( - f"position elements must be smaller than len_list, but the maximum value is {position.max()} (>= {self.len_list})" + f"`position` elements must be smaller than `len_list`, but the maximum value is {position.max()} (>= {self.len_list})" ) if self.fitting_method in ["iw", "mrdr"]: if not (isinstance(action_dist, np.ndarray) and action_dist.ndim == 3): raise ValueError( - "when fitting_method is either 'iw' or 'mrdr', action_dist (a 3-dimensional ndarray) must be given" + "when `fitting_method` is either 'iw' or 'mrdr', `action_dist` (a 3-dimensional ndarray) must be given" ) if action_dist.shape != (n_rounds, self.n_actions, self.len_list): raise ValueError( - f"shape of action_dist must be (n_rounds, n_actions, len_list)=({n_rounds, self.n_actions, self.len_list}), but is {action_dist.shape}" + f"shape of `action_dist` must be (n_rounds, n_actions, len_list)=({n_rounds, self.n_actions, self.len_list}), but is {action_dist.shape}" ) if not np.allclose(action_dist.sum(axis=1), 1): - raise ValueError("action_dist must be a probability distribution") + raise ValueError("`action_dist` must be a probability distribution") if pscore is None: pscore = np.ones_like(action) / self.n_actions @@ -185,7 +185,7 @@ def fit( ) def predict(self, context: np.ndarray) -> np.ndarray: - """Predict the mean reward function. + """Predict the reward function. Parameters ----------- @@ -233,7 +233,7 @@ def fit_predict( n_folds: int = 1, random_state: Optional[int] = None, ) -> np.ndarray: - """Fit the regression model on given logged bandit feedback data and predict the reward function of the same data. + """Fit the regression model on given logged bandit data and predict the reward function of the same data. Note ------ @@ -246,24 +246,24 @@ def fit_predict( Context vectors observed in each round of the logged bandit feedback, i.e., :math:`x_t`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. pscore: array-like, shape (n_rounds,), default=None Action choice probabilities (propensity score) of a behavior policy in the training logged bandit feedback. - When None is given, the the behavior policy is assumed to be a uniform one. + If None is given, the the behavior policy is assumed to be a uniform one. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. If None is given, a regression model assumes that there is only one position. When `len_list` > 1, this position argument has to be set. action_dist: array-like, shape (n_rounds, n_actions, len_list), default=None - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. - When either of 'iw' or 'mrdr' is used as the 'fitting_method' argument, then `action_dist` must be given. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. + When either of 'iw' or 'mrdr' is used as `fitting_method`, `action_dist` must be given. n_folds: int, default=1 Number of folds in the cross-fitting procedure. @@ -298,16 +298,16 @@ def fit_predict( else: if position.max() >= self.len_list: raise ValueError( - f"position elements must be smaller than len_list, but the maximum value is {position.max()} (>= {self.len_list})" + f"`position` elements must be smaller than `len_list`, but the maximum value is {position.max()} (>= {self.len_list})" ) if self.fitting_method in ["iw", "mrdr"]: if not (isinstance(action_dist, np.ndarray) and action_dist.ndim == 3): raise ValueError( - "when fitting_method is either 'iw' or 'mrdr', action_dist (a 3-dimensional ndarray) must be given" + "when `fitting_method` is either 'iw' or 'mrdr', `action_dist` (a 3-dimensional ndarray) must be given" ) if action_dist.shape != (n_rounds, self.n_actions, self.len_list): raise ValueError( - f"shape of action_dist must be (n_rounds, n_actions, len_list)=({n_rounds, self.n_actions, self.len_list}), but is {action_dist.shape}" + f"shape of `action_dist` must be (n_rounds, n_actions, len_list)=({n_rounds, self.n_actions, self.len_list}), but is {action_dist.shape}" ) if pscore is None: pscore = np.ones_like(action) / self.n_actions @@ -364,7 +364,7 @@ def _pre_process_for_reg_model( Context vectors observed in each round of the logged bandit feedback, i.e., :math:`x_t`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. action_context: array-like, shape shape (n_actions, dim_action_context) Context vector characterizing action (i.e., vector representation of each action). diff --git a/obp/ope/regression_model_slate.py b/obp/ope/regression_model_slate.py new file mode 100644 index 00000000..6b390e34 --- /dev/null +++ b/obp/ope/regression_model_slate.py @@ -0,0 +1,395 @@ +# Copyright (c) Yuta Saito, Yusuke Narita, and ZOZO Technologies, Inc. All rights reserved. +# Licensed under the Apache 2.0 License. + +"""Regression Model Class for Estimating the Q functions in Cascade-DR.""" +from dataclasses import dataclass + +import numpy as np +from sklearn.base import BaseEstimator +from sklearn.base import clone +from sklearn.base import is_classifier +from sklearn.utils import check_scalar + +from obp.utils import check_array + + +@dataclass +class SlateRegressionModel(BaseEstimator): + """Machine learning model to estimate the Q functions + + Note + ------- + Q function at position :math:`k` is defined as + :math:`\\hat{Q}_k := \\hat{Q}_k(x, a(1), \\ldots, a(k)) \\approx \\mathbb{E}[ \sum_{k'=k}^K \\alpha_{k'} r(k') | x, a(1), \\ldots, a(k)]`). + + Q function is estimated recursively, and then used to construct Cascade-DR. + Please refer to Section 3.1 of Kiyohara et al.(2022) for the detail. + + Parameters + ------------ + base_model: BaseEstimator + A machine learning model used to estimate the Q function. + + len_list: int + Length of a list of actions recommended in each impression (slate size). + When Open Bandit Dataset is used, 3 should be set. + + n_unique_action: int + Number of unique actions. + + fitting_method: str, default='normal' + Method to fit the regression model. + Must be either of ['normal', 'iw'] where 'iw' stands for importance weighting. + + Reference + ------------ + Haruka Kiyohara, Yuta Saito, Tatsuya Matsuhiro, Yusuke Narita, Nobuyuki Shimizu, and Yasuo Yamamoto. + "Doubly Robust Off-Policy Evaluation for Ranking Policies under the Cascade Behavior Model.", 2022. + + """ + + base_model: BaseEstimator + len_list: int + n_unique_action: int + fitting_method: str = "normal" + + def __post_init__(self): + """Initialize Class.""" + check_scalar(self.n_unique_action, "n_unique_action", int, min_val=2) + check_scalar(self.len_list, "len_list", int, min_val=1) + if not ( + isinstance(self.fitting_method, str) + and self.fitting_method in ["normal", "iw"] + ): + raise ValueError( + f"fitting_method must be either 'normal' or 'iw', but {self.fitting_method} is given" + ) + if not isinstance(self.base_model, BaseEstimator): + raise ValueError( + "`base_model` must be BaseEstimator or a child class of BaseEstimator" + ) + if is_classifier(self.base_model): + raise ValueError("`base_model` must be a regressor, not a classifier") + self.base_model_list = [clone(self.base_model) for _ in range(self.len_list)] + self.action_context = np.eye(self.n_unique_action) + + def fit( + self, + context: np.ndarray, + action: np.ndarray, + reward: np.ndarray, + pscore_cascade: np.ndarray, + evaluation_policy_pscore_cascade: np.ndarray, + evaluation_policy_action_dist: np.ndarray, + ): + """Fit the regression model on given logged bandit data. + + Parameters + ---------- + context: array-like, shape (n_rounds, dim_context) + Context vectors observed for each data, i.e., :math:`x_i`. + + action: array-like, (n_rounds * len_list,) + Action observed at each slot in each round of the logged bandit feedback, i.e., :math:`a_{t}(k)`, + which is chosen by the behavior policy :math:`\\pi_b`. + + reward: array-like, shape (n_rounds * len_list,) + Reward observed at each slot in each round of the logged bandit feedback, i.e., :math:`r_{t}(k)`. + + pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of behavior policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of behavior policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. + + evaluation_policy_pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. + + evaluation_policy_action_dist: array-like (n_rounds * len_list * n_unique_actions, ) + Plackett-luce style action distribution induced by evaluation policy (action choice probabilities at each slot given previous action choices). + , i.e., :math:`\\pi_e({a'}_t(k) | x_t, a_t(1), \\ldots, a_t(k-1)) \\forall {a'}_t(k) \\in \\mathcal{A}`. + + """ + check_array(array=context, name="context", expected_dim=2) + check_array(array=action, name="action", expected_dim=1) + check_array(array=reward, name="reward", expected_dim=1) + check_array(array=pscore_cascade, name="pscore_cascade", expected_dim=1) + check_array( + array=evaluation_policy_pscore_cascade, + name="evaluation_policy_pscore_cascade", + expected_dim=1, + ) + check_array( + array=evaluation_policy_action_dist, + name="evaluation_policy_action_dist", + expected_dim=1, + ) + if not ( + action.shape + == reward.shape + == pscore_cascade.shape + == evaluation_policy_pscore_cascade.shape + == (context.shape[0] * self.len_list,) + ): + raise ValueError( + "Expected `action.shape == reward.shape == pscore_cascade.shape == evaluation_policy_pscore_cascade.shape" + " == (context.shape[0] * len_list, )`" + ", but found it False" + ) + if evaluation_policy_action_dist.shape != ( + context.shape[0] * self.len_list * self.n_unique_action, + ): + raise ValueError( + "Expected `evaluation_policy_action_dist.shape == (context.shape[0] * len_list * n_unique_action, )`" + ", but found it False" + ) + if not ( + np.issubdtype(action.dtype, np.integer) + and action.min() >= 0 + and action.max() < self.n_unique_action + ): + raise ValueError( + "`action` elements must be integers in the range of [0, n_unique_action)" + ) + if np.any(pscore_cascade <= 0) or np.any(pscore_cascade > 1): + raise ValueError("`pscore_cascade` must be in the range of (0, 1]") + if np.any(evaluation_policy_pscore_cascade <= 0) or np.any( + evaluation_policy_pscore_cascade > 1 + ): + raise ValueError( + "`evaluation_policy_pscore_cascade` must be in the range of (0, 1]" + ) + if not np.allclose( + np.ones( + evaluation_policy_action_dist.reshape((-1, self.n_unique_action)).shape[ + 0 + ] + ), + evaluation_policy_action_dist.reshape((-1, self.n_unique_action)).sum( + axis=1 + ), + ): + raise ValueError( + "`evaluation_policy_action_dist[i * n_unique_action : (i+1) * n_unique_action]` " + "must sum up to one for all i." + ) + # (n_rounds_ * len_list, ) -> (n_rounds_, len_list) + action = action.reshape((-1, self.len_list)) + reward = reward.reshape((-1, self.len_list)) + iw = (evaluation_policy_pscore_cascade / pscore_cascade).reshape( + (-1, self.len_list) + ) + + # (n_rounds_, ) + n_rounds_ = len(action) + sample_weight = np.ones(n_rounds_) + + for position_ in range(self.len_list)[::-1]: + X, y = self._preprocess_for_reg_model( + context=context, + action=action, + reward=reward, + evaluation_policy_action_dist=evaluation_policy_action_dist, + position_=position_, + ) + + if self.fitting_method == "iw": + sample_weight = iw[:, position_] + + self.base_model_list[position_].fit(X, y, sample_weight=sample_weight) + + def predict( + self, + context: np.ndarray, + action: np.ndarray, + ): + """Predict the Q functions. + + Parameters + ----------- + context: array-like, shape (n_rounds_of_new_data, dim_context) + Context vectors for new data. + + action: array-like, shape (n_rounds_of_new_data * len_list, ) + Action vectors for new data. + + Returns + ----------- + q_hat: array-like, shape (n_rounds_of_new_data * len_list * n_unique_action, ) + Estimated Q functions for new data. + , i.e., :math:`\\hat{Q}_{t, k}(x_t, a_t(1), \\ldots, a_t(k-1), a_t(k)) \\forall a_t(k) \\in \\mathcal{A}`. + + """ + check_array(array=context, name="context", expected_dim=2) + check_array(array=action, name="action", expected_dim=1) + if action.shape != (context.shape[0] * self.len_list,): + raise ValueError( + "Expected `action.shape == (context.shape[0] * len_list, )`" + ", but found it False" + ) + n_rounds_of_new_data = len(context) + # (n_rounds_of_new_data * len_list, ) -> (n_rounds_of_new_data, len_list) + action = action.reshape((-1, self.len_list)) + # (n_rounds_, len_list, n_unique_action, ) + q_hat = np.zeros((n_rounds_of_new_data, self.len_list, self.n_unique_action)) + for position_ in range(self.len_list)[::-1]: + # the action vector shrinks every time as the position_ decreases + # (n_rounds_of_new_data, position_ - 1) + action = action[:, :position_] + # (n_rounds_of_new_data, dim_context) -> (n_rounds_of_new_data * n_unique_action, dim_context) + context_ = [] + # (n_rounds_of_new_data, position_) -> (n_rounds_of_new_data * n_unique_action, position_) + action_ = [] + for i in range(n_rounds_of_new_data): + for a_ in range(self.n_unique_action): + context_.append(context[i]) + action_.append(np.append(action[i], a_)) + # (n_rounds_of_new_data * n_unique_action, dim_context + position_) + X = np.concatenate([context_, action_], axis=1) + # (n_rounds_of_new_data * n_unique_action, ) -> (n_rounds_of_new_data, n_unique_action) + q_hat[:, position_, :] = ( + self.base_model_list[position_] + .predict(X) + .reshape((-1, self.n_unique_action)) + ) + # (n_rounds_of_new_data * len_list * n_unique_action, ) + return q_hat.flatten() + + def fit_predict( + self, + context: np.ndarray, + action: np.ndarray, + reward: np.ndarray, + pscore_cascade: np.ndarray, + evaluation_policy_pscore_cascade: np.ndarray, + evaluation_policy_action_dist: np.ndarray, + ): + """Fit the regression model on given logged bandit data and predict the Q functions of the same data. + + Parameters + ---------- + context: array-like, shape (n_rounds, dim_context) + Context vectors observed for each data, i.e., :math:`x_i`. + + action: array-like, (n_rounds * len_list,) + Action observed at each slot in each round of the logged bandit feedback, i.e., :math:`a_{t}(k)`, + which is chosen by the behavior policy :math:`\\pi_b`. + + reward: array-like, shape (n_rounds * len_list,) + Reward observed at each slot in each round of the logged bandit feedback, i.e., :math:`r_{t}(k)`. + + pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of behavior policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of behavior policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. + + evaluation_policy_pscore_cascade: array-like, shape (n_rounds * len_list,) + Joint probabilities of evaluation policy selecting action :math:`a_{1:k}` (actions presented at position (slot) `1` to `k`). + Each probability of evaluation policy selecting action :math:`a_k` (action presented at position (slot) `k`) is conditioned on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. + + evaluation_policy_action_dist: array-like (n_rounds * len_list * n_unique_actions, ) + Plackett-luce style action distribution induced by evaluation policy (action choice probabilities at each slot given previous action choices). + , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1)) \\forall a_t(k) \\in \\mathcal{A}`. + + Returns + ----------- + q_hat: array-like, shape (n_rounds_of_new_data * len_list * n_unique_action, ) + Estimated Q functions for new data by the regression model. + + """ + self.fit( + context=context, + action=action, + reward=reward, + pscore_cascade=pscore_cascade, + evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + # (n_rounds_test, len_list, n_unique_action, ) + return self.predict(context=context, action=action) + + def _preprocess_for_reg_model( + self, + context: np.ndarray, + action: np.ndarray, + reward: np.ndarray, + evaluation_policy_action_dist: np.ndarray, + position_: int, + ): + """Preprocess feature vectors and estimation target to train a given regression model. + + Note + ----- + Please override this method if you want to use another feature enginnering + for training the regression model. + + Parameters + ----------- + context: array-like, shape (n_rounds_, dim_context) + Context vectors in the training logged bandit feedback. + + action: array-like, (n_rounds_ * len_list, ) + Action observed at each slot in each round of the logged bandit feedback, i.e., :math:`a_{t}(k)`, + which is chosen by the behavior policy :math:`\\pi_b`. + + reward: array-like, shape (n_rounds_ * len_list, ) + Reward observed at each slot in each round of the logged bandit feedback, i.e., :math:`r_{t}(k)`. + + evaluation_policy_action_dist: array-like (n_rounds_ * len_list * n_unique_actions, ) + Plackett-luce style action distribution induced by evaluation policy (action choice probabilities at each slot given previous action choices). + , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1)) \\forall a_t(k) \\in \\mathcal{A}`. + + position_: int + Position id (slot) in a slate. + + Returns + ----------- + X, y: array-like, shape(n_rounds, ) + Input and target vectors in prediction. + + """ + n_rounds_ = len(context) + # (n_rounds_, len_list) -> (n_rounds_, position_) + action = action[:, : position_ + 1] + # (n_rounds_, len_list) -> (n_rounds_, ) + reward = reward[:, position_] + # estimator input + X = np.concatenate([context, action], axis=1) + # estimate the Q function at the next position + # (n_rounds_, ) + if position_ + 1 == self.len_list: + q_hat_at_next_position = np.zeros(n_rounds_) + else: + # (n_rounds_ * len_list * n_unique_action, ) -> (n_rounds_, len_list, n_unique_action) -> (n_rounds_, len_list) -> (n_rounds_ * n_unique_action, ) + evaluation_policy_action_dist_at_next_position = ( + evaluation_policy_action_dist.reshape( + (-1, self.len_list, self.n_unique_action) + )[:, position_ + 1, :] + ).flatten() + # (n_rounds_, dim_context) -> (n_rounds_ * n_unique_action, dim_context) + context_ = [] + # (n_rounds_, position_ + 1) -> (n_rounds_ * n_unique_action, position_ + 1) + action_ = [] + for i in range(n_rounds_): + for a_ in range(self.n_unique_action): + context_.append(context[i]) + action_.append(np.append(action[i], a_)) + X_ = np.concatenate([context_, action_], axis=1) + # (n_rounds_ * n_unique_action, ) -> (n_rounds_, ) + q_hat_at_next_position = self.base_model_list[position_ + 1].predict(X_) + # the expected Q function under the evaluation policy + # (n_rounds_ * n_unique_action, ) -> (n_rounds_, n_unique_action) -> (n_rounds_, ) + q_hat_at_next_position = ( + ( + evaluation_policy_action_dist_at_next_position + * q_hat_at_next_position + ) + .reshape((-1, self.n_unique_action)) + .sum(axis=1) + ) + # (n_rounds_, ) + y = reward + q_hat_at_next_position + # (n_rounds_, dim_context + position_), (n_rounds_, ) + return X, y diff --git a/obp/policy/base.py b/obp/policy/base.py index 52d33848..fcdff1e3 100644 --- a/obp/policy/base.py +++ b/obp/policy/base.py @@ -24,7 +24,7 @@ class BaseContextFreePolicy(metaclass=ABCMeta): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. batch_size: int, default=1 @@ -90,7 +90,7 @@ class BaseContextualPolicy(metaclass=ABCMeta): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. batch_size: int, default=1 @@ -153,7 +153,7 @@ class BaseOfflinePolicyLearner(metaclass=ABCMeta): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. """ diff --git a/obp/policy/contextfree.py b/obp/policy/contextfree.py index 848d75a7..2940a278 100644 --- a/obp/policy/contextfree.py +++ b/obp/policy/contextfree.py @@ -31,7 +31,7 @@ class EpsilonGreedy(BaseContextFreePolicy): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. batch_size: int, default=1 @@ -103,7 +103,7 @@ class Random(EpsilonGreedy): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. batch_size: int, default=1 @@ -156,7 +156,7 @@ class BernoulliTS(BaseContextFreePolicy): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. batch_size: int, default=1 diff --git a/obp/policy/linear.py b/obp/policy/linear.py index ad6f25c7..56edb807 100644 --- a/obp/policy/linear.py +++ b/obp/policy/linear.py @@ -24,7 +24,7 @@ class BaseLinPolicy(BaseContextualPolicy): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. batch_size: int, default=1 @@ -98,7 +98,7 @@ class LinEpsilonGreedy(BaseLinPolicy): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. batch_size: int, default=1 @@ -177,7 +177,7 @@ class LinUCB(BaseLinPolicy): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. batch_size: int, default=1 @@ -255,7 +255,7 @@ class LinTS(BaseLinPolicy): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. batch_size: int, default=1 diff --git a/obp/policy/logistic.py b/obp/policy/logistic.py index 58abdb35..916c03da 100644 --- a/obp/policy/logistic.py +++ b/obp/policy/logistic.py @@ -27,7 +27,7 @@ class BaseLogisticPolicy(BaseContextualPolicy): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. batch_size: int, default=1 @@ -113,7 +113,7 @@ class LogisticEpsilonGreedy(BaseLogisticPolicy): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. batch_size: int, default=1 @@ -180,7 +180,7 @@ class LogisticUCB(BaseLogisticPolicy): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. batch_size: int, default=1 @@ -254,7 +254,7 @@ class LogisticTS(BaseLogisticPolicy): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. batch_size: int, default=1 diff --git a/obp/policy/offline.py b/obp/policy/offline.py index c245bafd..34901e0c 100644 --- a/obp/policy/offline.py +++ b/obp/policy/offline.py @@ -43,7 +43,7 @@ class IPWLearner(BaseOfflinePolicyLearner): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. base_classifier: ClassifierMixin @@ -85,13 +85,13 @@ def _create_train_data_for_opl( Parameters ----------- context: array-like, shape (n_rounds, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. pscore: array-like, shape (n_rounds,), default=None Propensity scores, the probability of selecting each action by behavior policy, @@ -124,29 +124,29 @@ def fit( \\hat{\\pi} & \\in \\arg \\max_{\\pi \\in \\Pi} \\hat{V}_{\\mathrm{IPW}} (\\pi ; \\mathcal{D}) \\\\ - & = \\arg \\max_{\\pi \\in \\Pi} \\mathbb{E}_{\\mathcal{D}} \\left[\\frac{\\mathbb{I} \\{\\pi (x_{i})=a_{i} \\}}{\\pi_{b}(a_{i} | x_{i})} r_{i} \\right] \\\\ - & = \\arg \\min_{\\pi \\in \\Pi} \\mathbb{E}_{\\mathcal{D}} \\left[\\frac{r_i}{\\pi_{b}(a_{i} | x_{i})} \\mathbb{I} \\{\\pi (x_{i}) \\neq a_{i} \\} \\right], + & = \\arg \\max_{\\pi \\in \\Pi} \\mathbb{E}_{n} \\left[\\frac{\\mathbb{I} \\{\\pi (x_{i})=a_{i} \\}}{\\pi_{b}(a_{i} | x_{i})} r_{i} \\right] \\\\ + & = \\arg \\min_{\\pi \\in \\Pi} \\mathbb{E}_{n} \\left[\\frac{r_i}{\\pi_{b}(a_{i} | x_{i})} \\mathbb{I} \\{\\pi (x_{i}) \\neq a_{i} \\} \\right], - where :math:`\\mathbb{E}_{\\mathcal{D}} [\cdot]` is the empirical average over observations in :math:`\\mathcal{D}`. + where :math:`\\mathbb{E}_{n} [\cdot]` is the empirical average over observations in :math:`\\mathcal{D}`. See the reference for the details. Parameters ----------- context: array-like, shape (n_rounds, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. If None is given, a learner assumes that there is only one position. When `len_list` > 1, position has to be set. @@ -361,7 +361,7 @@ class QLearner(BaseOfflinePolicyLearner): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. base_model: BaseEstimator @@ -411,19 +411,19 @@ def fit( Parameters ----------- context: array-like, shape (n_rounds, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. If None is given, a learner assumes that there is only one position. When `len_list` > 1, position has to be set. @@ -631,7 +631,7 @@ class NNPolicyLearner(BaseOfflinePolicyLearner): Number of actions. len_list: int, default=1 - Length of a list of actions recommended in each impression. + Length of a list of actions in a recommender inferface, slate size. When Open Bandit Dataset is used, 3 should be set. dim_context: int @@ -918,20 +918,20 @@ def _create_train_data_for_opl( Parameters ----------- context: array-like, shape (n_rounds, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. pscore: array-like, shape (n_rounds,), default=None Propensity scores, the probability of selecting each action by behavior policy, in the given logged bandit data. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. If None is given, a learner assumes that there is only one position. Returns @@ -1009,19 +1009,19 @@ def fit( Parameters ----------- context: array-like, shape (n_rounds, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. If None is given, a learner assumes that there is only one position. When `len_list` > 1, position has to be set. Currently, this feature is not supported. @@ -1169,24 +1169,24 @@ def _estimate_policy_value( Parameters ----------- context: array-like, shape (batch_size, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action: array-like, shape (batch_size,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (batch_size,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. pscore: array-like, shape (batch_size,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_dist: array-like, shape (batch_size, n_actions, len_list) - Action choice probabilities of evaluation policy (must be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (must be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. Returns ---------- estimated_policy_value_arr: array-like, shape (batch_size,) - Rewards of each round estimated by an OPE estimator. + Rewards of each data estimated by an OPE estimator. """ if self.off_policy_objective == "dm": @@ -1234,13 +1234,13 @@ def _estimate_policy_constraint( Parameters ----------- action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (must be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (must be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. """ idx_tensor = torch.arange(action.shape[0], dtype=torch.long) @@ -1616,13 +1616,13 @@ def _create_train_data_for_q_func_estimation( Parameters ----------- context: array-like, shape (n_rounds, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. Returns -------- @@ -1685,13 +1685,13 @@ def fit( Parameters ----------- context: array-like, shape (n_rounds, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. """ check_bandit_feedback_inputs( @@ -1795,7 +1795,7 @@ def predict( Context vectors for new data. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (must be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (must be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. Returns ----------- diff --git a/obp/policy/offline_continuous.py b/obp/policy/offline_continuous.py index 5b2f7bd5..067a32a5 100644 --- a/obp/policy/offline_continuous.py +++ b/obp/policy/offline_continuous.py @@ -315,13 +315,13 @@ def _create_train_data_for_opl( Parameters ----------- context: array-like, shape (n_rounds, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action: array-like, shape (n_rounds,) - Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by a behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. pscore: array-like, shape (n_rounds,), default=None Propensity scores, the probability of selecting each action by behavior policy in the given logged bandit data. @@ -399,16 +399,16 @@ def fit( Parameters ----------- context: array-like, shape (n_rounds, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action: array-like, shape (n_rounds,) - Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by a behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. pscore: array-like, shape (n_rounds,), default=None - Action choice probabilities by a behavior policy (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities by a behavior policy (generalized propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. """ check_continuous_bandit_feedback_inputs( @@ -539,16 +539,16 @@ def _estimate_policy_value( Parameters ----------- context: Tensor, shape (batch_size, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action: Tensor, shape (batch_size,) - Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by a behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: Tensor, shape (batch_size,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. pscore: Tensor, shape (batch_size,) - Action choice probabilities of a behavior policy (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of a behavior policy (generalized propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_by_current_policy: Tensor, shape (batch_size,) Continuous action values given by the current policy. @@ -556,7 +556,7 @@ def _estimate_policy_value( Returns ---------- estimated_policy_value_arr: array-like, shape (batch_size,) - Rewards of each round estimated by an OPE estimator. + Rewards of each data estimated by an OPE estimator. """ @@ -861,13 +861,13 @@ def _create_train_data_for_q_func_estimation( Parameters ----------- context: array-like, shape (n_rounds, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action: array-like, shape (n_rounds,) - Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by a behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. Returns -------- @@ -931,13 +931,13 @@ def fit( Parameters ----------- context: array-like, shape (n_rounds, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action: array-like, shape (n_rounds,) - Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by a behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. """ check_continuous_bandit_feedback_inputs( diff --git a/obp/simulator/simulator.py b/obp/simulator/simulator.py index 937cb959..b4c798eb 100755 --- a/obp/simulator/simulator.py +++ b/obp/simulator/simulator.py @@ -29,7 +29,7 @@ def run_bandit_simulation( Parameters ---------- bandit_feedback: BanditFeedback - Logged bandit feedback data used in offline bandit simulation. + Logged bandit data used in offline bandit simulation. policy: BanditPolicy Online bandit policy evaluated in offline bandit simulation (i.e., evaluation policy). @@ -105,7 +105,7 @@ def calc_ground_truth_policy_value( Parameters ---------- bandit_feedback: BanditFeedback - Logged bandit feedback data used in offline bandit simulation. + Logged bandit data used in offline bandit simulation. It must contain "expected_rewards". reward_sampler: Callable[[np.ndarray, np.ndarray], np.ndarray] diff --git a/obp/utils.py b/obp/utils.py index 90f9481e..0478f3c1 100755 --- a/obp/utils.py +++ b/obp/utils.py @@ -126,8 +126,8 @@ def convert_to_action_dist( Number of actions. selected_actions: array-like, shape (n_rounds, len_list) - Sequence of actions selected by evaluation policy - at each round in offline bandit simulation. + Sequence of actions selected by evaluation policy + at each round in offline bandit simulation. Returns ---------- @@ -167,10 +167,12 @@ def check_array( """ if not isinstance(array, np.ndarray): - raise ValueError(f"{name} must be {expected_dim}D array, but got {type(array)}") + raise ValueError( + f"`{name}` must be {expected_dim}D array, but got {type(array)}" + ) if array.ndim != expected_dim: raise ValueError( - f"{name} must be {expected_dim}D array, but got {array.ndim}D array" + f"`{name}` must be {expected_dim}D array, but got {array.ndim}D array" ) @@ -195,11 +197,11 @@ def check_tensor( """ if not isinstance(tensor, torch.Tensor): raise ValueError( - f"{name} must be {expected_dim}D tensor, but got {type(tensor)}" + f"`{name}` must be {expected_dim}D tensor, but got {type(tensor)}" ) if tensor.ndim != expected_dim: raise ValueError( - f"{name} must be {expected_dim}D tensor, but got {tensor.ndim}D tensor" + f"`{name}` must be {expected_dim}D tensor, but got {tensor.ndim}D tensor" ) @@ -217,23 +219,22 @@ def check_bandit_feedback_inputs( Parameters ----------- context: array-like, shape (n_rounds, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. expected_reward: array-like, shape (n_rounds, n_actions), default=None - Expected rewards (or outcome) in each round, i.e., :math:`\\mathbb{E}[r_t]`. + Expected reward of each data, i.e., :math:`\\mathbb{E}[r_i|x_i,a_i]`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. - pscore: array-like, shape (n_rounds,), default=None - Propensity scores, the probability of selecting each action by behavior policy, - in the given logged bandit data. + pscore: array-like, shape (n_rounds,) + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_context: array-like, shape (n_actions, dim_action_context) Context vectors characterizing each action. @@ -242,9 +243,6 @@ def check_bandit_feedback_inputs( check_array(array=context, name="context", expected_dim=2) check_array(array=action, name="action", expected_dim=1) check_array(array=reward, name="reward", expected_dim=1) - if not (np.issubdtype(action.dtype, np.integer) and action.min() >= 0): - raise ValueError("action elements must be non-negative integers") - if expected_reward is not None: check_array(array=expected_reward, name="expected_reward", expected_dim=2) if not ( @@ -257,10 +255,17 @@ def check_bandit_feedback_inputs( "Expected `context.shape[0] == action.shape[0] == reward.shape[0] == expected_reward.shape[0]`" ", but found it False" ) - if action.max() >= expected_reward.shape[1]: + if not ( + np.issubdtype(action.dtype, np.integer) + and action.min() >= 0 + and action.max() < expected_reward.shape[1] + ): raise ValueError( - "action elements must be smaller than `expected_reward.shape[1]`" + "`action` elements must be integers in the range of [0, `expected_reward.shape[1]`)" ) + else: + if not (np.issubdtype(action.dtype, np.integer) and action.min() >= 0): + raise ValueError("`action` elements must be non-negative integers") if pscore is not None: check_array(array=pscore, name="pscore", expected_dim=1) if not ( @@ -271,7 +276,7 @@ def check_bandit_feedback_inputs( ", but found it False" ) if np.any(pscore <= 0): - raise ValueError("pscore must be positive") + raise ValueError("`pscore` must be positive") if position is not None: check_array(array=position, name="position", expected_dim=1) @@ -283,7 +288,7 @@ def check_bandit_feedback_inputs( ", but found it False" ) if not (np.issubdtype(position.dtype, np.integer) and position.min() >= 0): - raise ValueError("position elements must be non-negative integers") + raise ValueError("`position` elements must be non-negative integers") else: if not (context.shape[0] == action.shape[0] == reward.shape[0]): raise ValueError( @@ -292,10 +297,17 @@ def check_bandit_feedback_inputs( ) if action_context is not None: check_array(array=action_context, name="action_context", expected_dim=2) - if action.max() >= action_context.shape[0]: + if not ( + np.issubdtype(action.dtype, np.integer) + and action.min() >= 0 + and action.max() < action_context.shape[0] + ): raise ValueError( - "action elements must be smaller than `action_context.shape[0]`" + "`action` elements must be integers in the range of [0, `action_context.shape[0]`)" ) + else: + if not (np.issubdtype(action.dtype, np.integer) and action.min() >= 0): + raise ValueError("`action` elements must be non-negative integers") def check_ope_inputs( @@ -312,23 +324,22 @@ def check_ope_inputs( Parameters ----------- action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. position: array-like, shape (n_rounds,), default=None - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. action: array-like, shape (n_rounds,), default=None - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,), default=None - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. - pscore: array-like, shape (n_rounds,), default=None - Propensity scores, the probability of selecting each action by behavior policy, - in the given logged bandit data. + pscore: array-like, shape (n_rounds,) + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list), default=None - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. estimated_importance_weights: array-like, shape (n_rounds,), default=None Importance weights estimated via supervised classification, i.e., :math:`\\hat{w}(x_t, a_t)`. @@ -336,7 +347,7 @@ def check_ope_inputs( # action_dist check_array(array=action_dist, name="action_dist", expected_dim=3) if not np.allclose(action_dist.sum(axis=1), 1): - raise ValueError("action_dist must be a probability distribution") + raise ValueError("`action_dist` must be a probability distribution") # position if position is not None: @@ -346,14 +357,14 @@ def check_ope_inputs( "Expected `position.shape[0] == action_dist.shape[0]`, but found it False" ) if not (np.issubdtype(position.dtype, np.integer) and position.min() >= 0): - raise ValueError("position elements must be non-negative integers") + raise ValueError("`position` elements must be non-negative integers") if position.max() >= action_dist.shape[2]: raise ValueError( - "position elements must be smaller than `action_dist.shape[2]`" + "`position` elements must be smaller than `action_dist.shape[2]`" ) elif action_dist.shape[2] > 1: raise ValueError( - "position elements must be given when `action_dist.shape[2] > 1`" + "`position` elements must be given when `action_dist.shape[2] > 1`" ) # estimated_rewards_by_reg_model @@ -379,11 +390,13 @@ def check_ope_inputs( raise ValueError( "Expected `action.shape[0] == reward.shape[0]`, but found it False" ) - if not (np.issubdtype(action.dtype, np.integer) and action.min() >= 0): - raise ValueError("action elements must be non-negative integers") - if action.max() >= action_dist.shape[1]: + if not ( + np.issubdtype(action.dtype, np.integer) + and action.min() >= 0 + and action.max() < action_dist.shape[1] + ): raise ValueError( - "action elements must be smaller than `action_dist.shape[1]`" + "`action` elements must be integers in the range of [0, `action_dist.shape[1]`)" ) # pscore @@ -395,7 +408,7 @@ def check_ope_inputs( "Expected `action.shape[0] == reward.shape[0] == pscore.shape[0]`, but found it False" ) if np.any(pscore <= 0): - raise ValueError("pscore must be positive") + raise ValueError("`pscore` must be positive") def check_continuous_bandit_feedback_inputs( @@ -410,20 +423,20 @@ def check_continuous_bandit_feedback_inputs( Parameters ----------- context: array-like, shape (n_rounds, dim_context) - Context vectors in each round, i.e., :math:`x_t`. + Context vectors observed for each data, i.e., :math:`x_i`. action_by_behavior_policy: array-like, shape (n_rounds,) - Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by a behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,) - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. expected_reward: array-like, shape (n_rounds, n_actions), default=None - Expected rewards (or outcome) in each round, i.e., :math:`\\mathbb{E}[r_t]`. + Expected reward of each data, i.e., :math:`\\mathbb{E}[r_i|x_i,a_i]`. pscore: array-like, shape (n_rounds,), default=None Probability densities of the continuous action values sampled by a behavior policy - (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + (generalized propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. """ check_array(array=context, name="context", expected_dim=2) @@ -459,7 +472,7 @@ def check_continuous_bandit_feedback_inputs( "== reward.shape[0] == pscore.shape[0]`, but found it False" ) if np.any(pscore <= 0): - raise ValueError("pscore must be positive") + raise ValueError("`pscore` must be positive") def check_continuous_ope_inputs( @@ -477,17 +490,17 @@ def check_continuous_ope_inputs( Continuous action values given by the evaluation policy (can be deterministic), i.e., :math:`\\pi_e(x_t)`. action_by_behavior_policy: array-like, shape (n_rounds,), default=None - Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Continuous action values sampled by a behavior policy for each data in logged bandit data, i.e., :math:`a_i`. reward: array-like, shape (n_rounds,), default=None - Observed rewards (or outcome) in each round, i.e., :math:`r_t`. + Rewards observed for each data in logged bandit data, i.e., :math:`r_i`. pscore: array-like, shape (n_rounds,), default=None Probability densities of the continuous action values sampled by a behavior policy - (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + (generalized propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds,), default=None - Expected rewards given context and action estimated by a regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context and action estimated by a regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. """ # action_by_evaluation_policy @@ -545,7 +558,7 @@ def check_continuous_ope_inputs( ", but found it False" ) if np.any(pscore <= 0): - raise ValueError("pscore must be positive") + raise ValueError("`pscore` must be positive") def _check_slate_ope_inputs( @@ -561,13 +574,13 @@ def _check_slate_ope_inputs( Parameters ----------- slate_id: array-like, shape (<= n_rounds * len_list,) - Slate id observed in each round of the logged bandit feedback. + Slate id observed for each data in logged bandit data. reward: array-like, shape (<= n_rounds * len_list,) Reward observed at each slot in each round of the logged bandit feedback, i.e., :math:`r_{t}(k)`. position: array-like, shape (<= n_rounds * len_list,) - Positions of each round and slot in the given logged bandit data. + Position in a recommendation interface where the action was presented. pscore: array-like, shape (<= n_rounds * len_list,) Action choice probabilities of behavior policy (propensity scores). @@ -582,7 +595,7 @@ def _check_slate_ope_inputs( # position check_array(array=position, name="position", expected_dim=1) if not (position.dtype == int and position.min() >= 0): - raise ValueError("position elements must be non-negative integers") + raise ValueError("`position` elements must be non-negative integers") # reward check_array(array=reward, name="reward", expected_dim=1) @@ -590,7 +603,7 @@ def _check_slate_ope_inputs( # pscore check_array(array=pscore, name=f"{pscore_type}", expected_dim=1) if np.any(pscore <= 0) or np.any(pscore > 1): - raise ValueError(f"{pscore_type} must be in the range of (0, 1]") + raise ValueError(f"`{pscore_type}` must be in the range of (0, 1]") # evaluation_policy_pscore check_array( @@ -600,7 +613,7 @@ def _check_slate_ope_inputs( ) if np.any(evaluation_policy_pscore < 0) or np.any(evaluation_policy_pscore > 1): raise ValueError( - f"evaluation_policy_{pscore_type} must be in the range of [0, 1]" + f"`evaluation_policy_{pscore_type}` must be in the range of [0, 1]" ) # slate id @@ -615,7 +628,7 @@ def _check_slate_ope_inputs( == evaluation_policy_pscore.shape[0] ): raise ValueError( - f"slate_id, position, reward, {pscore_type}, and evaluation_policy_{pscore_type} " + f"`slate_id`, `position`, `reward`, `{pscore_type}`, and `evaluation_policy_{pscore_type}` " "must have the same number of samples." ) @@ -632,19 +645,19 @@ def check_sips_inputs( Parameters ----------- slate_id: array-like, shape (<= n_rounds * len_list,) - Slate id observed in each round of the logged bandit feedback. + Slate id observed for each data in logged bandit data. reward: array-like, shape (<= n_rounds * len_list,) Reward observed at each slot in each round of the logged bandit feedback, i.e., :math:`r_{t}(k)`. position: array-like, shape (<= n_rounds * len_list,) - Positions of each round and slot in the given logged bandit data. + Position in a recommendation interface where the action was presented. pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. evaluation_policy_pscore: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of evaluation policy, i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy, i.e., :math:`\\pi_e(a_i|x_i)`. """ _check_slate_ope_inputs( @@ -664,19 +677,19 @@ def check_sips_inputs( bandit_feedback_df["evaluation_policy_pscore"] = evaluation_policy_pscore # check uniqueness if bandit_feedback_df.duplicated(["slate_id", "position"]).sum() > 0: - raise ValueError("position must not be duplicated in each slate") + raise ValueError("`position` must not be duplicated in each slate") # check pscore uniqueness distinct_count_pscore_in_slate = bandit_feedback_df.groupby("slate_id").apply( lambda x: x["pscore"].unique().shape[0] ) if (distinct_count_pscore_in_slate != 1).sum() > 0: - raise ValueError("pscore must be unique in each slate") + raise ValueError("`pscore` must be unique in each slate") # check pscore uniqueness of evaluation policy distinct_count_evaluation_policy_pscore_in_slate = bandit_feedback_df.groupby( "slate_id" ).apply(lambda x: x["evaluation_policy_pscore"].unique().shape[0]) if (distinct_count_evaluation_policy_pscore_in_slate != 1).sum() > 0: - raise ValueError("evaluation_policy_pscore must be unique in each slate") + raise ValueError("`evaluation_policy_pscore` must be unique in each slate") def check_iips_inputs( @@ -691,13 +704,13 @@ def check_iips_inputs( Parameters ----------- slate_id: array-like, shape (<= n_rounds * len_list,) - Slate id observed in each round of the logged bandit feedback. + Slate id observed for each data in logged bandit data. reward: array-like, shape (<= n_rounds * len_list,) Reward observed at each slot in each round of the logged bandit feedback, i.e., :math:`r_{t}(k)`. position: array-like, shape (<= n_rounds * len_list,) - Positions of each round and slot in the given logged bandit data. + Position in a recommendation interface where the action was presented. pscore_item_position: array-like, shape (<= n_rounds * len_list,) Marginal action choice probabilities of the slot (:math:`k`) by a behavior policy (propensity scores), i.e., :math:`\\pi_b(a_{t}(k) |x_t)`. @@ -720,7 +733,7 @@ def check_iips_inputs( bandit_feedback_df["position"] = position # check uniqueness if bandit_feedback_df.duplicated(["slate_id", "position"]).sum() > 0: - raise ValueError("position must not be duplicated in each slate") + raise ValueError("`position` must not be duplicated in each slate") def check_rips_inputs( @@ -735,16 +748,16 @@ def check_rips_inputs( Parameters ----------- slate_id: array-like, shape (<= n_rounds * len_list,) - Slate id observed in each round of the logged bandit feedback. + Slate id observed for each data in logged bandit data. reward: array-like, shape (<= n_rounds * len_list,) Reward observed at each slot in each round of the logged bandit feedback, i.e., :math:`r_{t}(k)`. position: array-like, shape (<= n_rounds * len_list,) - Positions of each round and slot in the given logged bandit data. + Position in a recommendation interface where the action was presented. pscore_cascade: array-like, shape (<= n_rounds * len_list,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. evaluation_policy_pscore_cascade: array-like, shape (<= n_rounds * len_list,) Action choice probabilities above the slot (:math:`k`) by the evaluation policy, i.e., :math:`\\pi_e(\\{a_{t, j}\\}_{j \\le k}|x_t)`. @@ -775,7 +788,7 @@ def check_rips_inputs( ) # check uniqueness if bandit_feedback_df.duplicated(["slate_id", "position"]).sum() > 0: - raise ValueError("position must not be duplicated in each slate") + raise ValueError("`position` must not be duplicated in each slate") # check pscore_cascade structure previous_minimum_pscore_cascade = ( bandit_feedback_df.groupby("slate_id")["pscore_cascade"] @@ -786,7 +799,9 @@ def check_rips_inputs( if ( previous_minimum_pscore_cascade < bandit_feedback_df["pscore_cascade"] ).sum() > 0: - raise ValueError("pscore_cascade must be non-increasing sequence in each slate") + raise ValueError( + "`pscore_cascade` must be non-increasing sequence in each slate" + ) # check pscore_cascade structure of evaluation policy previous_minimum_evaluation_policy_pscore_cascade = ( bandit_feedback_df.groupby("slate_id")["evaluation_policy_pscore_cascade"] @@ -799,7 +814,105 @@ def check_rips_inputs( < bandit_feedback_df["evaluation_policy_pscore_cascade"] ).sum() > 0: raise ValueError( - "evaluation_policy_pscore_cascade must be non-increasing sequence in each slate" + "`evaluation_policy_pscore_cascade` must be non-increasing sequence in each slate" + ) + + +def check_cascade_dr_inputs( + n_unique_action: int, + slate_id: np.ndarray, + action: np.ndarray, + reward: np.ndarray, + position: np.ndarray, + pscore_cascade: np.ndarray, + evaluation_policy_pscore_cascade: np.ndarray, + q_hat: np.ndarray, + evaluation_policy_action_dist: np.ndarray, +) -> Optional[ValueError]: + """Check inputs of SlateCascadeDoublyRobust. + + Parameters + ----------- + n_unique_action: int + Number of unique actions. + + slate_id: array-like, shape (<= n_rounds * len_list,) + IDs to differentiate slates (i.e., rounds or lists of actions). + + action: array-like, (<= n_rounds * len_list,) + Action observed at each slot in each round of the logged bandit feedback, i.e., :math:`a_{t}(k)`, + which is chosen by the behavior policy :math:`\\pi_b`. + + reward: array-like, shape (<= n_rounds * len_list,) + Reward observed at each slot in each round of the logged bandit feedback, i.e., :math:`r_{t}(k)`. + + position: array-like, shape (<= n_rounds * len_list,) + IDs to differentiate slot (i.e., position in recommendation/ranking interface) in each slate. + + pscore_cascade: array-like, shape (<= n_rounds * len_list,) + Probabilities of behavior policy selecting action :math:`a` at position (slot) `k` conditional on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_b(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. + + evaluation_policy_pscore_cascade: array-like, shape (<= n_rounds * len_list,) + Probabilities of evaluation policy selecting action :math:`a` at position (slot) `k` conditional on the previous actions (presented at position `1` to `k-1`) + , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1))`. + + q_hat: array-like (<= n_rounds * len_list * n_unique_actions, ) + :math:`\\hat{Q}_k` used in Cascade-DR. + , i.e., :math:`\\hat{Q}_{t, k}(x_t, a_t(1), \\ldots, a_t(k-1), a_t(k)) \\forall a_t(k) \\in \\mathcal{A}`. + + evaluation_policy_action_dist: array-like (<= n_rounds * len_list * n_unique_actions, ) + Action choice probabilities of evaluation policy for all possible actions + , i.e., :math:`\\pi_e(a_t(k) | x_t, a_t(1), \\ldots, a_t(k-1)) \\forall a_t(k) \\in \\mathcal{A}`. + + """ + check_rips_inputs( + slate_id=slate_id, + reward=reward, + position=position, + pscore_cascade=pscore_cascade, + evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + ) + check_array(array=action, name="action", expected_dim=1) + check_array( + array=q_hat, + name="q_hat", + expected_dim=1, + ) + check_array( + array=evaluation_policy_action_dist, + name="evaluation_policy_action_dist", + expected_dim=1, + ) + if not ( + np.issubdtype(action.dtype, np.integer) + and action.min() >= 0 + and action.max() < n_unique_action + ): + raise ValueError( + "`action` elements must be integers in the range of [0, n_unique_action)" + ) + if not ( + slate_id.shape[0] + == action.shape[0] + == q_hat.shape[0] // n_unique_action + == evaluation_policy_action_dist.shape[0] // n_unique_action + ): + raise ValueError( + "Expected `slate_id.shape[0] == action.shape[0] == " + "q_hat.shape[0] // n_unique_action == evaluation_policy_action_dist.shape[0] // n_unique_action`, " + "but found it False" + ) + evaluation_policy_action_dist_ = evaluation_policy_action_dist.reshape( + (-1, n_unique_action) + ) + if not np.allclose( + np.ones(evaluation_policy_action_dist_.shape[0]), + evaluation_policy_action_dist_.sum(axis=1), + ): + raise ValueError( + "`evaluation_policy_action_dist[i * n_unique_action : (i+1) * n_unique_action]` " + "must sum up to one for all i." ) diff --git a/tests/dataset/test_synthetic.py b/tests/dataset/test_synthetic.py index cd0cb8cb..6ed0d713 100644 --- a/tests/dataset/test_synthetic.py +++ b/tests/dataset/test_synthetic.py @@ -76,20 +76,20 @@ def test_synthetic_init(): # context, action, description invalid_input_of_sample_reward = [ - ("3", np.ones(2, dtype=int), "context must be 2D array"), - (None, np.ones(2, dtype=int), "context must be 2D array"), - (np.ones((2, 3)), "3", "action must be 1D array"), - (np.ones((2, 3)), None, "action must be 1D array"), + ("3", np.ones(2, dtype=int), "`context` must be 2D array"), + (None, np.ones(2, dtype=int), "`context` must be 2D array"), + (np.ones((2, 3)), "3", "`action` must be 1D array"), + (np.ones((2, 3)), None, "`action` must be 1D array"), ( np.ones((2, 3)), np.ones(2, dtype=np.float32), "the dtype of action must be a subdtype of int", ), - (np.ones(2), np.ones(2, dtype=int), "context must be 2D array"), + (np.ones(2), np.ones(2, dtype=int), "`context` must be 2D array"), ( np.ones((2, 3)), np.ones((2, 3), dtype=int), - "action must be 1D array", + "`action` must be 1D array", ), ( np.ones((2, 3)), @@ -195,11 +195,11 @@ def test_synthetic_obtain_batch_bandit_feedback(): np.ones((2, 2, 3)), "Expected `expected_reward.shape[1]", ), - ("3", np.ones((2, 2, 3)), "expected_reward must be 2D array"), - (None, np.ones((2, 2, 3)), "expected_reward must be 2D array"), - (np.ones((2, 3)), np.ones((2, 3)), "action_dist must be 3D array"), - (np.ones((2, 3)), "3", "action_dist must be 3D array"), - (np.ones((2, 3)), None, "action_dist must be 3D array"), + ("3", np.ones((2, 2, 3)), "`expected_reward` must be 2D array"), + (None, np.ones((2, 2, 3)), "`expected_reward` must be 2D array"), + (np.ones((2, 3)), np.ones((2, 3)), "`action_dist` must be 3D array"), + (np.ones((2, 3)), "3", "`action_dist` must be 3D array"), + (np.ones((2, 3)), None, "`action_dist` must be 3D array"), ] valid_input_of_calc_policy_value = [ diff --git a/tests/dataset/test_synthetic_continuous.py b/tests/dataset/test_synthetic_continuous.py index 877bd774..fe01fc8e 100644 --- a/tests/dataset/test_synthetic_continuous.py +++ b/tests/dataset/test_synthetic_continuous.py @@ -301,15 +301,15 @@ def test_synthetic_continuous_obtain_batch_bandit_feedback(): np.ones(4), "Expected `context.shape[1]", ), - ("3", np.ones(4), "context must be 2D array"), - (None, np.ones(4), "context must be 2D array"), + ("3", np.ones(4), "`context` must be 2D array"), + (None, np.ones(4), "`context` must be 2D array"), ( np.ones((4, 1)), np.ones((4, 1)), # - "action must be 1D array", + "`action` must be 1D array", ), - (np.ones((4, 1)), "3", "action must be 1D array"), - (np.ones((4, 1)), None, "action must be 1D array"), + (np.ones((4, 1)), "3", "`action` must be 1D array"), + (np.ones((4, 1)), None, "`action` must be 1D array"), ] diff --git a/tests/dataset/test_synthetic_slate.py b/tests/dataset/test_synthetic_slate.py index a31c8b2a..d4702369 100644 --- a/tests/dataset/test_synthetic_slate.py +++ b/tests/dataset/test_synthetic_slate.py @@ -259,7 +259,7 @@ def check_slate_bandit_feedback( # check uniqueness assert ( bandit_feedback_df.duplicated(["slate_id", "position"]).sum() == 0 - ), "position must not be duplicated in each slate" + ), "`position` must not be duplicated in each slate" assert ( bandit_feedback_df.duplicated(["slate_id", "action"]).sum() == 0 if not is_factorizable @@ -302,7 +302,7 @@ def check_slate_bandit_feedback( ) assert ( count_pscore_in_expression != 1 - ).sum() == 0, "pscore must be unique in each slate" + ).sum() == 0, "`pscore` must be unique in each slate" if "pscore" in pscore_columns and "pscore_cascade" in pscore_columns: last_slot_feedback_df = bandit_feedback_df.drop_duplicates( "slate_id", keep="last" @@ -1088,22 +1088,22 @@ def test_synthetic_slate_using_valid_inputs( ( np.repeat(np.arange(n_rounds), len_list), "4", # - "reward must be 1D array", + "`reward` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), np.zeros((n_rounds, len_list), dtype=int), # - "reward must be 1D array", + "`reward` must be 1D array", ), ( "4", # np.zeros(n_rounds * len_list, dtype=int), - "slate_id must be 1D array", + "`slate_id` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list).reshape((n_rounds, len_list)), # np.zeros(n_rounds * len_list, dtype=int), - "slate_id must be 1D array", + "`slate_id` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -1196,7 +1196,7 @@ def test_calc_on_policy_policy_value_using_valid_input_data( np.array([5, 2]), # np.tile(np.arange(3), 5), ValueError, - "context must be 2D array", + "`context` must be 2D array", ), ( "optimal", @@ -1204,7 +1204,7 @@ def test_calc_on_policy_policy_value_using_valid_input_data( np.ones([5, 2]), np.ones([5, 2]), # ValueError, - "action must be 1D array", + "`action` must be 1D array", ), ( "optimal", @@ -1212,7 +1212,7 @@ def test_calc_on_policy_policy_value_using_valid_input_data( np.ones([5, 2]), np.random.choice(5), # ValueError, - "action must be 1D array", + "`action` must be 1D array", ), ( "optimal", @@ -1427,7 +1427,7 @@ def test_generate_evaluation_policy_pscore_using_valid_input_data( ) assert ( count_pscore_in_expression != 1 - ).sum() == 0, "pscore must be unique in each slate" + ).sum() == 0, "`pscore` must be unique in each slate" last_slot_feedback_df = bandit_feedback_df.drop_duplicates("slate_id", keep="last") assert np.allclose( last_slot_feedback_df["pscore"], last_slot_feedback_df["pscore_cascade"] @@ -1529,7 +1529,7 @@ def test_calc_epsilon_greedy_pscore_using_valid_input_data( np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [1, 2, 3]]).flatten(), np.ones((3, 2)), ValueError, - "evaluation_policy_logit_ must be 2D array", + "`evaluation_policy_logit_` must be 2D array", ), ( 3, @@ -1979,31 +1979,31 @@ def test_calc_ground_truth_policy_value_value_check_with_eta( np.ones((n_rounds, len_list)), np.ones((n_rounds, n_unique_action)), ValueError, - "action must be 1D array", + "`action` must be 1D array", ), ( np.ones((n_rounds * len_list)), np.ones((n_rounds * n_unique_action)), ValueError, - "evaluation_policy_logit_ must be 2D array", + "`evaluation_policy_logit_` must be 2D array", ), ( np.ones((n_rounds * len_list + 1)), np.ones((n_rounds, n_unique_action)), ValueError, - "the shape of action and evaluation_policy_logit_ must be", + "the shape of `action` and `evaluation_policy_logit_` must be", ), ( np.ones((n_rounds * len_list)), np.ones((n_rounds, n_unique_action + 1)), ValueError, - "the shape of action and evaluation_policy_logit_ must be", + "the shape of `action` and `evaluation_policy_logit_` must be", ), ( np.ones((n_rounds * len_list)), np.ones((n_rounds + 1, n_unique_action)), ValueError, - "the shape of action and evaluation_policy_logit_ must be", + "the shape of `action` and `evaluation_policy_logit_` must be", ), ] @@ -2287,3 +2287,161 @@ def test_obtain_pscore_given_evaluation_policy_logit_using_mock_input_data( assert np.allclose( true_pscores_item_position, evaluation_policy_pscore_item_position ) + + +# action, evaluation_policy_logit_, err, description +invalid_input_of_calc_evaluation_policy_action_dist = [ + ( + np.ones((10, 3)), # + np.ones((10, 3)), + ValueError, + "`action` must be 1D array", + ), + ( + np.ones((10 * 3 + 1)), # + np.ones((10, 3)), + ValueError, + "Expected `len(action)", + ), + ( + np.ones((10 * 3)), + np.ones((10, 2)), # + ValueError, + "Expected `evaluation_policy_logit_.shape[1]", + ), + ( + np.ones((10 * 3)), + np.ones((15, 3)), # + ValueError, + "Expected `len(action)", + ), + ( + np.ones((10 * 3)), + np.ones((10 * 3)), # + ValueError, + "`evaluation_policy_logit_` must be 2D array", + ), +] + + +@pytest.mark.parametrize( + "action, evaluation_policy_logit_, err, description", + invalid_input_of_calc_evaluation_policy_action_dist, +) +def test_calc_evaluation_policy_action_dist_using_invalid_input_data( + action, + evaluation_policy_logit_, + err, + description, +): + # set parameters + n_unique_action = 3 + len_list = 3 + dim_context = 2 + reward_type = "binary" + is_factorizable = True + random_state = 12345 + dataset = SyntheticSlateBanditDataset( + n_unique_action=n_unique_action, + len_list=len_list, + dim_context=dim_context, + reward_type=reward_type, + random_state=random_state, + is_factorizable=is_factorizable, + base_reward_function=logistic_reward_function, + ) + with pytest.raises(err, match=f"{description}*"): + dataset.calc_evaluation_policy_action_dist( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + ) + + +# action, evaluation_policy_logit_, description +valid_input_of_calc_evaluation_policy_action_dist = [ + ( + np.ones((10 * 3), dtype=int), + np.ones((10, 3)) * 10, + "", + ), +] + + +@pytest.mark.parametrize( + "action, evaluation_policy_logit_, description", + valid_input_of_calc_evaluation_policy_action_dist, +) +def test_calc_evaluation_policy_action_dist_using_valid_input_data_factorizable_case( + action, + evaluation_policy_logit_, + description, +): + # set parameters + n_unique_action = 3 + len_list = 3 + dim_context = 2 + reward_type = "binary" + is_factorizable = True + random_state = 12345 + dataset = SyntheticSlateBanditDataset( + n_unique_action=n_unique_action, + len_list=len_list, + dim_context=dim_context, + reward_type=reward_type, + random_state=random_state, + is_factorizable=is_factorizable, + base_reward_function=logistic_reward_function, + ) + evaluation_policy_action_dist = dataset.calc_evaluation_policy_action_dist( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + ) + assert len(evaluation_policy_action_dist) == n_rounds * len_list * n_unique_action + assert np.allclose( + evaluation_policy_action_dist.reshape((-1, n_unique_action)).sum(axis=1), + np.ones((n_rounds * len_list,)), + ) + assert np.allclose( + evaluation_policy_action_dist, + np.ones_like(evaluation_policy_action_dist) / n_unique_action, + ) + + +@pytest.mark.parametrize( + "action, evaluation_policy_logit_, description", + valid_input_of_calc_evaluation_policy_action_dist, +) +def test_calc_evaluation_policy_action_dist_using_valid_input_data_non_factorizable_case( + action, + evaluation_policy_logit_, + description, +): + # set parameters + n_unique_action = 3 + len_list = 3 + dim_context = 2 + reward_type = "binary" + is_factorizable = False + random_state = 12345 + dataset = SyntheticSlateBanditDataset( + n_unique_action=n_unique_action, + len_list=len_list, + dim_context=dim_context, + reward_type=reward_type, + random_state=random_state, + is_factorizable=is_factorizable, + base_reward_function=logistic_reward_function, + ) + evaluation_policy_action_dist = dataset.calc_evaluation_policy_action_dist( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + ) + assert len(evaluation_policy_action_dist) == n_rounds * len_list * n_unique_action + assert np.allclose( + evaluation_policy_action_dist.reshape((-1, n_unique_action)).sum(axis=1), + np.ones((n_rounds * len_list,)), + ) + assert not np.allclose( + evaluation_policy_action_dist, + np.ones_like(evaluation_policy_action_dist) / n_unique_action, + ) diff --git a/tests/dataset/test_synthetic_slate_functions.py b/tests/dataset/test_synthetic_slate_functions.py index 90d09cf9..b1802e5b 100644 --- a/tests/dataset/test_synthetic_slate_functions.py +++ b/tests/dataset/test_synthetic_slate_functions.py @@ -20,28 +20,28 @@ def test_generate_symmetric_matrix(): np.ones([2, 2]), None, ValueError, - "context must be 2D array", + "`context` must be 2D array", ), ( [1.0, 1.0], np.ones([2, 2]), None, ValueError, - "context must be 2D array", + "`context` must be 2D array", ), ( np.ones([2, 2]), np.array([1.0, 1.0]), None, ValueError, - "action_context must be 2D array", + "`action_context` must be 2D array", ), ( np.ones([2, 2]), [1.0, 1.0], None, ValueError, - "action_context must be 2D array", + "`action_context` must be 2D array", ), (np.ones([2, 2]), np.ones([2, 2]), np.array([1]), TypeError, ""), (np.ones([2, 2]), np.ones([2, 2]), -1, ValueError, ""), @@ -100,7 +100,7 @@ def test_linear_behavior_policy_logit_using_valid_input( False, 1, ValueError, - "context must be 2D array", + "`context` must be 2D array", ), ( np.ones([5, 2]), @@ -114,7 +114,7 @@ def test_linear_behavior_policy_logit_using_valid_input( False, 1, ValueError, - "action_context must be 2D array", + "`action_context` must be 2D array", ), ( np.ones([5, 2]), @@ -128,7 +128,7 @@ def test_linear_behavior_policy_logit_using_valid_input( False, 1, ValueError, - "action must be 1D array", + "`action` must be 1D array", ), ( np.ones([5, 2]), diff --git a/tests/ope/hyperparams.yaml b/tests/ope/hyperparams.yaml index ebaba64c..f76d5ad3 100644 --- a/tests/ope/hyperparams.yaml +++ b/tests/ope/hyperparams.yaml @@ -15,3 +15,4 @@ random_forest: random_state: 12345 ridge: alpha: 0.2 + random_state: 12345 diff --git a/tests/ope/hyperparams_slate.yaml b/tests/ope/hyperparams_slate.yaml new file mode 100644 index 00000000..c303b009 --- /dev/null +++ b/tests/ope/hyperparams_slate.yaml @@ -0,0 +1,14 @@ +lightgbm: + n_estimators: 100 + learning_rate: 0.01 + max_depth: 5 + min_samples_leaf: 10 + random_state: 12345 +random_forest: + n_estimators: 100 + max_depth: 5 + min_samples_leaf: 10 + random_state: 12345 +ridge: + alpha: 0.2 + random_state: 12345 diff --git a/tests/ope/test_all_estimators.py b/tests/ope/test_all_estimators.py index f1b86fd7..762654e0 100644 --- a/tests/ope/test_all_estimators.py +++ b/tests/ope/test_all_estimators.py @@ -20,7 +20,7 @@ np.zeros((5, 4, 3)), np.ones(5), np.ones(5), - "action_dist must be 3D array", + "`action_dist` must be 3D array", ), ( generate_action_dist(5, 4, 1)[:, :, 0], # @@ -31,7 +31,7 @@ np.zeros((5, 4, 1)), np.ones(5), np.ones(5), - "action_dist must be 3D array", + "`action_dist` must be 3D array", ), ( np.ones((5, 4, 3)), # @@ -42,7 +42,7 @@ np.zeros((5, 4, 3)), np.ones(5), np.ones(5), - "action_dist must be a probability distribution", + "`action_dist` must be a probability distribution", ), ( generate_action_dist(5, 4, 3), @@ -53,7 +53,7 @@ np.zeros((5, 4, 3)), np.ones(5), np.ones(5), - "position must be 1D array", + "`position` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -64,7 +64,7 @@ np.zeros((5, 4, 3)), np.ones(5), np.ones(5), - "position must be 1D array", + "`position` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -75,7 +75,7 @@ np.zeros((5, 4, 3)), np.ones(5), np.ones(5), - "position elements must be non-negative integers", + "`position` elements must be non-negative integers", ), ( generate_action_dist(5, 4, 3), @@ -86,7 +86,7 @@ np.zeros((5, 4, 3)), np.ones(5), np.ones(5), - "position elements must be non-negative integers", + "`position` elements must be non-negative integers", ), ( generate_action_dist(5, 4, 3), @@ -108,7 +108,7 @@ np.zeros((5, 4, 3)), np.ones(5), np.ones(5), - "position elements must be smaller than", + "`position` elements must be smaller than", ), ( generate_action_dist(5, 4, 3), @@ -119,7 +119,7 @@ np.zeros((5, 4, 3)), np.ones(5), np.ones(5), - "position elements must be given when", + "`position` elements must be given when", ), ] @@ -181,9 +181,23 @@ def test_estimation_of_all_estimators_using_invalid_input_data( ] all_estimators_tuning = ope.__all_estimators_tuning__ estimators_tuning = [ - getattr(ope.estimators_tuning, estimator_name)([1, 100, 10000, np.inf]) + getattr(ope.estimators_tuning, estimator_name)( + lambdas=[1, 100, 10000, np.inf], + tuning_method=tuning_method, + ) for estimator_name in all_estimators_tuning + for tuning_method in ["slope", "mse"] + ] + all_estimators_tuning_sg = ope.__all_estimators_tuning_sg__ + estimators_tuning_sg = [ + getattr(ope.estimators_tuning, estimator_name)( + lambdas=[0.001, 0.01, 0.1, 1.0], + tuning_method=tuning_method, + ) + for estimator_name in all_estimators_tuning_sg + for tuning_method in ["slope", "mse"] ] + estimators_tuning = estimators_tuning + estimators_tuning_sg # estimate_intervals function raises ValueError of all estimators for estimator in estimators: with pytest.raises(ValueError, match=f"{description}*"): @@ -225,9 +239,10 @@ def test_estimation_of_all_estimators_using_invalid_input_data( assert hasattr( estimator_tuning, "best_hyperparam" ), "estimator_tuning should have `best_hyperparam` attr" - assert hasattr( - estimator_tuning, "estimated_mse_score_dict" - ), "estimator_tuning should have `estimated_mse_score_dict` attr" + if estimator_tuning.tuning_method == "mse": + assert hasattr( + estimator_tuning, "estimated_mse_score_dict" + ), "estimator_tuning should have `estimated_mse_score_dict` attr" with pytest.raises(ValueError, match=f"{description}*"): _ = estimator_tuning.estimate_interval( action_dist=action_dist, @@ -241,9 +256,10 @@ def test_estimation_of_all_estimators_using_invalid_input_data( assert hasattr( estimator_tuning, "best_hyperparam" ), "estimator_tuning should have `best_hyperparam` attr" - assert hasattr( - estimator_tuning, "estimated_mse_score_dict" - ), "estimator_tuning should have `estimated_mse_score_dict` attr" + if estimator_tuning.tuning_method == "mse": + assert hasattr( + estimator_tuning, "estimated_mse_score_dict" + ), "estimator_tuning should have `estimated_mse_score_dict` attr" @pytest.mark.parametrize( @@ -267,9 +283,23 @@ def test_estimation_of_all_estimators_using_valid_input_data( ] all_estimators_tuning = ope.__all_estimators_tuning__ estimators_tuning = [ - getattr(ope.estimators_tuning, estimator_name)([1, 100, 10000, np.inf]) + getattr(ope.estimators_tuning, estimator_name)( + lambdas=[1, 100, 10000, np.inf], + tuning_method=tuning_method, + ) for estimator_name in all_estimators_tuning + for tuning_method in ["slope", "mse"] + ] + all_estimators_tuning_sg = ope.__all_estimators_tuning_sg__ + estimators_tuning_sg = [ + getattr(ope.estimators_tuning, estimator_name)( + lambdas=[0.001, 0.01, 0.1, 1.0], + tuning_method=tuning_method, + ) + for estimator_name in all_estimators_tuning_sg + for tuning_method in ["slope", "mse"] ] + estimators_tuning = estimators_tuning + estimators_tuning_sg # estimate_intervals function raises ValueError of all estimators for estimator in estimators: _ = estimator.estimate_policy_value( @@ -373,10 +403,23 @@ def test_estimate_intervals_of_all_estimators_using_invalid_input_data( ] all_estimators_tuning = ope.__all_estimators_tuning__ estimators_tuning = [ - getattr(ope.estimators_tuning, estimator_name)([1, 100, 10000, np.inf]) + getattr(ope.estimators_tuning, estimator_name)( + lambdas=[1, 100, 10000, np.inf], + tuning_method=tuning_method, + ) for estimator_name in all_estimators_tuning + for tuning_method in ["slope", "mse"] + ] + all_estimators_tuning_sg = ope.__all_estimators_tuning_sg__ + estimators_tuning_sg = [ + getattr(ope.estimators_tuning, estimator_name)( + lambdas=[0.001, 0.01, 0.1, 1.0], + tuning_method=tuning_method, + ) + for estimator_name in all_estimators_tuning_sg + for tuning_method in ["slope", "mse"] ] - # TODO + estimators_tuning = estimators_tuning + estimators_tuning_sg estimated_pscore = None estimated_importance_weights = np.ones(bandit_feedback["action"].shape[0]) # estimate_intervals function raises ValueError of all estimators @@ -436,10 +479,23 @@ def test_estimate_intervals_of_all_estimators_using_valid_input_data( ] all_estimators_tuning = ope.__all_estimators_tuning__ estimators_tuning = [ - getattr(ope.estimators_tuning, estimator_name)([1, 100, 10000, np.inf]) + getattr(ope.estimators_tuning, estimator_name)( + lambdas=[1, 100, 10000, np.inf], + tuning_method=tuning_method, + ) for estimator_name in all_estimators_tuning + for tuning_method in ["slope", "mse"] ] - # TODO + all_estimators_tuning_sg = ope.__all_estimators_tuning_sg__ + estimators_tuning_sg = [ + getattr(ope.estimators_tuning, estimator_name)( + lambdas=[0.001, 0.01, 0.1, 1.0], + tuning_method=tuning_method, + ) + for estimator_name in all_estimators_tuning_sg + for tuning_method in ["slope", "mse"] + ] + estimators_tuning = estimators_tuning + estimators_tuning_sg estimated_pscore = None estimated_importance_weights = np.ones(bandit_feedback["action"].shape[0]) # estimate_intervals function raises ValueError of all estimators @@ -510,10 +566,23 @@ def test_performance_of_ope_estimators_using_random_evaluation_policy( ] all_estimators_tuning = ope.__all_estimators_tuning__ estimators_tuning = [ - getattr(ope.estimators_tuning, estimator_name)([1, 100, 10000, np.inf]) + getattr(ope.estimators_tuning, estimator_name)( + lambdas=[1, 100, 10000, np.inf], + tuning_method=tuning_method, + ) for estimator_name in all_estimators_tuning + for tuning_method in ["slope", "mse"] ] - estimators = estimators_standard + estimators_tuning + all_estimators_tuning_sg = ope.__all_estimators_tuning_sg__ + estimators_tuning_sg = [ + getattr(ope.estimators_tuning, estimator_name)( + lambdas=[0.001, 0.01, 0.1, 1.0], + tuning_method=tuning_method, + ) + for estimator_name in all_estimators_tuning_sg + for tuning_method in ["slope", "mse"] + ] + estimators = estimators_standard + estimators_tuning + estimators_tuning_sg # skip estimation estimated_pscore = None estimated_importance_weights = ( @@ -563,10 +632,23 @@ def test_response_format_of_ope_estimators_using_random_evaluation_policy( ] all_estimators_tuning = ope.__all_estimators_tuning__ estimators_tuning = [ - getattr(ope.estimators_tuning, estimator_name)([1, 100, 10000, np.inf]) + getattr(ope.estimators_tuning, estimator_name)( + lambdas=[1, 100, 10000, np.inf], + tuning_method=tuning_method, + ) for estimator_name in all_estimators_tuning + for tuning_method in ["slope", "mse"] + ] + all_estimators_tuning_sg = ope.__all_estimators_tuning_sg__ + estimators_tuning_sg = [ + getattr(ope.estimators_tuning, estimator_name)( + lambdas=[0.001, 0.01, 0.1, 1.0], + tuning_method=tuning_method, + ) + for estimator_name in all_estimators_tuning_sg + for tuning_method in ["slope", "mse"] ] - estimators = estimators_standard + estimators_tuning + estimators = estimators_standard + estimators_tuning + estimators_tuning_sg # skip estimation estimated_pscore = None estimated_importance_weights = ( diff --git a/tests/ope/test_bipw_estimators.py b/tests/ope/test_bipw_estimators.py index c773b817..bb4d7853 100644 --- a/tests/ope/test_bipw_estimators.py +++ b/tests/ope/test_bipw_estimators.py @@ -21,7 +21,7 @@ r"`lambda_` must be an instance of \(, \), not .", ), (-1.0, ValueError, "`lambda_`= -1.0, must be >= 0.0."), - (np.nan, ValueError, "lambda_ must not be nan"), + (np.nan, ValueError, "`lambda_` must not be nan"), ] @@ -51,7 +51,7 @@ def test_bipw_init_using_invalid_inputs( np.zeros(5, dtype=int), np.random.choice(3, size=5), np.ones(5), - "action must be 1D array", + "`action` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -59,7 +59,7 @@ def test_bipw_init_using_invalid_inputs( None, # np.random.choice(3, size=5), np.ones(5), - "reward must be 1D array", + "`reward` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -67,7 +67,7 @@ def test_bipw_init_using_invalid_inputs( np.zeros(5, dtype=int), np.random.choice(3, size=5), None, # - "estimated_importance_weights must be 1D array", + "`estimated_importance_weights` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -75,7 +75,7 @@ def test_bipw_init_using_invalid_inputs( np.zeros(5, dtype=int), np.random.choice(3, size=5), np.ones(5), - "action elements must be non-negative integers", + "`action` elements must be integers in the range of", ), ( generate_action_dist(5, 4, 3), @@ -83,7 +83,7 @@ def test_bipw_init_using_invalid_inputs( np.zeros(5, dtype=int), np.random.choice(3, size=5), np.ones(5), - "action elements must be non-negative integers", + "`action` elements must be integers in the range of", ), ( generate_action_dist(5, 4, 3), @@ -91,7 +91,7 @@ def test_bipw_init_using_invalid_inputs( np.zeros(5, dtype=int), np.random.choice(3, size=5), np.ones(5), - "action must be 1D array", + "`action` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -99,7 +99,7 @@ def test_bipw_init_using_invalid_inputs( np.zeros(5, dtype=int), np.random.choice(3, size=5), np.ones(5), - "action must be 1D array", + "`action` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -107,7 +107,7 @@ def test_bipw_init_using_invalid_inputs( np.zeros(5, dtype=int), np.random.choice(3, size=5), np.ones(5), - r"action elements must be smaller than`", + r"`action` elements must be integers in the range of`", ), ( generate_action_dist(5, 4, 3), @@ -115,7 +115,7 @@ def test_bipw_init_using_invalid_inputs( "4", # np.random.choice(3, size=5), np.ones(5), - "reward must be 1D array", + "`reward` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -123,7 +123,7 @@ def test_bipw_init_using_invalid_inputs( np.zeros((3, 2), dtype=int), # np.random.choice(3, size=5), np.ones(5), - "reward must be 1D array", + "`reward` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -139,7 +139,7 @@ def test_bipw_init_using_invalid_inputs( np.zeros(5, dtype=int), np.random.choice(3, size=5), "4", # - "estimated_importance_weights must be 1D array", + "`estimated_importance_weights` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -147,7 +147,7 @@ def test_bipw_init_using_invalid_inputs( np.zeros(5, dtype=int), np.random.choice(3, size=5), np.ones((5, 3)), # - "estimated_importance_weights must be 1D array", + "`estimated_importance_weights` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -171,7 +171,7 @@ def test_bipw_init_using_invalid_inputs( np.zeros(5, dtype=int), np.random.choice(3, size=5), None, # - "estimated_importance_weights must be 1D array", + "`estimated_importance_weights` must be 1D array", ), ] diff --git a/tests/ope/test_dm_estimators.py b/tests/ope/test_dm_estimators.py index f49eddd6..9ba79035 100644 --- a/tests/ope/test_dm_estimators.py +++ b/tests/ope/test_dm_estimators.py @@ -20,13 +20,13 @@ generate_action_dist(5, 4, 3), np.zeros(5, dtype=int), None, # - "estimated_rewards_by_reg_model must be 3D array", + "`estimated_rewards_by_reg_model` must be 3D array", ), ( generate_action_dist(5, 4, 3), np.zeros(5, dtype=int), "4", # - "estimated_rewards_by_reg_model must be 3D array", + "`estimated_rewards_by_reg_model` must be 3D array", ), ] diff --git a/tests/ope/test_dr_estimators.py b/tests/ope/test_dr_estimators.py index 5225e069..6f3469d6 100644 --- a/tests/ope/test_dr_estimators.py +++ b/tests/ope/test_dr_estimators.py @@ -10,6 +10,8 @@ from obp.ope import DoublyRobustWithShrinkage from obp.ope import DoublyRobustWithShrinkageTuning from obp.ope import SelfNormalizedDoublyRobust +from obp.ope import SubGaussianDoublyRobust +from obp.ope import SubGaussianDoublyRobustTuning from obp.ope import SwitchDoublyRobust from obp.ope import SwitchDoublyRobustTuning from obp.types import BanditFeedback @@ -30,7 +32,7 @@ r"`lambda_` must be an instance of \(, \), not .", ), (-1.0, False, ValueError, "`lambda_`= -1.0, must be >= 0.0."), - (np.nan, False, ValueError, "lambda_ must not be nan"), + (np.nan, False, ValueError, "`lambda_` must not be nan"), ( 1.0, "s", @@ -68,6 +70,7 @@ def test_dr_init_using_invalid_inputs( invalid_input_of_dr_tuning_init = [ ( "", # + "mse", True, 0.05, False, @@ -76,6 +79,7 @@ def test_dr_init_using_invalid_inputs( ), ( None, # + "slope", True, 0.05, False, @@ -84,6 +88,7 @@ def test_dr_init_using_invalid_inputs( ), ( [""], # + "mse", True, 0.05, False, @@ -92,6 +97,7 @@ def test_dr_init_using_invalid_inputs( ), ( [None], # + "slope", True, 0.05, False, @@ -100,6 +106,7 @@ def test_dr_init_using_invalid_inputs( ), ( [], # + "mse", True, 0.05, False, @@ -108,15 +115,34 @@ def test_dr_init_using_invalid_inputs( ), ( [-1.0], # + "slope", True, 0.05, False, ValueError, "`an element of lambdas`= -1.0, must be >= 0.0.", ), - ([np.nan], True, 0.05, False, ValueError, "an element of lambdas must not be nan"), + ( + [np.nan], + "mse", + True, + 0.05, + False, + ValueError, + "an element of lambdas must not be nan", + ), + ( + [1], + "", # + True, + 0.05, + False, + ValueError, + "`tuning_method` must be either 'slope' or 'mse'", + ), ( [1], + "mse", "", # 0.05, False, @@ -125,6 +151,7 @@ def test_dr_init_using_invalid_inputs( ), ( [1], + "slope", None, # 0.05, False, @@ -133,6 +160,7 @@ def test_dr_init_using_invalid_inputs( ), ( [1], + "mse", True, "", # False, @@ -141,6 +169,7 @@ def test_dr_init_using_invalid_inputs( ), ( [1], + "slope", True, None, # False, @@ -149,6 +178,7 @@ def test_dr_init_using_invalid_inputs( ), ( [1], + "mse", True, -1.0, # False, @@ -157,6 +187,7 @@ def test_dr_init_using_invalid_inputs( ), ( [1], + "slope", True, 1.1, # False, @@ -165,6 +196,7 @@ def test_dr_init_using_invalid_inputs( ), ( [1], + "slope", True, 1.0, "s", # @@ -175,11 +207,12 @@ def test_dr_init_using_invalid_inputs( @pytest.mark.parametrize( - "lambdas, use_bias_upper_bound, delta, use_estimated_pscore, err, description", + "lambdas, tuning_method, use_bias_upper_bound, delta, use_estimated_pscore, err, description", invalid_input_of_dr_tuning_init, ) def test_dr_tuning_init_using_invalid_inputs( lambdas, + tuning_method, use_bias_upper_bound, delta, use_estimated_pscore, @@ -191,6 +224,7 @@ def test_dr_tuning_init_using_invalid_inputs( use_bias_upper_bound=use_bias_upper_bound, delta=delta, lambdas=lambdas, + tuning_method=tuning_method, use_estimated_pscore=use_estimated_pscore, ) @@ -199,6 +233,7 @@ def test_dr_tuning_init_using_invalid_inputs( use_bias_upper_bound=use_bias_upper_bound, delta=delta, lambdas=lambdas, + tuning_method=tuning_method, use_estimated_pscore=use_estimated_pscore, ) @@ -207,14 +242,24 @@ def test_dr_tuning_init_using_invalid_inputs( use_bias_upper_bound=use_bias_upper_bound, delta=delta, lambdas=lambdas, + tuning_method=tuning_method, + use_estimated_pscore=use_estimated_pscore, + ) + + with pytest.raises(err, match=f"{description}*"): + _ = SubGaussianDoublyRobustTuning( + use_bias_upper_bound=use_bias_upper_bound, + delta=delta, + lambdas=lambdas, + tuning_method=tuning_method, use_estimated_pscore=use_estimated_pscore, ) valid_input_of_dr_init = [ (np.inf, "infinite lambda_"), - (3.0, "float lambda_"), - (2, "integer lambda_"), + (0.3, "float lambda_"), + (1, "integer lambda_"), ] @@ -226,43 +271,70 @@ def test_dr_init_using_valid_input_data(lambda_: float, description: str) -> Non _ = DoublyRobust(lambda_=lambda_) _ = DoublyRobustWithShrinkage(lambda_=lambda_) _ = SwitchDoublyRobust(lambda_=lambda_) + if lambda_ < np.inf: + _ = SubGaussianDoublyRobust(lambda_=lambda_) valid_input_of_dr_tuning_init = [ - ([3.0, np.inf, 100.0], "float lambda_"), - ([2], "integer lambda_"), + ([0.3, 0.001], "slope", "float lambda_"), + ([1], "mse", "integer lambda_"), ] @pytest.mark.parametrize( - "lambdas, description", + "lambdas, tuning_method, description", valid_input_of_dr_tuning_init, ) -def test_dr_tuning_init_using_valid_input_data(lambdas, description): - _ = DoublyRobustTuning(lambdas=lambdas) +def test_dr_tuning_init_using_valid_input_data(lambdas, tuning_method, description): + _ = DoublyRobustTuning(lambdas=lambdas, tuning_method=tuning_method) _ = DoublyRobustWithShrinkageTuning( lambdas=lambdas, + tuning_method=tuning_method, ) _ = SwitchDoublyRobustTuning( lambdas=lambdas, + tuning_method=tuning_method, + ) + _ = SubGaussianDoublyRobustTuning( + lambdas=lambdas, + tuning_method=tuning_method, ) # prepare instances dm = DirectMethod() dr = DoublyRobust() -dr_tuning = DoublyRobustTuning(lambdas=[1, 100], estimator_name="dr_tuning") +dr_tuning_mse = DoublyRobustTuning( + lambdas=[1, 100], tuning_method="mse", estimator_name="dr_tuning_mse" +) +dr_tuning_slope = DoublyRobustTuning( + lambdas=[1, 100], tuning_method="slope", estimator_name="dr_tuning_slope" +) dr_os_0 = DoublyRobustWithShrinkage(lambda_=0.0) -dr_os_tuning = DoublyRobustWithShrinkageTuning( - lambdas=[1, 100], estimator_name="dr_os_tuning" +dr_os_tuning_mse = DoublyRobustWithShrinkageTuning( + lambdas=[1, 100], tuning_method="mse", estimator_name="dr_os_tuning_mse" +) +dr_os_tuning_slope = DoublyRobustWithShrinkageTuning( + lambdas=[1, 100], tuning_method="slope", estimator_name="dr_os_tuning_slope" ) dr_os_max = DoublyRobustWithShrinkage(lambda_=np.inf) sndr = SelfNormalizedDoublyRobust() switch_dr_0 = SwitchDoublyRobust(lambda_=0.0) -switch_dr_tuning = SwitchDoublyRobustTuning( - lambdas=[1, 100], estimator_name="switch_dr_tuning" +switch_dr_tuning_mse = SwitchDoublyRobustTuning( + lambdas=[1, 100], tuning_method="mse", estimator_name="switch_dr_tuning_mse" +) +switch_dr_tuning_slope = SwitchDoublyRobustTuning( + lambdas=[1, 100], tuning_method="slope", estimator_name="switch_dr_tuning_slope" ) switch_dr_max = SwitchDoublyRobust(lambda_=np.inf) +sg_dr_0 = SubGaussianDoublyRobust(lambda_=0.0) +sg_dr_tuning_mse = SubGaussianDoublyRobustTuning( + lambdas=[0.01, 0.1], tuning_method="mse", estimator_name="sg_dr_tuning_mse" +) +sg_dr_tuning_slope = SubGaussianDoublyRobustTuning( + lambdas=[0.01, 0.1], tuning_method="slope", estimator_name="sg_dr_tuning_slope" +) +sg_dr_max = SubGaussianDoublyRobust(lambda_=1.0) # estimated pscore dr_estimated_pscore = DoublyRobust(use_estimated_pscore=True) dr_os_estimated_pscore = DoublyRobustWithShrinkage(use_estimated_pscore=True) @@ -287,12 +359,21 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): dr_estimators = [ dr, - dr_tuning, + dr_tuning_mse, + dr_tuning_slope, dr_os_0, - dr_os_tuning, + dr_os_tuning_mse, + dr_os_tuning_slope, + dr_os_max, sndr, switch_dr_0, - switch_dr_tuning, + switch_dr_tuning_mse, + switch_dr_tuning_slope, + switch_dr_max, + sg_dr_0, + sg_dr_tuning_mse, + sg_dr_tuning_slope, + sg_dr_max, dr_estimated_pscore, dr_os_estimated_pscore, dr_tuning_estimated_pscore, @@ -315,7 +396,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), False, None, - "action must be 1D array", + "`action` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -326,7 +407,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), False, None, - "reward must be 1D array", + "`reward` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -337,7 +418,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), False, None, - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -348,7 +429,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): None, # False, None, - "estimated_rewards_by_reg_model must be 3D array", + "`estimated_rewards_by_reg_model` must be 3D array", ), ( generate_action_dist(5, 4, 3), @@ -359,7 +440,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), False, None, - "action elements must be non-negative integers", + "`action` elements must be integers in the range of", ), ( generate_action_dist(5, 4, 3), @@ -370,7 +451,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), False, None, - "action elements must be non-negative integers", + "`action` elements must be integers in the range of", ), ( generate_action_dist(5, 4, 3), @@ -381,7 +462,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), False, None, - "action must be 1D array", + "`action` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -392,7 +473,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), False, None, - "action must be 1D array", + "`action` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -403,7 +484,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), False, None, - r"action elements must be smaller than`", + r"`action` elements must be integers in the range of`", ), ( generate_action_dist(5, 4, 3), @@ -414,7 +495,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), False, None, - "reward must be 1D array", + "`reward` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -425,7 +506,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), False, None, - "reward must be 1D array", + "`reward` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -447,7 +528,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), False, None, - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -458,7 +539,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), False, None, - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -480,7 +561,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), False, None, - "pscore must be positive", + "`pscore` must be positive", ), ( generate_action_dist(5, 4, 3), @@ -502,7 +583,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): "4", # False, None, - "estimated_rewards_by_reg_model must be 3D array", + "`estimated_rewards_by_reg_model` must be 3D array", ), ( generate_action_dist(5, 4, 3), @@ -513,7 +594,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), True, None, # - "estimated_pscore must be 1D array", + "`estimated_pscore` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -524,7 +605,7 @@ def test_dr_tuning_init_using_valid_input_data(lambdas, description): np.zeros((5, 4, 3)), True, np.arange(5), # - "pscore must be positive", + "`pscore` must be positive", ), ] @@ -608,12 +689,31 @@ def test_dr_variants_using_valid_input_data( ) -> None: # check dr variants switch_dr = SwitchDoublyRobust(lambda_=hyperparameter) - switch_dr_tuning = SwitchDoublyRobustTuning( - lambdas=[hyperparameter, hyperparameter * 10] + switch_dr_tuning_mse = SwitchDoublyRobustTuning( + lambdas=[hyperparameter, hyperparameter * 10], + tuning_method="mse", + ) + switch_dr_tuning_slope = SwitchDoublyRobustTuning( + lambdas=[hyperparameter, hyperparameter * 10], + tuning_method="slope", ) dr_os = DoublyRobustWithShrinkage(lambda_=hyperparameter) - dr_os_tuning = DoublyRobustWithShrinkageTuning( - lambdas=[hyperparameter, hyperparameter * 10] + dr_os_tuning_mse = DoublyRobustWithShrinkageTuning( + lambdas=[hyperparameter, hyperparameter * 10], + tuning_method="mse", + ) + dr_os_tuning_slope = DoublyRobustWithShrinkageTuning( + lambdas=[hyperparameter, hyperparameter * 10], + tuning_method="slope", + ) + sg_dr = SubGaussianDoublyRobust(lambda_=hyperparameter) + sg_dr_tuning_mse = SubGaussianDoublyRobustTuning( + lambdas=[hyperparameter, hyperparameter / 10], + tuning_method="mse", + ) + sg_dr_tuning_slope = SubGaussianDoublyRobustTuning( + lambdas=[hyperparameter, hyperparameter / 10], + tuning_method="slope", ) switch_dr_estimated_pscore = SwitchDoublyRobust( lambda_=hyperparameter, use_estimated_pscore=True @@ -628,12 +728,17 @@ def test_dr_variants_using_valid_input_data( lambdas=[hyperparameter, hyperparameter * 10], use_estimated_pscore=True ) for estimator in [ + sg_dr, + sg_dr_tuning_mse, + sg_dr_tuning_slope, switch_dr, - switch_dr_tuning, - dr_os, - dr_os_tuning, + switch_dr_tuning_mse, + switch_dr_tuning_slope, switch_dr_estimated_pscore, switch_dr_tuning_estimated_pscore, + dr_os, + dr_os_tuning_mse, + dr_os_tuning_slope, dr_os_estimated_pscore, dr_os_tuning_estimated_pscore, ]: @@ -710,7 +815,7 @@ def test_boundedness_of_sndr_using_random_evaluation_policy( ), f"estimated policy value of sndr should be smaller than or equal to 2 (because of its 2-boundedness), but the value is: {estimated_policy_value}" -def test_dr_osage_using_random_evaluation_policy( +def test_dr_os_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray ) -> None: """ @@ -764,6 +869,29 @@ def test_switch_dr_using_random_evaluation_policy( assert ( dr_value == switch_dr_max_value ), "SwitchDR (lambda=1e10) should be the same as DoublyRobust" + + +def test_sg_dr_using_random_evaluation_policy( + synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray +) -> None: + """ + Test the switch_dr using synthetic bandit data and random evaluation policy + """ + expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis] + action_dist = random_action_dist + # prepare input dict + input_dict = { + k: v + for k, v in synthetic_bandit_feedback.items() + if k in ["reward", "action", "pscore", "position"] + } + input_dict["action_dist"] = action_dist + input_dict["estimated_rewards_by_reg_model"] = expected_reward + dr_value = dr.estimate_policy_value(**input_dict) + sg_dr_0_value = sg_dr_0.estimate_policy_value(**input_dict) + assert ( + dr_value == sg_dr_0_value + ), "SG-DR (lambda=0) should be the same as DoublyRobust" input_dict["estimated_pscore"] = input_dict["pscore"] del input_dict["pscore"] dr_value_estimated_pscore = dr_estimated_pscore.estimate_policy_value(**input_dict) diff --git a/tests/ope/test_dr_estimators_continuous.py b/tests/ope/test_dr_estimators_continuous.py index 30983f2b..7bc33032 100644 --- a/tests/ope/test_dr_estimators_continuous.py +++ b/tests/ope/test_dr_estimators_continuous.py @@ -40,7 +40,7 @@ def test_synthetic_init(): np.ones(5), np.ones(5), np.random.uniform(size=5), - "action_by_evaluation_policy must be 1D array", + "`action_by_evaluation_policy` must be 1D array", ), ( np.ones((5, 1)), # @@ -48,7 +48,7 @@ def test_synthetic_init(): np.ones(5), np.ones(5), np.random.uniform(size=5), - "action_by_evaluation_policy must be 1D array", + "`action_by_evaluation_policy` must be 1D array", ), ( np.ones(5), @@ -56,7 +56,7 @@ def test_synthetic_init(): np.ones(5), np.ones(5), np.random.uniform(size=5), - "estimated_rewards_by_reg_model must be 1D array", + "`estimated_rewards_by_reg_model` must be 1D array", ), ( np.ones(5), @@ -64,7 +64,7 @@ def test_synthetic_init(): np.ones(5), np.ones(5), np.random.uniform(size=5), - "estimated_rewards_by_reg_model must be 1D array", + "`estimated_rewards_by_reg_model` must be 1D array", ), ( np.ones(5), # @@ -80,7 +80,7 @@ def test_synthetic_init(): None, # np.ones(5), np.random.uniform(size=5), - "action_by_behavior_policy must be 1D array", + "`action_by_behavior_policy` must be 1D array", ), ( np.ones(5), @@ -88,7 +88,7 @@ def test_synthetic_init(): np.ones((5, 1)), # np.ones(5), np.random.uniform(size=5), - "action_by_behavior_policy must be 1D array", + "`action_by_behavior_policy` must be 1D array", ), ( np.ones(5), @@ -96,7 +96,7 @@ def test_synthetic_init(): np.ones(5), None, # np.random.uniform(size=5), - "reward must be 1D array", + "`reward` must be 1D array", ), ( np.ones(5), @@ -104,7 +104,7 @@ def test_synthetic_init(): np.ones(5), np.ones((5, 1)), # np.random.uniform(size=5), - "reward must be 1D array", + "`reward` must be 1D array", ), ( np.ones(5), @@ -128,7 +128,7 @@ def test_synthetic_init(): np.ones(5), np.ones(5), None, # - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( np.ones(5), @@ -136,7 +136,7 @@ def test_synthetic_init(): np.ones(5), np.ones(5), np.random.uniform(size=(5, 1)), # - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( np.ones(5), @@ -152,7 +152,7 @@ def test_synthetic_init(): np.ones(5), np.ones(5), np.arange(5), # - "pscore must be positive", + "`pscore` must be positive", ), ] diff --git a/tests/ope/test_dr_estimators_slate.py b/tests/ope/test_dr_estimators_slate.py new file mode 100644 index 00000000..cc00b6cf --- /dev/null +++ b/tests/ope/test_dr_estimators_slate.py @@ -0,0 +1,1010 @@ +import numpy as np +import pytest +from sklearn.tree import DecisionTreeRegressor + +from obp.dataset import linear_behavior_policy_logit +from obp.dataset import logistic_reward_function +from obp.dataset import SyntheticSlateBanditDataset +from obp.ope import SlateCascadeDoublyRobust +from obp.ope import SlateRegressionModel +from obp.ope import SlateRewardInteractionIPS + + +# setting +len_list = 3 +n_unique_action = 10 +rips = SlateRewardInteractionIPS(len_list=len_list) +dr = SlateCascadeDoublyRobust(len_list=len_list, n_unique_action=n_unique_action) +n_rounds = 5 + +# --- invalid --- +# slate_id, action, reward, pscore, position, evaluation_policy_pscore, q_hat, evaluation_policy_action_dist, description +invalid_input_of_slate_estimators = [ + ( + "4", # + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`slate_id` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list).reshape((n_rounds, len_list)), # + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`slate_id` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list) - 1, # + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "slate_id elements must be non-negative integers", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + "4", # + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`action` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros((n_rounds, len_list), dtype=int), # + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`action` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int) - 1, # + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`action` elements must be integers in the range of", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=float), # + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`action` elements must be integers in the range of", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.ones(n_rounds * len_list, dtype=int) * 10, # + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`action` elements must be integers in the range of", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + "4", # + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`reward` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros((n_rounds, len_list), dtype=int), # + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`reward` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros((n_rounds * len_list), dtype=int), + "4", # + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`pscore_cascade` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros((n_rounds * len_list), dtype=int), + np.ones((n_rounds, len_list)), # + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`pscore_cascade` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros((n_rounds * len_list), dtype=int), + np.ones(n_rounds * len_list) + 1, # + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`pscore_cascade` must be in the range of", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros((n_rounds * len_list), dtype=int), + np.ones(n_rounds * len_list) - 1, # + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`pscore_cascade` must be in the range of", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros((n_rounds * len_list), dtype=int), + np.hstack([[0.2], np.ones(n_rounds * len_list - 1)]), # + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`pscore_cascade` must be non-increasing sequence in each slate", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros((n_rounds * len_list), dtype=int), + np.ones(n_rounds * len_list - 1), # + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`slate_id`, `position`, `reward`, `pscore_cascade`, and `evaluation_policy_pscore_cascade` must have the same number of samples", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + "4", # + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`position` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds).reshape((n_rounds, len_list)), # + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`position` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds) - 1, # + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`position` elements must be non-negative integers", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.repeat(np.arange(n_rounds), len_list), # + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`position` must not be duplicated in each slate", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + "4", # + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`evaluation_policy_pscore_cascade` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones((n_rounds, len_list)), # + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`evaluation_policy_pscore_cascade` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list) + 1, # + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`evaluation_policy_pscore_cascade` must be in the range of", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list) - 1.1, # + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`evaluation_policy_pscore_cascade` must be in the range of", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.hstack([[0.2], np.ones(n_rounds * len_list - 1)]), # + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`evaluation_policy_pscore_cascade` must be non-increasing sequence in each slate", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + None, # + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`q_hat` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + "4", # + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`q_hat` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones((n_rounds, len_list, n_unique_action)), # + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`q_hat` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones((n_rounds * len_list, n_unique_action)), # + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "`q_hat` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + "4", # + "`evaluation_policy_action_dist` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones((n_rounds, len_list, n_unique_action)) / n_unique_action, # + "`evaluation_policy_action_dist` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones((n_rounds * len_list, n_unique_action)) / n_unique_action, # + "`evaluation_policy_action_dist` must be 1D array", + ), + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action), # + "evaluation_policy_action_dist[i * n_unique_action : (i+1) * n_unique_action]", + ), +] + + +@pytest.mark.parametrize( + "slate_id, action, reward, pscore, position, evaluation_policy_pscore, q_hat, evaluation_policy_action_dist, description", + invalid_input_of_slate_estimators, +) +def test_estimate_policy_value_using_invalid_input_data( + slate_id, + action, + reward, + pscore, + position, + evaluation_policy_pscore, + q_hat, + evaluation_policy_action_dist, + description, +) -> None: + with pytest.raises(ValueError, match=f"{description}*"): + _ = dr.estimate_policy_value( + slate_id=slate_id, + action=action, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + q_hat=q_hat, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + _ = dr.estimate_interval( + slate_id=slate_id, + action=action, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + q_hat=q_hat, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + + +# --- valid --- +valid_input_of_slate_estimators = [ + ( + np.repeat(np.arange(n_rounds), len_list), + np.zeros(n_rounds * len_list, dtype=int), + np.zeros(n_rounds * len_list, dtype=int), + np.ones(n_rounds * len_list), + np.tile(np.arange(len_list), n_rounds), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "each slate has data of 3 (len_list) positions", + ), +] + + +@pytest.mark.parametrize( + "slate_id, action, reward, pscore, position, evaluation_policy_pscore, q_hat, evaluation_policy_action_dist, description", + valid_input_of_slate_estimators, +) +def test_cascade_dr_using_valid_input_data( + slate_id, + action, + reward, + pscore, + position, + evaluation_policy_pscore, + q_hat, + evaluation_policy_action_dist, + description, +) -> None: + _ = dr.estimate_policy_value( + slate_id=slate_id, + action=action, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + q_hat=q_hat, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + _ = dr.estimate_interval( + slate_id=slate_id, + action=action, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + q_hat=q_hat, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + + +# --- confidence intervals --- +# alpha, n_bootstrap_samples, random_state, err, description +invalid_input_of_estimate_intervals = [ + ( + 0.05, + 100, + "s", + ValueError, + "'s' cannot be used to seed a numpy.random.RandomState instance", + ), + (0.05, -1, 1, ValueError, "`n_bootstrap_samples`= -1, must be >= 1"), + ( + 0.05, + "s", + 1, + TypeError, + "`n_bootstrap_samples` must be an instance of , not ", + ), + (-1.0, 1, 1, ValueError, "`alpha`= -1.0, must be >= 0.0"), + (2.0, 1, 1, ValueError, "`alpha`= 2.0, must be <= 1.0"), + ( + "0", + 1, + 1, + TypeError, + "`alpha` must be an instance of , not ", + ), +] + +valid_input_of_estimate_intervals = [ + (0.05, 100, 1, "random_state is 1"), + (0.05, 1, 1, "n_bootstrap_samples is 1"), +] + + +@pytest.mark.parametrize( + "slate_id, action, reward, pscore, position, evaluation_policy_pscore, q_hat, evaluation_policy_action_dist, description_1", + valid_input_of_slate_estimators, +) +@pytest.mark.parametrize( + "alpha, n_bootstrap_samples, random_state, err, description_2", + invalid_input_of_estimate_intervals, +) +def test_estimate_interval_using_invalid_input_data( + slate_id, + action, + reward, + pscore, + position, + evaluation_policy_pscore, + q_hat, + evaluation_policy_action_dist, + description_1, + alpha, + n_bootstrap_samples, + random_state, + err, + description_2, +) -> None: + with pytest.raises(err, match=f"{description_2}*"): + _ = dr.estimate_interval( + slate_id=slate_id, + action=action, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + q_hat=q_hat, + evaluation_policy_action_dist=evaluation_policy_action_dist, + alpha=alpha, + n_bootstrap_samples=n_bootstrap_samples, + random_state=random_state, + ) + + +@pytest.mark.parametrize( + "slate_id, action, reward, pscore, position, evaluation_policy_pscore, q_hat, evaluation_policy_action_dist, description_1", + valid_input_of_slate_estimators, +) +@pytest.mark.parametrize( + "alpha, n_bootstrap_samples, random_state, description_2", + valid_input_of_estimate_intervals, +) +def test_estimate_interval_using_valid_input_data( + slate_id, + action, + reward, + pscore, + position, + evaluation_policy_pscore, + q_hat, + evaluation_policy_action_dist, + description_1, + alpha, + n_bootstrap_samples, + random_state, + description_2, +) -> None: + _ = dr.estimate_interval( + slate_id=slate_id, + action=action, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + q_hat=q_hat, + evaluation_policy_action_dist=evaluation_policy_action_dist, + alpha=alpha, + n_bootstrap_samples=n_bootstrap_samples, + random_state=random_state, + ) + + +def test_slate_ope_performance_using_cascade_additive_log(): + # set parameters + n_unique_action = 10 + len_list = 3 + dim_context = 2 + reward_type = "binary" + random_state = 12345 + n_rounds = 1000 + reward_structure = "cascade_additive" + click_model = None + behavior_policy_function = linear_behavior_policy_logit + reward_function = logistic_reward_function + dataset = SyntheticSlateBanditDataset( + n_unique_action=n_unique_action, + len_list=len_list, + dim_context=dim_context, + reward_type=reward_type, + reward_structure=reward_structure, + click_model=click_model, + random_state=random_state, + behavior_policy_function=behavior_policy_function, + base_reward_function=reward_function, + ) + random_behavior_dataset = SyntheticSlateBanditDataset( + n_unique_action=n_unique_action, + len_list=len_list, + dim_context=dim_context, + reward_type=reward_type, + reward_structure=reward_structure, + click_model=click_model, + random_state=random_state, + behavior_policy_function=None, + base_reward_function=reward_function, + ) + # obtain feedback + bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) + slate_id = bandit_feedback["slate_id"] + context = bandit_feedback["context"] + action = bandit_feedback["action"] + reward = bandit_feedback["reward"] + pscore = bandit_feedback["pscore_cascade"] + position = bandit_feedback["position"] + + # obtain random behavior feedback + random_behavior_feedback = random_behavior_dataset.obtain_batch_bandit_feedback( + n_rounds=n_rounds + ) + evaluation_policy_logit_ = np.ones((n_rounds, n_unique_action)) / n_unique_action + evaluation_policy_action_dist = ( + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action + ) + ( + _, + _, + evaluation_policy_pscore, + ) = dataset.obtain_pscore_given_evaluation_policy_logit( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + return_pscore_item_position=False, + ) + evaluation_policy_action_dist = dataset.calc_evaluation_policy_action_dist( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + ) + + # obtain q_hat + base_regression_model = SlateRegressionModel( + base_model=DecisionTreeRegressor(max_depth=3, random_state=12345), + len_list=len_list, + n_unique_action=n_unique_action, + fitting_method="iw", + ) + q_hat = base_regression_model.fit_predict( + context=context, + action=action, + reward=reward, + pscore_cascade=pscore, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + + # check if q_hat=0 case coincides with rips + cascade_dr_estimated_policy_value = dr.estimate_policy_value( + slate_id=slate_id, + action=action, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + q_hat=q_hat, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + # compute statistics of ground truth policy value + q_pi_e = ( + random_behavior_feedback["reward"] + .reshape((n_rounds, dataset.len_list)) + .sum(axis=1) + ) + gt_mean = q_pi_e.mean() + gt_std = q_pi_e.std(ddof=1) + print("Cascade additive") + # check the performance of OPE + ci_bound = gt_std * 3 / np.sqrt(q_pi_e.shape[0]) + print(f"gt_mean: {gt_mean}, 3 * gt_std / sqrt(n): {ci_bound}") + estimated_policy_value = { + "cascade-dr": cascade_dr_estimated_policy_value, + } + for key in estimated_policy_value: + print( + f"estimated_value: {estimated_policy_value[key]} ------ estimator: {key}, " + ) + # test the performance of each estimator + assert ( + np.abs(gt_mean - estimated_policy_value[key]) <= ci_bound + ), f"OPE of {key} did not work well (absolute error is greater than 3*sigma)" + + # check if q_hat = 0 case of cascade-dr coincides with rips + cascade_dr_estimated_policy_value_ = dr.estimate_policy_value( + slate_id=slate_id, + action=action, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + q_hat=np.zeros_like(q_hat), + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + rips_estimated_policy_value = rips.estimate_policy_value( + slate_id=slate_id, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + ) + assert np.allclose( + np.array([cascade_dr_estimated_policy_value_]), + np.array([rips_estimated_policy_value]), + ) + + +def test_slate_ope_performance_using_independent_log(): + # set parameters + n_unique_action = 10 + len_list = 3 + dim_context = 2 + reward_type = "binary" + random_state = 12345 + n_rounds = 1000 + reward_structure = "independent" + click_model = None + behavior_policy_function = linear_behavior_policy_logit + reward_function = logistic_reward_function + dataset = SyntheticSlateBanditDataset( + n_unique_action=n_unique_action, + len_list=len_list, + dim_context=dim_context, + reward_type=reward_type, + reward_structure=reward_structure, + click_model=click_model, + random_state=random_state, + behavior_policy_function=behavior_policy_function, + base_reward_function=reward_function, + ) + random_behavior_dataset = SyntheticSlateBanditDataset( + n_unique_action=n_unique_action, + len_list=len_list, + dim_context=dim_context, + reward_type=reward_type, + reward_structure=reward_structure, + click_model=click_model, + random_state=random_state, + behavior_policy_function=None, + base_reward_function=reward_function, + ) + # obtain feedback + bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) + slate_id = bandit_feedback["slate_id"] + context = bandit_feedback["context"] + action = bandit_feedback["action"] + reward = bandit_feedback["reward"] + pscore = bandit_feedback["pscore_cascade"] + position = bandit_feedback["position"] + + # obtain random behavior feedback + random_behavior_feedback = random_behavior_dataset.obtain_batch_bandit_feedback( + n_rounds=n_rounds + ) + evaluation_policy_logit_ = np.ones((n_rounds, n_unique_action)) / n_unique_action + evaluation_policy_action_dist = ( + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action + ) + ( + _, + _, + evaluation_policy_pscore, + ) = dataset.obtain_pscore_given_evaluation_policy_logit( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + return_pscore_item_position=False, + ) + evaluation_policy_action_dist = dataset.calc_evaluation_policy_action_dist( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + ) + + # obtain q_hat + base_regression_model = SlateRegressionModel( + base_model=DecisionTreeRegressor(max_depth=3, random_state=12345), + len_list=len_list, + n_unique_action=n_unique_action, + fitting_method="iw", + ) + q_hat = base_regression_model.fit_predict( + context=context, + action=action, + reward=reward, + pscore_cascade=pscore, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + + # check if q_hat=0 case coincides with rips + cascade_dr_estimated_policy_value = dr.estimate_policy_value( + slate_id=slate_id, + action=action, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + q_hat=q_hat, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + # compute statistics of ground truth policy value + q_pi_e = ( + random_behavior_feedback["reward"] + .reshape((n_rounds, dataset.len_list)) + .sum(axis=1) + ) + gt_mean = q_pi_e.mean() + gt_std = q_pi_e.std(ddof=1) + print("Cascade additive") + # check the performance of OPE + ci_bound = gt_std * 3 / np.sqrt(q_pi_e.shape[0]) + print(f"gt_mean: {gt_mean}, 3 * gt_std / sqrt(n): {ci_bound}") + estimated_policy_value = { + "cascade-dr": cascade_dr_estimated_policy_value, + } + for key in estimated_policy_value: + print( + f"estimated_value: {estimated_policy_value[key]} ------ estimator: {key}, " + ) + # test the performance of each estimator + assert ( + np.abs(gt_mean - estimated_policy_value[key]) <= ci_bound + ), f"OPE of {key} did not work well (absolute error is greater than 3*sigma)" + + # check if q_hat = 0 case of cascade-dr coincides with rips + cascade_dr_estimated_policy_value_ = dr.estimate_policy_value( + slate_id=slate_id, + action=action, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + q_hat=np.zeros_like(q_hat), + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + rips_estimated_policy_value = rips.estimate_policy_value( + slate_id=slate_id, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + ) + assert np.allclose( + np.array([cascade_dr_estimated_policy_value_]), + np.array([rips_estimated_policy_value]), + ) + + +def test_slate_ope_performance_using_standard_additive_log(): + # set parameters + n_unique_action = 10 + len_list = 3 + dim_context = 2 + reward_type = "binary" + random_state = 12345 + n_rounds = 1000 + reward_structure = "standard_additive" + click_model = None + behavior_policy_function = linear_behavior_policy_logit + reward_function = logistic_reward_function + dataset = SyntheticSlateBanditDataset( + n_unique_action=n_unique_action, + len_list=len_list, + dim_context=dim_context, + reward_type=reward_type, + reward_structure=reward_structure, + click_model=click_model, + random_state=random_state, + behavior_policy_function=behavior_policy_function, + base_reward_function=reward_function, + ) + random_behavior_dataset = SyntheticSlateBanditDataset( + n_unique_action=n_unique_action, + len_list=len_list, + dim_context=dim_context, + reward_type=reward_type, + reward_structure=reward_structure, + click_model=click_model, + random_state=random_state, + behavior_policy_function=None, + base_reward_function=reward_function, + ) + # obtain feedback + bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) + slate_id = bandit_feedback["slate_id"] + context = bandit_feedback["context"] + action = bandit_feedback["action"] + reward = bandit_feedback["reward"] + pscore = bandit_feedback["pscore_cascade"] + position = bandit_feedback["position"] + + # obtain random behavior feedback + random_behavior_feedback = random_behavior_dataset.obtain_batch_bandit_feedback( + n_rounds=n_rounds + ) + evaluation_policy_logit_ = np.ones((n_rounds, n_unique_action)) / n_unique_action + evaluation_policy_action_dist = ( + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action + ) + ( + _, + _, + evaluation_policy_pscore, + ) = dataset.obtain_pscore_given_evaluation_policy_logit( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + return_pscore_item_position=False, + ) + evaluation_policy_action_dist = dataset.calc_evaluation_policy_action_dist( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + ) + + # obtain q_hat + base_regression_model = SlateRegressionModel( + base_model=DecisionTreeRegressor(max_depth=3, random_state=12345), + len_list=len_list, + n_unique_action=n_unique_action, + fitting_method="iw", + ) + q_hat = base_regression_model.fit_predict( + context=context, + action=action, + reward=reward, + pscore_cascade=pscore, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + + # check if q_hat=0 case coincides with rips + cascade_dr_estimated_policy_value = dr.estimate_policy_value( + slate_id=slate_id, + action=action, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + q_hat=q_hat, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + # compute statistics of ground truth policy value + q_pi_e = ( + random_behavior_feedback["reward"] + .reshape((n_rounds, dataset.len_list)) + .sum(axis=1) + ) + gt_mean = q_pi_e.mean() + gt_std = q_pi_e.std(ddof=1) + print("Cascade additive") + # check the performance of OPE + ci_bound = gt_std * 3 / np.sqrt(q_pi_e.shape[0]) + print(f"gt_mean: {gt_mean}, 3 * gt_std / sqrt(n): {ci_bound}") + estimated_policy_value = { + "cascade-dr": cascade_dr_estimated_policy_value, + } + for key in estimated_policy_value: + print( + f"estimated_value: {estimated_policy_value[key]} ------ estimator: {key}, " + ) + # test the performance of each estimator + assert ( + np.abs(gt_mean - estimated_policy_value[key]) <= ci_bound + ), f"OPE of {key} did not work well (absolute error is greater than 3*sigma)" + + # check if q_hat = 0 case of cascade-dr coincides with rips + cascade_dr_estimated_policy_value_ = dr.estimate_policy_value( + slate_id=slate_id, + action=action, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + q_hat=np.zeros_like(q_hat), + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + rips_estimated_policy_value = rips.estimate_policy_value( + slate_id=slate_id, + reward=reward, + pscore_cascade=pscore, + position=position, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + ) + assert np.allclose( + np.array([cascade_dr_estimated_policy_value_]), + np.array([rips_estimated_policy_value]), + ) diff --git a/tests/ope/test_importance_weight_estimator.py b/tests/ope/test_importance_weight_estimator.py index 81e836c8..efedc835 100644 --- a/tests/ope/test_importance_weight_estimator.py +++ b/tests/ope/test_importance_weight_estimator.py @@ -103,7 +103,7 @@ "RandomForest", # 2, ValueError, - "base_model must be BaseEstimator or a child class of BaseEstimator", + "`base_model` must be BaseEstimator or a child class of BaseEstimator", ), ( np.random.uniform(size=(n_actions, 8)), @@ -134,7 +134,7 @@ 1, 2, ValueError, - "context must be 2D array", + "`context` must be 2D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -150,7 +150,7 @@ 1, 2, ValueError, - "action must be 1D array", + "`action` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7, 3)), # @@ -166,7 +166,7 @@ 1, 2, ValueError, - "context must be 2D array", + "`context` must be 2D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -182,7 +182,7 @@ 1, 2, ValueError, - "action must be 1D array", + "`action` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -198,7 +198,7 @@ 1, 2, ValueError, - "action elements must be non-negative integers", + "`action` elements must be non-negative integers", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -214,7 +214,7 @@ 1, 2, ValueError, - "action elements must be non-negative integers", + "`action` elements must be non-negative integers", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -230,7 +230,7 @@ 1, 2, ValueError, - "position must be 1D array", + "`position` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -246,7 +246,7 @@ 1, 2, ValueError, - "position must be 1D array", + "`position` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -278,7 +278,7 @@ 1, 2, ValueError, - "position elements must be non-negative integers", + "`position` elements must be non-negative integers", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -294,7 +294,7 @@ 1, 2, ValueError, - "position elements must be non-negative integers", + "`position` elements must be non-negative integers", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -342,7 +342,7 @@ 1, 2, ValueError, - "action_context must be 2D array", + "`action_context` must be 2D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -358,7 +358,7 @@ 1, 2, ValueError, - "action_context must be 2D array", + "`action_context` must be 2D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -374,7 +374,7 @@ 1, 2, ValueError, - r"action elements must be smaller than", + r"`action` elements must be integers in the range of", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -390,7 +390,7 @@ 1, 2, ValueError, - "position must be 1D array", + "`position` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -406,7 +406,7 @@ 1, 2, ValueError, - "position elements must be smaller than len_list", + "`position` elements must be smaller than `len_list`", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -422,7 +422,7 @@ 1, 2, ValueError, - "action_dist must be 3D array", + "`action_dist` must be 3D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -438,7 +438,7 @@ 1, 2, ValueError, - "shape of action_dist must be (n_rounds, n_actions, len_list)", + "shape of `action_dist` must be (n_rounds, n_actions, len_list)", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -454,7 +454,7 @@ 1, 2, ValueError, - "action_dist must be a probability distribution", + "`action_dist` must be a probability distribution", ), ( np.random.uniform(size=(n_rounds, 7)), diff --git a/tests/ope/test_ipw_estimators.py b/tests/ope/test_ipw_estimators.py index 994d38e8..df0897a4 100644 --- a/tests/ope/test_ipw_estimators.py +++ b/tests/ope/test_ipw_estimators.py @@ -7,6 +7,8 @@ from obp.ope import InverseProbabilityWeighting from obp.ope import InverseProbabilityWeightingTuning from obp.ope import SelfNormalizedInverseProbabilityWeighting +from obp.ope import SubGaussianInverseProbabilityWeighting +from obp.ope import SubGaussianInverseProbabilityWeightingTuning from obp.types import BanditFeedback @@ -25,7 +27,7 @@ r"`lambda_` must be an instance of \(, \), not .", ), (-1.0, False, ValueError, "`lambda_`= -1.0, must be >= 0.0."), - (np.nan, False, ValueError, "lambda_ must not be nan"), + (np.nan, False, ValueError, "`lambda_` must not be nan"), ( 1.0, "s", @@ -55,6 +57,7 @@ def test_ipw_init_using_invalid_inputs( invalid_input_of_ipw_tuning_init = [ ( "", # + "mse", True, 0.05, False, @@ -63,6 +66,7 @@ def test_ipw_init_using_invalid_inputs( ), ( None, # + "slope", True, 0.05, False, @@ -71,6 +75,7 @@ def test_ipw_init_using_invalid_inputs( ), ( [""], # + "mse", True, 0.05, False, @@ -79,6 +84,7 @@ def test_ipw_init_using_invalid_inputs( ), ( [None], # + "slope", True, 0.05, False, @@ -87,6 +93,7 @@ def test_ipw_init_using_invalid_inputs( ), ( [], # + "mse", True, 0.05, False, @@ -95,15 +102,34 @@ def test_ipw_init_using_invalid_inputs( ), ( [-1.0], # + "slope", True, 0.05, False, ValueError, "`an element of lambdas`= -1.0, must be >= 0.0.", ), - ([np.nan], True, 0.05, False, ValueError, "an element of lambdas must not be nan"), + ( + [np.nan], + "mse", + True, + 0.05, + False, + ValueError, + "an element of lambdas must not be nan", + ), + ( + [1], + "", # + True, + 0.05, + False, + ValueError, + "`tuning_method` must be either 'slope' or 'mse'", + ), ( [1], + "mse", "", # 0.05, False, @@ -112,6 +138,7 @@ def test_ipw_init_using_invalid_inputs( ), ( [1], + "slope", None, # 0.05, False, @@ -120,6 +147,7 @@ def test_ipw_init_using_invalid_inputs( ), ( [1], + "mse", True, "", # False, @@ -128,6 +156,7 @@ def test_ipw_init_using_invalid_inputs( ), ( [1], + "slope", True, None, # False, @@ -136,6 +165,7 @@ def test_ipw_init_using_invalid_inputs( ), ( [1], + "mse", True, -1.0, # False, @@ -144,6 +174,7 @@ def test_ipw_init_using_invalid_inputs( ), ( [1], + "slope", True, 1.1, # False, @@ -152,6 +183,7 @@ def test_ipw_init_using_invalid_inputs( ), ( [1], + "slope", True, 1.0, "s", # @@ -162,11 +194,12 @@ def test_ipw_init_using_invalid_inputs( @pytest.mark.parametrize( - "lambdas, use_bias_upper_bound, delta, use_estimated_pscore, err, description", + "lambdas, tuning_method, use_bias_upper_bound, delta, use_estimated_pscore, err, description", invalid_input_of_ipw_tuning_init, ) def test_ipw_tuning_init_using_invalid_inputs( lambdas, + tuning_method, use_bias_upper_bound, delta, use_estimated_pscore, @@ -178,6 +211,7 @@ def test_ipw_tuning_init_using_invalid_inputs( use_bias_upper_bound=use_bias_upper_bound, delta=delta, lambdas=lambdas, + tuning_method=tuning_method, use_estimated_pscore=use_estimated_pscore, ) @@ -185,7 +219,18 @@ def test_ipw_tuning_init_using_invalid_inputs( # prepare ipw instances ipw = InverseProbabilityWeighting() snipw = SelfNormalizedInverseProbabilityWeighting() -ipw_tuning = InverseProbabilityWeightingTuning(lambdas=[10, 1000]) +ipw_tuning_mse = InverseProbabilityWeightingTuning( + lambdas=[10, 1000], tuning_method="mse" +) +ipw_tuning_slope = InverseProbabilityWeightingTuning( + lambdas=[10, 1000], tuning_method="slope" +) +sgipw_tuning_mse = SubGaussianInverseProbabilityWeightingTuning( + lambdas=[0.01, 0.1], tuning_method="mse" +) +sgipw_tuning_slope = SubGaussianInverseProbabilityWeightingTuning( + lambdas=[0.01, 0.1], tuning_method="slope" +) # action_dist, action, reward, pscore, position, use_estimated_pscore, estimated_pscore, description @@ -198,7 +243,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), False, None, - "action must be 1D array", + "`action` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -208,7 +253,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), False, None, - "reward must be 1D array", + "`reward` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -218,7 +263,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), False, None, - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -228,7 +273,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), False, None, - "action elements must be non-negative integers", + "`action` elements must be integers in the range of", ), ( generate_action_dist(5, 4, 3), @@ -238,7 +283,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), False, None, - "action elements must be non-negative integers", + "`action` elements must be integers in the range of", ), ( generate_action_dist(5, 4, 3), @@ -248,7 +293,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), False, None, - "action must be 1D array", + "`action` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -258,7 +303,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), False, None, - "action must be 1D array", + "`action` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -268,7 +313,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), False, None, - r"action elements must be smaller than`", + r"`action` elements must be integers in the range of`", ), ( generate_action_dist(5, 4, 3), @@ -278,7 +323,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), False, None, - "reward must be 1D array", + "`reward` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -288,7 +333,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), False, None, - "reward must be 1D array", + "`reward` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -308,7 +353,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), False, None, - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -318,7 +363,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), False, None, - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -338,7 +383,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), False, None, - "pscore must be positive", + "`pscore` must be positive", ), ( generate_action_dist(5, 4, 3), @@ -348,7 +393,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), True, None, # - "estimated_pscore must be 1D array", + "`estimated_pscore` must be 1D array", ), ( generate_action_dist(5, 4, 3), @@ -358,7 +403,7 @@ def test_ipw_tuning_init_using_invalid_inputs( np.random.choice(3, size=5), True, np.arange(5), # - "pscore must be positive", + "`pscore` must be positive", ), ] @@ -382,9 +427,15 @@ def test_ipw_using_invalid_input_data( snipw = SelfNormalizedInverseProbabilityWeighting( use_estimated_pscore=use_estimated_pscore ) + sgipw = SubGaussianInverseProbabilityWeighting( + use_estimated_pscore=use_estimated_pscore + ) ipw_tuning = InverseProbabilityWeightingTuning( lambdas=[10, 1000], use_estimated_pscore=use_estimated_pscore ) + sgipw_tuning = SubGaussianInverseProbabilityWeightingTuning( + lambdas=[0.01, 0.1], use_estimated_pscore=use_estimated_pscore + ) with pytest.raises(ValueError, match=f"{description}*"): _ = ipw.estimate_policy_value( action_dist=action_dist, @@ -439,6 +490,42 @@ def test_ipw_using_invalid_input_data( position=position, estimated_pscore=estimated_pscore, ) + with pytest.raises(ValueError, match=f"{description}*"): + _ = sgipw.estimate_policy_value( + action_dist=action_dist, + action=action, + reward=reward, + pscore=pscore, + position=position, + estimated_pscore=estimated_pscore, + ) + with pytest.raises(ValueError, match=f"{description}*"): + _ = sgipw.estimate_interval( + action_dist=action_dist, + action=action, + reward=reward, + pscore=pscore, + position=position, + estimated_pscore=estimated_pscore, + ) + with pytest.raises(ValueError, match=f"{description}*"): + _ = sgipw_tuning.estimate_policy_value( + action_dist=action_dist, + action=action, + reward=reward, + pscore=pscore, + position=position, + estimated_pscore=estimated_pscore, + ) + with pytest.raises(ValueError, match=f"{description}*"): + _ = sgipw_tuning.estimate_interval( + action_dist=action_dist, + action=action, + reward=reward, + pscore=pscore, + position=position, + estimated_pscore=estimated_pscore, + ) def test_ipw_using_random_evaluation_policy( @@ -456,7 +543,7 @@ def test_ipw_using_random_evaluation_policy( } input_dict["action_dist"] = action_dist # ipw estimators can be used without estimated_rewards_by_reg_model - for estimator in [ipw, snipw, ipw_tuning]: + for estimator in [ipw, snipw, ipw_tuning_mse, ipw_tuning_slope]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance( estimated_policy_value, float diff --git a/tests/ope/test_ipw_estimators_continuous.py b/tests/ope/test_ipw_estimators_continuous.py index 6d73b5e1..93c01ca6 100644 --- a/tests/ope/test_ipw_estimators_continuous.py +++ b/tests/ope/test_ipw_estimators_continuous.py @@ -64,42 +64,42 @@ def test_synthetic_init(): np.ones(5), np.ones(5), np.random.uniform(size=5), - "action_by_evaluation_policy must be 1D array", + "`action_by_evaluation_policy` must be 1D array", ), ( np.ones((5, 1)), # np.ones(5), np.ones(5), np.random.uniform(size=5), - "action_by_evaluation_policy must be 1D array", + "`action_by_evaluation_policy` must be 1D array", ), ( np.ones(5), None, # np.ones(5), np.random.uniform(size=5), - "action_by_behavior_policy must be 1D array", + "`action_by_behavior_policy` must be 1D array", ), ( np.ones(5), np.ones((5, 1)), # np.ones(5), np.random.uniform(size=5), - "action_by_behavior_policy must be 1D array", + "`action_by_behavior_policy` must be 1D array", ), ( np.ones(5), np.ones(5), None, # np.random.uniform(size=5), - "reward must be 1D array", + "`reward` must be 1D array", ), ( np.ones(5), np.ones(5), np.ones((5, 1)), # np.random.uniform(size=5), - "reward must be 1D array", + "`reward` must be 1D array", ), ( np.ones(5), @@ -120,14 +120,14 @@ def test_synthetic_init(): np.ones(5), np.ones(5), None, # - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( np.ones(5), np.ones(5), np.ones(5), np.random.uniform(size=(5, 1)), # - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( np.ones(5), @@ -141,7 +141,7 @@ def test_synthetic_init(): np.ones(5), np.ones(5), np.arange(5), # - "pscore must be positive", + "`pscore` must be positive", ), ] diff --git a/tests/ope/test_ipw_estimators_slate.py b/tests/ope/test_ipw_estimators_slate.py index d947c535..66c73e71 100644 --- a/tests/ope/test_ipw_estimators_slate.py +++ b/tests/ope/test_ipw_estimators_slate.py @@ -33,7 +33,7 @@ np.ones(n_rounds * len_list), "4", # np.ones(n_rounds * len_list), - "position must be 1D array", + "`position` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -41,7 +41,7 @@ np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds).reshape((n_rounds, len_list)), # np.ones(n_rounds * len_list), - "position must be 1D array", + "`position` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -49,7 +49,7 @@ np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds) - 1, # np.ones(n_rounds * len_list), - "position elements must be non-negative integers", + "`position` elements must be non-negative integers", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -57,7 +57,7 @@ np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "reward must be 1D array", + "`reward` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -65,7 +65,7 @@ np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "reward must be 1D array", + "`reward` must be 1D array", ), ( "4", # @@ -73,7 +73,7 @@ np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "slate_id must be 1D array", + "`slate_id` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list).reshape((n_rounds, len_list)), # @@ -81,7 +81,7 @@ np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "slate_id must be 1D array", + "`slate_id` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list) - 1, # @@ -97,7 +97,7 @@ np.ones(n_rounds * len_list), np.repeat(np.arange(n_rounds), len_list), # np.ones(n_rounds * len_list), - "position must not be duplicated in each slate", + "`position` must not be duplicated in each slate", ), ] @@ -321,7 +321,7 @@ def test_slate_estimators_using_valid_input_data( "4", # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -329,7 +329,7 @@ def test_slate_estimators_using_valid_input_data( np.ones((n_rounds, len_list)), # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -337,7 +337,7 @@ def test_slate_estimators_using_valid_input_data( np.ones(n_rounds * len_list) + 1, # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "pscore must be in the range of", + "`pscore` must be in the range of", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -345,7 +345,7 @@ def test_slate_estimators_using_valid_input_data( np.ones(n_rounds * len_list) - 1, # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "pscore must be in the range of", + "`pscore` must be in the range of", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -353,7 +353,7 @@ def test_slate_estimators_using_valid_input_data( np.ones(n_rounds * len_list - 1), # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "slate_id, position, reward, pscore, and evaluation_policy_pscore must have the same number of samples", + "`slate_id`, `position`, `reward`, `pscore`, and `evaluation_policy_pscore` must have the same number of samples", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -361,7 +361,7 @@ def test_slate_estimators_using_valid_input_data( np.hstack([np.ones(n_rounds * len_list - 1), [0.2]]), # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "pscore must be unique in each slate", + "`pscore` must be unique in each slate", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -369,7 +369,7 @@ def test_slate_estimators_using_valid_input_data( np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), "4", # - "evaluation_policy_pscore must be 1D array", + "`evaluation_policy_pscore` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -377,7 +377,7 @@ def test_slate_estimators_using_valid_input_data( np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.ones((n_rounds, len_list)), # - "evaluation_policy_pscore must be 1D array", + "`evaluation_policy_pscore` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -385,7 +385,7 @@ def test_slate_estimators_using_valid_input_data( np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list) + 1, # - "evaluation_policy_pscore must be in the range of", + "`evaluation_policy_pscore` must be in the range of", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -393,7 +393,7 @@ def test_slate_estimators_using_valid_input_data( np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list) - 1.1, # - "evaluation_policy_pscore must be in the range of", + "`evaluation_policy_pscore` must be in the range of", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -401,7 +401,7 @@ def test_slate_estimators_using_valid_input_data( np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.hstack([np.ones(n_rounds * len_list - 1), [0.2]]), # - "evaluation_policy_pscore must be unique in each slate", + "`evaluation_policy_pscore` must be unique in each slate", ), ] @@ -453,7 +453,7 @@ def test_sips_using_invalid_input_data( "4", # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "pscore_item_position must be 1D array", + "`pscore_item_position` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -461,7 +461,7 @@ def test_sips_using_invalid_input_data( np.ones((n_rounds, len_list)), # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "pscore_item_position must be 1D array", + "`pscore_item_position` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -469,7 +469,7 @@ def test_sips_using_invalid_input_data( np.ones(n_rounds * len_list) + 1, # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "pscore_item_position must be in the range of", + "`pscore_item_position` must be in the range of", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -477,7 +477,7 @@ def test_sips_using_invalid_input_data( np.ones(n_rounds * len_list) - 1, # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "pscore_item_position must be in the range of", + "`pscore_item_position` must be in the range of", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -485,7 +485,7 @@ def test_sips_using_invalid_input_data( np.ones(n_rounds * len_list - 1), # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "slate_id, position, reward, pscore_item_position, and evaluation_policy_pscore_item_position must have the same number of samples", + "`slate_id`, `position`, `reward`, `pscore_item_position`, and `evaluation_policy_pscore_item_position` must have the same number of samples", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -493,7 +493,7 @@ def test_sips_using_invalid_input_data( np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), "4", # - "evaluation_policy_pscore_item_position must be 1D array", + "`evaluation_policy_pscore_item_position` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -501,7 +501,7 @@ def test_sips_using_invalid_input_data( np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.ones((n_rounds, len_list)), # - "evaluation_policy_pscore_item_position must be 1D array", + "`evaluation_policy_pscore_item_position` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -509,7 +509,7 @@ def test_sips_using_invalid_input_data( np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list) + 1, # - "evaluation_policy_pscore_item_position must be in the range of", + "`evaluation_policy_pscore_item_position` must be in the range of", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -517,7 +517,7 @@ def test_sips_using_invalid_input_data( np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list) - 1.1, # - "evaluation_policy_pscore_item_position must be in the range of", + "`evaluation_policy_pscore_item_position` must be in the range of", ), ] @@ -574,7 +574,7 @@ def test_iips_using_invalid_input_data( "4", # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "pscore_cascade must be 1D array", + "`pscore_cascade` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -582,7 +582,7 @@ def test_iips_using_invalid_input_data( np.ones((n_rounds, len_list)), # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "pscore_cascade must be 1D array", + "`pscore_cascade` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -590,7 +590,7 @@ def test_iips_using_invalid_input_data( np.ones(n_rounds * len_list) + 1, # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "pscore_cascade must be in the range of", + "`pscore_cascade` must be in the range of", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -598,7 +598,7 @@ def test_iips_using_invalid_input_data( np.ones(n_rounds * len_list) - 1, # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "pscore_cascade must be in the range of", + "`pscore_cascade` must be in the range of", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -606,7 +606,7 @@ def test_iips_using_invalid_input_data( np.ones(n_rounds * len_list - 1), # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "slate_id, position, reward, pscore_cascade, and evaluation_policy_pscore_cascade must have the same number of samples", + "`slate_id`, `position`, `reward`, `pscore_cascade`, and `evaluation_policy_pscore_cascade` must have the same number of samples", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -614,7 +614,7 @@ def test_iips_using_invalid_input_data( np.hstack([[0.2], np.ones(n_rounds * len_list - 1)]), # np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list), - "pscore_cascade must be non-increasing sequence in each slate", + "`pscore_cascade` must be non-increasing sequence in each slate", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -622,7 +622,7 @@ def test_iips_using_invalid_input_data( np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), "4", # - "evaluation_policy_pscore_cascade must be 1D array", + "`evaluation_policy_pscore_cascade` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -630,7 +630,7 @@ def test_iips_using_invalid_input_data( np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.ones((n_rounds, len_list)), # - "evaluation_policy_pscore_cascade must be 1D array", + "`evaluation_policy_pscore_cascade` must be 1D array", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -638,7 +638,7 @@ def test_iips_using_invalid_input_data( np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list) + 1, # - "evaluation_policy_pscore_cascade must be in the range of", + "`evaluation_policy_pscore_cascade` must be in the range of", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -646,7 +646,7 @@ def test_iips_using_invalid_input_data( np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.ones(n_rounds * len_list) - 1.1, # - "evaluation_policy_pscore_cascade must be in the range of", + "`evaluation_policy_pscore_cascade` must be in the range of", ), ( np.repeat(np.arange(n_rounds), len_list), @@ -654,7 +654,7 @@ def test_iips_using_invalid_input_data( np.ones(n_rounds * len_list), np.tile(np.arange(len_list), n_rounds), np.hstack([[0.2], np.ones(n_rounds * len_list - 1)]), # - "evaluation_policy_pscore_cascade must be non-increasing sequence in each slate", + "`evaluation_policy_pscore_cascade` must be non-increasing sequence in each slate", ), ] diff --git a/tests/ope/test_meta.py b/tests/ope/test_meta.py index 1cb804f4..a73dc5e7 100644 --- a/tests/ope/test_meta.py +++ b/tests/ope/test_meta.py @@ -51,13 +51,13 @@ def estimate_policy_value( Parameters ---------- position: array-like, shape (n_rounds,) - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. Returns ---------- @@ -80,13 +80,13 @@ def estimate_interval( Parameters ---------- position: array-like, shape (n_rounds,) - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) - Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. + Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_i,a_i)`. alpha: float, default=0.05 Significance level. @@ -141,22 +141,22 @@ def estimate_policy_value( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. position: array-like, shape (n_rounds,) - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. pscore: array-like, shape (n_rounds,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_dist: array-like, shape (n_rounds, n_actions, len_list) - Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. Returns ---------- @@ -183,23 +183,23 @@ def estimate_interval( Parameters ---------- reward: array-like, shape (n_rounds,) - Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`. + Reward observed for each data in logged bandit data, i.e., :math:`r_i`. action: array-like, shape (n_rounds,) - Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. + Action sampled by behavior policy for each data in logged bandit data, i.e., :math:`a_i`. position: array-like, shape (n_rounds,) - Position of recommendation interface where action was presented in each round of the given logged bandit data. + Position in a recommendation interface where the action was presented. pscore: array-like, shape (n_rounds,) - Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. + Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_i|x_i)`. action_dist: array-like, shape (n_rounds, n_actions, len_list) Action choice probabilities - by the evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`. + by the evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_i|x_i)`. estimated_pscore: array-like, shape (n_rounds,), default=None - Estimated action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_t|x_t)`. + Estimated behavior policy (propensity scores), i.e., :math:`\\hat{\\pi}_b(a_i|x_i)`. alpha: float, default=0.05 Significance level. @@ -298,11 +298,11 @@ def test_meta_estimated_rewards_by_reg_model_inputs( ( np.zeros((2, 3, 4)), {"dm": None}, - r"estimated_rewards_by_reg_model\[dm\] must be 3D array", + r"`estimated_rewards_by_reg_model\[dm\]` must be 3D array", ), - (np.zeros((2, 3)), None, "action_dist must be 3D array"), - ("3", None, "action_dist must be 3D array"), - (None, None, "action_dist must be 3D array"), + (np.zeros((2, 3)), None, "`action_dist` must be 3D array"), + ("3", None, "`action_dist` must be 3D array"), + (None, None, "`action_dist` must be 3D array"), ] valid_input_of_create_estimator_inputs = [ @@ -316,7 +316,7 @@ def test_meta_estimated_rewards_by_reg_model_inputs( {"dm": np.zeros((2, 3, 4))}, "same shape", ), - (np.zeros((2, 3, 1)), None, "estimated_rewards_by_reg_model is None"), + (np.zeros((2, 3, 1)), None, "`estimated_rewards_by_reg_model` is None"), ] @@ -671,7 +671,7 @@ def test_meta_summarize_off_policy_estimates( invalid_input_of_evaluation_performance_of_estimators = [ - ("foo", 0.3, ValueError, "metric must be either 'relative-ee' or 'se'"), + ("foo", 0.3, ValueError, "`metric` must be either 'relative-ee' or 'se'"), ( "se", 1, @@ -688,7 +688,7 @@ def test_meta_summarize_off_policy_estimates( "relative-ee", 0.0, ValueError, - "ground_truth_policy_value must be non-zero when metric is relative-ee", + "`ground_truth_policy_value` must be non-zero when metric is relative-ee", ), ] diff --git a/tests/ope/test_meta_continuous.py b/tests/ope/test_meta_continuous.py index f14894c4..d14baa78 100644 --- a/tests/ope/test_meta_continuous.py +++ b/tests/ope/test_meta_continuous.py @@ -227,15 +227,15 @@ def test_meta_estimated_rewards_by_reg_model_inputs( ( np.zeros(5), {"dr": None}, - r"estimated_rewards_by_reg_model\[dr\] must be 1D array", + r"`estimated_rewards_by_reg_model\[dr\]` must be 1D array", ), ( np.zeros((2, 3)), None, - "action_by_evaluation_policy must be 1D array", + "`action_by_evaluation_policy` must be 1D array", ), - ("3", None, "action_by_evaluation_policy must be 1D array"), - (None, None, "action_by_evaluation_policy must be 1D array"), + ("3", None, "`action_by_evaluation_policy` must be 1D array"), + (None, None, "`action_by_evaluation_policy` must be 1D array"), ] valid_input_of_create_estimator_inputs = [ @@ -249,7 +249,7 @@ def test_meta_estimated_rewards_by_reg_model_inputs( {"dr": np.zeros(5)}, "same shape", ), - (np.zeros(5), None, "estimated_rewards_by_reg_model is None"), + (np.zeros(5), None, "`estimated_rewards_by_reg_model` is None"), ] @@ -601,7 +601,7 @@ def test_meta_summarize_off_policy_estimates( invalid_input_of_evaluation_performance_of_estimators = [ - ("foo", 0.3, ValueError, "metric must be either 'relative-ee' or 'se'"), + ("foo", 0.3, ValueError, "`metric` must be either 'relative-ee' or 'se'"), ( "se", 1, @@ -618,7 +618,7 @@ def test_meta_summarize_off_policy_estimates( "relative-ee", 0.0, ValueError, - "ground_truth_policy_value must be non-zero when metric is relative-ee", + "`ground_truth_policy_value` must be non-zero when metric is relative-ee", ), ] diff --git a/tests/ope/test_meta_slate.py b/tests/ope/test_meta_slate.py index 0ed30c27..308b8710 100644 --- a/tests/ope/test_meta_slate.py +++ b/tests/ope/test_meta_slate.py @@ -10,6 +10,7 @@ from pandas.testing import assert_frame_equal import pytest +from obp.ope import SlateCascadeDoublyRobust from obp.ope import SlateIndependentIPS from obp.ope import SlateOffPolicyEvaluation from obp.ope import SlateRewardInteractionIPS @@ -133,7 +134,7 @@ def estimate_interval( @dataclass class SlateRewardInteractionIPSMock(SlateRewardInteractionIPS): - """Slate Recursive Inverse Propensity Scoring (RIPS) Mock.""" + """Slate Reward interaction Inverse Propensity Scoring (RIPS) Mock.""" estimator_name: str = "rips" @@ -183,12 +184,75 @@ def estimate_interval( return {k: v for k, v in mock_confidence_interval.items()} +@dataclass +class SlateCascadeDoublyRobustMock(SlateCascadeDoublyRobust): + """Slate Cascade Doubly RObust (Cascade-DR) Mock.""" + + estimator_name: str = "cascade-dr" + + def estimate_policy_value( + self, + slate_id: np.ndarray, + action: np.ndarray, + reward: np.ndarray, + position: np.ndarray, + pscore_cascade: np.ndarray, + evaluation_policy_pscore_cascade: np.ndarray, + evaluation_policy_action_dist: np.ndarray, + q_hat: np.ndarray, + **kwargs, + ) -> float: + """Estimate the policy value of evaluation policy. + + Returns + ---------- + mock_policy_value: float + + """ + return mock_policy_value + + def estimate_interval( + self, + slate_id: np.ndarray, + action: np.ndarray, + reward: np.ndarray, + position: np.ndarray, + pscore_cascade: np.ndarray, + evaluation_policy_pscore_cascade: np.ndarray, + evaluation_policy_action_dist: np.ndarray, + q_hat: np.ndarray, + alpha: float = 0.05, + n_bootstrap_samples: int = 10000, + random_state: Optional[int] = None, + **kwargs, + ) -> Dict[str, float]: + """Estimate confidence interval of policy value by nonparametric bootstrap procedure. + + Returns + ---------- + mock_confidence_interval: Dict[str, float] + Dictionary storing the estimated mean and upper-lower confidence bounds. + + """ + check_confidence_interval_arguments( + alpha=alpha, + n_bootstrap_samples=n_bootstrap_samples, + random_state=random_state, + ) + return {k: v for k, v in mock_confidence_interval.items()} + + # define Mock instances -sips = SlateStandardIPSMock(len_list=3) -sips2 = SlateStandardIPSMock(len_list=3, eps=0.02) -sips3 = SlateStandardIPSMock(len_list=3, estimator_name="sips3") -iips = SlateIndependentIPSMock(len_list=3) -rips = SlateRewardInteractionIPSMock(len_list=3) +len_list = 3 +n_unique_action = 10 +sips = SlateStandardIPSMock(len_list=len_list) +sips2 = SlateStandardIPSMock(len_list=len_list, eps=0.02) +sips3 = SlateStandardIPSMock(len_list=len_list, estimator_name="sips3") +iips = SlateIndependentIPSMock(len_list=len_list) +rips = SlateRewardInteractionIPSMock(len_list=len_list) +cascade_dr = SlateCascadeDoublyRobustMock( + len_list=len_list, n_unique_action=n_unique_action +) def test_meta_post_init(synthetic_slate_bandit_feedback: BanditFeedback) -> None: @@ -211,7 +275,13 @@ def test_meta_post_init(synthetic_slate_bandit_feedback: BanditFeedback) -> None "sips3": sips3, }, "__post_init__ returns a wrong value" # __post__init__ raises RuntimeError when necessary_keys are not included in the bandit_feedback - necessary_keys = ["slate_id", "position", "reward"] + necessary_keys = [ + "slate_id", + "context", + "action", + "reward", + "position", + ] for i in range(len(necessary_keys)): for deleted_keys in itertools.combinations(necessary_keys, i + 1): invalid_bandit_feedback_dict = {key: "_" for key in necessary_keys} @@ -224,31 +294,49 @@ def test_meta_post_init(synthetic_slate_bandit_feedback: BanditFeedback) -> None ) -# evaluation_policy_pscore, description +# evaluation_policy_pscore_cascade, evaluation_policy_action_dist, q_hat, description invalid_input_of_create_estimator_inputs = [ ( None, + np.ones(100 * len_list * n_unique_action) / n_unique_action, + np.ones(100 * len_list * n_unique_action), "one of evaluation_policy_pscore, evaluation_policy_pscore_item_position, or evaluation_policy_pscore_cascade must be given", ), + ( + np.ones(300), + None, + np.ones(100 * len_list * n_unique_action), + "evaluation_policy_action_dist must be given", + ), + ( + np.ones(300), + np.ones(100 * len_list * n_unique_action) / n_unique_action, + None, + "q_hat must be given", + ), ] -# evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, description +# evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, evaluation_policy_action_dist, q_hat, description valid_input_of_create_estimator_inputs = [ ( np.ones(300), np.ones(300), np.ones(300), - "deterministic evaluation policy", + np.ones(300 * n_unique_action) / n_unique_action, + np.ones(100 * len_list * n_unique_action), + "evaluation policy", ), ] @pytest.mark.parametrize( - "evaluation_policy_pscore, description", + "evaluation_policy_pscore_cascade, evaluation_policy_action_dist, q_hat, description", invalid_input_of_create_estimator_inputs, ) def test_meta_create_estimator_inputs_using_invalid_input_data( - evaluation_policy_pscore, + evaluation_policy_pscore_cascade, + evaluation_policy_action_dist, + q_hat, description: str, synthetic_slate_bandit_feedback: BanditFeedback, ) -> None: @@ -256,44 +344,61 @@ def test_meta_create_estimator_inputs_using_invalid_input_data( Test the _create_estimator_inputs using valid data and a sips estimator """ ope_ = SlateOffPolicyEvaluation( - bandit_feedback=synthetic_slate_bandit_feedback, ope_estimators=[sips] + bandit_feedback=synthetic_slate_bandit_feedback, + ope_estimators=[cascade_dr], ) # raise ValueError when the shape of two arrays are different with pytest.raises(ValueError, match=f"{description}*"): _ = ope_._create_estimator_inputs( - evaluation_policy_pscore=evaluation_policy_pscore + evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, ) # _create_estimator_inputs function is called in the following functions with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.estimate_policy_values( - evaluation_policy_pscore=evaluation_policy_pscore + evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, ) with pytest.raises(ValueError, match=f"{description}*"): - _ = ope_.estimate_intervals(evaluation_policy_pscore=evaluation_policy_pscore) + _ = ope_.estimate_intervals( + evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, + ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.summarize_off_policy_estimates( - evaluation_policy_pscore=evaluation_policy_pscore + evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=0.1, - evaluation_policy_pscore=evaluation_policy_pscore, + evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.summarize_estimators_comparison( ground_truth_policy_value=0.1, - evaluation_policy_pscore=evaluation_policy_pscore, + evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, ) @pytest.mark.parametrize( - "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, description", + "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, evaluation_policy_action_dist, q_hat, description", valid_input_of_create_estimator_inputs, ) def test_meta_create_estimator_inputs_using_valid_input_data( evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, + evaluation_policy_action_dist, + q_hat, description: str, synthetic_slate_bandit_feedback: BanditFeedback, ) -> None: @@ -308,6 +413,8 @@ def test_meta_create_estimator_inputs_using_valid_input_data( ) assert set(estimator_inputs.keys()) == set( [ + "slate_id", + "action", "reward", "pscore", "pscore_item_position", @@ -317,7 +424,9 @@ def test_meta_create_estimator_inputs_using_valid_input_data( "evaluation_policy_pscore_item_position", "evaluation_policy_pscore_cascade", "slate_id", - ] + "evaluation_policy_action_dist", + "q_hat", + ], ), f"Invalid response of _create_estimator_inputs (test case: {description})" # _create_estimator_inputs function is called in the following functions _ = ope_.estimate_policy_values(evaluation_policy_pscore=evaluation_policy_pscore) @@ -331,16 +440,30 @@ def test_meta_create_estimator_inputs_using_valid_input_data( _ = ope_.summarize_estimators_comparison( ground_truth_policy_value=0.1, evaluation_policy_pscore=evaluation_policy_pscore ) + # check if the valid values are returned when using cascade-dr + ope_ = SlateOffPolicyEvaluation( + bandit_feedback=synthetic_slate_bandit_feedback, + ope_estimators=[cascade_dr], + ) + estimator_inputs = ope_._create_estimator_inputs( + evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, + ) + assert estimator_inputs["evaluation_policy_action_dist"] is not None + assert estimator_inputs["q_hat"] is not None @pytest.mark.parametrize( - "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, description", + "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, evaluation_policy_action_dist, q_hat, description", valid_input_of_create_estimator_inputs, ) def test_meta_estimate_policy_values_using_valid_input_data( evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, + evaluation_policy_action_dist, + q_hat, description: str, synthetic_slate_bandit_feedback: BanditFeedback, ) -> None: @@ -359,44 +482,42 @@ def test_meta_estimate_policy_values_using_valid_input_data( # multiple ope estimators ope_ = SlateOffPolicyEvaluation( bandit_feedback=synthetic_slate_bandit_feedback, - ope_estimators=[iips, sips, rips], + ope_estimators=[iips, sips, rips, cascade_dr], ) assert ope_.estimate_policy_values( evaluation_policy_pscore=evaluation_policy_pscore, evaluation_policy_pscore_item_position=evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, ) == { "iips": mock_policy_value, "sips": mock_policy_value + sips.eps, "rips": mock_policy_value, - }, "SlateOffPolicyEvaluation.estimate_policy_values ([IIPS, SIPS, RIPS]) returns a wrong value" + "cascade-dr": mock_policy_value, + }, "SlateOffPolicyEvaluation.estimate_policy_values ([IIPS, SIPS, RIPS, Cascade-DR]) returns a wrong value" @pytest.mark.parametrize( - "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, description", + "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, evaluation_policy_action_dist, q_hat, description", valid_input_of_create_estimator_inputs, ) def test_meta_estimate_policy_values_using_various_pscores( evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, + evaluation_policy_action_dist, + q_hat, description: str, synthetic_slate_bandit_feedback: BanditFeedback, ) -> None: - necessary_keys = [ - "reward", - "position", - "evaluation_policy_pscore", - "evaluation_policy_pscore_item_position", - "evaluation_policy_pscore_cascade" "slate_id", - ] pscore_keys = [ "pscore", "pscore_item_position", "pscore_cascade", ] # TypeError must be raised when required positional arguments are missing - for i in range(len(necessary_keys)): + for i in range(len(pscore_keys)): for deleted_keys in itertools.combinations(pscore_keys, i + 1): copied_feedback = deepcopy(synthetic_slate_bandit_feedback) # delete @@ -408,23 +529,38 @@ def test_meta_estimate_policy_values_using_various_pscores( ): ope_ = SlateOffPolicyEvaluation( bandit_feedback=copied_feedback, - ope_estimators=[sips, iips, rips], + ope_estimators=[sips, iips, rips, cascade_dr], ) _ = ope_.estimate_policy_values( evaluation_policy_pscore=evaluation_policy_pscore, evaluation_policy_pscore_item_position=evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, ) + # pscore_item_position and evaluation_policy_pscore_item_position are not necessary when iips is not evaluated copied_feedback = deepcopy(synthetic_slate_bandit_feedback) del copied_feedback["pscore_item_position"] ope_ = SlateOffPolicyEvaluation( bandit_feedback=copied_feedback, - ope_estimators=[sips, rips], + ope_estimators=[sips, rips, cascade_dr], ) _ = ope_.estimate_policy_values( evaluation_policy_pscore=evaluation_policy_pscore, evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, + ) + # evaluation_policy_action_dist and q_hat are not necessary when cascade-dr is not used + ope_ = SlateOffPolicyEvaluation( + bandit_feedback=synthetic_slate_bandit_feedback, + ope_estimators=[sips, rips, iips], + ) + _ = ope_.estimate_policy_values( + evaluation_policy_pscore=evaluation_policy_pscore, + evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, + evaluation_policy_pscore_item_position=evaluation_policy_pscore_item_position, ) @@ -463,7 +599,7 @@ def test_meta_estimate_policy_values_using_various_pscores( @pytest.mark.parametrize( - "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, description_1", + "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, evaluation_policy_action_dist, q_hat, description_1", valid_input_of_create_estimator_inputs, ) @pytest.mark.parametrize( @@ -474,6 +610,8 @@ def test_meta_estimate_intervals_using_invalid_input_data( evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, + evaluation_policy_action_dist, + q_hat, description_1: str, alpha, n_bootstrap_samples, @@ -506,7 +644,7 @@ def test_meta_estimate_intervals_using_invalid_input_data( @pytest.mark.parametrize( - "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, description_1", + "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, evaluation_policy_action_dist, q_hat, description_1", valid_input_of_create_estimator_inputs, ) @pytest.mark.parametrize( @@ -517,6 +655,8 @@ def test_meta_estimate_intervals_using_valid_input_data( evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, + evaluation_policy_action_dist, + q_hat, description_1: str, alpha: float, n_bootstrap_samples: int, @@ -541,22 +681,28 @@ def test_meta_estimate_intervals_using_valid_input_data( }, "SlateOffPolicyEvaluation.estimate_intervals ([IIPS]) returns a wrong value" # multiple ope estimators ope_ = SlateOffPolicyEvaluation( - bandit_feedback=synthetic_slate_bandit_feedback, ope_estimators=[iips, sips] + bandit_feedback=synthetic_slate_bandit_feedback, + ope_estimators=[iips, rips, cascade_dr, sips], ) assert ope_.estimate_intervals( evaluation_policy_pscore=evaluation_policy_pscore, + evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade, evaluation_policy_pscore_item_position=evaluation_policy_pscore_item_position, + evaluation_policy_action_dist=evaluation_policy_action_dist, + q_hat=q_hat, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) == { "iips": mock_confidence_interval, + "rips": mock_confidence_interval, + "cascade-dr": mock_confidence_interval, "sips": {k: v + sips.eps for k, v in mock_confidence_interval.items()}, }, "SlateOffPolicyEvaluation.estimate_intervals ([IIPS, SIPS]) returns a wrong value" @pytest.mark.parametrize( - "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, description_1", + "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, evaluation_policy_action_dist, q_hat, description_1", valid_input_of_create_estimator_inputs, ) @pytest.mark.parametrize( @@ -567,6 +713,8 @@ def test_meta_summarize_off_policy_estimates( evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, + evaluation_policy_action_dist, + q_hat, description_1: str, alpha: float, n_bootstrap_samples: int, @@ -633,7 +781,7 @@ def test_meta_summarize_off_policy_estimates( invalid_input_of_evaluation_performance_of_estimators = [ - ("foo", 0.3, ValueError, "metric must be either 'relative-ee' or 'se'"), + ("foo", 0.3, ValueError, "`metric` must be either 'relative-ee' or 'se'"), ( "se", 1, @@ -650,7 +798,7 @@ def test_meta_summarize_off_policy_estimates( "relative-ee", 0.0, ValueError, - "ground_truth_policy_value must be non-zero when metric is relative-ee", + "`ground_truth_policy_value` must be non-zero when metric is relative-ee", ), ] @@ -661,7 +809,7 @@ def test_meta_summarize_off_policy_estimates( @pytest.mark.parametrize( - "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, description_1", + "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, evaluation_policy_action_dist, q_hat, description_1", valid_input_of_create_estimator_inputs, ) @pytest.mark.parametrize( @@ -672,6 +820,8 @@ def test_meta_evaluate_performance_of_estimators_using_invalid_input_data( evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, + evaluation_policy_action_dist, + q_hat, description_1: str, metric, ground_truth_policy_value, @@ -701,7 +851,7 @@ def test_meta_evaluate_performance_of_estimators_using_invalid_input_data( @pytest.mark.parametrize( - "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, description_1", + "evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, evaluation_policy_action_dist, q_hat, description_1", valid_input_of_create_estimator_inputs, ) @pytest.mark.parametrize( @@ -712,6 +862,8 @@ def test_meta_evaluate_performance_of_estimators_using_valid_input_data( evaluation_policy_pscore, evaluation_policy_pscore_item_position, evaluation_policy_pscore_cascade, + evaluation_policy_action_dist, + q_hat, description_1: str, metric, ground_truth_policy_value, diff --git a/tests/ope/test_offline_estimation_performance.py b/tests/ope/test_offline_estimation_performance.py index 33f7022e..971b31b3 100644 --- a/tests/ope/test_offline_estimation_performance.py +++ b/tests/ope/test_offline_estimation_performance.py @@ -16,23 +16,22 @@ from obp.dataset import SyntheticBanditDataset from obp.ope import BalancedInverseProbabilityWeighting from obp.ope import DirectMethod -from obp.ope import DoublyRobust from obp.ope import DoublyRobustTuning -from obp.ope import DoublyRobustWithShrinkage from obp.ope import DoublyRobustWithShrinkageTuning -from obp.ope import InverseProbabilityWeighting -from obp.ope import InverseProbabilityWeightingTuning from obp.ope import ImportanceWeightEstimator +from obp.ope import InverseProbabilityWeightingTuning from obp.ope import OffPolicyEvaluation from obp.ope import PropensityScoreEstimator from obp.ope import RegressionModel from obp.ope import SelfNormalizedDoublyRobust from obp.ope import SelfNormalizedInverseProbabilityWeighting -from obp.ope import SwitchDoublyRobust +from obp.ope import SubGaussianDoublyRobustTuning +from obp.ope import SubGaussianInverseProbabilityWeightingTuning from obp.ope import SwitchDoublyRobustTuning from obp.ope.estimators import BaseOffPolicyEstimator from obp.policy import IPWLearner + # hyperparameters of the regression model used in model dependent OPE estimators hyperparams = { "lightgbm": { @@ -106,18 +105,10 @@ ] bipw_model_configurations = { - "bipw (random_forest raw)": dict( - fitting_method="raw", - base_model=RandomForestClassifier(**hyperparams["random_forest"]), - ), "bipw (random_forest sample)": dict( fitting_method="sample", base_model=RandomForestClassifier(**hyperparams["random_forest"]), ), - "bipw (svc raw)": dict( - fitting_method="raw", - base_model=SVC(**hyperparams["svc"]), - ), "bipw (svc sample)": dict( fitting_method="sample", base_model=SVC(**hyperparams["svc"]), @@ -157,49 +148,86 @@ def estimate_interval(self) -> Dict[str, float]: ope_estimators = [ NaiveEstimator(), DirectMethod(), - InverseProbabilityWeighting(), InverseProbabilityWeightingTuning( - lambdas=[100, 1000, np.inf], estimator_name="ipw (tuning)" + lambdas=[10, 50, 100, 500, 1000, 5000, np.inf], + tuning_method="mse", + estimator_name="ipw (tuning-mse)", + ), + InverseProbabilityWeightingTuning( + lambdas=[10, 50, 100, 500, 1000, 5000, np.inf], + tuning_method="slope", + estimator_name="ipw (tuning-slope)", + ), + SubGaussianInverseProbabilityWeightingTuning( + lambdas=[0.0001, 0.01], + tuning_method="mse", + estimator_name="sg-ipw (tuning-mse)", ), SelfNormalizedInverseProbabilityWeighting(), - DoublyRobust(), - DoublyRobustTuning(lambdas=[100, 1000, np.inf], estimator_name="dr (tuning)"), + DoublyRobustTuning( + lambdas=[10, 50, 100, 500, 1000, 5000, np.inf], + tuning_method="mse", + estimator_name="dr (tuning-mse)", + ), + DoublyRobustTuning( + lambdas=[10, 50, 100, 500, 1000, 5000, np.inf], + tuning_method="slope", + estimator_name="dr (tuning-slope)", + ), SelfNormalizedDoublyRobust(), - SwitchDoublyRobust(lambda_=1.0, estimator_name="switch-dr (lambda=1)"), - SwitchDoublyRobust(lambda_=100.0, estimator_name="switch-dr (lambda=100)"), SwitchDoublyRobustTuning( - lambdas=[100, 1000, np.inf], estimator_name="switch-dr (tuning)" + lambdas=[10, 50, 100, 500, 1000, 5000, np.inf], + tuning_method="mse", + estimator_name="switch-dr (tuning-mse)", + ), + SwitchDoublyRobustTuning( + lambdas=[10, 50, 100, 500, 1000, 5000, np.inf], + tuning_method="slope", + estimator_name="switch-dr (tuning-slope)", + ), + DoublyRobustWithShrinkageTuning( + lambdas=[10, 50, 100, 500, 1000, 5000, np.inf], + tuning_method="mse", + estimator_name="dr-os (tuning-mse)", ), - DoublyRobustWithShrinkage(lambda_=1.0, estimator_name="dr-os (lambda=1)"), - DoublyRobustWithShrinkage(lambda_=100.0, estimator_name="dr-os (lambda=100)"), DoublyRobustWithShrinkageTuning( - lambdas=[100, 1000, np.inf], estimator_name="dr-os (tuning)" + lambdas=[10, 50, 100, 500, 1000, 5000, np.inf], + tuning_method="slope", + estimator_name="dr-os (tuning-slope)", + ), + SubGaussianDoublyRobustTuning( + lambdas=[0.005, 0.01, 0.05, 0.1, 0.5], + tuning_method="mse", + estimator_name="sg-dr (tuning-mse)", ), - InverseProbabilityWeighting( - lambda_=100, + SubGaussianDoublyRobustTuning( + lambdas=[0.005, 0.01, 0.05, 0.1, 0.5], + tuning_method="slope", + estimator_name="sg-dr (tuning-slope)", + ), + InverseProbabilityWeightingTuning( + lambdas=[10, 50, 100, 500, 1000, 5000, np.inf], estimator_name="cipw (estimated pscore)", use_estimated_pscore=True, ), SelfNormalizedInverseProbabilityWeighting( estimator_name="snipw (estimated pscore)", use_estimated_pscore=True ), - DoublyRobust(estimator_name="dr (estimated pscore)", use_estimated_pscore=True), - DoublyRobustWithShrinkage( - lambda_=500, - estimator_name="dr-os (estimated pscore)", + DoublyRobustTuning( + lambdas=[10, 50, 100, 500, 1000, 5000, np.inf], + estimator_name="dr (estimated pscore)", use_estimated_pscore=True, ), - BalancedInverseProbabilityWeighting( - estimator_name="bipw (svc raw)", lambda_=np.inf - ), - BalancedInverseProbabilityWeighting( - estimator_name="bipw (svc sample)", lambda_=np.inf + DoublyRobustWithShrinkageTuning( + lambdas=[10, 50, 100, 500, 1000, 5000, np.inf], + estimator_name="dr-os (estimated pscore)", + use_estimated_pscore=True, ), BalancedInverseProbabilityWeighting( - estimator_name="bipw (random_forest raw)", lambda_=np.inf + estimator_name="bipw (svc sample)", lambda_=100 ), BalancedInverseProbabilityWeighting( - estimator_name="bipw (random_forest sample)", lambda_=np.inf + estimator_name="bipw (random_forest sample)", lambda_=100 ), ] @@ -221,7 +249,7 @@ def process(i: int): dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, - beta=-2.0, + beta=3.0, reward_function=logistic_reward_function, random_state=i, ) @@ -243,10 +271,10 @@ def process(i: int): pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback - action_dist = evaluation_policy.predict( + action_dist = evaluation_policy.predict_proba( context=bandit_feedback_test["context"], ) - # estimate the mean reward function of the test set of synthetic bandit feedback with ML model + # estimate the reward function of the test set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, @@ -258,11 +286,11 @@ def process(i: int): context=bandit_feedback_test["context"], action=bandit_feedback_test["action"], reward=bandit_feedback_test["reward"], - n_folds=3, # 3-fold cross-fitting + n_folds=2, random_state=12345, ) # fit propensity score estimators - classification_model_for_action = PropensityScoreEstimator( + pscore_estimator = PropensityScoreEstimator( len_list=1, n_actions=n_actions, base_model=base_model_dict[base_model_for_pscore_estimator]( @@ -270,12 +298,11 @@ def process(i: int): ), calibration_cv=2, ) - estimated_pscore = classification_model_for_action.fit_predict( + estimated_pscore = pscore_estimator.fit_predict( action=bandit_feedback_test["action"], position=bandit_feedback_test["position"], context=bandit_feedback_test["context"], - n_folds=3, - evaluate_model_performance=True, + n_folds=2, random_state=12345, ) # fit importance weight estimators @@ -310,6 +337,7 @@ def process(i: int): estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, estimated_pscore=estimated_pscore, estimated_importance_weights=estimated_importance_weights_dict, + metric="relative-ee", ) return relative_ee_i @@ -329,33 +357,29 @@ def process(i: int): relative_ee_df = DataFrame(relative_ee_dict).describe().T.round(6) relative_ee_df_mean = relative_ee_df["mean"] - assert relative_ee_df_mean["naive"] > relative_ee_df_mean["dm"] - assert relative_ee_df_mean["naive"] > relative_ee_df_mean["ipw"] - assert relative_ee_df_mean["naive"] > relative_ee_df_mean["ipw (tuning)"] - assert relative_ee_df_mean["naive"] > relative_ee_df_mean["snipw"] - assert relative_ee_df_mean["naive"] > relative_ee_df_mean["dr"] - assert relative_ee_df_mean["naive"] > relative_ee_df_mean["dr (tuning)"] - assert relative_ee_df_mean["naive"] > relative_ee_df_mean["sndr"] - assert relative_ee_df_mean["naive"] > relative_ee_df_mean["switch-dr (lambda=1)"] - assert relative_ee_df_mean["naive"] > relative_ee_df_mean["switch-dr (lambda=100)"] - assert relative_ee_df_mean["naive"] > relative_ee_df_mean["switch-dr (tuning)"] - assert relative_ee_df_mean["naive"] > relative_ee_df_mean["dr-os (lambda=1)"] - assert relative_ee_df_mean["naive"] > relative_ee_df_mean["dr-os (lambda=100)"] - assert relative_ee_df_mean["naive"] > relative_ee_df_mean["dr-os (tuning)"] - # test estimated_pscore and bipw - estimated_pscore_and_bipw_estimators = [ + tested_estimators = [ + "dm", + "ipw (tuning-mse)", + "ipw (tuning-slope)", + "sg-ipw (tuning-mse)", + "snipw", + "dr (tuning-mse)", + "dr (tuning-slope)", + "sndr", + "switch-dr (tuning-mse)", + "switch-dr (tuning-slope)", + "dr-os (tuning-mse)", + "dr-os (tuning-slope)", + "sg-dr (tuning-mse)", + "sg-dr (tuning-slope)", "cipw (estimated pscore)", "snipw (estimated pscore)", "dr (estimated pscore)", "dr-os (estimated pscore)", - "bipw (svc raw)", "bipw (svc sample)", - "bipw (random_forest raw)", "bipw (random_forest sample)", ] - for estimator_name in estimated_pscore_and_bipw_estimators: + for estimator_name in tested_estimators: assert ( - relative_ee_df_mean["naive"] > relative_ee_df_mean[estimator_name] - ), f"{estimator_name} is worse than naive estimator" - # print(estimator_name, relative_ee_df_mean[estimator_name]) - # print(relative_ee_df_mean["naive"]) + relative_ee_df_mean[estimator_name] / relative_ee_df_mean["naive"] < 1.25 + ), f"{estimator_name} is significantly worse than naive (on-policy) estimator" diff --git a/tests/ope/test_propensity_score_estimator.py b/tests/ope/test_propensity_score_estimator.py index 6e04e06e..84be5952 100644 --- a/tests/ope/test_propensity_score_estimator.py +++ b/tests/ope/test_propensity_score_estimator.py @@ -72,7 +72,7 @@ "RandomForest", # 2, ValueError, - "base_model must be BaseEstimator or a child class of BaseEstimator", + "`base_model` must be BaseEstimator or a child class of BaseEstimator", ), ( n_actions, @@ -98,7 +98,7 @@ 1, 2, ValueError, - "context must be 2D array", + "`context` must be 2D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -111,7 +111,7 @@ 1, 2, ValueError, - "action must be 1D array", + "`action` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7, 3)), # @@ -124,7 +124,7 @@ 1, 2, ValueError, - "context must be 2D array", + "`context` must be 2D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -137,7 +137,7 @@ 1, 2, ValueError, - "action must be 1D array", + "`action` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -150,7 +150,7 @@ 1, 2, ValueError, - "action elements must be non-negative integers", + "`action` elements must be non-negative integers", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -163,7 +163,7 @@ 1, 2, ValueError, - "action elements must be non-negative integers", + "`action` elements must be non-negative integers", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -176,7 +176,7 @@ 1, 2, ValueError, - "position must be 1D array", + "`position` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -189,7 +189,7 @@ 1, 2, ValueError, - "position must be 1D array", + "`position` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -215,7 +215,7 @@ 1, 2, ValueError, - "position elements must be non-negative integers", + "`position` elements must be non-negative integers", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -228,7 +228,7 @@ 1, 2, ValueError, - "position elements must be non-negative integers", + "`position` elements must be non-negative integers", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -267,7 +267,7 @@ 1, 2, ValueError, - r"action elements must be smaller than", + r"`action` elements must be integers in the range of", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -280,7 +280,7 @@ 1, 2, ValueError, - "position must be 1D array", + "`position` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -293,7 +293,7 @@ 1, 2, ValueError, - "position elements must be smaller than len_list", + "`position` elements must be smaller than `len_list`", ), ( np.random.uniform(size=(n_rounds, 7)), diff --git a/tests/ope/test_regression_models.py b/tests/ope/test_regression_models.py index c38b30c1..b93ce819 100644 --- a/tests/ope/test_regression_models.py +++ b/tests/ope/test_regression_models.py @@ -79,7 +79,7 @@ 1, # Ridge(**hyperparams["ridge"]), ValueError, - "fitting_method must be one of", + "`fitting_method` must be one of", ), ( np.random.uniform(size=(n_actions, 8)), @@ -88,7 +88,7 @@ "awesome", # Ridge(**hyperparams["ridge"]), ValueError, - "fitting_method must be one of", + "`fitting_method` must be one of", ), ( np.random.uniform(size=(n_actions, 8)), @@ -97,7 +97,7 @@ "normal", "RandomForest", # ValueError, - "base_model must be BaseEstimator or a child class of BaseEstimator", + "`base_model` must be BaseEstimator or a child class of BaseEstimator", ), ] @@ -119,7 +119,7 @@ 3, 1, ValueError, - "context must be 2D array", + "`context` must be 2D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -136,7 +136,7 @@ 3, 1, ValueError, - "action must be 1D array", + "`action` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -153,7 +153,7 @@ 3, 1, ValueError, - "reward must be 1D array", + "`reward` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7, 3)), # @@ -170,7 +170,7 @@ 3, 1, ValueError, - "context must be 2D array", + "`context` must be 2D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -187,7 +187,7 @@ 3, 1, ValueError, - "action must be 1D array", + "`action` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -204,7 +204,7 @@ 3, 1, ValueError, - "reward must be 1D array", + "`reward` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -221,7 +221,7 @@ 3, 1, ValueError, - "action elements must be non-negative integers", + "`action` elements must be non-negative integers", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -238,7 +238,7 @@ 3, 1, ValueError, - "action elements must be non-negative integers", + "`action` elements must be non-negative integers", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -255,7 +255,7 @@ 3, 1, ValueError, - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -272,7 +272,7 @@ 3, 1, ValueError, - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -306,7 +306,7 @@ 3, 1, ValueError, - "pscore must be positive", + "`pscore` must be positive", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -323,7 +323,7 @@ 3, 1, ValueError, - "position must be 1D array", + "`position` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -340,7 +340,7 @@ 3, 1, ValueError, - "position must be 1D array", + "`position` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -374,7 +374,7 @@ 3, 1, ValueError, - "position elements must be non-negative integers", + "`position` elements must be non-negative integers", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -391,7 +391,7 @@ 3, 1, ValueError, - "position elements must be non-negative integers", + "`position` elements must be non-negative integers", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -442,7 +442,7 @@ 3, 1, ValueError, - "action_context must be 2D array", + "`action_context` must be 2D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -459,7 +459,7 @@ 3, 1, ValueError, - "action_context must be 2D array", + "`action_context` must be 2D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -476,7 +476,7 @@ 3, 1, ValueError, - r"action elements must be smaller than", + "`action` elements must be integers in the range of", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -493,7 +493,7 @@ 3, 1, ValueError, - "position must be 1D array", + "`position` must be 1D array", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -510,7 +510,7 @@ 3, 1, ValueError, - "position elements must be smaller than len_list", + "`position` elements must be smaller than `len_list`", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -527,7 +527,7 @@ 3, 1, ValueError, - "when fitting_method is either", + "when `fitting_method` is either", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -544,7 +544,7 @@ 3, 1, ValueError, - "when fitting_method is either", + "when `fitting_method` is either", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -561,7 +561,7 @@ 3, 1, ValueError, - "shape of action_dist must be (n_rounds, n_actions, len_list)", + "shape of `action_dist` must be (n_rounds, n_actions, len_list)", ), ( np.random.uniform(size=(n_rounds, 7)), @@ -578,7 +578,7 @@ 3, 1, ValueError, - "action_dist must be a probability distribution", + "`action_dist` must be a probability distribution", ), ( np.random.uniform(size=(n_rounds, 7)), diff --git a/tests/ope/test_regression_models_slate.py b/tests/ope/test_regression_models_slate.py new file mode 100644 index 00000000..aeaff157 --- /dev/null +++ b/tests/ope/test_regression_models_slate.py @@ -0,0 +1,825 @@ +from itertools import permutations +from itertools import product +from pathlib import Path + +import numpy as np +import pytest +from sklearn.ensemble import GradientBoostingRegressor +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import Ridge +import yaml + +from obp.dataset import linear_behavior_policy_logit +from obp.dataset import logistic_reward_function +from obp.dataset import SyntheticSlateBanditDataset +from obp.ope import SlateCascadeDoublyRobust +from obp.ope import SlateRegressionModel +from obp.ope import SlateRewardInteractionIPS +from obp.utils import softmax + + +np.random.seed(1) + +model_dict = dict( + ridge=Ridge, + lightgbm=GradientBoostingRegressor, + random_forest=RandomForestRegressor, +) + +# hyperparameter settings for the base ML model in regression model +cd_path = Path(__file__).parent.resolve() +with open(cd_path / "hyperparams_slate.yaml", "rb") as f: + hyperparams = yaml.safe_load(f) + + +n_rounds = 1000 +n_unique_action = 3 +len_list = 3 +rips = SlateRewardInteractionIPS(len_list=len_list) +dr = SlateCascadeDoublyRobust(len_list=len_list, n_unique_action=n_unique_action) + +# n_unique_action, len_list, fitting_method, base_model, err, description +invalid_input_of_initializing_regression_models = [ + ( + "a", # + len_list, + "normal", + Ridge(**hyperparams["ridge"]), + TypeError, + "`n_unique_action` must be an instance of , not .", + ), + ( + 1, # + len_list, + "normal", + Ridge(**hyperparams["ridge"]), + ValueError, + "`n_unique_action`= 1, must be >= 2", + ), + ( + n_unique_action, + "a", # + "normal", + Ridge(**hyperparams["ridge"]), + TypeError, + "`len_list` must be an instance of , not .", + ), + ( + n_unique_action, + 0, # + "normal", + Ridge(**hyperparams["ridge"]), + ValueError, + "`len_list`= 0, must be >= 1", + ), + ( + n_unique_action, + len_list, + 1, # + Ridge(**hyperparams["ridge"]), + ValueError, + "fitting_method must be either", + ), + ( + n_unique_action, + len_list, + "awesome", # + Ridge(**hyperparams["ridge"]), + ValueError, + "fitting_method must be either", + ), + ( + n_unique_action, + len_list, + "normal", + "RandomForest", # + ValueError, + "`base_model` must be BaseEstimator or a child class of BaseEstimator", + ), +] + + +# context, action, reward, pscore, evaluation_policy_pscore, evaluation_policy_action_dist, err, description +invalid_input_of_fitting_regression_models = [ + ( + None, # + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "`context` must be 2D array", + ), + ( + np.random.uniform(size=(n_rounds, 7, 3)), # + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "`context` must be 2D array", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + None, # + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "`action` must be 1D array", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=(n_rounds, len_list)), # + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "`action` must be 1D array", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice([-1, -3], size=n_rounds * len_list), # + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "`action` elements must be integers in the range of", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + (np.arange(n_rounds * len_list) % n_unique_action) + 1, # + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "`action` elements must be integers in the range of", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list - 1), # + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "Expected `action.shape ==", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list), + None, # + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "`reward` must be 1D array", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=(n_rounds, len_list)), # + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "`reward` must be 1D array", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + "3", # + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "`pscore_cascade` must be 1D array", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones((n_rounds, len_list)), # + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "`pscore_cascade` must be 1D array", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list - 1), # + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "Expected `action.shape ==", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.arange(n_rounds * len_list), # + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "`pscore_cascade` must be in the range of", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + "3", # + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "`evaluation_policy_pscore_cascade` must be 1D array", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones((n_rounds, len_list)), # + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "`evaluation_policy_pscore_cascade` must be 1D array", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list - 1), # + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "Expected `action.shape ==", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.arange(n_rounds * len_list), # + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + ValueError, + "`evaluation_policy_pscore_cascade` must be in the range of", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + "4", # + ValueError, + "`evaluation_policy_action_dist` must be 1D array", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones((n_rounds, len_list, n_unique_action)) / n_unique_action, # + ValueError, + "`evaluation_policy_action_dist` must be 1D array", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones((n_rounds * len_list, n_unique_action)) / n_unique_action, # + ValueError, + "`evaluation_policy_action_dist` must be 1D array", + ), + ( + np.random.uniform(size=(n_rounds, 7)), + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.random.uniform(size=n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action), # + ValueError, + "evaluation_policy_action_dist[i * n_unique_action : (i+1) * n_unique_action]", + ), +] + + +valid_input_of_regression_models = [ + ( + np.random.uniform(size=(n_rounds, 7)), # + np.random.choice(n_unique_action, size=n_rounds * len_list), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list), + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action, + "", + ), +] + + +@pytest.mark.parametrize( + "n_unique_action, len_list, fitting_method, base_model, err, description", + invalid_input_of_initializing_regression_models, +) +def test_initializing_regression_models_using_invalid_input_data( + n_unique_action, + len_list, + fitting_method, + base_model, + err, + description, +) -> None: + # initialization raises ValueError + with pytest.raises(err, match=f"{description}*"): + _ = SlateRegressionModel( + n_unique_action=n_unique_action, + len_list=len_list, + base_model=base_model, + fitting_method=fitting_method, + ) + + +@pytest.mark.parametrize( + "context, action, reward, pscore, evaluation_policy_pscore, evaluation_policy_action_dist, err, description", + invalid_input_of_fitting_regression_models, +) +def test_fitting_regression_models_using_invalid_input_data( + context, + action, + reward, + pscore, + evaluation_policy_pscore, + evaluation_policy_action_dist, + err, + description, +) -> None: + # fit_predict function raises ValueError + with pytest.raises(err, match=f"{description}*"): + regression_model = SlateRegressionModel( + n_unique_action=n_unique_action, + len_list=len_list, + base_model=Ridge(**hyperparams["ridge"]), + fitting_method="normal", + ) + _ = regression_model.fit_predict( + context=context, + action=action, + reward=reward, + pscore_cascade=pscore, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + + +@pytest.mark.parametrize( + "context, action, reward, pscore, evaluation_policy_pscore, evaluation_policy_action_dist, description", + valid_input_of_regression_models, +) +def test_regression_models_using_valid_input_data( + context, + action, + reward, + pscore, + evaluation_policy_pscore, + evaluation_policy_action_dist, + description, +) -> None: + # fit_predict + for fitting_method in ["normal", "iw"]: + regression_model = SlateRegressionModel( + n_unique_action=n_unique_action, + len_list=len_list, + base_model=Ridge(**hyperparams["ridge"]), + fitting_method=fitting_method, + ) + _ = regression_model.fit_predict( + context=context, + action=action, + reward=reward, + pscore_cascade=pscore, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + + +def test_cascade_dr_criterion_using_cascade_additive_log(): + # set parameters + n_unique_action = 3 + len_list = 3 + dim_context = 2 + reward_type = "binary" + random_state = 12345 + n_rounds = 1000 + reward_structure = "cascade_additive" + click_model = None + behavior_policy_function = linear_behavior_policy_logit + reward_function = logistic_reward_function + dataset = SyntheticSlateBanditDataset( + n_unique_action=n_unique_action, + len_list=len_list, + dim_context=dim_context, + reward_type=reward_type, + reward_structure=reward_structure, + click_model=click_model, + random_state=random_state, + behavior_policy_function=behavior_policy_function, + base_reward_function=reward_function, + ) + # obtain feedback + bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) + context = bandit_feedback["context"] + action = bandit_feedback["action"] + reward = bandit_feedback["reward"] + pscore = bandit_feedback["pscore_cascade"] + + # random evaluation policy + evaluation_policy_logit_ = np.ones((n_rounds, n_unique_action)) / n_unique_action + evaluation_policy_action_dist = ( + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action + ) + ( + _, + _, + evaluation_policy_pscore, + ) = dataset.obtain_pscore_given_evaluation_policy_logit( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + return_pscore_item_position=False, + ) + evaluation_policy_action_dist = dataset.calc_evaluation_policy_action_dist( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + ) + q_expected = calc_ground_truth_mean_reward_function( + dataset=dataset, + context=context, + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + ) + + # obtain q_hat and check if q_hat is effective + cascade_dr_criterion_pass_rate = 0.7 + for fitting_method in ["normal", "iw"]: + for model_name, model in model_dict.items(): + base_regression_model = SlateRegressionModel( + base_model=model(**hyperparams[model_name]), + len_list=len_list, + n_unique_action=n_unique_action, + fitting_method=fitting_method, + ) + q_hat = base_regression_model.fit_predict( + context=context, + action=action, + reward=reward, + pscore_cascade=pscore, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + # compare dr criterion + cascade_dr_criterion = np.abs((q_expected - q_hat)) - np.abs(q_hat) + print( + f"Dr criterion is satisfied with probability {np.mean(cascade_dr_criterion <= 0)} ------ model: {model_name} ({fitting_method})," + ) + assert ( + np.mean(cascade_dr_criterion <= 0) >= cascade_dr_criterion_pass_rate + ), f" should be satisfied with a probability at least {cascade_dr_criterion_pass_rate}" + + +def test_cascade_dr_criterion_using_independent_log(): + # set parameters + n_unique_action = 3 + len_list = 3 + dim_context = 2 + reward_type = "binary" + random_state = 12345 + n_rounds = 1000 + reward_structure = "independent" + click_model = None + behavior_policy_function = linear_behavior_policy_logit + reward_function = logistic_reward_function + dataset = SyntheticSlateBanditDataset( + n_unique_action=n_unique_action, + len_list=len_list, + dim_context=dim_context, + reward_type=reward_type, + reward_structure=reward_structure, + click_model=click_model, + random_state=random_state, + behavior_policy_function=behavior_policy_function, + base_reward_function=reward_function, + ) + # obtain feedback + bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) + context = bandit_feedback["context"] + action = bandit_feedback["action"] + reward = bandit_feedback["reward"] + pscore = bandit_feedback["pscore_cascade"] + + # random evaluation policy + evaluation_policy_logit_ = np.ones((n_rounds, n_unique_action)) / n_unique_action + evaluation_policy_action_dist = ( + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action + ) + ( + _, + _, + evaluation_policy_pscore, + ) = dataset.obtain_pscore_given_evaluation_policy_logit( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + return_pscore_item_position=False, + ) + evaluation_policy_action_dist = dataset.calc_evaluation_policy_action_dist( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + ) + q_expected = calc_ground_truth_mean_reward_function( + dataset=dataset, + context=context, + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + ) + + # obtain q_hat and check if q_hat is effective + cascade_dr_criterion_pass_rate = 0.7 + for fitting_method in ["normal", "iw"]: + for model_name, model in model_dict.items(): + base_regression_model = SlateRegressionModel( + base_model=model(**hyperparams[model_name]), + len_list=len_list, + n_unique_action=n_unique_action, + fitting_method=fitting_method, + ) + q_hat = base_regression_model.fit_predict( + context=context, + action=action, + reward=reward, + pscore_cascade=pscore, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + # compare dr criterion + cascade_dr_criterion = np.abs((q_expected - q_hat)) - np.abs(q_hat) + print( + f"Dr criterion is satisfied with probability {np.mean(cascade_dr_criterion <= 0)} ------ model: {model_name} ({fitting_method})," + ) + assert ( + np.mean(cascade_dr_criterion <= 0) >= cascade_dr_criterion_pass_rate + ), f" should be satisfied with a probability at least {cascade_dr_criterion_pass_rate}" + + +def test_cascade_dr_criterion_using_standard_additive_log(): + # set parameters + n_unique_action = 3 + len_list = 3 + dim_context = 2 + reward_type = "binary" + random_state = 12345 + n_rounds = 1000 + reward_structure = "standard_additive" + click_model = None + behavior_policy_function = linear_behavior_policy_logit + reward_function = logistic_reward_function + dataset = SyntheticSlateBanditDataset( + n_unique_action=n_unique_action, + len_list=len_list, + dim_context=dim_context, + reward_type=reward_type, + reward_structure=reward_structure, + click_model=click_model, + random_state=random_state, + behavior_policy_function=behavior_policy_function, + base_reward_function=reward_function, + ) + # obtain feedback + bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) + context = bandit_feedback["context"] + action = bandit_feedback["action"] + reward = bandit_feedback["reward"] + pscore = bandit_feedback["pscore_cascade"] + + # random evaluation policy + evaluation_policy_logit_ = np.ones((n_rounds, n_unique_action)) / n_unique_action + evaluation_policy_action_dist = ( + np.ones(n_rounds * len_list * n_unique_action) / n_unique_action + ) + ( + _, + _, + evaluation_policy_pscore, + ) = dataset.obtain_pscore_given_evaluation_policy_logit( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + return_pscore_item_position=False, + ) + evaluation_policy_action_dist = dataset.calc_evaluation_policy_action_dist( + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + ) + q_expected = calc_ground_truth_mean_reward_function( + dataset=dataset, + context=context, + action=action, + evaluation_policy_logit_=evaluation_policy_logit_, + ) + + # obtain q_hat and check if q_hat is effective + cascade_dr_criterion_pass_rate = 0.7 + for fitting_method in ["normal", "iw"]: + for model_name, model in model_dict.items(): + base_regression_model = SlateRegressionModel( + base_model=model(**hyperparams[model_name]), + len_list=len_list, + n_unique_action=n_unique_action, + fitting_method=fitting_method, + ) + q_hat = base_regression_model.fit_predict( + context=context, + action=action, + reward=reward, + pscore_cascade=pscore, + evaluation_policy_pscore_cascade=evaluation_policy_pscore, + evaluation_policy_action_dist=evaluation_policy_action_dist, + ) + # compare dr criterion + cascade_dr_criterion = np.abs((q_expected - q_hat)) - np.abs(q_hat) + print( + f"Dr criterion is satisfied with probability {np.mean(cascade_dr_criterion <= 0)} ------ model: {model_name} ({fitting_method})," + ) + assert ( + np.mean(cascade_dr_criterion <= 0) >= cascade_dr_criterion_pass_rate + ), f" should be satisfied with a probability at least {cascade_dr_criterion_pass_rate}" + + +def calc_ground_truth_mean_reward_function( + dataset, + context: np.ndarray, + action: np.ndarray, + evaluation_policy_logit_: np.ndarray, +): + n_rounds = len(context) + action = action.reshape((n_rounds, dataset.len_list)) + ground_truth_mean_reward_function = np.zeros( + (n_rounds, dataset.len_list, dataset.n_unique_action), dtype=float + ) + + for position in range(dataset.len_list): + if position != dataset.len_list - 1: + if dataset.is_factorizable: + enumerated_slate_actions = [ + _ + for _ in product( + np.arange(dataset.n_unique_action), + repeat=dataset.len_list - position - 1, + ) + ] + else: + enumerated_slate_actions = [ + _ + for _ in permutations( + np.arange(dataset.n_unique_action), + dataset.len_list - position - 1, + ) + ] + enumerated_slate_actions = np.array(enumerated_slate_actions).astype("int8") + n_enumerated_slate_actions = len(enumerated_slate_actions) + + for i in range(n_rounds): + if position != dataset.len_list - 1: + action_ = np.tile( + action[i][: position + 1], (n_enumerated_slate_actions, 1) + ) + for a_ in range(dataset.n_unique_action): + action__ = action_.copy() + action__[:, position] = a_ + enumerated_slate_actions_ = np.concatenate( + [action_, enumerated_slate_actions], axis=1 + ) + ground_truth_mean_reward_function[ + i, position, a_ + ] = calc_ground_truth_mean_reward_function_given_enumerated_slate_actions( + dataset=dataset, + context=context, + evaluation_policy_logit_=evaluation_policy_logit_, + enumerated_slate_actions=enumerated_slate_actions_, + i=i, + position=position, + ) + + else: + action_ = action[i].reshape((1, dataset.len_list)) + for a_ in range(dataset.n_unique_action): + action__ = action_.copy() + action__[:, position] = a_ + enumerated_slate_actions_ = action__ + n_enumerated_slate_actions = 1 + ground_truth_mean_reward_function[ + i, position, a_ + ] = calc_ground_truth_mean_reward_function_given_enumerated_slate_actions( + dataset=dataset, + context=context, + evaluation_policy_logit_=evaluation_policy_logit_, + enumerated_slate_actions=enumerated_slate_actions_, + i=i, + position=position, + ) + + return ground_truth_mean_reward_function.flatten() + + +def calc_ground_truth_mean_reward_function_given_enumerated_slate_actions( + dataset, + context: np.ndarray, + evaluation_policy_logit_: np.ndarray, + enumerated_slate_actions: np.ndarray, + i: int, + position: int, +): + pscores = [] + evaluation_policy_logit_i = evaluation_policy_logit_[i].reshape( + (1, dataset.n_unique_action) + ) + n_enumerated_slate_actions = len(enumerated_slate_actions) + + if dataset.is_factorizable: + action_dist = softmax(evaluation_policy_logit_i)[0] + + for action_list in enumerated_slate_actions: + pscore = 1 + + for position in range(dataset.len_list): + pscore *= action_dist[action_list[position]] + + pscores.append(pscore) + + else: + for action_list in enumerated_slate_actions: + pscore = 1 + evaluation_policy_logit_i_ = evaluation_policy_logit_i.copy() + + for position in range(dataset.len_list): + action_dist = softmax(evaluation_policy_logit_i_)[0] + pscore *= action_dist[action_list[position]] + evaluation_policy_logit_i_[0][action_list[position]] = -1e10 + + pscores.append(pscore) + + pscores = np.array(pscores) + + # calculate expected slate-level reward for each combinatorial set of items (i.e., slate actions) + if dataset.base_reward_function is None: + expected_slot_reward = dataset.sample_contextfree_expected_reward( + random_state=dataset.random_state + ) + expected_slot_reward_tile = np.tile( + expected_slot_reward, (n_enumerated_slate_actions, 1, 1) + ) + expected_slate_rewards = np.array( + [ + expected_slot_reward_tile[ + np.arange(n_enumerated_slate_actions) % n_enumerated_slate_actions, + np.array(enumerated_slate_actions)[:, position_], + position_, + ] + for position_ in np.arange(dataset.len_list) + ] + ).T + else: + expected_slate_rewards = dataset.reward_function( + context=context[i].reshape((1, -1)), + action_context=dataset.action_context, + action=enumerated_slate_actions.flatten(), + action_interaction_weight_matrix=dataset.action_interaction_weight_matrix, + base_reward_function=dataset.base_reward_function, + reward_type=dataset.reward_type, + reward_structure=dataset.reward_structure, + len_list=dataset.len_list, + is_enumerated=True, + random_state=dataset.random_state, + ) + # click models based on expected reward + expected_slate_rewards *= dataset.exam_weight + if dataset.reward_type == "binary": + discount_factors = np.ones(expected_slate_rewards.shape[0]) + previous_slot_expected_reward = np.zeros(expected_slate_rewards.shape[0]) + for position_ in np.arange(dataset.len_list): + discount_factors *= ( + previous_slot_expected_reward * dataset.attractiveness[position_] + + (1 - previous_slot_expected_reward) + ) + expected_slate_rewards[:, position_] = ( + discount_factors * expected_slate_rewards[:, position_] + ) + previous_slot_expected_reward = expected_slate_rewards[:, position_] + + return (pscores * expected_slate_rewards.sum(axis=1)).sum() diff --git a/tests/policy/test_offline.py b/tests/policy/test_offline.py index aff7b94b..02a8d0d1 100644 --- a/tests/policy/test_offline.py +++ b/tests/policy/test_offline.py @@ -150,7 +150,7 @@ def test_ipw_learner_fit(): learner.fit(context=context, action=action, reward=reward) # position must be non-negative - desc = "position elements must be non-negative integers" + desc = "`position` elements must be non-negative integers" with pytest.raises(ValueError, match=f"{desc}*"): negative_position = position - 1 learner = IPWLearner(n_actions=n_actions, len_list=len_list) @@ -173,7 +173,7 @@ def test_ipw_learner_predict(): len_list = 1 # shape error - desc = "context must be 2D array" + desc = "`context` must be 2D array" with pytest.raises(ValueError, match=f"{desc}*"): context = np.array([1.0, 1.0]) learner = IPWLearner(n_actions=n_actions, len_list=len_list) @@ -208,7 +208,7 @@ def test_ipw_learner_sample_action(): learner = IPWLearner(n_actions=n_actions, len_list=len_list) learner.fit(context=context, action=action, reward=reward, position=position) - desc = "context must be 2D array" + desc = "`context` must be 2D array" with pytest.raises(ValueError, match=f"{desc}*"): invalid_type_context = [1.0, 2.0] learner.sample_action(context=invalid_type_context) @@ -249,13 +249,13 @@ def test_ipw_learner_sample_action(): "normal", "`len_list`= 20, must be <= 10", ), - (10, 1, "base_regressor", "normal", "base_model must be BaseEstimator"), # + (10, 1, "base_regressor", "normal", "`base_model` must be BaseEstimator"), # ( 10, 1, base_classifier, "None", # - "fitting_method must be one of 'normal', 'iw', or 'mrdr', but", + "`fitting_method` must be one of 'normal', 'iw', or 'mrdr', but", ), ] @@ -358,7 +358,7 @@ def test_q_learner_fit(): learner.fit(context=context, action=action, reward=reward) # position must be non-negative - desc = "position elements must be non-negative integers" + desc = "`position` elements must be non-negative integers" with pytest.raises(ValueError, match=f"{desc}*"): negative_position = position - 1 learner = QLearner( @@ -374,7 +374,7 @@ def test_q_learner_predict(): len_list = 1 # shape error - desc = "context must be 2D array" + desc = "`context` must be 2D array" with pytest.raises(ValueError, match=f"{desc}*"): context = np.array([1.0, 1.0]) learner = QLearner( @@ -415,7 +415,7 @@ def test_q_learner_sample_action(): ) learner.fit(context=context, action=action, reward=reward, position=position) - desc = "context must be 2D array" + desc = "`context` must be 2D array" with pytest.raises(ValueError, match=f"{desc}*"): invalid_type_context = [1.0, 2.0] learner.sample_action(context=invalid_type_context) @@ -1537,7 +1537,7 @@ def test_nn_policy_learner_predict(): pscore = np.array([0.5] * 100, dtype=np.float32) # shape error - desc = "context must be 2D array" + desc = "`context` must be 2D array" with pytest.raises(ValueError, match=f"{desc}*"): learner = NNPolicyLearner( n_actions=n_actions, @@ -1590,7 +1590,7 @@ def test_nn_policy_learner_sample_action(): pscore = np.array([0.5] * 100, dtype=np.float32) # shape error - desc = "context must be 2D array" + desc = "`context` must be 2D array" with pytest.raises(ValueError, match=f"{desc}*"): learner = NNPolicyLearner( n_actions=n_actions, @@ -1641,7 +1641,7 @@ def test_nn_policy_learner_predict_proba(): pscore = np.array([0.5] * 100, dtype=np.float32) # shape error - desc = "context must be 2D array" + desc = "`context` must be 2D array" with pytest.raises(ValueError, match=f"{desc}*"): learner = NNPolicyLearner( n_actions=n_actions, diff --git a/tests/policy/test_offline_continuous.py b/tests/policy/test_offline_continuous.py index a3cbd9c0..8b1205e8 100644 --- a/tests/policy/test_offline_continuous.py +++ b/tests/policy/test_offline_continuous.py @@ -874,56 +874,56 @@ def test_nn_policy_learner_create_train_data_for_opl(): np.ones(5), np.ones(5), np.ones(5) * 0.5, - "context must be 2D array", + "`context` must be 2D array", ), ( np.ones(5), # np.ones(5), np.ones(5), np.ones(5) * 0.5, - "context must be 2D array", + "`context` must be 2D array", ), ( np.ones((5, 2)), 5, # np.ones(5), np.ones(5) * 0.5, - "action_by_behavior_policy must be 1D array", + "`action_by_behavior_policy` must be 1D array", ), ( np.ones((5, 2)), np.ones((5, 2)), # np.ones(5), np.ones(5) * 0.5, - "action_by_behavior_policy must be 1D array", + "`action_by_behavior_policy` must be 1D array", ), ( np.ones((5, 2)), np.ones(5), 5, # np.ones(5) * 0.5, - "reward must be 1D array", + "`reward` must be 1D array", ), ( np.ones((5, 2)), np.ones(5), np.ones((5, 2)), # np.ones(5) * 0.5, - "reward must be 1D array", + "`reward` must be 1D array", ), ( np.ones((5, 2)), np.ones(5), np.ones(5), 0.5, # - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( np.ones((5, 2)), np.ones(5), np.ones(5), np.ones((5, 2)) * 0.5, # - "pscore must be 1D array", + "`pscore` must be 1D array", ), ( np.ones((4, 2)), # @@ -951,7 +951,7 @@ def test_nn_policy_learner_create_train_data_for_opl(): np.ones(5), np.ones(5), np.arange(5) * 0.1, # - "pscore must be positive", + "`pscore` must be positive", ), ( np.ones((5, 3)), # @@ -1049,10 +1049,10 @@ def test_nn_policy_learner_predict(): ) # shape error - with pytest.raises(ValueError, match="context must be 2D array"): + with pytest.raises(ValueError, match="`context` must be 2D array"): learner.predict(context=np.ones(5)) - with pytest.raises(ValueError, match="context must be 2D array"): + with pytest.raises(ValueError, match="`context` must be 2D array"): learner.predict(context="np.ones(5)") # inconsistency between dim_context and context