Merge pull request #20 from st-tech/feat/update-version-to-0.3.2

update version to 0.3.2
st-tech · Nov 7, 2020 · 0667738 · 0667738
2 parents 1e1eb9f + f09f218
commit 0667738
Show file tree

Hide file tree

Showing 16 changed files with 177 additions and 171 deletions.
diff --git a/docs/obp.rst b/docs/obp.rst
@@ -31,6 +31,7 @@ dataset module
     obp.dataset.base
     obp.dataset.real
     obp.dataset.synthetic
+    obp.dataset.multiclass
 
 
 simulator module

diff --git a/examples/quickstart/quickstart.ipynb b/examples/quickstart/quickstart.ipynb
diff --git a/examples/quickstart/quickstart_synthetic.ipynb b/examples/quickstart/quickstart_synthetic.ipynb
diff --git a/obp/dataset/multiclass.py b/obp/dataset/multiclass.py
@@ -24,16 +24,16 @@ class MultiClassToBanditReduction(BaseSyntheticBanditDataset):
     A machine learning classifier such as logistic regression is used to construct behavior and evaluation policies as follows.
 
         1. Split the original data into training (:math:`\\mathcal{D}_{\\mathrm{tr}}`) and evaluation (:math:`\\mathcal{D}_{\\mathrm{ev}}`) sets.
-        2. Train classifiers on :math:`\\mathcal{D}_{\\mathrm{tr}}` and regard them as base deterministic policies :math:`\\pi_{\\mathrm{det},b}` and :math:`\\pi_{\\mathrm{det},e}`.
-        3. Construct behavior (:math:`\\pi_{b}`) and evaluation (:math:`\\pi_{e}`) policies based on :math:`\\pi_{\\mathrm{det}}` as
+        2. Train classifiers on :math:`\\mathcal{D}_{\\mathrm{tr}}` and obtain base deterministic policies :math:`\\pi_{\\mathrm{det},b}` and :math:`\\pi_{\\mathrm{det},e}`.
+        3. Construct behavior (:math:`\\pi_{b}`) and evaluation (:math:`\\pi_{e}`) policies based on :math:`\\pi_{\\mathrm{det},b}` and :math:`\\pi_{\\mathrm{det},e}` as
 
             .. math::
 
-                \\pi_b (a | x) := \\alpha_b \\pi_{\\mathrm{det},b} (a|x) + (1.0 - \\alpha_b) \\pi_{u} (a|x)
+                \\pi_b (a | x) := \\alpha_b \\cdot \\pi_{\\mathrm{det},b} (a|x) + (1.0 - \\alpha_b) \\cdot \\pi_{u} (a|x)
 
             .. math::
 
-                \\pi_e (a | x) := \\alpha_e \\pi_{\\mathrm{det},e} (a|x) + (1.0 - \\alpha_e) \\pi_{u} (a|x)
+                \\pi_e (a | x) := \\alpha_e \\cdot \\pi_{\\mathrm{det},e} (a|x) + (1.0 - \\alpha_e) \\cdot \\pi_{u} (a|x)
 
             where :math:`\\pi_{u}` is a uniform random policy and :math:`\\alpha_b` and :math:`\\alpha_e` are set by the user.
 
@@ -60,11 +60,11 @@ class MultiClassToBanditReduction(BaseSyntheticBanditDataset):
     base_classifier_b: ClassifierMixin
         Machine learning classifier used to construct a behavior policy.
 
-    alpha_b: float, default: 0.9
+    alpha_b: float, default=0.9
         Ration of a uniform random policy when constructing a **behavior** policy.
         Must be in the [0, 1) interval to make the behavior policy a stochastic one.
 
-    dataset_name: str, default: None
+    dataset_name: str, default=None
         Name of the dataset.
 
     Examples
@@ -187,7 +187,7 @@ def split_train_eval(
             If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the evaluation split.
             If int, represents the absolute number of test samples.
 
-        random_state: int, default: None
+        random_state: int, default=None
             Controls the random seed in train-evaluation split.
 
         """
@@ -213,12 +213,12 @@ def obtain_batch_bandit_feedback(
         Please call `self.split_train_eval()` before calling this method.
 
         Parameters
-        ----------
+        -----------
         eval_size: float or int, default=0.25
             If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
             If int, represents the absolute number of test samples.
 
-        random_state: int, default: None
+        random_state: int, default=None
             Controls the random seed in sampling actions.
 
         Returns
@@ -261,10 +261,12 @@ def obtain_action_dist_by_eval_policy(
     ) -> np.ndarray:
         """Obtain action choice probabilities by an evaluation policy.
 
-        base_classifier_e: ClassifierMixin, default: None
+        Parameters
+        -----------
+        base_classifier_e: ClassifierMixin, default=None
             Machine learning classifier used to construct a behavior policy.
 
-        alpha_e: float, default: 1.0
+        alpha_e: float, default=1.0
             Ration of a uniform random policy when constructing an **evaluation** policy.
             Must be in the [0, 1] interval (evaluation policy can be deterministic).
 

diff --git a/obp/dataset/real.py b/obp/dataset/real.py
@@ -32,10 +32,10 @@ class OpenBanditDataset(BaseRealBanditDataset):
     campaign: str
         One of the three possible campaigns considered in ZOZOTOWN, "all", "men", and "women".
 
-    data_path: Path, default: Path('./obd')
+    data_path: Path, default=Path('./obd')
         Path that stores Open Bandit Dataset.
 
-    dataset_name: str, default: 'obd'
+    dataset_name: str, default='obd'
         Name of the dataset.
 
     References
@@ -109,13 +109,13 @@ def calc_on_policy_policy_value_estimate(
         campaign: str
             One of the three possible campaigns considered in ZOZOTOWN (i.e., "all", "men", and "women").
 
-        data_path: Path, default: Path('./obd')
+        data_path: Path, default=Path('./obd')
             Path that stores Open Bandit Dataset.
 
         test_size: float, default=0.3
             If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
 
-        is_timeseries_split: bool, default: False
+        is_timeseries_split: bool, default=False
             If true, split the original logged badnit feedback data by time series.
 
         Returns
@@ -178,7 +178,7 @@ def obtain_batch_bandit_feedback(
         test_size: float, default=0.3
             If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
 
-        is_timeseries_split: bool, default: False
+        is_timeseries_split: bool, default=False
             If true, split the original logged badnit feedback data by time series.
 
         Returns
@@ -233,10 +233,10 @@ def sample_bootstrap_bandit_feedback(
         test_size: float, default=0.3
             If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
 
-        is_timeseries_split: bool, default: False
+        is_timeseries_split: bool, default=False
             If true, split the original logged badnit feedback data by time series.
 
-        random_state: int, default: None
+        random_state: int, default=None
             Controls the random seed in sampling logged bandit dataset.
 
         Returns

diff --git a/obp/dataset/synthetic.py b/obp/dataset/synthetic.py
@@ -31,29 +31,29 @@ class SyntheticBanditDataset(BaseSyntheticBanditDataset):
     n_actions: int
         Number of actions.
 
-    dim_context: int, default: 1
+    dim_context: int, default=1
         Number of dimensions of context vectors.
 
-    reward_type: str, default: 'binary'
+    reward_type: str, default='binary'
         Type of reward variable, must be either 'binary' or 'continuous'.
         When 'binary' is given, rewards are sampled from the Bernoulli distribution.
         When 'continuous' is given, rewards are sampled from the truncated Normal distribution with `scale=1`.
 
-    reward_function: Callable[[np.ndarray, np.ndarray], np.ndarray]], default: None
+    reward_function: Callable[[np.ndarray, np.ndarray], np.ndarray]], default=None
         Function generating expected reward with context and action context vectors,
         i.e., :math:`\\mu: \\mathcal{X} \\times \\mathcal{A} \\rightarrow \\mathbb{R}`.
         If None is set, context **independent** expected reward for each action will be
         sampled from the uniform distribution automatically.
 
-    behavior_policy_function: Callable[[np.ndarray, np.ndarray], np.ndarray], default: None
+    behavior_policy_function: Callable[[np.ndarray, np.ndarray], np.ndarray], default=None
         Function generating probability distribution over action space,
         i.e., :math:`\\pi: \\mathcal{X} \\rightarrow \\Delta(\\mathcal{A})`.
         If None is set, context **independent** uniform distribution will be used (uniform random behavior policy).
 
-    random_state: int, default: None
+    random_state: int, default=None
         Controls the random seed in sampling synthetic bandit dataset.
 
-    dataset_name: str, default: 'synthetic_bandit_dataset'
+    dataset_name: str, default='synthetic_bandit_dataset'
         Name of the dataset.
 
     Examples
@@ -252,7 +252,7 @@ def logistic_reward_function(
     action_context: array-like, shape (n_actions, dim_action_context)
         Vector representation for each action.
 
-    random_state: int, default: None
+    random_state: int, default=None
         Controls the random seed in sampling dataset.
 
     Returns
@@ -292,7 +292,7 @@ def linear_reward_function(
     action_context: array-like, shape (n_actions, dim_action_context)
         Vector representation for each action.
 
-    random_state: int, default: None
+    random_state: int, default=None
         Controls the random seed in sampling dataset.
 
     Returns
@@ -332,7 +332,7 @@ def linear_behavior_policy(
     action_context: array-like, shape (n_actions, dim_action_context)
         Vector representation for each action.
 
-    random_state: int, default: None
+    random_state: int, default=None
         Controls the random seed in sampling dataset.
 
     Returns

diff --git a/obp/ope/estimators.py b/obp/ope/estimators.py
@@ -49,7 +49,7 @@ class ReplayMethod(BaseOffPolicyEstimator):
 
     Parameters
     ----------
-    estimator_name: str, default: 'rm'.
+    estimator_name: str, default='rm'.
         Name of off-policy estimator.
 
     References
@@ -151,13 +151,13 @@ def estimate_interval(
         position: array-like, shape (n_rounds,)
             Positions of each round in the given logged bandit feedback.
 
-        alpha: float, default: 0.05
+        alpha: float, default=0.05
             P-value.
 
-        n_bootstrap_samples: int, default: 10000
+        n_bootstrap_samples: int, default=10000
             Number of resampling performed in the bootstrap procedure.
 
-        random_state: int, default: None
+        random_state: int, default=None
             Controls the random seed in bootstrap sampling.
 
         Returns
@@ -197,7 +197,7 @@ class InverseProbabilityWeighting(BaseOffPolicyEstimator):
 
     Parameters
     ------------
-    estimator_name: str, default: 'ipw'.
+    estimator_name: str, default='ipw'.
         Name of off-policy estimator.
 
     References
@@ -320,13 +320,13 @@ def estimate_interval(
             Distribution over actions or the action choice probabilities
             by the evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a|x)`.
 
-        alpha: float, default: 0.05
+        alpha: float, default=0.05
             P-value.
 
-        n_bootstrap_samples: int, default: 10000
+        n_bootstrap_samples: int, default=10000
             Number of resampling performed in the bootstrap procedure.
 
-        random_state: int, default: None
+        random_state: int, default=None
             Controls the random seed in bootstrap sampling.
 
         Returns
@@ -372,7 +372,7 @@ class SelfNormalizedInverseProbabilityWeighting(InverseProbabilityWeighting):
 
     Parameters
     ----------
-    estimator_name: str, default: 'snipw'.
+    estimator_name: str, default='snipw'.
         Name of off-policy estimator.
 
     References
@@ -451,7 +451,7 @@ class DirectMethod(BaseOffPolicyEstimator):
 
     Parameters
     ----------
-    estimator_name: str, default: 'dm'.
+    estimator_name: str, default='dm'.
         Name of off-policy estimator.
 
     References
@@ -554,13 +554,13 @@ def estimate_interval(
         estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list)
             Estimated rewards for each round, action, and position by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
 
-        alpha: float, default: 0.05
+        alpha: float, default=0.05
             P-value.
 
-        n_bootstrap_samples: int, default: 10000
+        n_bootstrap_samples: int, default=10000
             Number of resampling performed in the bootstrap procedure.
 
-        random_state: int, default: None
+        random_state: int, default=None
             Controls the random seed in bootstrap sampling.
 
         Returns
@@ -611,7 +611,7 @@ class DoublyRobust(InverseProbabilityWeighting):
 
     Parameters
     ----------
-    estimator_name: str, default: 'dr'.
+    estimator_name: str, default='dr'.
         Name of off-policy estimator.
 
     References
@@ -760,13 +760,13 @@ def estimate_interval(
         estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list)
             Estimated rewards for each round, action, and position by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
 
-        alpha: float, default: 0.05
+        alpha: float, default=0.05
             P-value.
 
-        n_bootstrap_samples: int, default: 10000
+        n_bootstrap_samples: int, default=10000
             Number of resampling performed in the bootstrap procedure.
 
-        random_state: int, default: None
+        random_state: int, default=None
             Controls the random seed in bootstrap sampling.
 
         Returns
@@ -815,7 +815,7 @@ class SelfNormalizedDoublyRobust(DoublyRobust):
 
     Parameters
     ----------
-    estimator_name: str, default: 'sndr'.
+    estimator_name: str, default='sndr'.
         Name of off-policy estimator.
 
     References
@@ -906,11 +906,11 @@ class SwitchInverseProbabilityWeighting(DoublyRobust):
 
     Parameters
     ----------
-    tau: float, default: 1
+    tau: float, default=1
         Switching hyperparameter. When importance weight is larger than this parameter, the DM estimator is applied, otherwise the IPW estimator is applied.
         This hyperparameter should be larger than 1., otherwise it is meaningless.
 
-    estimator_name: str, default: 'switch-ipw'.
+    estimator_name: str, default='switch-ipw'.
         Name of off-policy estimator.
 
     References
@@ -1007,11 +1007,11 @@ class SwitchDoublyRobust(DoublyRobust):
 
     Parameters
     ----------
-    tau: float, default: 1
+    tau: float, default=1
         Switching hyperparameter. When importance weight is larger than this parameter, the DM estimator is applied, otherwise the DR estimator is applied.
         This hyperparameter should be larger than 0., otherwise it is meaningless.
 
-    estimator_name: str, default: 'switch-dr'.
+    estimator_name: str, default='switch-dr'.
         Name of off-policy estimator.
 
     References
@@ -1127,7 +1127,7 @@ class DoublyRobustWithShrinkage(DoublyRobust):
     lambda_: float
         Shrinkage hyperparameter. This hyperparameter should be larger than 0., otherwise it is meaningless.
 
-    estimator_name: str, default: 'dr-os'.
+    estimator_name: str, default='dr-os'.
         Name of off-policy estimator.
 
     References