st-tech · usaito · Dec 3, 2022 · Dec 3, 2022 · Dec 4, 2022 · Dec 4, 2022
diff --git a/.github/workflows/lints.yml b/.github/workflows/lints.yml
@@ -17,7 +17,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v2
         with:
-          python-version: 3.7
+          python-version: 3.8
 
       - name: Black
         uses: psf/black@stable

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -12,7 +12,7 @@ jobs:
 
     strategy:
       matrix:
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8, 3.9, 3.10]
 
     # Not intended for forks.
     if: github.repository == 'st-tech/zr-obp'

diff --git a/obp/dataset/synthetic.py b/obp/dataset/synthetic.py
@@ -3,8 +3,9 @@
 
 """Class for Generating Synthetic Logged Bandit Data."""
 from dataclasses import dataclass
-from typing import Callable, Tuple
+from typing import Callable
 from typing import Optional
+from typing import Tuple
 
 import numpy as np
 from scipy.stats import truncnorm
@@ -20,6 +21,7 @@
 from .base import BaseBanditDataset
 from .reward_type import RewardType
 
+
 coef_func_signature = Callable[
     [np.ndarray, np.ndarray, np.random.RandomState],
     Tuple[np.ndarray, np.ndarray, np.ndarray],
@@ -77,6 +79,11 @@ class SyntheticBanditDataset(BaseBanditDataset):
         A larger value leads to a noisier reward distribution.
         This argument is valid only when `reward_type="continuous"`.
 
+    reward_noise_distribution: str, default='normal'
+        From which distribution we sample noise on the reward, must be either 'normal' or 'truncated_normal'.
+        If 'truncated_normal' is given, we do not have any negative reward realization in the logged dataset.
+        This argument is valid only when `reward_type="continuous"`.
+
     action_context: np.ndarray, default=None
          Vector representation of (discrete) actions.
          If None, one-hot representation will be used.
@@ -177,6 +184,7 @@ class SyntheticBanditDataset(BaseBanditDataset):
     reward_type: str = RewardType.BINARY.value
     reward_function: Optional[Callable[[np.ndarray, np.ndarray], np.ndarray]] = None
     reward_std: float = 1.0
+    reward_noise_distribution: str = "normal"
     action_context: Optional[np.ndarray] = None
     behavior_policy_function: Optional[
         Callable[[np.ndarray, np.ndarray], np.ndarray]
@@ -211,6 +219,12 @@ def __post_init__(self) -> None:
                 f"`reward_type` must be either '{RewardType.BINARY.value}' or '{RewardType.CONTINUOUS.value}',"
                 f"but {self.reward_type} is given.'"
             )
+        if self.reward_noise_distribution not in ["normal", "truncated_normal"]:
+            raise ValueError(
+                f"`reward_noise_distribution` must be either 'normal' or 'truncated_normal',"
+                f"but {self.reward_noise_distribution} is given.'"
+            )
+
         check_scalar(self.reward_std, "reward_std", (int, float), min_val=0)
         if self.reward_function is None:
             self.expected_reward = self.sample_contextfree_expected_reward()
@@ -263,16 +277,23 @@ def sample_reward_given_expected_reward(
         if RewardType(self.reward_type) == RewardType.BINARY:
             reward = self.random_.binomial(n=1, p=expected_reward_factual)
         elif RewardType(self.reward_type) == RewardType.CONTINUOUS:
-            mean = expected_reward_factual
-            a = (self.reward_min - mean) / self.reward_std
-            b = (self.reward_max - mean) / self.reward_std
-            reward = truncnorm.rvs(
-                a=a,
-                b=b,
-                loc=mean,
-                scale=self.reward_std,
-                random_state=self.random_state,
-            )
+            if self.reward_noise_distribution == "normal":
+                reward = self.random_.normal(
+                    loc=expected_reward_factual,
+                    scale=self.reward_std,
+                    size=action.shape,
+                )
+            elif self.reward_noise_distribution == "truncated_normal":
+                mean = expected_reward_factual
+                a = (self.reward_min - mean) / self.reward_std
+                b = (self.reward_max - mean) / self.reward_std
+                reward = truncnorm.rvs(
+                    a=a,
+                    b=b,
+                    loc=mean,
+                    scale=self.reward_std,
+                    random_state=self.random_state,
+                )
         else:
             raise NotImplementedError
 
@@ -329,12 +350,13 @@ def obtain_batch_bandit_feedback(self, n_rounds: int) -> BanditFeedback:
         expected_reward_ = self.calc_expected_reward(contexts)
         if RewardType(self.reward_type) == RewardType.CONTINUOUS:
             # correct expected_reward_, as we use truncated normal distribution here
-            mean = expected_reward_
-            a = (self.reward_min - mean) / self.reward_std
-            b = (self.reward_max - mean) / self.reward_std
-            expected_reward_ = truncnorm.stats(
-                a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
-            )
+            if self.reward_noise_distribution == "truncated_normal":
+                mean = expected_reward_
+                a = (self.reward_min - mean) / self.reward_std
+                b = (self.reward_max - mean) / self.reward_std
+                expected_reward_ = truncnorm.stats(
+                    a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
+                )
 
         # calculate the action choice probabilities of the behavior policy
         if self.behavior_policy_function is None:

diff --git a/obp/dataset/synthetic_multi.py b/obp/dataset/synthetic_multi.py
@@ -74,6 +74,11 @@ class SyntheticMultiLoggersBanditDataset(SyntheticBanditDataset):
         A larger value leads to a noisier reward distribution.
         This argument is valid only when `reward_type="continuous"`.
 
+    reward_noise_distribution: str, default='normal'
+        From which distribution we sample noise on the reward, must be either 'normal' or 'truncated_normal'.
+        If 'truncated_normal' is given, we do not have any negative reward realization in the logged dataset.
+        This argument is valid only when `reward_type="continuous"`.
+
     action_context: np.ndarray, default=None
          Vector representation of (discrete) actions.
          If None, one-hot representation will be used.
@@ -272,12 +277,13 @@ def obtain_batch_bandit_feedback(self, n_rounds: int) -> BanditFeedback:
         expected_reward_ = self.calc_expected_reward(contexts)
         if RewardType(self.reward_type) == RewardType.CONTINUOUS:
             # correct expected_reward_, as we use truncated normal distribution here
-            mean = expected_reward_
-            a = (self.reward_min - mean) / self.reward_std
-            b = (self.reward_max - mean) / self.reward_std
-            expected_reward_ = truncnorm.stats(
-                a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
-            )
+            if self.reward_noise_distribution == "truncated_normal":
+                mean = expected_reward_
+                a = (self.reward_min - mean) / self.reward_std
+                b = (self.reward_max - mean) / self.reward_std
+                expected_reward_ = truncnorm.stats(
+                    a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
+                )
 
         # calculate the action choice probabilities of the behavior policy
         pi_b_logits = expected_reward_

diff --git a/obp/policy/offline.py b/obp/policy/offline.py
@@ -441,6 +441,7 @@ def fit(
                 raise ValueError("When `self.len_list > 1`, `position` must be given.")
 
         unif_action_dist = np.ones((context.shape[0], self.n_actions, self.len_list))
+        unif_action_dist /= self.n_actions
         self.q_estimator.fit(
             context=context,
             action=action,

diff --git a/obp/simulator/coefficient_drifter.py b/obp/simulator/coefficient_drifter.py
@@ -1,6 +1,8 @@
 from collections import deque
 from dataclasses import dataclass
-from typing import Optional, Tuple, List
+from typing import List
+from typing import Optional
+from typing import Tuple
 
 import numpy as np
 from sklearn.utils import check_random_state

diff --git a/obp/simulator/replay.py b/obp/simulator/replay.py
@@ -4,7 +4,8 @@
 from obp.policy.policy_type import PolicyType
 from obp.simulator.simulator import BanditPolicy
 from obp.types import BanditFeedback
-from obp.utils import check_bandit_feedback_inputs, convert_to_action_dist
+from obp.utils import check_bandit_feedback_inputs
+from obp.utils import convert_to_action_dist
 
 
 def run_bandit_replay(

diff --git a/obp/simulator/simulator.py b/obp/simulator/simulator.py
@@ -5,11 +5,16 @@
 from collections import defaultdict
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Callable, Tuple, Optional, List, Any
+from typing import Any
+from typing import Callable
+from typing import List
+from typing import Optional
+from typing import Tuple
 from typing import Union
 
 import numpy as np
-from sklearn.utils import check_random_state, check_scalar
+from sklearn.utils import check_random_state
+from sklearn.utils import check_scalar
 from tqdm import tqdm
 
 from ..dataset.reward_type import RewardType
@@ -18,7 +23,8 @@
 from ..policy import BaseContextualPolicy
 from ..policy.policy_type import PolicyType
 from ..types import BanditFeedback
-from ..utils import check_bandit_feedback_inputs, check_array
+from ..utils import check_array
+from ..utils import check_bandit_feedback_inputs
 
 
 # bandit policy type