Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Package Versions #189

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/lints.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.7
python-version: 3.8

- name: Black
uses: psf/black@stable
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:

strategy:
matrix:
python-version: [3.7, 3.8, 3.9]
python-version: [3.8, 3.9, 3.10]

# Not intended for forks.
if: github.repository == 'st-tech/zr-obp'
Expand Down
56 changes: 39 additions & 17 deletions obp/dataset/synthetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@

"""Class for Generating Synthetic Logged Bandit Data."""
from dataclasses import dataclass
from typing import Callable, Tuple
from typing import Callable
from typing import Optional
from typing import Tuple

import numpy as np
from scipy.stats import truncnorm
Expand All @@ -20,6 +21,7 @@
from .base import BaseBanditDataset
from .reward_type import RewardType


coef_func_signature = Callable[
[np.ndarray, np.ndarray, np.random.RandomState],
Tuple[np.ndarray, np.ndarray, np.ndarray],
Expand Down Expand Up @@ -77,6 +79,11 @@ class SyntheticBanditDataset(BaseBanditDataset):
A larger value leads to a noisier reward distribution.
This argument is valid only when `reward_type="continuous"`.

reward_noise_distribution: str, default='normal'
From which distribution we sample noise on the reward, must be either 'normal' or 'truncated_normal'.
If 'truncated_normal' is given, we do not have any negative reward realization in the logged dataset.
This argument is valid only when `reward_type="continuous"`.

action_context: np.ndarray, default=None
Vector representation of (discrete) actions.
If None, one-hot representation will be used.
Expand Down Expand Up @@ -177,6 +184,7 @@ class SyntheticBanditDataset(BaseBanditDataset):
reward_type: str = RewardType.BINARY.value
reward_function: Optional[Callable[[np.ndarray, np.ndarray], np.ndarray]] = None
reward_std: float = 1.0
reward_noise_distribution: str = "normal"
action_context: Optional[np.ndarray] = None
behavior_policy_function: Optional[
Callable[[np.ndarray, np.ndarray], np.ndarray]
Expand Down Expand Up @@ -211,6 +219,12 @@ def __post_init__(self) -> None:
f"`reward_type` must be either '{RewardType.BINARY.value}' or '{RewardType.CONTINUOUS.value}',"
f"but {self.reward_type} is given.'"
)
if self.reward_noise_distribution not in ["normal", "truncated_normal"]:
raise ValueError(
f"`reward_noise_distribution` must be either 'normal' or 'truncated_normal',"
f"but {self.reward_noise_distribution} is given.'"
)

check_scalar(self.reward_std, "reward_std", (int, float), min_val=0)
if self.reward_function is None:
self.expected_reward = self.sample_contextfree_expected_reward()
Expand Down Expand Up @@ -263,16 +277,23 @@ def sample_reward_given_expected_reward(
if RewardType(self.reward_type) == RewardType.BINARY:
reward = self.random_.binomial(n=1, p=expected_reward_factual)
elif RewardType(self.reward_type) == RewardType.CONTINUOUS:
mean = expected_reward_factual
a = (self.reward_min - mean) / self.reward_std
b = (self.reward_max - mean) / self.reward_std
reward = truncnorm.rvs(
a=a,
b=b,
loc=mean,
scale=self.reward_std,
random_state=self.random_state,
)
if self.reward_noise_distribution == "normal":
reward = self.random_.normal(
loc=expected_reward_factual,
scale=self.reward_std,
size=action.shape,
)
elif self.reward_noise_distribution == "truncated_normal":
mean = expected_reward_factual
a = (self.reward_min - mean) / self.reward_std
b = (self.reward_max - mean) / self.reward_std
reward = truncnorm.rvs(
a=a,
b=b,
loc=mean,
scale=self.reward_std,
random_state=self.random_state,
)
else:
raise NotImplementedError

Expand Down Expand Up @@ -329,12 +350,13 @@ def obtain_batch_bandit_feedback(self, n_rounds: int) -> BanditFeedback:
expected_reward_ = self.calc_expected_reward(contexts)
if RewardType(self.reward_type) == RewardType.CONTINUOUS:
# correct expected_reward_, as we use truncated normal distribution here
mean = expected_reward_
a = (self.reward_min - mean) / self.reward_std
b = (self.reward_max - mean) / self.reward_std
expected_reward_ = truncnorm.stats(
a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
)
if self.reward_noise_distribution == "truncated_normal":
mean = expected_reward_
a = (self.reward_min - mean) / self.reward_std
b = (self.reward_max - mean) / self.reward_std
expected_reward_ = truncnorm.stats(
a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
)

# calculate the action choice probabilities of the behavior policy
if self.behavior_policy_function is None:
Expand Down
18 changes: 12 additions & 6 deletions obp/dataset/synthetic_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ class SyntheticMultiLoggersBanditDataset(SyntheticBanditDataset):
A larger value leads to a noisier reward distribution.
This argument is valid only when `reward_type="continuous"`.

reward_noise_distribution: str, default='normal'
From which distribution we sample noise on the reward, must be either 'normal' or 'truncated_normal'.
If 'truncated_normal' is given, we do not have any negative reward realization in the logged dataset.
This argument is valid only when `reward_type="continuous"`.

action_context: np.ndarray, default=None
Vector representation of (discrete) actions.
If None, one-hot representation will be used.
Expand Down Expand Up @@ -272,12 +277,13 @@ def obtain_batch_bandit_feedback(self, n_rounds: int) -> BanditFeedback:
expected_reward_ = self.calc_expected_reward(contexts)
if RewardType(self.reward_type) == RewardType.CONTINUOUS:
# correct expected_reward_, as we use truncated normal distribution here
mean = expected_reward_
a = (self.reward_min - mean) / self.reward_std
b = (self.reward_max - mean) / self.reward_std
expected_reward_ = truncnorm.stats(
a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
)
if self.reward_noise_distribution == "truncated_normal":
mean = expected_reward_
a = (self.reward_min - mean) / self.reward_std
b = (self.reward_max - mean) / self.reward_std
expected_reward_ = truncnorm.stats(
a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
)

# calculate the action choice probabilities of the behavior policy
pi_b_logits = expected_reward_
Expand Down
1 change: 1 addition & 0 deletions obp/policy/offline.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,7 @@ def fit(
raise ValueError("When `self.len_list > 1`, `position` must be given.")

unif_action_dist = np.ones((context.shape[0], self.n_actions, self.len_list))
unif_action_dist /= self.n_actions
self.q_estimator.fit(
context=context,
action=action,
Expand Down
4 changes: 3 additions & 1 deletion obp/simulator/coefficient_drifter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from collections import deque
from dataclasses import dataclass
from typing import Optional, Tuple, List
from typing import List
from typing import Optional
from typing import Tuple

import numpy as np
from sklearn.utils import check_random_state
Expand Down
3 changes: 2 additions & 1 deletion obp/simulator/replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from obp.policy.policy_type import PolicyType
from obp.simulator.simulator import BanditPolicy
from obp.types import BanditFeedback
from obp.utils import check_bandit_feedback_inputs, convert_to_action_dist
from obp.utils import check_bandit_feedback_inputs
from obp.utils import convert_to_action_dist


def run_bandit_replay(
Expand Down
12 changes: 9 additions & 3 deletions obp/simulator/simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,16 @@
from collections import defaultdict
from copy import deepcopy
from dataclasses import dataclass
from typing import Callable, Tuple, Optional, List, Any
from typing import Any
from typing import Callable
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union

import numpy as np
from sklearn.utils import check_random_state, check_scalar
from sklearn.utils import check_random_state
from sklearn.utils import check_scalar
from tqdm import tqdm

from ..dataset.reward_type import RewardType
Expand All @@ -18,7 +23,8 @@
from ..policy import BaseContextualPolicy
from ..policy.policy_type import PolicyType
from ..types import BanditFeedback
from ..utils import check_bandit_feedback_inputs, check_array
from ..utils import check_array
from ..utils import check_bandit_feedback_inputs


# bandit policy type
Expand Down
Loading