Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions rllib/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -2457,3 +2457,27 @@ py_test(
srcs = ["contrib/bandits/examples/simple_context_bandit.py"],
args = ["--as-test", "--stop-reward=10", "--run=contrib/LinUCB"],
)

py_test(
name = "contrib/bandits/examples/lin_ts_train_wheel_env",
main = "contrib/bandits/examples/LinTS_train_wheel_env.py",
tags = ["examples", "examples_U"],
size = "small",
srcs = ["contrib/bandits/examples/LinTS_train_wheel_env.py"],
)

py_test(
name = "contrib/bandits/examples/tune_lin_ts_train_wheel_env",
main = "contrib/bandits/examples/tune_LinTS_train_wheel_env.py",
tags = ["examples", "examples_U"],
size = "small",
srcs = ["contrib/bandits/examples/tune_LinTS_train_wheel_env.py"],
)

py_test(
name = "contrib/bandits/examples/tune_lin_ucb_train_recommendation",
main = "contrib/bandits/examples/tune_LinUCB_train_recommendation.py",
tags = ["examples", "examples_U"],
size = "small",
srcs = ["contrib/bandits/examples/tune_LinUCB_train_recommendation.py"],
)
5 changes: 3 additions & 2 deletions rllib/contrib/bandits/examples/LinTS_train_wheel_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import numpy as np
from matplotlib import pyplot as plt

from ray.rllib.contrib.bandits.agents import LinTSTrainer
from ray.rllib.contrib.bandits.envs import WheelBanditEnv

Expand All @@ -28,7 +29,7 @@ def plot_model_weights(means, covs):


if __name__ == "__main__":
num_iter = 20
num_iter = 10
print("Running training for %s time steps" % num_iter)
trainer = LinTSTrainer(env=WheelBanditEnv)

Expand All @@ -42,7 +43,7 @@ def plot_model_weights(means, covs):
trainer.train()

info = trainer.train()
print(info["learner"])
print(info["info"]["learner"])

# Get model parameters
means = [model.arms[i].theta.numpy() for i in range(5)]
Expand Down
12 changes: 6 additions & 6 deletions rllib/contrib/bandits/examples/tune_LinTS_train_wheel_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def plot_model_weights(means, covs, ax):
if __name__ == "__main__":
TS_CONFIG["env"] = WheelBanditEnv

# Actual training_iterations will be 20 * timesteps_per_iteration
# Actual training_iterations will be 10 * timesteps_per_iteration
# (100 by default) = 2,000
training_iterations = 20
training_iterations = 10

print("Running training for %s time steps" % training_iterations)

Expand All @@ -49,19 +49,19 @@ def plot_model_weights(means, covs, ax):

print("The trials took", time.time() - start_time, "seconds\n")

# Analyze cumulative regrets of the trials
# Analyze cumulative regrets of the trials.
frame = pd.DataFrame()
for key, df in analysis.trial_dataframes.items():
frame = frame.append(df, ignore_index=True)

x = frame.groupby("num_steps_trained")[
"learner/cumulative_regret"].aggregate(["mean", "max", "min", "std"])
x = frame.groupby("agent_timesteps_total")[
"episode_reward_mean"].aggregate(["mean", "max", "min", "std"])

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))

ax1.plot(x["mean"])

ax1.set_title("Cumulative Regret")
ax1.set_title("Episode reward mean")
ax1.set_xlabel("Training steps")

# Restore trainer from checkpoint
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,14 @@
from ray.rllib.contrib.bandits.envs import ParametricItemRecoEnv

if __name__ == "__main__":

# Temp fix to avoid OMP conflict
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

UCB_CONFIG["env"] = ParametricItemRecoEnv

# Actual training_iterations will be 20 * timesteps_per_iteration
# Actual training_iterations will be 10 * timesteps_per_iteration
# (100 by default) = 2,000
training_iterations = 20
training_iterations = 10

print("Running training for %s time steps" % training_iterations)

Expand All @@ -29,7 +28,7 @@
"contrib/LinUCB",
config=UCB_CONFIG,
stop={"training_iteration": training_iterations},
num_samples=5,
num_samples=2,
checkpoint_at_end=False)

print("The trials took", time.time() - start_time, "seconds\n")
Expand All @@ -38,8 +37,8 @@
frame = pd.DataFrame()
for key, df in analysis.trial_dataframes.items():
frame = frame.append(df, ignore_index=True)
x = frame.groupby("num_steps_trained")[
"learner/cumulative_regret"].aggregate(["mean", "max", "min", "std"])
x = frame.groupby("agent_timesteps_total")[
"episode_reward_mean"].aggregate(["mean", "max", "min", "std"])

plt.plot(x["mean"])
plt.fill_between(
Expand All @@ -48,6 +47,6 @@
x["mean"] + x["std"],
color="b",
alpha=0.2)
plt.title("Cumulative Regret")
plt.title("Episode reward mean")
plt.xlabel("Training steps")
plt.show()
8 changes: 4 additions & 4 deletions rllib/contrib/bandits/exploration.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ def get_exploration_action(self,

def _get_torch_exploration_action(self, action_dist, explore):
if explore:
return action_dist.inputs.argmax(dim=1), None
return action_dist.inputs.argmax(dim=-1), None
else:
scores = self.model.predict(self.model.current_obs())
return scores.argmax(dim=1), None
return scores.argmax(dim=-1), None


class UCB(Exploration):
Expand All @@ -40,7 +40,7 @@ def get_exploration_action(self,

def _get_torch_exploration_action(self, action_dist, explore):
if explore:
return action_dist.inputs.argmax(dim=1), None
return action_dist.inputs.argmax(dim=-1), None
else:
scores = self.model.value_function()
return scores.argmax(dim=1), None
return scores.argmax(dim=-1), None
22 changes: 14 additions & 8 deletions rllib/contrib/bandits/models/linear_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def _init_params(self):

def partial_fit(self, x, y):
# TODO: Handle batch of data rather than individual points
self._check_inputs(x, y)
x, y = self._check_inputs(x, y)
x = x.squeeze(0)
y = y.item()
self.time += 1
Expand Down Expand Up @@ -77,7 +77,7 @@ def forward(self, x, sample_theta=False):
posterior distribution to perform Thompson Sampling as per
http://proceedings.mlr.press/v28/agrawal13.pdf .
"""
self._check_inputs(x)
x = self._check_inputs(x)
theta = self.sample_theta() if sample_theta else self.theta
scores = x @ theta
return scores
Expand All @@ -94,6 +94,7 @@ def _check_inputs(self, x, y=None):
"Target should be a tensor;" \
"Only online learning with a batch size of 1 is " \
"supported for now!"
return x if y is None else (x, y)


class DiscreteLinearModel(TorchModelV2, nn.Module):
Expand Down Expand Up @@ -189,14 +190,19 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
self._cur_ctx = None

def _check_inputs(self, x):
if x.ndim == 3:
assert x.size()[
0] == 1, "Only batch size of 1 is supported for now."
if x.ndim == 3 and x.size()[0] != 1:
# Just a test batch, slice to index 0.
if torch.all(x == 0.0):
x = x[0:1]
# An actual batch -> Error.
else:
raise ValueError("Only batch size of 1 is supported for now.")
return x

@override(ModelV2)
def forward(self, input_dict, state, seq_lens):
x = input_dict["obs"]["item"]
self._check_inputs(x)
x = self._check_inputs(x)
x.squeeze_(dim=0) # Remove the batch dimension
scores = self.predict(x)
scores.unsqueeze_(dim=0) # Add the batch dimension
Expand Down Expand Up @@ -230,7 +236,7 @@ def current_obs(self):
class ParametricLinearModelUCB(ParametricLinearModel):
def forward(self, input_dict, state, seq_lens):
x = input_dict["obs"]["item"]
self._check_inputs(x)
x = self._check_inputs(x)
x.squeeze_(dim=0) # Remove the batch dimension
scores = super(ParametricLinearModelUCB, self).predict(
x, sample_theta=False, use_ucb=True)
Expand All @@ -241,7 +247,7 @@ def forward(self, input_dict, state, seq_lens):
class ParametricLinearModelThompsonSampling(ParametricLinearModel):
def forward(self, input_dict, state, seq_lens):
x = input_dict["obs"]["item"]
self._check_inputs(x)
x = self._check_inputs(x)
x.squeeze_(dim=0) # Remove the batch dimension
scores = super(ParametricLinearModelThompsonSampling, self).predict(
x, sample_theta=True, use_ucb=False)
Expand Down