ray-project · sven1977 · Jun 15, 2021 · Jun 11, 2021
@@ -2457,3 +2457,27 @@ py_test(
     srcs = ["contrib/bandits/examples/simple_context_bandit.py"],
     args = ["--as-test", "--stop-reward=10", "--run=contrib/LinUCB"],
 )
+
+py_test(
+    name = "contrib/bandits/examples/lin_ts_train_wheel_env",
+    main = "contrib/bandits/examples/LinTS_train_wheel_env.py",
+    tags = ["examples", "examples_U"],
+    size = "small",
+    srcs = ["contrib/bandits/examples/LinTS_train_wheel_env.py"],
+)
+
+py_test(
+    name = "contrib/bandits/examples/tune_lin_ts_train_wheel_env",
+    main = "contrib/bandits/examples/tune_LinTS_train_wheel_env.py",
+    tags = ["examples", "examples_U"],
+    size = "small",
+    srcs = ["contrib/bandits/examples/tune_LinTS_train_wheel_env.py"],
+)
+
+py_test(
+    name = "contrib/bandits/examples/tune_lin_ucb_train_recommendation",
+    main = "contrib/bandits/examples/tune_LinUCB_train_recommendation.py",
+    tags = ["examples", "examples_U"],
+    size = "small",
+    srcs = ["contrib/bandits/examples/tune_LinUCB_train_recommendation.py"],
+)
@@ -4,6 +4,7 @@
 
 import numpy as np
 from matplotlib import pyplot as plt
+
 from ray.rllib.contrib.bandits.agents import LinTSTrainer
 from ray.rllib.contrib.bandits.envs import WheelBanditEnv
 
@@ -28,7 +29,7 @@ def plot_model_weights(means, covs):
 
 
 if __name__ == "__main__":
-    num_iter = 20
+    num_iter = 10
     print("Running training for %s time steps" % num_iter)
     trainer = LinTSTrainer(env=WheelBanditEnv)
 
@@ -42,7 +43,7 @@ def plot_model_weights(means, covs):
         trainer.train()
 
     info = trainer.train()
-    print(info["learner"])
+    print(info["info"]["learner"])
 
     # Get model parameters
     means = [model.arms[i].theta.numpy() for i in range(5)]

@@ -33,9 +33,9 @@ def plot_model_weights(means, covs, ax):
 if __name__ == "__main__":
     TS_CONFIG["env"] = WheelBanditEnv
 
-    # Actual training_iterations will be 20 * timesteps_per_iteration
+    # Actual training_iterations will be 10 * timesteps_per_iteration
     # (100 by default) = 2,000
-    training_iterations = 20
+    training_iterations = 10
 
     print("Running training for %s time steps" % training_iterations)
 
@@ -49,19 +49,19 @@ def plot_model_weights(means, covs, ax):
 
     print("The trials took", time.time() - start_time, "seconds\n")
 
-    # Analyze cumulative regrets of the trials
+    # Analyze cumulative regrets of the trials.
     frame = pd.DataFrame()
     for key, df in analysis.trial_dataframes.items():
         frame = frame.append(df, ignore_index=True)
 
-    x = frame.groupby("num_steps_trained")[
-        "learner/cumulative_regret"].aggregate(["mean", "max", "min", "std"])
+    x = frame.groupby("agent_timesteps_total")[
+        "episode_reward_mean"].aggregate(["mean", "max", "min", "std"])
 
     fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))
 
     ax1.plot(x["mean"])
 
-    ax1.set_title("Cumulative Regret")
+    ax1.set_title("Episode reward mean")
     ax1.set_xlabel("Training steps")
 
     # Restore trainer from checkpoint

@@ -12,15 +12,14 @@
 from ray.rllib.contrib.bandits.envs import ParametricItemRecoEnv
 
 if __name__ == "__main__":
-
     # Temp fix to avoid OMP conflict
     os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
 
     UCB_CONFIG["env"] = ParametricItemRecoEnv
 
-    # Actual training_iterations will be 20 * timesteps_per_iteration
+    # Actual training_iterations will be 10 * timesteps_per_iteration
     # (100 by default) = 2,000
-    training_iterations = 20
+    training_iterations = 10
 
     print("Running training for %s time steps" % training_iterations)
 
@@ -29,7 +28,7 @@
         "contrib/LinUCB",
         config=UCB_CONFIG,
         stop={"training_iteration": training_iterations},
-        num_samples=5,
+        num_samples=2,
         checkpoint_at_end=False)
 
     print("The trials took", time.time() - start_time, "seconds\n")
@@ -38,8 +37,8 @@
     frame = pd.DataFrame()
     for key, df in analysis.trial_dataframes.items():
         frame = frame.append(df, ignore_index=True)
-    x = frame.groupby("num_steps_trained")[
-        "learner/cumulative_regret"].aggregate(["mean", "max", "min", "std"])
+    x = frame.groupby("agent_timesteps_total")[
+        "episode_reward_mean"].aggregate(["mean", "max", "min", "std"])
 
     plt.plot(x["mean"])
     plt.fill_between(
@@ -48,6 +47,6 @@
         x["mean"] + x["std"],
         color="b",
         alpha=0.2)
-    plt.title("Cumulative Regret")
+    plt.title("Episode reward mean")
     plt.xlabel("Training steps")
     plt.show()
@@ -20,10 +20,10 @@ def get_exploration_action(self,
 
     def _get_torch_exploration_action(self, action_dist, explore):
         if explore:
-            return action_dist.inputs.argmax(dim=1), None
+            return action_dist.inputs.argmax(dim=-1), None
         else:
             scores = self.model.predict(self.model.current_obs())
-            return scores.argmax(dim=1), None
+            return scores.argmax(dim=-1), None
 
 
 class UCB(Exploration):
@@ -40,7 +40,7 @@ def get_exploration_action(self,
 
     def _get_torch_exploration_action(self, action_dist, explore):
         if explore:
-            return action_dist.inputs.argmax(dim=1), None
+            return action_dist.inputs.argmax(dim=-1), None
         else:
             scores = self.model.value_function()
-            return scores.argmax(dim=1), None
+            return scores.argmax(dim=-1), None
@@ -33,7 +33,7 @@ def _init_params(self):
 
     def partial_fit(self, x, y):
         # TODO: Handle batch of data rather than individual points
-        self._check_inputs(x, y)
+        x, y = self._check_inputs(x, y)
         x = x.squeeze(0)
         y = y.item()
         self.time += 1
@@ -77,7 +77,7 @@ def forward(self, x, sample_theta=False):
                 posterior distribution to perform Thompson Sampling as per
                 http://proceedings.mlr.press/v28/agrawal13.pdf .
         """
-        self._check_inputs(x)
+        x = self._check_inputs(x)
         theta = self.sample_theta() if sample_theta else self.theta
         scores = x @ theta
         return scores
@@ -94,6 +94,7 @@ def _check_inputs(self, x, y=None):
                 "Target should be a tensor;" \
                 "Only online learning with a batch size of 1 is " \
                 "supported for now!"
+        return x if y is None else (x, y)
 
 
 class DiscreteLinearModel(TorchModelV2, nn.Module):
@@ -189,14 +190,19 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
         self._cur_ctx = None
 
     def _check_inputs(self, x):
-        if x.ndim == 3:
-            assert x.size()[
-                0] == 1, "Only batch size of 1 is supported for now."
+        if x.ndim == 3 and x.size()[0] != 1:
+            # Just a test batch, slice to index 0.
+            if torch.all(x == 0.0):
+                x = x[0:1]
+            # An actual batch -> Error.
+            else:
+                raise ValueError("Only batch size of 1 is supported for now.")
+        return x
 
     @override(ModelV2)
     def forward(self, input_dict, state, seq_lens):
         x = input_dict["obs"]["item"]
-        self._check_inputs(x)
+        x = self._check_inputs(x)
         x.squeeze_(dim=0)  # Remove the batch dimension
         scores = self.predict(x)
         scores.unsqueeze_(dim=0)  # Add the batch dimension
@@ -230,7 +236,7 @@ def current_obs(self):
 class ParametricLinearModelUCB(ParametricLinearModel):
     def forward(self, input_dict, state, seq_lens):
         x = input_dict["obs"]["item"]
-        self._check_inputs(x)
+        x = self._check_inputs(x)
         x.squeeze_(dim=0)  # Remove the batch dimension
         scores = super(ParametricLinearModelUCB, self).predict(
             x, sample_theta=False, use_ucb=True)
@@ -241,7 +247,7 @@ def forward(self, input_dict, state, seq_lens):
 class ParametricLinearModelThompsonSampling(ParametricLinearModel):
     def forward(self, input_dict, state, seq_lens):
         x = input_dict["obs"]["item"]
-        self._check_inputs(x)
+        x = self._check_inputs(x)
         x.squeeze_(dim=0)  # Remove the batch dimension
         scores = super(ParametricLinearModelThompsonSampling, self).predict(
             x, sample_theta=True, use_ucb=False)