ray-project · sven1977 · Oct 4, 2023 · Jun 22, 2023 · Jul 6, 2023 · Jul 6, 2023
diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml
@@ -564,6 +564,14 @@
     - ./ci/env/env_info.sh
     - pytest rllib_contrib/maml/tests/test_maml.py
 
+- label: ":exploding_death_star: RLlib Contrib: SlateQ Tests"
+  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_RLLIB_CONTRIB_AFFECTED"]
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - (cd rllib_contrib/slate_q && pip install -r requirements.txt && pip install -e .)
+    - ./ci/env/env_info.sh
+    - pytest rllib_contrib/slate_q/tests/
+    - python rllib_contrib/slate_q/examples/recommender_system_with_recsim_and_slateq.py --run-as-test
 
 - label: ":exploding_death_star: RLlib Contrib: DDPG Tests"
   conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_RLLIB_CONTRIB_AFFECTED"]
@@ -572,4 +580,4 @@
     - (cd rllib_contrib/ddpg && pip install -r requirements.txt && pip install -e .)
     - ./ci/env/env_info.sh
     - pytest rllib_contrib/ddpg/tests/
-    - python rllib_contrib/ddpg/examples/ddpg_pendulum_v1.py --run-as-test
+    - python rllib_contrib/ddpg/examples/ddpg_pendulum_v1.py --run-as-test
diff --git a/rllib_contrib/slate_q/README.md b/rllib_contrib/slate_q/README.md
@@ -0,0 +1,17 @@
+# SlateQ (Asynchronous Advantage Actor-Critic)
+
+[SlateQ](https://storage.googleapis.com/pub-tools-public-publication-data/pdf/9f91de1fa0ac351ecb12e4062a37afb896aa1463.pdf) is a model-free RL method that builds on top of DQN and generates recommendation slates for recommender system environments. Since these types of environments come with large combinatorial action spaces, SlateQ mitigates this by decomposing the Q-value into single-item Q-values and solves the decomposed objective via mixing integer programming and deep learning optimization. SlateQ can be evaluated on Google’s RecSim environment.
+
+
+## Installation
+
+```
+conda create -n rllib-slateq python=3.10
+conda activate rllib-slateq
+pip install -r requirements.txt
+pip install -e '.[development]'
+```
+
+## Usage
+
+[SlateQ Example]()
diff --git a/rllib_contrib/slate_q/examples/recommender_system_with_recsim_and_slateq.py b/rllib_contrib/slate_q/examples/recommender_system_with_recsim_and_slateq.py
@@ -0,0 +1,163 @@
+"""Using an RLlib-ready RecSim environment and the SlateQ algorithm
+for solving recommendation system problems.
+
+This example supports three different RecSim (RLlib-ready) environments,
+configured via the --env option:
+- "long-term-satisfaction"
+- "interest-exploration"
+- "interest-evolution"
+"""
+
+import argparse
+
+import numpy as np
+from rllib_slate_q.slate_q import SlateQ, SlateQConfig
+from scipy.stats import sem
+
+import ray
+from ray import air, tune
+from ray.rllib.examples.env.recommender_system_envs_with_recsim import (
+    InterestEvolutionRecSimEnv,
+    InterestExplorationRecSimEnv,
+    LongTermSatisfactionRecSimEnv,
+)
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2", "torch"],
+    default="torch",
+    help="The DL framework specifier.",
+)
+parser.add_argument(
+    "--env",
+    type=str,
+    default="interest-evolution",
+    choices=["interest-evolution", "interest-exploration", "long-term-satisfaction"],
+    help=("Select the RecSim env to use."),
+)
+
+parser.add_argument(
+    "--random-test-episodes",
+    type=int,
+    default=0,
+    help="The number of test episodes to run with a random agent to figure out "
+    "up front what the random baseline reward is.",
+)
+
+parser.add_argument("--tune-num-samples", type=int, default=1)
+
+parser.add_argument(
+    "--env-num-candidates",
+    type=int,
+    default=100,
+    help="The number of candidates that the agent has to pick "
+    "`--env-slate-size` from each timestep. These candidates will be "
+    "sampled by the environment's built-in document sampler model.",
+)
+
+parser.add_argument(
+    "--num-steps-sampled-before-learning_starts",
+    type=int,
+    default=20000,
+    help="Number of timesteps to collect from rollout workers before we start "
+    "sampling from replay buffers for learning..",
+)
+
+parser.add_argument(
+    "--env-slate-size",
+    type=int,
+    default=2,
+    help="The size of the slate to recommend (from out of "
+    "`--env-num-candidates` sampled docs) each timestep.",
+)
+parser.add_argument(
+    "--env-dont-resample-documents",
+    action="store_true",
+    help="Whether to NOT resample `--env-num-candidates` docs "
+    "each timestep. If set, the env will only sample `--env-num-candidates`"
+    " once at the beginning and the agent always has to pick "
+    "`--env-slate-size` docs from this sample.",
+)
+
+parser.add_argument("--run-as-test", action="store_true")
+
+
+def main():
+    args = parser.parse_args()
+    ray.init()
+
+    env_config = {
+        "num_candidates": args.env_num_candidates,
+        "resample_documents": not args.env_dont_resample_documents,
+        "slate_size": args.env_slate_size,
+        "seed": 0,
+        "convert_to_discrete_action_space": False,
+    }
+
+    config = (
+        SlateQConfig()
+        .environment(
+            InterestEvolutionRecSimEnv
+            if args.env == "interest-evolution"
+            else InterestExplorationRecSimEnv
+            if args.env == "interest-exploration"
+            else LongTermSatisfactionRecSimEnv,
+            env_config=env_config,
+        )
+        .framework(args.framework)
+        .rollouts(num_rollout_workers=7)
+        .resources()
+    )
+
+    config.num_steps_sampled_before_learning_starts = (
+        args.num_steps_sampled_before_learning_starts
+    )
+
+    # Perform a test run on the env with a random agent to see, what
+    # the random baseline reward is.
+    if args.random_test_episodes:
+        print(
+            f"Running {args.random_test_episodes} episodes to get a random "
+            "agent's baseline reward ..."
+        )
+        env = config["env"](config=env_config)
+        env.reset()
+        num_episodes = 0
+        episode_rewards = []
+        episode_reward = 0.0
+        while num_episodes < args.random_test_episodes:
+            action = env.action_space.sample()
+            _, r, d, _, _ = env.step(action)
+            episode_reward += r
+            if d:
+                num_episodes += 1
+                episode_rewards.append(episode_reward)
+                episode_reward = 0.0
+                env.reset()
+        print(
+            f"Ran {args.random_test_episodes} episodes with a random agent "
+            "reaching a mean episode return of "
+            f"{np.mean(episode_rewards)}+/-{sem(episode_rewards)}."
+        )
+
+    if args.run_as_test:
+        stop = {"training_iteration": 1}
+    else:
+        stop = {
+            "training_iteration": 200,
+            "timesteps_total": 150000,
+            "episode_reward_mean": 160,
+        }
+
+    tune.Tuner(
+        SlateQ,
+        run_config=air.RunConfig(
+            stop=stop,
+        ),
+        param_space=config,
+    ).fit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/rllib_contrib/slate_q/pyproject.toml b/rllib_contrib/slate_q/pyproject.toml
@@ -0,0 +1,18 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[project]
+name = "rllib-slate-q"
+authors = [{name = "Anyscale Inc."}]
+version = "0.1.0"
+description = ""
+readme = "README.md"
+requires-python = ">=3.7, <3.11"
+dependencies = ["gym==0.26.2", "recsim==0.2.4", "gymnasium[mujoco]==0.26.3", "ray[rllib]==2.5.1"]
+
+[project.optional-dependencies]
+development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "torch==1.12.0"]
diff --git a/rllib_contrib/slate_q/requirements.txt b/rllib_contrib/slate_q/requirements.txt
@@ -0,0 +1,2 @@
+tensorflow==2.11.0
+torch==1.12.0
diff --git a/rllib_contrib/slate_q/src/rllib_slate_q/env/__init__.py b/rllib_contrib/slate_q/src/rllib_slate_q/env/__init__.py
@@ -0,0 +1,11 @@
+from rllib_slate_q.env.recommender_system_envs_with_recsim import (
+    InterestEvolutionRecSimEnv,
+    InterestExplorationRecSimEnv,
+    LongTermSatisfactionRecSimEnv,
+)
+
+__all__ = [
+    "InterestExplorationRecSimEnv",
+    "LongTermSatisfactionRecSimEnv",
+    "InterestEvolutionRecSimEnv",
+]
diff --git a/rllib_contrib/slate_q/src/rllib_slate_q/env/recommender_system_envs_with_recsim.py b/rllib_contrib/slate_q/src/rllib_slate_q/env/recommender_system_envs_with_recsim.py
@@ -0,0 +1,106 @@
+"""Examples for RecSim envs ready to be used by RLlib Trainers
+
+RecSim is a configurable recommender systems simulation platform.
+Source: https://github.com/google-research/recsim
+"""
+
+from recsim import choice_model
+from recsim.environments import interest_evolution as iev
+from recsim.environments import interest_exploration as iex
+from recsim.environments import long_term_satisfaction as lts
+
+from ray.rllib.env.wrappers.recsim import make_recsim_env
+from ray.tune import register_env
+
+# Some built-in RecSim envs to test with.
+# ---------------------------------------
+
+# Long-term satisfaction env: User has to pick from items that are either
+# a) unhealthy, but taste good, or b) healthy, but have bad taste.
+# Best strategy is to pick a mix of both to ensure long-term
+# engagement.
+
+
+def lts_user_model_creator(env_ctx):
+    return lts.LTSUserModel(
+        env_ctx["slate_size"],
+        user_state_ctor=lts.LTSUserState,
+        response_model_ctor=lts.LTSResponse,
+    )
+
+
+def lts_document_sampler_creator(env_ctx):
+    return lts.LTSDocumentSampler()
+
+
+LongTermSatisfactionRecSimEnv = make_recsim_env(
+    recsim_user_model_creator=lts_user_model_creator,
+    recsim_document_sampler_creator=lts_document_sampler_creator,
+    reward_aggregator=lts.clicked_engagement_reward,
+)
+
+
+# Interest exploration env: Models the problem of active exploration
+# of user interests. It is meant to illustrate popularity bias in
+# recommender systems, where myopic maximization of engagement leads
+# to bias towards documents that have wider appeal,
+# whereas niche user interests remain unexplored.
+def iex_user_model_creator(env_ctx):
+    return iex.IEUserModel(
+        env_ctx["slate_size"],
+        user_state_ctor=iex.IEUserState,
+        response_model_ctor=iex.IEResponse,
+        seed=env_ctx["seed"],
+    )
+
+
+def iex_document_sampler_creator(env_ctx):
+    return iex.IETopicDocumentSampler(seed=env_ctx["seed"])
+
+
+InterestExplorationRecSimEnv = make_recsim_env(
+    recsim_user_model_creator=iex_user_model_creator,
+    recsim_document_sampler_creator=iex_document_sampler_creator,
+    reward_aggregator=iex.total_clicks_reward,
+)
+
+
+# Interest evolution env: See https://github.com/google-research/recsim
+# for more information.
+def iev_user_model_creator(env_ctx):
+    return iev.IEvUserModel(
+        env_ctx["slate_size"],
+        choice_model_ctor=choice_model.MultinomialProportionalChoiceModel,
+        response_model_ctor=iev.IEvResponse,
+        user_state_ctor=iev.IEvUserState,
+        seed=env_ctx["seed"],
+    )
+
+
+# Extend IEvVideo to fix a bug caused by None cluster_ids.
+class SingleClusterIEvVideo(iev.IEvVideo):
+    def __init__(self, doc_id, features, video_length=None, quality=None):
+        super(SingleClusterIEvVideo, self).__init__(
+            doc_id=doc_id,
+            features=features,
+            cluster_id=0,  # single cluster.
+            video_length=video_length,
+            quality=quality,
+        )
+
+
+def iev_document_sampler_creator(env_ctx):
+    return iev.UtilityModelVideoSampler(doc_ctor=iev.IEvVideo, seed=env_ctx["seed"])
+
+
+InterestEvolutionRecSimEnv = make_recsim_env(
+    recsim_user_model_creator=iev_user_model_creator,
+    recsim_document_sampler_creator=iev_document_sampler_creator,
+    reward_aggregator=iev.clicked_watchtime_reward,
+)
+
+
+# Backward compatibility.
+register_env(
+    name="RecSim-v1", env_creator=lambda env_ctx: InterestEvolutionRecSimEnv(env_ctx)
+)
diff --git a/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/__init__.py b/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/__init__.py
@@ -0,0 +1,9 @@
+from rllib_slate_q.slate_q.slateq import SlateQ, SlateQConfig
+from rllib_slate_q.slate_q.slateq_tf_policy import SlateQTFPolicy
+from rllib_slate_q.slate_q.slateq_torch_policy import SlateQTorchPolicy
+
+from ray.tune.registry import register_trainable
+
+__all__ = ["SlateQConfig", "SlateQ", "SlateQTFPolicy", "SlateQTorchPolicy"]
+
+register_trainable("rllib-contrib-slate-q", SlateQ)