ray-project · krfricke · Jul 22, 2021 · Jul 22, 2021
diff --git a/run_ci_examples.sh b/run_ci_examples.sh
@@ -27,6 +27,7 @@ echo "running simple.py" && python simple.py --smoke-test
 echo "running simple_predict.py" && python simple_predict.py
 echo "running simple_dask.py" && python simple_dask.py --smoke-test
 echo "running simple_modin.py" && python simple_modin.py --smoke-test
+echo "running simple_objectstore.py" && python simple_objectstore.py --smoke-test
 
 if [ "$TUNE" = "1" ]; then
   echo "running simple_tune.py" && python simple_tune.py --smoke-test

diff --git a/xgboost_ray/data_sources/object_store.py b/xgboost_ray/data_sources/object_store.py
@@ -15,6 +15,8 @@ class ObjectStore(DataSource):
     @staticmethod
     def is_data_type(data: Any,
                      filetype: Optional[RayFileType] = None) -> bool:
+        if isinstance(data, Sequence):
+            return all(isinstance(d, ObjectRef) for d in data)
         return isinstance(data, ObjectRef)
 
     @staticmethod

diff --git a/xgboost_ray/examples/simple_objectstore.py b/xgboost_ray/examples/simple_objectstore.py
@@ -0,0 +1,91 @@
+import argparse
+
+import numpy as np
+import pandas as pd
+
+import ray
+
+from xgboost_ray import RayDMatrix, train, RayParams
+
+
+def main(cpus_per_actor, num_actors):
+    # Generate dataset
+    x = np.repeat(range(8), 16).reshape((32, 4))
+    # Even numbers --> 0, odd numbers --> 1
+    y = np.tile(np.repeat(range(2), 4), 4)
+
+    # Flip some bits to reduce max accuracy
+    bits_to_flip = np.random.choice(32, size=6, replace=False)
+    y[bits_to_flip] = 1 - y[bits_to_flip]
+
+    data = pd.DataFrame(x)
+    data["label"] = y
+
+    # Split into 4 partitions
+    partitions = [ray.put(part) for part in np.split(data, 4)]
+
+    train_set = RayDMatrix(partitions, "label")
+
+    evals_result = {}
+    # Set XGBoost config.
+    xgboost_params = {
+        "tree_method": "approx",
+        "objective": "binary:logistic",
+        "eval_metric": ["logloss", "error"],
+    }
+
+    # Train the classifier
+    bst = train(
+        params=xgboost_params,
+        dtrain=train_set,
+        evals=[(train_set, "train")],
+        evals_result=evals_result,
+        ray_params=RayParams(
+            max_actor_restarts=0,
+            gpus_per_actor=0,
+            cpus_per_actor=cpus_per_actor,
+            num_actors=num_actors),
+        verbose_eval=False,
+        num_boost_round=10)
+
+    model_path = "modin.xgb"
+    bst.save_model(model_path)
+    print("Final training error: {:.4f}".format(
+        evals_result["train"]["error"][-1]))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--address",
+        required=False,
+        type=str,
+        help="the address to use for Ray")
+    parser.add_argument(
+        "--server-address",
+        required=False,
+        type=str,
+        help="Address of the remote server if using Ray Client.")
+    parser.add_argument(
+        "--cpus-per-actor",
+        type=int,
+        default=1,
+        help="Sets number of CPUs per xgboost training worker.")
+    parser.add_argument(
+        "--num-actors",
+        type=int,
+        default=4,
+        help="Sets number of xgboost workers to use.")
+    parser.add_argument(
+        "--smoke-test", action="store_true", default=False, help="gpu")
+
+    args, _ = parser.parse_known_args()
+
+    if args.smoke_test:
+        ray.init(num_cpus=args.num_actors + 1)
+    elif args.server_address:
+        ray.util.connect(args.server_address)
+    else:
+        ray.init(address=args.address)
+
+    main(args.cpus_per_actor, args.num_actors)