In [1]:
import ray

# Load data.
dataset = ray.data.read_csv("s3://anonymous@air-example-data/breast_cancer.csv")

# Split data into train and validation.
train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)

# Create a test dataset by dropping the target column.
test_dataset = valid_dataset.drop_columns(cols=["target"])

2023-07-07 16:05:29,590	INFO worker.py:1627 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m

Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode[0m


Read progress 0:   0%|          | 0/1 [00:00<?, ?it/s]

Read progress 0:   0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
dataset

Dataset(
   num_blocks=1,
   num_rows=569,
   schema={
      mean radius: double,
      mean texture: double,
      mean perimeter: double,
      mean area: double,
      mean smoothness: double,
      mean compactness: double,
      mean concavity: double,
      mean concave points: double,
      mean symmetry: double,
      mean fractal dimension: double,
      radius error: double,
      texture error: double,
      perimeter error: double,
      area error: double,
      smoothness error: double,
      compactness error: double,
      concavity error: double,
      concave points error: double,
      symmetry error: double,
      fractal dimension error: double,
      worst radius: double,
      worst texture: double,
      worst perimeter: double,
      worst area: double,
      worst smoothness: double,
      worst compactness: double,
      worst concavity: double,
      worst concave points: double,
      worst symmetry: double,
      worst fractal dimension: double,
      targ

In [3]:
# Create a preprocessor to scale some columns.
from ray.data.preprocessors import StandardScaler

preprocessor = StandardScaler(columns=["mean radius", "mean texture"])

In [5]:
%pip install xgboost-ray

Collecting xgboost-ray
  Downloading xgboost_ray-0.1.16-py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 2.9 MB/s eta 0:00:01
Collecting xgboost>=0.90
  Downloading xgboost-1.7.6-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 6.8 MB/s eta 0:00:01
Installing collected packages: xgboost, xgboost-ray
Successfully installed xgboost-1.7.6 xgboost-ray-0.1.16
You should consider upgrading via the '/Users/cx/Work/python/chatkg/venv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
from ray.air.config import ScalingConfig
from ray.train.xgboost import XGBoostTrainer

trainer = XGBoostTrainer(
    scaling_config=ScalingConfig(
        # Number of workers to use for data parallelism.
        num_workers=2,
        # Whether to use GPU acceleration.
        use_gpu=False,
        # Make sure to leave some CPUs free for Ray Data operations.
        _max_cpu_fraction_per_node=0.9,
    ),
    label_column="target",
    num_boost_round=20,
    params={
        # XGBoost specific params
        "objective": "binary:logistic",
        # "tree_method": "gpu_hist",  # uncomment this to use GPUs.
        "eval_metric": ["logloss", "error"],
    },
    datasets={"train": train_dataset, "valid": valid_dataset},
    preprocessor=preprocessor,
)
best_result = trainer.fit()
print(best_result.metrics)

(pid=42583) - Aggregate 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=42583) SortSample 2:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=42583) ShuffleMap 3:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=42583) ShuffleReduce 4:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=42583) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(XGBoostTrainer pid=42583)[0m 2023-07-07 16:09:11,322	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(XGBoostTrainer pid=42583)[0m 2023-07-07 16:09:11,322	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(XGBoostTrainer pid=42583)[0m 2023-07-07 16:09:11,322	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(XGBoostTrainer pid=42583)[0m 2023-07-07 16:09:11,322	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(XGBoostTrainer pid=42583)[0m 2023-07-07 16:09:11,322	INFO streaming_executor.py:94 -- Tip: For detailed progress repo

Trial name,date,hostname,node_ip,pid,timestamp,trial_id
XGBoostTrainer_d6ec1_00000,2023-07-07_16-09-11,chenxiaodeMBP,127.0.0.1,42583,1688738951,d6ec1_00000


2023-07-07 16:09:11,552	ERROR tune.py:1107 -- Trials did not complete: [XGBoostTrainer_d6ec1_00000]
2023-07-07 16:09:11,555	INFO tune.py:1111 -- Total run time: 3.09 seconds (3.07 seconds for the tuning loop).
- /Users/cx/ray_results/XGBoostTrainer_2023-07-07_16-09-08/XGBoostTrainer_d6ec1_00000_0_2023-07-07_16-09-08


TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run.
To continue this run, you can use: `trainer = XGBoostTrainer.restore("/Users/cx/ray_results/XGBoostTrainer_2023-07-07_16-09-08")`.
To start a new run that will retry on training failures, set `air.RunConfig(failure_config=air.FailureConfig(max_failures))` in the Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` for unlimited retries.

In [1]:
import ray

ray.init()

2023-07-07 16:13:11,018	INFO worker.py:1627 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


0,1
Python version:,3.9.17
Ray version:,2.5.1
Dashboard:,http://127.0.0.1:8266


In [2]:
# Define the square task.
@ray.remote
def square(x):
    return x * x

# Launch four parallel square tasks.
futures = [square.remote(i) for i in range(4)]

# Retrieve results.
print(ray.get(futures))
# -> [0, 1, 4, 9]

[0, 1, 4, 9]


In [3]:
# Define the Counter actor.
@ray.remote
class Counter:
    def __init__(self):
        self.i = 0

    def get(self):
        return self.i

    def incr(self, value):
        self.i += value

# Create a Counter actor.
c = Counter.remote()

# Submit calls to the actor. These calls run asynchronously but in
# submission order on the remote actor process.
for _ in range(10):
    c.incr.remote(1)

# Retrieve final actor state.
print(ray.get(c.get.remote()))
# -> 10

10


In [4]:
import numpy as np

# Define a task that sums the values in a matrix.
@ray.remote
def sum_matrix(matrix):
    return np.sum(matrix)

# Call the task with a literal argument value.
print(ray.get(sum_matrix.remote(np.ones((100, 100)))))
# -> 10000.0

# Put a large array into the object store.
matrix_ref = ray.put(np.ones((1000, 1000)))

# Call the task with the object reference as an argument.
print(ray.get(sum_matrix.remote(matrix_ref)))
# -> 1000000.0

10000.0
1000000.0
