Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[2/n] Lightweight Ray AIR API refactor #37123

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
80 commits
Select commit Hold shift + click to select a range
8d36b16
Lightweight Ray AIR API refactor
pcmoritz Jun 22, 2023
6cfdd91
shuffle
pcmoritz Jun 25, 2023
6f2f095
Merge branch 'master' into lightweight-ray-air-api-refactor
pcmoritz Jun 25, 2023
6bdbe14
update API
pcmoritz Jun 25, 2023
9d74cc2
compat
pcmoritz Jun 25, 2023
aa73331
update
pcmoritz Jun 26, 2023
3009ae9
move to internal
pcmoritz Jun 26, 2023
c26abb1
update
pcmoritz Jun 26, 2023
74be132
lint
pcmoritz Jun 26, 2023
6685d1f
update
pcmoritz Jun 27, 2023
9709afd
update
pcmoritz Jun 28, 2023
c525695
Merge branch 'master' into lightweight-ray-air-api-refactor
pcmoritz Jun 28, 2023
1e32b82
update
pcmoritz Jun 28, 2023
2c21498
add context
pcmoritz Jul 1, 2023
0b00002
Merge branch 'master' into lightweight-ray-air-api-refactor
pcmoritz Jul 1, 2023
90dc9e4
fix
pcmoritz Jul 1, 2023
561f74c
fix errors in docstrings
pcmoritz Jul 1, 2023
dcf0332
merge
pcmoritz Jul 4, 2023
995ac6a
update
pcmoritz Jul 4, 2023
7e2dfe3
fix
pcmoritz Jul 4, 2023
4ba674e
lint
pcmoritz Jul 4, 2023
d399bfa
update
pcmoritz Jul 5, 2023
2b2c683
lint
pcmoritz Jul 5, 2023
41173d5
save
pcmoritz Jul 5, 2023
a039d85
update
pcmoritz Jul 6, 2023
52dcee6
update
pcmoritz Jul 6, 2023
65a761e
update
pcmoritz Jul 6, 2023
27315c5
update
pcmoritz Jul 6, 2023
4264ff2
update
pcmoritz Jul 6, 2023
f37e59b
update
pcmoritz Jul 6, 2023
620a1d5
update
pcmoritz Jul 6, 2023
c79512a
fix
pcmoritz Jul 6, 2023
bd8d8a4
fix
pcmoritz Jul 6, 2023
50e83e7
update
pcmoritz Jul 6, 2023
36aae0e
update
pcmoritz Jul 6, 2023
660587d
update
pcmoritz Jul 8, 2023
cee3539
Merge branch 'master' into lightweight-ray-air-api-refactor-examples
pcmoritz Jul 8, 2023
9199f95
update
pcmoritz Jul 8, 2023
bb7b8bc
update
pcmoritz Jul 8, 2023
7d2f66b
fix
pcmoritz Jul 9, 2023
d7bb36e
update
pcmoritz Jul 9, 2023
e172b64
update
pcmoritz Jul 9, 2023
d2f9a0f
fixes
pcmoritz Jul 10, 2023
7a79946
update
pcmoritz Jul 10, 2023
d482d42
update
pcmoritz Jul 10, 2023
2cd94c0
update
pcmoritz Jul 11, 2023
e20910d
Merge branch 'master' into lightweight-ray-air-api-refactor-examples
pcmoritz Jul 11, 2023
b3d4cd5
update
pcmoritz Jul 11, 2023
36c4b34
Merge branch 'lightweight-ray-air-api-refactor-examples' of github.co…
pcmoritz Jul 11, 2023
956bee2
update
pcmoritz Jul 12, 2023
903df1b
update
pcmoritz Jul 12, 2023
6ea02ef
Merge branch 'master' into lightweight-ray-air-api-refactor-examples
pcmoritz Jul 13, 2023
0997ff5
fix
pcmoritz Jul 13, 2023
0cc9b1e
update
pcmoritz Jul 13, 2023
110176f
update
pcmoritz Jul 14, 2023
8514a8a
update
pcmoritz Jul 15, 2023
674bbe6
update
pcmoritz Jul 15, 2023
2721800
update
pcmoritz Jul 15, 2023
6b14c22
update
pcmoritz Jul 15, 2023
921c166
Merge branch 'master' into lightweight-ray-air-api-refactor-examples
pcmoritz Aug 1, 2023
ed1617e
update
pcmoritz Aug 1, 2023
10891a7
update
pcmoritz Aug 1, 2023
7a89de4
API update
pcmoritz Aug 1, 2023
ecff3eb
update
pcmoritz Aug 1, 2023
2926ca9
fix tests
pcmoritz Aug 1, 2023
64ee636
update
pcmoritz Aug 1, 2023
1469970
fixes
pcmoritz Aug 1, 2023
ff5064f
update
pcmoritz Aug 2, 2023
7fe6d8f
fix
pcmoritz Aug 2, 2023
f952741
Update doc/source/train/dl_guide.rst
pcmoritz Aug 2, 2023
497f559
Update doc/source/train/dl_guide.rst
pcmoritz Aug 2, 2023
c41be34
Update python/ray/train/mosaic/_mosaic_utils.py
pcmoritz Aug 2, 2023
329715c
Update python/ray/tune/experiment/experiment.py
pcmoritz Aug 2, 2023
9121156
Update python/ray/tune/impl/tuner_internal.py
pcmoritz Aug 2, 2023
b0040e2
Update python/ray/tune/impl/tuner_internal.py
pcmoritz Aug 2, 2023
8eee519
update
pcmoritz Aug 2, 2023
6c821e2
Merge branch 'master' into lightweight-ray-air-api-refactor-examples
pcmoritz Aug 2, 2023
99757d4
lint
pcmoritz Aug 2, 2023
4386a8a
Merge branch 'master' into lightweight-ray-air-api-refactor-examples
pcmoritz Aug 2, 2023
8ae82af
update
pcmoritz Aug 2, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray.air import Checkpoint, session
from ray import train, tune
from ray.train import Checkpoint
from ray.tune.schedulers import ASHAScheduler

######################################################################
Expand Down Expand Up @@ -125,7 +125,7 @@ def forward(self, x):
#
# net = Net(config["l1"], config["l2"])
#
# checkpoint = session.get_checkpoint()
# checkpoint = ray.train.get_checkpoint()
#
# if checkpoint:
# checkpoint_state = checkpoint.to_dict()
Expand Down Expand Up @@ -189,7 +189,7 @@ def forward(self, x):
# }
# checkpoint = Checkpoint.from_dict(checkpoint_data)
#
# session.report(
# ray.train.report(
# {"loss": val_loss / val_steps, "accuracy": correct / total},
# checkpoint=checkpoint,
# )
Expand Down Expand Up @@ -226,7 +226,7 @@ def train_cifar(config, data_dir=None):
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

checkpoint = session.get_checkpoint()
checkpoint = train.get_checkpoint()

if checkpoint:
checkpoint_state = checkpoint.to_dict()
Expand Down Expand Up @@ -303,7 +303,7 @@ def train_cifar(config, data_dir=None):
}
checkpoint = Checkpoint.from_dict(checkpoint_data)

session.report(
train.report(
{"loss": val_loss / val_steps, "accuracy": correct / total},
checkpoint=checkpoint,
)
Expand Down
2 changes: 1 addition & 1 deletion doc/source/data/doc_code/preprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@

from ray.data.preprocessors import MinMaxScaler
from ray.train.xgboost import XGBoostTrainer
from ray.air.config import ScalingConfig
from ray.train import ScalingConfig

train_dataset = ray.data.from_items([{"x": x, "y": 2 * x} for x in range(0, 32, 3)])
valid_dataset = ray.data.from_items([{"x": x, "y": 2 * x} for x in range(1, 32, 3)])
Expand Down
5 changes: 3 additions & 2 deletions doc/source/data/working-with-pytorch.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ Ray Data integrates with :ref:`Ray Train <train-docs>` for easy data ingest for
import torch
from torch import nn
import ray
from ray.air import session, ScalingConfig
from ray import train
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer

def train_func(config):
Expand All @@ -56,7 +57,7 @@ Ray Data integrates with :ref:`Ray Train <train-docs>` for easy data ingest for
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

# Datasets can be accessed in your train_func via ``get_dataset_shard``.
train_data_shard = session.get_dataset_shard("train")
train_data_shard = train.get_dataset_shard("train")

for epoch_idx in range(2):
for batch in train_data_shard.iter_torch_batches(batch_size=128, dtypes=torch.float32):
Expand Down
4 changes: 2 additions & 2 deletions doc/source/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ predictions.show(limit=1)
</div>
<div class="tab-pane fade" id="v-pills-training" role="tabpanel" aria-labelledby="v-pills-training-tab" style="user-select:none;">
<pre style="margin:0;"><code class="language-python not-selectable">
from ray.air.config import ScalingConfig
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer

# Step 1: setup PyTorch model training as you normally would
Expand Down Expand Up @@ -166,7 +166,7 @@ result = trainer.fit()
<div class="tab-pane fade" id="v-pills-tuning" role="tabpanel" aria-labelledby="v-pills-tuning-tab" style="user-select:none;" style="user-select:none;">
<pre style="margin:0;"><code class="language-python not-selectable">
from ray import tune
from ray.air.config import ScalingConfig
from ray.train import ScalingConfig
from ray.train.lightgbm import LightGBMTrainer

train_dataset, eval_dataset = ...
Expand Down
6 changes: 3 additions & 3 deletions doc/source/ray-air/computer-vision.rst
Original file line number Diff line number Diff line change
Expand Up @@ -207,13 +207,13 @@ Training vision models
Creating checkpoints
--------------------

:class:`Checkpoints <ray.air.checkpoint.Checkpoint>` are required for batch inference and model
:class:`Checkpoints <ray.train.Checkpoint>` are required for batch inference and model
serving. They contain model state and optionally a preprocessor.

If you're going from training to prediction, don't create a new checkpoint.
:meth:`Trainer.fit() <ray.train.trainer.BaseTrainer.fit>` returns a
:class:`~ray.air.result.Result` object. Use
:attr:`Result.checkpoint <ray.air.result.Result.checkpoint>` instead.
:class:`~ray.train.Result` object. Use
:attr:`Result.checkpoint <ray.train.Result.checkpoint>` instead.

.. tab-set::

Expand Down
13 changes: 6 additions & 7 deletions doc/source/ray-air/doc_code/computer_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,11 @@ def train_torch_model(dataset, preprocessor, per_epoch_preprocessor):
from torchvision import models

from ray import train
from ray.air import session
from ray.air.config import ScalingConfig
from ray.train import ScalingConfig
from ray.train.torch import TorchCheckpoint, TorchTrainer

def train_one_epoch(model, *, criterion, optimizer, batch_size, epoch):
dataset_shard = session.get_dataset_shard("train")
dataset_shard = train.get_dataset_shard("train")

running_loss = 0
for i, batch in enumerate(
Expand All @@ -210,7 +209,7 @@ def train_one_epoch(model, *, criterion, optimizer, batch_size, epoch):

running_loss += loss.item()
if i % 2000 == 1999:
session.report(
train.report(
metrics={
"epoch": epoch,
"batch": i,
Expand Down Expand Up @@ -254,13 +253,13 @@ def train_tensorflow_model(dataset, preprocessor, per_epoch_preprocessor):
# __tensorflow_training_loop_start__
import tensorflow as tf

from ray.air import session
from ray import train
from ray.air.integrations.keras import ReportCheckpointCallback

def train_loop_per_worker(config):
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

train_shard = session.get_dataset_shard("train")
train_shard = train.get_dataset_shard("train")
train_dataset = train_shard.to_tf(
"image",
"label",
Expand All @@ -286,7 +285,7 @@ def train_loop_per_worker(config):
# __tensorflow_training_loop_stop__

# __tensorflow_trainer_start__
from ray.air import ScalingConfig
from ray.train import ScalingConfig
from ray.train.tensorflow import TensorflowTrainer

# The following transform operation is lazy.
Expand Down
13 changes: 6 additions & 7 deletions doc/source/ray-air/doc_code/hvd_trainer.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import horovod.torch as hvd
import ray
import ray.train as train
from ray import train
from ray.train import Checkpoint, ScalingConfig
import ray.train.torch # Need this to use `train.torch.get_device()`
import horovod.torch as hvd
from ray.train.horovod import HorovodTrainer
import torch
import torch.nn as nn
from ray.air import session, Checkpoint
from ray.train.horovod import HorovodTrainer
from ray.air.config import ScalingConfig

# If using GPUs, set this to True.
use_gpu = False
Expand All @@ -31,7 +30,7 @@ def forward(self, input):

def train_loop_per_worker():
hvd.init()
dataset_shard = session.get_dataset_shard("train")
dataset_shard = train.get_dataset_shard("train")
model = NeuralNetwork()
device = train.torch.get_device()
model.to(device)
Expand All @@ -56,7 +55,7 @@ def train_loop_per_worker():
loss.backward()
optimizer.step()
print(f"epoch: {epoch}, loss: {loss.item()}")
session.report(
train.report(
{},
checkpoint=Checkpoint.from_dict(dict(model=model.state_dict())),
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
# __air_session_start__

import tensorflow as tf
from ray.air import session
from ray.air.checkpoint import Checkpoint
from ray.air.config import ScalingConfig
from ray import train
from ray.train import Checkpoint, ScalingConfig
from ray.train.tensorflow import TensorflowTrainer


Expand All @@ -22,7 +21,7 @@ def build_model() -> tf.keras.Model:


def train_func():
ckpt = session.get_checkpoint()
ckpt = train.get_checkpoint()
if ckpt:
with ckpt.as_directory() as loaded_checkpoint_dir:
import tensorflow as tf
Expand All @@ -32,9 +31,7 @@ def train_func():
model = build_model()

model.save("my_model", overwrite=True)
session.report(
metrics={"iter": 1}, checkpoint=Checkpoint.from_directory("my_model")
)
train.report(metrics={"iter": 1}, checkpoint=Checkpoint.from_directory("my_model"))


scaling_config = ScalingConfig(num_workers=2)
Expand Down
6 changes: 3 additions & 3 deletions doc/source/ray-air/doc_code/tf_starter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
import ray
import tensorflow as tf

from ray.air import session
from ray import train
from ray.train import ScalingConfig
from ray.air.integrations.keras import ReportCheckpointCallback
from ray.train.tensorflow import TensorflowTrainer
from ray.air.config import ScalingConfig


# If using GPUs, set this to True.
Expand Down Expand Up @@ -46,7 +46,7 @@ def train_func(config: dict):
metrics=[tf.keras.metrics.mean_squared_error],
)

dataset = session.get_dataset_shard("train")
dataset = train.get_dataset_shard("train")

results = []
for _ in range(epochs):
Expand Down
7 changes: 3 additions & 4 deletions doc/source/ray-air/doc_code/torch_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@

import ray
from ray import train
from ray.air import session, Checkpoint
from ray.train import Checkpoint, ScalingConfig
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig


# If using GPUs, set this to True.
Expand All @@ -30,7 +29,7 @@ def forward(self, input):


def train_loop_per_worker():
dataset_shard = session.get_dataset_shard("train")
dataset_shard = train.get_dataset_shard("train")
model = NeuralNetwork()
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
Expand All @@ -49,7 +48,7 @@ def train_loop_per_worker():
optimizer.step()
print(f"epoch: {epoch}, loss: {loss.item()}")

session.report(
train.report(
{},
checkpoint=Checkpoint.from_dict(
dict(epoch=epoch, model=model.state_dict())
Expand Down
Loading
Loading