## Training

Define the training workload by specifying the:
- experiment and model parameters
- compute scaling configuration
- forward pass for batches of training and validation data
- train loop for each epoch of data and checkpointing

<img src="https://raw.githubusercontent.com/anyscale/multimodal-ai/refs/heads/main/images/trainer.png" width=700>

In [None]:
# Train loop config.
experiment_name = "doggos"
train_loop_config = {
    "model_registry": model_registry,
    "experiment_name": experiment_name,
    "embedding_dim": 512,
    "hidden_dim": 256,
    "dropout_p": 0.3,
    "lr": 1e-3,
    "lr_factor": 0.8,
    "lr_patience": 3,
    "num_epochs": 20,
    "batch_size": 256,
}


In [None]:
# Scaling config
num_workers = 4
scaling_config = ray.train.ScalingConfig(
    num_workers=num_workers,
    use_gpu=True,
    resources_per_worker={"CPU": 8, "GPU": 2},
    accelerator_type="T4",
)


In [None]:
import tempfile
import mlflow
import numpy as np
from ray.train.torch import TorchTrainer


In [None]:
def train_epoch(ds, batch_size, model, num_classes, loss_fn, optimizer):
    model.train()
    loss = 0.0
    ds_generator = ds.iter_torch_batches(batch_size=batch_size, collate_fn=collate_fn)
    for i, batch in enumerate(ds_generator):
        optimizer.zero_grad()  # Reset gradients.
        z = model(batch)  # Forward pass.
        targets = F.one_hot(batch["label"], num_classes=num_classes).float()
        J = loss_fn(z, targets)  # Define loss.
        J.backward()  # Backward pass.
        optimizer.step()  # Update weights.
        loss += (J.detach().item() - loss) / (i + 1)  # Cumulative loss
    return loss


In [None]:
def eval_epoch(ds, batch_size, model, num_classes, loss_fn):
    model.eval()
    loss = 0.0
    y_trues, y_preds = [], []
    ds_generator = ds.iter_torch_batches(batch_size=batch_size, collate_fn=collate_fn)
    with torch.inference_mode():
        for i, batch in enumerate(ds_generator):
            z = model(batch)
            targets = F.one_hot(
                batch["label"], num_classes=num_classes
            ).float()  # one-hot (for loss_fn)
            J = loss_fn(z, targets).item()
            loss += (J - loss) / (i + 1)
            y_trues.extend(batch["label"].cpu().numpy())
            y_preds.extend(torch.argmax(z, dim=1).cpu().numpy())
    return loss, np.vstack(y_trues), np.vstack(y_preds)


In [None]:
def train_loop_per_worker(config):
    # Hyperparameters.
    model_registry = config["model_registry"]
    experiment_name = config["experiment_name"]
    embedding_dim = config["embedding_dim"]
    hidden_dim = config["hidden_dim"]
    dropout_p = config["dropout_p"]
    lr = config["lr"]
    lr_factor = config["lr_factor"]
    lr_patience = config["lr_patience"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    num_classes = config["num_classes"]

    # Experiment tracking.
    if ray.train.get_context().get_world_rank() == 0:
        mlflow.set_tracking_uri(f"file:{model_registry}")
        mlflow.set_experiment(experiment_name)
        mlflow.start_run()
        mlflow.log_params(config)

    # Datasets.
    train_ds = ray.train.get_dataset_shard("train")
    val_ds = ray.train.get_dataset_shard("val")

    # Model.
    model = ClassificationModel(
        embedding_dim=embedding_dim,
        hidden_dim=hidden_dim,
        dropout_p=dropout_p,
        num_classes=num_classes,
    )
    model = ray.train.torch.prepare_model(model)

    # Training components.
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode="min",
        factor=lr_factor,
        patience=lr_patience,
    )

    # Training.
    best_val_loss = float("inf")
    for epoch in range(num_epochs):
        # Steps
        train_loss = train_epoch(
            train_ds, batch_size, model, num_classes, loss_fn, optimizer
        )
        val_loss, _, _ = eval_epoch(val_ds, batch_size, model, num_classes, loss_fn)
        scheduler.step(val_loss)

        # Checkpoint (metrics, preprocessor and model artifacts).
        with tempfile.TemporaryDirectory() as dp:
            model.module.save(dp=dp)
            metrics = dict(
                lr=optimizer.param_groups[0]["lr"],
                train_loss=train_loss,
                val_loss=val_loss,
            )
            with open(os.path.join(dp, "class_to_label.json"), "w") as fp:
                json.dump(config["class_to_label"], fp, indent=4)
            if ray.train.get_context().get_world_rank() == 0:  # only on main worker 0
                mlflow.log_metrics(metrics, step=epoch)
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    mlflow.log_artifacts(dp)

    # End experiment tracking.
    if ray.train.get_context().get_world_rank() == 0:
        mlflow.end_run()


<div class="alert alert-block alert"> <b> Minimal change to your training code</b> 

Notice that there isn't much new Ray Train code on top of the base PyTorch code. You specified how you want to scale out the training workload, load the Ray datasets, and then checkpoint on the main worker node and that's it. See these guides ([PyTorch](https://docs.ray.io/en/latest/train/getting-started-pytorch.html), [PyTorch Lightning](https://docs.ray.io/en/latest/train/getting-started-pytorch-lightning.html), [Hugging Face Transformers](https://docs.ray.io/en/latest/train/getting-started-transformers.html)) to see the minimal change in code needed to distribute your training workloads. See this extensive list of [Ray Train user guides](https://docs.ray.io/en/latest/train/user-guides.html).

In [None]:
# Load preprocessed datasets.
preprocessed_train_ds = ray.data.read_parquet(preprocessed_train_path)
preprocessed_val_ds = ray.data.read_parquet(preprocessed_val_path)


In [None]:
# Trainer.
train_loop_config["class_to_label"] = preprocessor.class_to_label
train_loop_config["num_classes"] = len(preprocessor.class_to_label)
trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config=train_loop_config,
    scaling_config=scaling_config,
    datasets={"train": preprocessed_train_ds, "val": preprocessed_val_ds},
)


In [None]:
# Train.
results = trainer.fit()


[36m(TrainController pid=125066)[0m [State Transition] INITIALIZING -> SCHEDULING.
[36m(TrainController pid=125066)[0m Attempting to start training worker group of size 4 with the following resources: [{'CPU': 8, 'GPU': 2, 'accelerator_type:T4': 0.001}] * 4
[36m(TrainController pid=125066)[0m Using blocking ray.get inside async actor. This blocks the event loop. Please use `await` on object ref with asyncio.gather if you want to yield execution to the event loop instead.


[36m(autoscaler +3m40s)[0m [autoscaler] [4xT4:48CPU-192GB] Attempting to add 1 node to the cluster (increasing from 1 to 2).
[36m(autoscaler +3m40s)[0m [autoscaler] [4xT4:48CPU-192GB|g4dn.12xlarge] [us-west-2a] [on-demand] Launched 1 instance.
[36m(autoscaler +3m45s)[0m [autoscaler] Cluster upscaled to {112 CPU, 8 GPU}.


[36m(TrainController pid=125066)[0m Retrying the launch of the training worker group. The previous launch attempt encountered the following failure:
[36m(TrainController pid=125066)[0m [State Transition] SCHEDULING -> RESCHEDULING.
[36m(TrainController pid=125066)[0m [State Transition] RESCHEDULING -> SCHEDULING.
[36m(TrainController pid=125066)[0m Attempting to start training worker group of size 4 with the following resources: [{'CPU': 8, 'GPU': 2, 'accelerator_type:T4': 0.001}] * 4


[36m(autoscaler +4m30s)[0m [autoscaler] Cluster upscaled to {160 CPU, 12 GPU}.


[36m(RayTrainWorker pid=3319, ip=10.0.34.27)[0m Setting up process group for: env:// [rank=0, world_size=4]
[36m(RayTrainWorker pid=16056, ip=10.0.4.102)[0m Moving model to device: cuda:0
[36m(TrainController pid=125066)[0m Started training worker group of size 4: 
[36m(TrainController pid=125066)[0m - (ip=10.0.34.27, pid=3319) world_rank=0, local_rank=0, node_rank=0
[36m(TrainController pid=125066)[0m - (ip=10.0.34.27, pid=3320) world_rank=1, local_rank=1, node_rank=0
[36m(TrainController pid=125066)[0m - (ip=10.0.4.102, pid=16056) world_rank=2, local_rank=0, node_rank=1
[36m(TrainController pid=125066)[0m - (ip=10.0.4.102, pid=16055) world_rank=3, local_rank=1, node_rank=1
[36m(TrainController pid=125066)[0m [State Transition] SCHEDULING -> RUNNING.
[36m(RayTrainWorker pid=3319, ip=10.0.34.27)[0m 2025/08/22 00:32:11 INFO mlflow.tracking.fluent: Experiment with name 'doggos' does not exist. Creating a new experiment.
[36m(RayTrainWorker pid=16056, ip=10.0.4.102)[0m

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

[36m(SplitCoordinator pid=125821)[0m Registered dataset logger for dataset train_88_0
[36m(SplitCoordinator pid=125821)[0m Starting execution of Dataset train_88_0. Full logs are in /tmp/ray/session_2025-08-21_18-48-13_464408_2298/logs/ray-data
[36m(SplitCoordinator pid=125821)[0m Execution plan of Dataset train_88_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> OutputSplitter[split(4, equal=True)]
[36m(SplitCoordinator pid=125821)[0m ⚠️  Ray's object store is configured to use only 28.5% of available memory (195.9GB out of 687.2GB total). For optimal Ray Data performance, we recommend setting the object store to at least 50% of available memory. You can do this by setting the 'object_store_memory' parameter when calling ray.init() or by setting the RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION environment variable.
[36m(RayTrainWorker pid=3319, ip=10.0.34.27)[0m Moving model to device: cuda:0
[36m(RayTrainWorker pid=3319, ip=10.

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

[36m(SplitCoordinator pid=125821)[0m Registered dataset logger for dataset train_88_1[32m [repeated 2x across cluster][0m
[36m(SplitCoordinator pid=125821)[0m Starting execution of Dataset train_88_1. Full logs are in /tmp/ray/session_2025-08-21_18-48-13_464408_2298/logs/ray-data[32m [repeated 2x across cluster][0m
[36m(SplitCoordinator pid=125821)[0m Execution plan of Dataset train_88_1: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> OutputSplitter[split(4, equal=True)][32m [repeated 2x across cluster][0m
[36m(SplitCoordinator pid=125822)[0m ⚠️  Ray's object store is configured to use only 28.5% of available memory (195.9GB out of 687.2GB total). For optimal Ray Data performance, we recommend setting the object store to at least 50% of available memory. You can do this by setting the 'object_store_memory' parameter when calling ray.init() or by setting the RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION environment variable.


(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

[36m(SplitCoordinator pid=125822)[0m ✔️  Dataset val_89_2 execution finished in 0.14 seconds[32m [repeated 5x across cluster][0m


(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

[36m(SplitCoordinator pid=125822)[0m Registered dataset logger for dataset val_89_6[32m [repeated 11x across cluster][0m
[36m(SplitCoordinator pid=125822)[0m Starting execution of Dataset val_89_6. Full logs are in /tmp/ray/session_2025-08-21_18-48-13_464408_2298/logs/ray-data[32m [repeated 11x across cluster][0m
[36m(SplitCoordinator pid=125822)[0m Execution plan of Dataset val_89_6: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> OutputSplitter[split(4, equal=True)][32m [repeated 11x across cluster][0m


(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

[36m(SplitCoordinator pid=125822)[0m ✔️  Dataset val_89_9 execution finished in 0.12 seconds[32m [repeated 14x across cluster][0m


(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

[36m(SplitCoordinator pid=125821)[0m Registered dataset logger for dataset train_88_15[32m [repeated 17x across cluster][0m
[36m(SplitCoordinator pid=125821)[0m Starting execution of Dataset train_88_15. Full logs are in /tmp/ray/session_2025-08-21_18-48-13_464408_2298/logs/ray-data[32m [repeated 17x across cluster][0m
[36m(SplitCoordinator pid=125821)[0m Execution plan of Dataset train_88_15: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> OutputSplitter[split(4, equal=True)][32m [repeated 17x across cluster][0m


(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

(pid=125821) Running 0: 0.00 row [00:00, ? row/s]

(pid=125821) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125821) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125821) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

[36m(SplitCoordinator pid=125822)[0m ✔️  Dataset val_89_18 execution finished in 0.12 seconds[32m [repeated 18x across cluster][0m


(pid=125822) Running 0: 0.00 row [00:00, ? row/s]

(pid=125822) - ListFiles 1: 0.00 row [00:00, ? row/s]

(pid=125822) - ReadFiles 2: 0.00 row [00:00, ? row/s]

(pid=125822) - split(4, equal=True) 3: 0.00 row [00:00, ? row/s]

[36m(TrainController pid=125066)[0m [State Transition] RUNNING -> FINISHED.
