ray-project · amogkam · Dec 22, 2021 · Nov 17, 2021 · Nov 20, 2021 · Nov 23, 2021
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -715,7 +715,7 @@
     - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=client_unit_tests,-gpu_only --test_env=RAY_CLIENT_MODE=1 python/ray/util/sgd/...
 
 - label: ":octopus: Tune/SGD/Modin/Dask tests and examples. Python 3.7"
-  conditions: ["RAY_CI_TUNE_AFFECTED", "RAY_CI_SGD_AFFECTED"]
+  conditions: ["RAY_CI_TUNE_AFFECTED", "RAY_CI_TRAIN_AFFECTED"]
   commands:
     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
     - TUNE_TESTING=1 PYTHON=3.7 INSTALL_HOROVOD=1 ./ci/travis/install-dependencies.sh
@@ -725,9 +725,10 @@
     - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-client python/ray/util/xgboost/...
     - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/horovod/...
     - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/ray_lightning/...
+    - bazel test --config-ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/ml_utils/...
 
 - label: ":octopus: Ludwig tests and examples. Python 3.7"
-  conditions: ["RAY_CI_TUNE_AFFECTED", "RAY_CI_SGD_AFFECTED"]
+  conditions: ["RAY_CI_TUNE_AFFECTED"]
   commands:
     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
     - SGD_TESTING=1 PYTHON=3.7 INSTALL_LUDWIG=1 INSTALL_HOROVOD=1 ./ci/travis/install-dependencies.sh

@@ -72,6 +72,13 @@ TBXLoggerCallback
 
 .. autoclass:: ray.train.callbacks.TBXLoggerCallback
 
+.. _train-api-mlflow-logger-callback
+
+MLflowLoggerCallback
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ray.train.callbacks.MLflowLoggerCallback
+
 Checkpointing
 -------------
 

@@ -435,11 +435,11 @@ You can plug all of these into Ray Train with the following interface:
 Logging Callbacks
 +++++++++++++++++
 
-The following ``TrainingCallback``\s are available and will write to a file within the
-:ref:`log directory <train-log-dir>` of each training run.
+The following ``TrainingCallback``\s are available and will log the intermediate results of the training run.
 
 1. :ref:`train-api-json-logger-callback`
 2. :ref:`train-api-tbx-logger-callback`
+3. :ref:`train-api-mlflow-logger-callback`
 
 Custom Callbacks
 ++++++++++++++++

@@ -1,5 +1,5 @@
 import abc
-from typing import List, Dict
+from typing import List, Dict, Optional
 
 
 class TrainingCallback(metaclass=abc.ABCMeta):
@@ -16,21 +16,27 @@ def handle_result(self, results: List[Dict], **info):
         """
         pass
 
-    def start_training(self, logdir: str, **info):
+    def start_training(self, logdir: str, config: Dict, **info):
         """Called once on training start.
 
         Args:
             logdir (str): Path to the file directory where logs
                 should be persisted.
+            config (Dict): The config dict passed into ``trainer.run()``.
             **info: kwargs dict for forward compatibility.
         """
         pass
 
-    def finish_training(self, error: bool = False, **info):
+    def finish_training(self,
+                        error: bool = False,
+                        run_dir: Optional[str] = None,
+                        **info):
         """Called once after training is over.
 
         Args:
             error (bool): If True, there was an exception during training.
+            run_dir (Optional[str]): The path to the directory for this
+                training run.
             **info: kwargs dict for forward compatibility.
         """
         pass
@@ -12,6 +12,7 @@
 from ray.train.callbacks import TrainingCallback
 from ray.train.constants import (RESULT_FILE_JSON, TRAINING_ITERATION,
                                  TIME_TOTAL_S, TIMESTAMP, PID)
+from ray.util.ml_utils.mlflow import MLflowLoggerUtil
 
 logger = logging.getLogger(__name__)
 
@@ -174,6 +175,100 @@ def _validate_worker_to_log(self, worker_to_log) -> int:
         return worker_to_log
 
 
+class MLflowLoggerCallback(TrainingSingleWorkerLoggingCallback):
+    """MLflow Logger to automatically log Train results and config to MLflow.
+
+    MLflow (https://mlflow.org) Tracking is an open source library for
+    recording and querying experiments. This Ray Train callback
+    sends information (config parameters, training results & metrics,
+    and artifacts) to MLflow for automatic experiment tracking.
+
+    Args:
+        tracking_uri (Optional[str]): The tracking URI for where to manage
+            experiments and runs. This can either be a local file path or a
+            remote server. This arg gets passed directly to mlflow
+            initialization.
+        registry_uri (Optional[str]): The registry URI that gets passed
+            directly to mlflow initialization.
+        experiment_id (Optional[str]): The experiment id of an already
+            existing experiment. If not
+            passed in, experiment_name will be used.
+        experiment_name (Optional[str]): The experiment name to use for this
+            Train run.
+            If the experiment with the name already exists with MLflow,
+            it will be used. If not, a new experiment will be created with
+            this name.
+        tags (Optional[Dict]):  An optional dictionary of string keys and
+            values to set as tags on the run
+        save_artifact (bool): If set to True, automatically save the entire
+            contents of the Train local_dir as an artifact to the
+            corresponding run in MlFlow.
+        logdir (Optional[str]): Path to directory where the results file
+            should be. If None, will be set by the Trainer. If no tracking
+            uri or registry uri are passed in, the logdir will be used for
+            both.
+        worker_to_log (int): Worker index to log. By default, will log the
+            worker with index 0.
+    """
+
+    def __init__(self,
+                 tracking_uri: Optional[str] = None,
+                 registry_uri: Optional[str] = None,
+                 experiment_id: Optional[str] = None,
+                 experiment_name: Optional[str] = None,
+                 tags: Optional[Dict] = None,
+                 save_artifact: bool = False,
+                 logdir: Optional[str] = None,
+                 worker_to_log: int = 0):
+        super().__init__(logdir=logdir, worker_to_log=worker_to_log)
+
+        self.tracking_uri = tracking_uri
+        self.registry_uri = registry_uri
+        self.experiment_id = experiment_id
+        self.experiment_name = experiment_name
+        self.tags = tags
+
+        self.save_artifact = save_artifact
+        self.mlflow_util = MLflowLoggerUtil()
+
+    def start_training(self, logdir: str, config: Dict, **info):
+        super().start_training(logdir=logdir, config=config, info=info)
+
+        tracking_uri = self.tracking_uri if self.tracking_uri is not None \
+            else \
+            str(self.logdir)
+        registry_uri = self.registry_uri if self.registry_uri is not None \
+            else \
+            str(self.logdir)
+
+        success = self.mlflow_util.setup_mlflow(
+            tracking_uri=tracking_uri,
+            registry_uri=registry_uri,
+            experiment_id=self.experiment_id,
+            experiment_name=self.experiment_name,
+            create_experiment_if_not_exists=True)
+
+        if not success:
+            raise ValueError("No experiment_name or experiment_id passed in, "
+                             "Please "
+                             "set one of these to use the "
+                             "MLflowLoggerCallback.")
+
+        self.mlflow_util.start_run(tags=self.tags, set_active=True)
+        self.mlflow_util.log_params(params_to_log=config)
+
+    def handle_result(self, results: List[Dict], **info):
+        result = results[self._workers_to_log]
+
+        self.mlflow_util.log_metrics(
+            metrics_to_log=result, step=result[TRAINING_ITERATION])
+
+    def finish_training(self, error: bool = False, **info):
+        if self.save_artifact:
+            self.mlflow_util.save_artifacts(dir=str(self.logdir))
+        self.mlflow_util.end_run(status="FAILED" if error else "FINISHED")
+
+
 class TBXLoggerCallback(TrainingSingleWorkerLoggingCallback):
     """Logs Train results in TensorboardX format.
 

@@ -1,30 +1,26 @@
 import argparse
 
-import mlflow
-
 from ray.train import Trainer
 from ray.train.examples.train_fashion_mnist_example import train_func
+from ray.train.callbacks.logging import MLflowLoggerCallback
 
 
 def main(num_workers=2, use_gpu=False):
-    mlflow.set_experiment("train_torch_fashion_mnist")
-
     trainer = Trainer(
         backend="torch", num_workers=num_workers, use_gpu=use_gpu)
     trainer.start()
-    iterator = trainer.run_iterator(
+    final_results = trainer.run(
         train_func=train_func,
         config={
             "lr": 1e-3,
             "batch_size": 64,
             "epochs": 4
-        })
-
-    for intermediate_result in iterator:
-        first_worker_result = intermediate_result[0]
-        mlflow.log_metric("loss", first_worker_result["loss"])
+        },
+        callbacks=[
+            MLflowLoggerCallback(experiment_name="train_fashion_mnist")
+        ])
 
-    print("Full losses for rank 0 worker: ", iterator.get_final_results())
+    print("Full losses for rank 0 worker: ", final_results)
 
 
 if __name__ == "__main__":

@@ -15,6 +15,7 @@
 from ray.train.callbacks import JsonLoggerCallback, TBXLoggerCallback
 from ray.train.backend import BackendConfig, Backend
 from ray.train.worker_group import WorkerGroup
+from ray.train.callbacks.logging import MLflowLoggerCallback
 
 try:
     from tensorflow.python.summary.summary_iterator \
@@ -136,8 +137,6 @@ def _validate_tbx_result(events_dir):
     assert len(results["hello/world"]) == 1
 
 
-@pytest.mark.skipif(
-    summary_iterator is None, reason="tensorboard is not installed")
 def test_TBX(ray_start_4_cpus, make_temp_dir):
     config = TestConfig()
 
@@ -159,6 +158,54 @@ def train_func():
     _validate_tbx_result(temp_dir)
 
 
+def test_mlflow(ray_start_4_cpus, make_temp_dir):
+    config = TestConfig()
+
+    params = {"p1": "p1"}
+
+    temp_dir = make_temp_dir
+    num_workers = 4
+
+    def train_func(config):
+        train.report(episode_reward_mean=4)
+        train.report(episode_reward_mean=5)
+        train.report(episode_reward_mean=6)
+        return 1
+
+    callback = MLflowLoggerCallback(
+        experiment_name="test_exp", logdir=temp_dir)
+    trainer = Trainer(config, num_workers=num_workers)
+    trainer.start()
+    trainer.run(train_func, config=params, callbacks=[callback])
+
+    from mlflow.tracking import MlflowClient
+
+    client = MlflowClient(
+        tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri())
+
+    all_runs = callback.mlflow_util._mlflow.search_runs(experiment_ids=["0"])
+    assert len(all_runs) == 1
+    # all_runs is a pandas dataframe.
+    all_runs = all_runs.to_dict(orient="records")
+    run_id = all_runs[0]["run_id"]
+    run = client.get_run(run_id)
+
+    assert run.data.params == params
+    assert "episode_reward_mean" in run.data.metrics and \
+           run.data.metrics["episode_reward_mean"] == 6.0
+    assert TRAINING_ITERATION in run.data.metrics and \
+           run.data.metrics[TRAINING_ITERATION] == 3.0
+
+    metric_history = client.get_metric_history(
+        run_id=run_id, key="episode_reward_mean")
+
+    assert len(metric_history) == 3
+    iterations = [metric.step for metric in metric_history]
+    assert iterations == [1, 2, 3]
+    rewards = [metric.value for metric in metric_history]
+    assert rewards == [4, 5, 6]
+
+
 if __name__ == "__main__":
     import pytest
     import sys

@@ -281,7 +281,9 @@ def run(self,
         finished_with_errors = False
 
         for callback in callbacks:
-            callback.start_training(logdir=self.latest_run_dir)
+            callback.start_training(
+                logdir=str(self.latest_run_dir),
+                config=config if config else {})
 
         train_func = self._get_train_func(train_func, config)
 
@@ -304,7 +306,9 @@ def run(self,
             return iterator.get_final_results()
         finally:
             for callback in callbacks:
-                callback.finish_training(error=finished_with_errors)
+                callback.finish_training(
+                    error=finished_with_errors,
+                    run_dir=str(self.latest_run_dir))
 
     def run_iterator(
             self,