# Ray Tune Example

#### Examples from: 

* https://docs.ray.io/en/latest/tune/examples/tune-sklearn.html
* https://docs.ray.io/en/latest/tune/index.html


In [3]:
import os
from collections import Counter
import platform
import time
import numpy as np

import ray
from ray import tune

from tune_sklearn import TuneGridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV


If we have a ray cluster to connect to, do it. otherwise rely on ray to manage distribution of work locally. 

In [4]:
ray.init('ray://{ray_head}:10001'.format(ray_head=os.environ['RAY_CLUSTER']))

ClientContext(dashboard_url='10.131.2.243:8265', python_version='3.8.12', ray_version='1.12.1', ray_commit='4863e33856b54ccf8add5cbe75e41558850a1b75', protocol_version='2022-03-16', _num_clients=4, _context_to_restore=<ray.util.client._ClientContext object at 0x7fb0368e1280>)

Just to test this out we will define a simple function for which we want to find the set of parameters that produce lowest score. Here we will use the function `a**2 + b` to define our score. Later on we will use a more complex function, that requires multiple iterations. But for now let's just confirm this works as expected. 

In [5]:
# 1. Define an objective function.
def objective(config, checkpoint_dir=None):
    score = config["a"] ** 2 + config["b"]
    return {"score": score}

We are also going to need to define our search space for our grid search. Again, let's do something super simple so we can evaluate the results by our own inspection and confirm everything is working right. 

In [6]:
# 2. Define a search space.
search_space = {
    "a": tune.grid_search([0.001, 0.01, 0.1]),
    "b": tune.grid_search([1, 2, 3]),
}

Now we invoke our `tune.run` command and see what happens. :) 

We also want to get the results for the set of parameters that give us the smallest score for our objective function. 

In [7]:
# 3. Start a Tune run and print the best result.
analysis = tune.run(objective, config=search_space,resources_per_trial={'gpu': 1})
print(analysis.get_best_config(metric="score", mode="min"))

[2m[36m(run pid=2682)[0m 2022-07-22 17:18:44,994	INFO trial_runner.py:803 -- starting objective_562f1_00000
[2m[36m(run pid=2682)[0m 2022-07-22 17:18:45,023	ERROR syncer.py:119 -- Log sync requires rsync to be installed.


[2m[36m(run pid=2682)[0m == Status ==
[2m[36m(run pid=2682)[0m Current time: 2022-07-22 17:18:46 (running for 00:00:03.15)
[2m[36m(run pid=2682)[0m Memory usage on this node: 8.1/30.9 GiB
[2m[36m(run pid=2682)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=2682)[0m Resources requested: 1.0/1 CPUs, 1.0/1 GPUs, 0.0/2.8 GiB heap, 0.0/1.13 GiB objects
[2m[36m(run pid=2682)[0m Result logdir: /home/ray_results/objective_2022-07-22_17-18-43
[2m[36m(run pid=2682)[0m Number of trials: 9/9 (8 PENDING, 1 RUNNING)
[2m[36m(run pid=2682)[0m +-----------------------+----------+-------------------+-------+-----+
[2m[36m(run pid=2682)[0m | Trial name            | status   | loc               |     a |   b |
[2m[36m(run pid=2682)[0m |-----------------------+----------+-------------------+-------+-----|
[2m[36m(run pid=2682)[0m | objective_562f1_00000 | RUNNING  | 10.131.2.243:2788 | 0.001 |   1 |
[2m[36m(run pid=2682)[0m | objective_562f1_00001 | PENDING  |      

[2m[36m(run pid=2682)[0m 2022-07-22 17:18:48,417	INFO trial_runner.py:803 -- starting objective_562f1_00001


[2m[36m(run pid=2682)[0m Result for objective_562f1_00001:
[2m[36m(run pid=2682)[0m   date: 2022-07-22_17-18-50
[2m[36m(run pid=2682)[0m   done: false
[2m[36m(run pid=2682)[0m   experiment_id: d0e73a616fff43dbaea9389c68d02060
[2m[36m(run pid=2682)[0m   hostname: ray-cluster-mcliffor-40redhat-2ecom-ray-cluster-mcliffor-499pj7
[2m[36m(run pid=2682)[0m   iterations_since_restore: 1
[2m[36m(run pid=2682)[0m   node_ip: 10.131.2.243
[2m[36m(run pid=2682)[0m   pid: 2891
[2m[36m(run pid=2682)[0m   score: 1.0001
[2m[36m(run pid=2682)[0m   time_since_restore: 0.00014495849609375
[2m[36m(run pid=2682)[0m   time_this_iter_s: 0.00014495849609375
[2m[36m(run pid=2682)[0m   time_total_s: 0.00014495849609375
[2m[36m(run pid=2682)[0m   timestamp: 1658510330
[2m[36m(run pid=2682)[0m   timesteps_since_restore: 0
[2m[36m(run pid=2682)[0m   training_iteration: 1
[2m[36m(run pid=2682)[0m   trial_id: 562f1_00001
[2m[36m(run pid=2682)[0m   warmup_time: 0.0025

[2m[36m(run pid=2682)[0m 2022-07-22 17:18:52,127	INFO trial_runner.py:803 -- starting objective_562f1_00002


[2m[36m(run pid=2682)[0m Result for objective_562f1_00002:
[2m[36m(run pid=2682)[0m   date: 2022-07-22_17-18-53
[2m[36m(run pid=2682)[0m   done: false
[2m[36m(run pid=2682)[0m   experiment_id: 565819ccd37b4be7b7d42b20eac1355d
[2m[36m(run pid=2682)[0m   hostname: ray-cluster-mcliffor-40redhat-2ecom-ray-cluster-mcliffor-499pj7
[2m[36m(run pid=2682)[0m   iterations_since_restore: 1
[2m[36m(run pid=2682)[0m   node_ip: 10.131.2.243
[2m[36m(run pid=2682)[0m   pid: 2996
[2m[36m(run pid=2682)[0m   score: 1.01
[2m[36m(run pid=2682)[0m   time_since_restore: 0.00016832351684570312
[2m[36m(run pid=2682)[0m   time_this_iter_s: 0.00016832351684570312
[2m[36m(run pid=2682)[0m   time_total_s: 0.00016832351684570312
[2m[36m(run pid=2682)[0m   timestamp: 1658510333
[2m[36m(run pid=2682)[0m   timesteps_since_restore: 0
[2m[36m(run pid=2682)[0m   training_iteration: 1
[2m[36m(run pid=2682)[0m   trial_id: 562f1_00002
[2m[36m(run pid=2682)[0m   warmup_time:

[2m[36m(run pid=2682)[0m 2022-07-22 17:18:55,407	INFO trial_runner.py:803 -- starting objective_562f1_00003


[2m[36m(run pid=2682)[0m == Status ==
[2m[36m(run pid=2682)[0m Current time: 2022-07-22 17:18:57 (running for 00:00:13.53)
[2m[36m(run pid=2682)[0m Memory usage on this node: 8.2/30.9 GiB
[2m[36m(run pid=2682)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=2682)[0m Resources requested: 1.0/1 CPUs, 1.0/1 GPUs, 0.0/2.8 GiB heap, 0.0/1.13 GiB objects
[2m[36m(run pid=2682)[0m Result logdir: /home/ray_results/objective_2022-07-22_17-18-43
[2m[36m(run pid=2682)[0m Number of trials: 9/9 (5 PENDING, 1 RUNNING, 3 TERMINATED)
[2m[36m(run pid=2682)[0m +-----------------------+------------+-------------------+-------+-----+--------+------------------+---------+
[2m[36m(run pid=2682)[0m | Trial name            | status     | loc               |     a |   b |   iter |   total time (s) |   score |
[2m[36m(run pid=2682)[0m |-----------------------+------------+-------------------+-------+-----+--------+------------------+---------|
[2m[36m(run pid=2682)[0m | objec

[2m[36m(run pid=2682)[0m 2022-07-22 17:18:59,166	INFO trial_runner.py:803 -- starting objective_562f1_00004


[2m[36m(run pid=2682)[0m Result for objective_562f1_00004:
[2m[36m(run pid=2682)[0m   date: 2022-07-22_17-19-00
[2m[36m(run pid=2682)[0m   done: false
[2m[36m(run pid=2682)[0m   experiment_id: fb63af60920d4018a57bbe4f2e94c699
[2m[36m(run pid=2682)[0m   hostname: ray-cluster-mcliffor-40redhat-2ecom-ray-cluster-mcliffor-499pj7
[2m[36m(run pid=2682)[0m   iterations_since_restore: 1
[2m[36m(run pid=2682)[0m   node_ip: 10.131.2.243
[2m[36m(run pid=2682)[0m   pid: 3204
[2m[36m(run pid=2682)[0m   score: 2.0001
[2m[36m(run pid=2682)[0m   time_since_restore: 0.00015997886657714844
[2m[36m(run pid=2682)[0m   time_this_iter_s: 0.00015997886657714844
[2m[36m(run pid=2682)[0m   time_total_s: 0.00015997886657714844
[2m[36m(run pid=2682)[0m   timestamp: 1658510340
[2m[36m(run pid=2682)[0m   timesteps_since_restore: 0
[2m[36m(run pid=2682)[0m   training_iteration: 1
[2m[36m(run pid=2682)[0m   trial_id: 562f1_00004
[2m[36m(run pid=2682)[0m   warmup_tim

[2m[36m(run pid=2682)[0m 2022-07-22 17:19:03,391	INFO trial_runner.py:803 -- starting objective_562f1_00005
[2m[36m(run pid=2682)[0m 2022-07-22 17:19:03,530	INFO trial_runner.py:803 -- starting objective_562f1_00006
[2m[36m(run pid=2682)[0m 2022-07-22 17:19:04,185	INFO trial_runner.py:803 -- starting objective_562f1_00007
[2m[36m(run pid=2682)[0m 2022-07-22 17:19:04,219	INFO trial_runner.py:803 -- starting objective_562f1_00008


[2m[36m(run pid=2682)[0m Result for objective_562f1_00005:
[2m[36m(run pid=2682)[0m   date: 2022-07-22_17-19-04
[2m[36m(run pid=2682)[0m   done: false
[2m[36m(run pid=2682)[0m   experiment_id: 07942328e00341f09d01a85c5ad0e866
[2m[36m(run pid=2682)[0m   hostname: ray-cluster-mcliffor-40redhat-2ecom-ray-cluster-mcliffor-4prdz4
[2m[36m(run pid=2682)[0m   iterations_since_restore: 1
[2m[36m(run pid=2682)[0m   node_ip: 10.128.4.73
[2m[36m(run pid=2682)[0m   pid: 202
[2m[36m(run pid=2682)[0m   score: 2.01
[2m[36m(run pid=2682)[0m   time_since_restore: 0.0001308917999267578
[2m[36m(run pid=2682)[0m   time_this_iter_s: 0.0001308917999267578
[2m[36m(run pid=2682)[0m   time_total_s: 0.0001308917999267578
[2m[36m(run pid=2682)[0m   timestamp: 1658510344
[2m[36m(run pid=2682)[0m   timesteps_since_restore: 0
[2m[36m(run pid=2682)[0m   training_iteration: 1
[2m[36m(run pid=2682)[0m   trial_id: 562f1_00005
[2m[36m(run pid=2682)[0m   warmup_time: 0.00

[2m[36m(run pid=2682)[0m 2022-07-22 17:19:06,327	INFO tune.py:701 -- Total run time: 23.28 seconds (22.60 seconds for the tuning loop).


Cool, It works! At least, the ray tune tool doesn't break. Now let's 1) apply it to a more complex function and 2) compare it to the performance of vanilla sklearn. First thing we need to do is create a dataset and partition it into training and testing sets for evaluation.   




In [8]:
# Create dataset
X, y = make_classification(
    n_samples=11000,
    n_features=1000,
    n_informative=50,
    n_redundant=0,
    n_classes=10,
    class_sep=2.5,
)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=1000)

This function has different parameters the the one above, so let's define our new search space for the SGD (Stochastic Gradient Descent) algorithm. 

In [9]:
# 2. Define a search space.
search_space = {
    "alpha": tune.grid_search([1e-4, 1e-1, 1]),
    "epsilon": tune.grid_search([0.01, 0.1]),
}

As a baseline to evaluate our increased performance. Let's run SGD once, with one set of parameters and see how long it takes. 

In [10]:
clf = SGDClassifier(alpha=1,epsilon=0.1)
clf.fit(x_train,y_train)
clf.score(x_test,y_test)

0.889

Looks like it takes about 1 second to initialize the SGD, train it and evaluate it. Give our search space contains 6 different combinations, we should expect a serial execution (slowest) to be about 6 seconds, and a fully parralelized implementation to take about 1 second (fastest). Lets see.    

In [11]:
def objective(config, checkpoint_dir=None):
    clf = SGDClassifier(alpha = config["alpha"], epsilon = config["epsilon"])
    clf.fit(x_train,y_train)
    score = clf.score(x_test,y_test)
    return {"score": score}

In [12]:
analysis = tune.run(objective, config=search_space,resources_per_trial={'gpu': 1})
print(analysis.get_best_config(metric="score", mode="max"))


[2m[36m(run pid=2682)[0m 2022-07-22 17:19:59,561	INFO trial_runner.py:803 -- starting objective_7e5c3_00000


[2m[36m(run pid=2682)[0m == Status ==
[2m[36m(run pid=2682)[0m Current time: 2022-07-22 17:20:05 (running for 00:00:14.54)
[2m[36m(run pid=2682)[0m Memory usage on this node: 8.7/30.9 GiB
[2m[36m(run pid=2682)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=2682)[0m Resources requested: 1.0/4 CPUs, 1.0/4 GPUs, 0.0/11.2 GiB heap, 0.0/4.66 GiB objects
[2m[36m(run pid=2682)[0m Result logdir: /home/ray_results/objective_2022-07-22_17-19-50
[2m[36m(run pid=2682)[0m Number of trials: 6/6 (5 PENDING, 1 RUNNING)
[2m[36m(run pid=2682)[0m +-----------------------+----------+------------------+---------+-----------+
[2m[36m(run pid=2682)[0m | Trial name            | status   | loc              |   alpha |   epsilon |
[2m[36m(run pid=2682)[0m |-----------------------+----------+------------------+---------+-----------|
[2m[36m(run pid=2682)[0m | objective_7e5c3_00000 | RUNNING  | 10.131.0.141:263 |  0.0001 |      0.01 |
[2m[36m(run pid=2682)[0m | objective_7

[2m[36m(run pid=2682)[0m 2022-07-22 17:20:05,108	INFO trial_runner.py:803 -- starting objective_7e5c3_00001
[2m[36m(run pid=2682)[0m 2022-07-22 17:20:06,011	INFO trial_runner.py:803 -- starting objective_7e5c3_00002


[2m[36m(run pid=2682)[0m == Status ==
[2m[36m(run pid=2682)[0m Current time: 2022-07-22 17:20:10 (running for 00:00:20.09)
[2m[36m(run pid=2682)[0m Memory usage on this node: 9.0/30.9 GiB
[2m[36m(run pid=2682)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=2682)[0m Resources requested: 3.0/4 CPUs, 3.0/4 GPUs, 0.0/11.2 GiB heap, 0.0/4.66 GiB objects
[2m[36m(run pid=2682)[0m Result logdir: /home/ray_results/objective_2022-07-22_17-19-50
[2m[36m(run pid=2682)[0m Number of trials: 6/6 (3 PENDING, 3 RUNNING)
[2m[36m(run pid=2682)[0m +-----------------------+----------+-------------------+---------+-----------+
[2m[36m(run pid=2682)[0m | Trial name            | status   | loc               |   alpha |   epsilon |
[2m[36m(run pid=2682)[0m |-----------------------+----------+-------------------+---------+-----------|
[2m[36m(run pid=2682)[0m | objective_7e5c3_00000 | RUNNING  | 10.131.0.141:263  |  0.0001 |      0.01 |
[2m[36m(run pid=2682)[0m | objecti

[2m[36m(run pid=2682)[0m 2022-07-22 17:20:10,653	INFO trial_runner.py:803 -- starting objective_7e5c3_00003


[2m[36m(run pid=2682)[0m Result for objective_7e5c3_00002:
[2m[36m(run pid=2682)[0m   date: 2022-07-22_17-20-12
[2m[36m(run pid=2682)[0m   done: false
[2m[36m(run pid=2682)[0m   experiment_id: 6f4e9cef0b3d45988456470c4be6d0b1
[2m[36m(run pid=2682)[0m   hostname: ray-cluster-mcliffor-40redhat-2ecom-ray-cluster-mcliffor-4prdz4
[2m[36m(run pid=2682)[0m   iterations_since_restore: 1
[2m[36m(run pid=2682)[0m   node_ip: 10.128.4.73
[2m[36m(run pid=2682)[0m   pid: 263
[2m[36m(run pid=2682)[0m   score: 0.9
[2m[36m(run pid=2682)[0m   time_since_restore: 1.6609642505645752
[2m[36m(run pid=2682)[0m   time_this_iter_s: 1.6609642505645752
[2m[36m(run pid=2682)[0m   time_total_s: 1.6609642505645752
[2m[36m(run pid=2682)[0m   timestamp: 1658510412
[2m[36m(run pid=2682)[0m   timesteps_since_restore: 0
[2m[36m(run pid=2682)[0m   training_iteration: 1
[2m[36m(run pid=2682)[0m   trial_id: 7e5c3_00002
[2m[36m(run pid=2682)[0m   warmup_time: 0.002967834472



[2m[36m(run pid=2682)[0m Result for objective_7e5c3_00001:
[2m[36m(run pid=2682)[0m   date: 2022-07-22_17-20-13
[2m[36m(run pid=2682)[0m   done: false
[2m[36m(run pid=2682)[0m   experiment_id: 6d6b52e1ada146de8472b8b35ca17322
[2m[36m(run pid=2682)[0m   hostname: ray-cluster-mcliffor-40redhat-2ecom-ray-cluster-mcliffor-499pj7
[2m[36m(run pid=2682)[0m   iterations_since_restore: 1
[2m[36m(run pid=2682)[0m   node_ip: 10.131.2.243
[2m[36m(run pid=2682)[0m   pid: 3660
[2m[36m(run pid=2682)[0m   score: 0.894
[2m[36m(run pid=2682)[0m   time_since_restore: 3.877516269683838
[2m[36m(run pid=2682)[0m   time_this_iter_s: 3.877516269683838
[2m[36m(run pid=2682)[0m   time_total_s: 3.877516269683838
[2m[36m(run pid=2682)[0m   timestamp: 1658510413
[2m[36m(run pid=2682)[0m   timesteps_since_restore: 0
[2m[36m(run pid=2682)[0m   training_iteration: 1
[2m[36m(run pid=2682)[0m   trial_id: 7e5c3_00001
[2m[36m(run pid=2682)[0m   warmup_time: 0.00271534919

[2m[36m(run pid=2682)[0m 2022-07-22 17:20:15,119	INFO trial_runner.py:803 -- starting objective_7e5c3_00004


[2m[36m(run pid=2682)[0m Result for objective_7e5c3_00001:
[2m[36m(run pid=2682)[0m   date: 2022-07-22_17-20-13
[2m[36m(run pid=2682)[0m   done: true
[2m[36m(run pid=2682)[0m   experiment_id: 6d6b52e1ada146de8472b8b35ca17322
[2m[36m(run pid=2682)[0m   experiment_tag: 1_alpha=0.1,epsilon=0.01
[2m[36m(run pid=2682)[0m   hostname: ray-cluster-mcliffor-40redhat-2ecom-ray-cluster-mcliffor-499pj7
[2m[36m(run pid=2682)[0m   iterations_since_restore: 1
[2m[36m(run pid=2682)[0m   node_ip: 10.131.2.243
[2m[36m(run pid=2682)[0m   pid: 3660
[2m[36m(run pid=2682)[0m   score: 0.894
[2m[36m(run pid=2682)[0m   time_since_restore: 3.877516269683838
[2m[36m(run pid=2682)[0m   time_this_iter_s: 3.877516269683838
[2m[36m(run pid=2682)[0m   time_total_s: 3.877516269683838
[2m[36m(run pid=2682)[0m   timestamp: 1658510413
[2m[36m(run pid=2682)[0m   timesteps_since_restore: 0
[2m[36m(run pid=2682)[0m   training_iteration: 1
[2m[36m(run pid=2682)[0m   trial_id

[2m[36m(run pid=2682)[0m 2022-07-22 17:20:18,427	INFO trial_runner.py:803 -- starting objective_7e5c3_00005


[2m[36m(run pid=2682)[0m == Status ==
[2m[36m(run pid=2682)[0m Current time: 2022-07-22 17:20:22 (running for 00:00:31.60)
[2m[36m(run pid=2682)[0m Memory usage on this node: 9.1/30.9 GiB
[2m[36m(run pid=2682)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=2682)[0m Resources requested: 4.0/4 CPUs, 4.0/4 GPUs, 0.0/11.2 GiB heap, 0.0/4.66 GiB objects
[2m[36m(run pid=2682)[0m Result logdir: /home/ray_results/objective_2022-07-22_17-19-50
[2m[36m(run pid=2682)[0m Number of trials: 6/6 (4 RUNNING, 2 TERMINATED)
[2m[36m(run pid=2682)[0m +-----------------------+------------+-------------------+---------+-----------+--------+------------------+---------+
[2m[36m(run pid=2682)[0m | Trial name            | status     | loc               |   alpha |   epsilon |   iter |   total time (s) |   score |
[2m[36m(run pid=2682)[0m |-----------------------+------------+-------------------+---------+-----------+--------+------------------+---------|
[2m[36m(run pid=268

[2m[36m(run pid=2682)[0m 2022-07-22 17:20:45,116	INFO tune.py:701 -- Total run time: 56.23 seconds (54.45 seconds for the tuning loop).


In [13]:
# Example parameters to tune from SGDClassifier
parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]}

# n_jobs=-1 enables use of all cores like Tune does
sklearn_search = GridSearchCV(SGDClassifier(), parameter_grid, n_jobs=-1)

start = time.time()
sklearn_search.fit(x_train, y_train)
end = time.time()
print("Sklearn Fit Time:", end - start)

Sklearn Fit Time: 37.59654688835144


#### Note: This is actually using a different method of using ray tune 

In [14]:
tune_search = TuneGridSearchCV(
    SGDClassifier(), parameter_grid, early_stopping=True, max_iters=10
)

start = time.time()
tune_search.fit(x_train, y_train)
end = time.time()
print("Tune GridSearch Fit Time:", end - start)
# Tune GridSearch Fit Time: 15.436315774917603 (for an 8 core laptop)

[2m[36m(run pid=2682)[0m 2022-07-22 17:24:03,657	INFO trial_runner.py:803 -- starting _Trainable_1465f_00000
[2m[36m(run pid=2682)[0m 2022-07-22 17:24:03,726	INFO trial_runner.py:803 -- starting _Trainable_1465f_00001
[2m[36m(run pid=2682)[0m 2022-07-22 17:24:03,731	INFO trial_runner.py:803 -- starting _Trainable_1465f_00002
[2m[36m(run pid=2682)[0m 2022-07-22 17:24:03,860	INFO trial_runner.py:803 -- starting _Trainable_1465f_00003
[2m[36m(run pid=2682)[0m 2022-07-22 17:24:08,126	INFO trial_runner.py:803 -- starting _Trainable_1465f_00004
[2m[36m(run pid=2682)[0m 2022-07-22 17:24:15,609	INFO trial_runner.py:803 -- starting _Trainable_1465f_00005


Tune GridSearch Fit Time: 19.755069732666016


In [15]:
# n_jobs=-1 enables use of all cores like Tune does
sklearn_search = GridSearchCV(SGDClassifier(), parameter_grid, n_jobs=-1)

start = time.time()
sklearn_search.fit(x_train, y_train)
end = time.time()
print("Sklearn Fit Time:", end - start)


Sklearn Fit Time: 39.39723062515259
