-
Notifications
You must be signed in to change notification settings - Fork 5.4k
/
faq.py
405 lines (316 loc) · 9.48 KB
/
faq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
# flake8: noqa
# __reproducible_start__
import numpy as np
from ray import train, tune
from ray.train import ScalingConfig
def train_func(config):
# Set seed for trainable random result.
# If you remove this line, you will get different results
# each time you run the trial, even if the configuration
# is the same.
np.random.seed(config["seed"])
random_result = np.random.uniform(0, 100, size=1).item()
train.report({"result": random_result})
# Set seed for Ray Tune's random search.
# If you remove this line, you will get different configurations
# each time you run the script.
np.random.seed(1234)
tuner = tune.Tuner(
train_func,
tune_config=tune.TuneConfig(
num_samples=10,
search_alg=tune.search.BasicVariantGenerator(),
),
param_space={"seed": tune.randint(0, 1000)},
)
tuner.fit()
# __reproducible_end__
# __basic_config_start__
config = {"a": {"x": tune.uniform(0, 10)}, "b": tune.choice([1, 2, 3])}
# __basic_config_end__
# __conditional_spaces_start__
config = {
"a": tune.randint(5, 10),
"b": tune.sample_from(lambda spec: np.random.randint(0, spec.config.a)),
}
# __conditional_spaces_end__
# __iter_start__
def _iter():
for a in range(5, 10):
for b in range(a):
yield a, b
config = {
"ab": tune.grid_search(list(_iter())),
}
# __iter_end__
def train_func(config):
random_result = np.random.uniform(0, 100, size=1).item()
train.report({"result": random_result})
train_fn = train_func
MOCK = True
# Note we put this check here to make sure at least the syntax of
# the code is correct. Some of these snippets simply can't be run on the nose.
if not MOCK:
# __resources_start__
tuner = tune.Tuner(
tune.with_resources(
train_fn, resources={"cpu": 2, "gpu": 0.5, "custom_resources": {"hdd": 80}}
),
)
tuner.fit()
# __resources_end__
# __resources_pgf_start__
tuner = tune.Tuner(
tune.with_resources(
train_fn,
resources=tune.PlacementGroupFactory(
[
{"CPU": 2, "GPU": 0.5, "hdd": 80},
{"CPU": 1},
{"CPU": 1},
],
strategy="PACK",
),
)
)
tuner.fit()
# __resources_pgf_end__
# __resources_scalingconfig_start__
tuner = tune.Tuner(
tune.with_resources(
train_fn,
resources=ScalingConfig(
trainer_resources={"CPU": 2, "GPU": 0.5, "hdd": 80},
num_workers=2,
resources_per_worker={"CPU": 1},
),
)
)
tuner.fit()
# __resources_scalingconfig_end__
# __resources_lambda_start__
tuner = tune.Tuner(
tune.with_resources(
train_fn,
resources=lambda config: {"GPU": 1} if config["use_gpu"] else {"GPU": 0},
),
param_space={
"use_gpu": True,
},
)
tuner.fit()
# __resources_lambda_end__
metric = None
# __modin_start__
def train_fn(config):
# some Modin operations here
# import modin.pandas as pd
train.report({"metric": metric})
tuner = tune.Tuner(
tune.with_resources(
train_fn,
resources=tune.PlacementGroupFactory(
[
{"CPU": 1}, # this bundle will be used by the trainable itself
{"CPU": 1}, # this bundle will be used by Modin
],
strategy="PACK",
),
)
)
tuner.fit()
# __modin_end__
# __huge_data_start__
from ray import tune
import numpy as np
def train_func(config, num_epochs=5, data=None):
for i in range(num_epochs):
for sample in data:
# ... train on sample
pass
# Some huge dataset
data = np.random.random(size=100000000)
tuner = tune.Tuner(tune.with_parameters(train_func, num_epochs=5, data=data))
tuner.fit()
# __huge_data_end__
# __seeded_1_start__
import random
random.seed(1234)
output = [random.randint(0, 100) for _ in range(10)]
# The output will always be the same.
assert output == [99, 56, 14, 0, 11, 74, 4, 85, 88, 10]
# __seeded_1_end__
# __seeded_2_start__
# This should suffice to initialize the RNGs for most Python-based libraries
import random
import numpy as np
random.seed(1234)
np.random.seed(5678)
# __seeded_2_end__
# __torch_tf_seeds_start__
import torch
torch.manual_seed(0)
import tensorflow as tf
tf.random.set_seed(0)
# __torch_tf_seeds_end__
# __torch_seed_example_start__
import random
import numpy as np
from ray import tune
def trainable(config):
# config["seed"] is set deterministically, but differs between training runs
random.seed(config["seed"])
np.random.seed(config["seed"])
# torch.manual_seed(config["seed"])
# ... training code
config = {
"seed": tune.randint(0, 10000),
# ...
}
if __name__ == "__main__":
# Set seed for the search algorithms/schedulers
random.seed(1234)
np.random.seed(1234)
# Don't forget to check if the search alg has a `seed` parameter
tuner = tune.Tuner(trainable, param_space=config)
tuner.fit()
# __torch_seed_example_end__
# __large_data_start__
from ray import train, tune
import numpy as np
def f(config, data=None):
pass
# use data
data = np.random.random(size=100000000)
tuner = tune.Tuner(tune.with_parameters(f, data=data))
tuner.fit()
# __large_data_end__
MyTrainableClass = None
if not MOCK:
# __log_1_start__
tuner = tune.Tuner(
MyTrainableClass,
run_config=train.RunConfig(storage_path="s3://my-log-dir"),
)
tuner.fit()
# __log_1_end__
if not MOCK:
# __s3_start__
from ray import tune
tuner = tune.Tuner(
train_fn,
# ...,
run_config=train.RunConfig(storage_path="s3://your-s3-bucket/durable-trial/"),
)
tuner.fit()
# __s3_end__
# __sync_config_start__
from ray import train, tune
tuner = tune.Tuner(
train_fn,
run_config=train.RunConfig(storage_path="/path/to/shared/storage"),
)
tuner.fit()
# __sync_config_end__
import ray
ray.shutdown()
# __grid_search_start__
parameters = {
"qux": tune.sample_from(lambda spec: 2 + 2),
"bar": tune.grid_search([True, False]),
"foo": tune.grid_search([1, 2, 3]),
"baz": "asd", # a constant value
}
tuner = tune.Tuner(train_fn, param_space=parameters)
tuner.fit()
# __grid_search_end__
# __grid_search_2_start__
# num_samples=10 repeats the 3x3 grid search 10 times, for a total of 90 trials
tuner = tune.Tuner(
train_fn,
run_config=train.RunConfig(
name="my_trainable",
),
param_space={
"alpha": tune.uniform(100, 200),
"beta": tune.sample_from(lambda spec: spec.config.alpha * np.random.normal()),
"nn_layers": [
tune.grid_search([16, 64, 256]),
tune.grid_search([16, 64, 256]),
],
},
tune_config=tune.TuneConfig(
num_samples=10,
),
)
# __grid_search_2_end__
if not MOCK:
import os
from pathlib import Path
# __no_chdir_start__
def train_func(config):
# Read from relative paths
print(open("./read.txt").read())
# The working directory shouldn't have changed from the original
# NOTE: The `TUNE_ORIG_WORKING_DIR` environment variable is deprecated.
assert os.getcwd() == os.environ["TUNE_ORIG_WORKING_DIR"]
# Write to the Tune trial directory, not the shared working dir
tune_trial_dir = Path(train.get_context().get_trial_dir())
with open(tune_trial_dir / "write.txt", "w") as f:
f.write("trial saved artifact")
os.environ["RAY_CHDIR_TO_TRIAL_DIR"] = "0"
tuner = tune.Tuner(train_func)
tuner.fit()
# __no_chdir_end__
# __iter_experimentation_initial_start__
import os
import tempfile
import torch
from ray import train, tune
from ray.train import Checkpoint
import random
def trainable(config):
for epoch in range(1, config["num_epochs"]):
# Do some training...
with tempfile.TemporaryDirectory() as tempdir:
torch.save(
{"model_state_dict": {"x": 1}}, os.path.join(tempdir, "model.pt")
)
train.report(
{"score": random.random()},
checkpoint=Checkpoint.from_directory(tempdir),
)
tuner = tune.Tuner(
trainable,
param_space={"num_epochs": 10, "hyperparam": tune.grid_search([1, 2, 3])},
tune_config=tune.TuneConfig(metric="score", mode="max"),
)
result_grid = tuner.fit()
best_result = result_grid.get_best_result()
best_checkpoint = best_result.checkpoint
# __iter_experimentation_initial_end__
# __iter_experimentation_resume_start__
import ray
def trainable(config):
# Add logic to handle the initial checkpoint.
checkpoint: Checkpoint = config["start_from_checkpoint"]
with checkpoint.as_directory() as checkpoint_dir:
model_state_dict = torch.load(os.path.join(checkpoint_dir, "model.pt"))
# Initialize a model from the checkpoint...
# model = ...
# model.load_state_dict(model_state_dict)
for epoch in range(1, config["num_epochs"]):
# Do some more training...
...
train.report({"score": random.random()})
new_tuner = tune.Tuner(
trainable,
param_space={
"num_epochs": 10,
"hyperparam": tune.grid_search([4, 5, 6]),
"start_from_checkpoint": best_checkpoint,
},
tune_config=tune.TuneConfig(metric="score", mode="max"),
)
result_grid = new_tuner.fit()
# __iter_experimentation_resume_end__