New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[tune] Sync logs from workers and improve tensorboard reporting #1567
Changes from all commits
68e6f3c
cf735d8
c89d1f8
06f5f93
1d704b2
46b5482
5104f31
0c3a941
f258869
51e49f5
1eb7071
959b37c
4eef3bc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from __future__ import absolute_import | ||
from __future__ import division | ||
from __future__ import print_function | ||
|
||
import getpass | ||
import os | ||
|
||
|
||
def get_ssh_user(): | ||
"""Returns ssh username for connecting to cluster workers.""" | ||
|
||
return getpass.getuser() | ||
|
||
|
||
# TODO(ekl) this currently only works for clusters launched with | ||
# ray create_or_update | ||
def get_ssh_key(): | ||
"""Returns ssh key to connecting to cluster workers.""" | ||
|
||
path = os.path.expanduser("~/ray_bootstrap_key.pem") | ||
if os.path.exists(path): | ||
return path | ||
return None |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,9 +4,12 @@ | |
|
||
import distutils.spawn | ||
import os | ||
import pipes | ||
import subprocess | ||
import time | ||
|
||
import ray | ||
from ray.tune.cluster_info import get_ssh_key, get_ssh_user | ||
from ray.tune.error import TuneError | ||
from ray.tune.result import DEFAULT_RESULTS_DIR | ||
|
||
|
@@ -15,20 +18,21 @@ | |
_syncers = {} | ||
|
||
|
||
def get_syncer(local_dir, remote_dir): | ||
if not remote_dir.startswith("s3://"): | ||
raise TuneError("Upload uri must start with s3://") | ||
def get_syncer(local_dir, remote_dir=None): | ||
if remote_dir: | ||
if not remote_dir.startswith("s3://"): | ||
raise TuneError("Upload uri must start with s3://") | ||
|
||
if not distutils.spawn.find_executable("aws"): | ||
raise TuneError("Upload uri requires awscli tool to be installed") | ||
if not distutils.spawn.find_executable("aws"): | ||
raise TuneError("Upload uri requires awscli tool to be installed") | ||
|
||
if local_dir.startswith(DEFAULT_RESULTS_DIR + "/"): | ||
rel_path = os.path.relpath(local_dir, DEFAULT_RESULTS_DIR) | ||
remote_dir = os.path.join(remote_dir, rel_path) | ||
if local_dir.startswith(DEFAULT_RESULTS_DIR + "/"): | ||
rel_path = os.path.relpath(local_dir, DEFAULT_RESULTS_DIR) | ||
remote_dir = os.path.join(remote_dir, rel_path) | ||
|
||
key = (local_dir, remote_dir) | ||
if key not in _syncers: | ||
_syncers[key] = _S3LogSyncer(local_dir, remote_dir) | ||
_syncers[key] = _LogSyncer(local_dir, remote_dir) | ||
|
||
return _syncers[key] | ||
|
||
|
@@ -38,23 +42,64 @@ def wait_for_log_sync(): | |
syncer.wait() | ||
|
||
|
||
class _S3LogSyncer(object): | ||
def __init__(self, local_dir, remote_dir): | ||
class _LogSyncer(object): | ||
"""Log syncer for tune. | ||
|
||
This syncs files from workers to the local node, and optionally also from | ||
the local node to a remote directory (e.g. S3).""" | ||
|
||
def __init__(self, local_dir, remote_dir=None): | ||
self.local_dir = local_dir | ||
self.remote_dir = remote_dir | ||
self.last_sync_time = 0 | ||
self.sync_process = None | ||
print("Created S3LogSyncer for {} -> {}".format(local_dir, remote_dir)) | ||
self.local_ip = ray.services.get_node_ip_address() | ||
self.worker_ip = None | ||
print("Created LogSyncer for {} -> {}".format(local_dir, remote_dir)) | ||
|
||
def set_worker_ip(self, worker_ip): | ||
"""Set the worker ip to sync logs from.""" | ||
|
||
self.worker_ip = worker_ip | ||
|
||
def sync_if_needed(self): | ||
if time.time() - self.last_sync_time > 300: | ||
self.sync_now() | ||
|
||
def sync_now(self, force=False): | ||
print( | ||
"Syncing files from {} -> {}".format( | ||
self.local_dir, self.remote_dir)) | ||
self.last_sync_time = time.time() | ||
if not self.worker_ip: | ||
print( | ||
"Worker ip unknown, skipping log sync for {}".format( | ||
self.local_dir)) | ||
return | ||
|
||
if self.worker_ip == self.local_ip: | ||
worker_to_local_sync_cmd = None # don't need to rsync | ||
else: | ||
ssh_key = get_ssh_key() | ||
ssh_user = get_ssh_user() | ||
if ssh_key is None or ssh_user is None: | ||
print( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not allow arbitrary clusters to use this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cluster_info.py will have to support that. I'm not sure how we can infer the SSH key unless it's set up by Ray. |
||
"Error: log sync requires cluster to be setup with " | ||
"`ray create_or_update`.") | ||
return | ||
if not distutils.spawn.find_executable("rsync"): | ||
print("Error: log sync requires rsync to be installed.") | ||
return | ||
worker_to_local_sync_cmd = ( | ||
("""rsync -avz -e "ssh -i '{}' -o ConnectTimeout=120s """ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this also doesn't work with Docker btw There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, it would be nice to have a more generalized way of running remote commands. Probably when kubernetes support lands we can expose a common run_cmd interface from the autoscaler. |
||
"""-o StrictHostKeyChecking=no" '{}@{}:{}/' '{}/'""").format( | ||
ssh_key, ssh_user, self.worker_ip, | ||
pipes.quote(self.local_dir), pipes.quote(self.local_dir))) | ||
|
||
if self.remote_dir: | ||
local_to_remote_sync_cmd = ( | ||
"aws s3 sync '{}' '{}'".format( | ||
pipes.quote(self.local_dir), pipes.quote(self.remote_dir))) | ||
else: | ||
local_to_remote_sync_cmd = None | ||
|
||
if self.sync_process: | ||
self.sync_process.poll() | ||
if self.sync_process.returncode is None: | ||
|
@@ -63,8 +108,17 @@ def sync_now(self, force=False): | |
else: | ||
print("Warning: last sync is still in progress, skipping") | ||
return | ||
self.sync_process = subprocess.Popen( | ||
["aws", "s3", "sync", self.local_dir, self.remote_dir]) | ||
|
||
if worker_to_local_sync_cmd or local_to_remote_sync_cmd: | ||
final_cmd = "" | ||
if worker_to_local_sync_cmd: | ||
final_cmd += worker_to_local_sync_cmd | ||
if local_to_remote_sync_cmd: | ||
if final_cmd: | ||
final_cmd += " && " | ||
final_cmd += local_to_remote_sync_cmd | ||
print("Running log sync: {}".format(final_cmd)) | ||
self.sync_process = subprocess.Popen(final_cmd, shell=True) | ||
|
||
def wait(self): | ||
if self.sync_process: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why preserve state?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it seems like
sync_now(force=True)
is called when forced to flush and when the trial stops. In both cases, why do you only care about the last worker ip?Separately, It might also make sense to split functions for local -> remote sync and local <- worker sync.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The worker ip can change over time if the trial is restarted. Unfortunately Ray doesn't have an API to get the worker IP synchronously, so we piggyback on the IP reported by the last result.
Note that syncing is per-trial, so we actually track the IP per trial.