Skip to content

Commit c5acc00

Browse files
d4l3kfacebook-github-bot
authored andcommitted
cli: defer loading schedulers until used (#537)
Summary: This updates the cli, schedulers and runners so that the schedulers are only imported when they need to be used. This means only the relevant scheduler is loaded which vastly improves responsiveness. * `torchx --help` 900ms -> 300ms * `torchx status local_cwd://` 1.38s to 840ms Breaking changes: * `schedulers.get_schedulers()` has been removed in favor of `schedulers.get_scheduler_factories()` since it's too dangerous * `Runner.run_opts()` now takes a specific scheduler name instead of returning all scheduler runopts since that requires loading all schedulers. * get_schedulers is an internal interface since downstream users should be using the runner interface so changing this shouldn't be an issue Runner.run_opts() is part of the user Runner interface but it's only really practical for the CLI so I doubt there's any OSS usage of it Pull Request resolved: #537 Test Plan: (torchx) tristanr@tristanr-arch2 ~/D/torchx (deferload)> time torchx --help usage: torchx [-h] [--log_level LOG_LEVEL] [--version] {builtins,cancel,configure,describe,log,run,runopts,status} ... torchx CLI optional arguments: -h, --help show this help message and exit --log_level LOG_LEVEL Python logging log level --version show program's version number and exit sub-commands: Use the following commands to run operations, e.g.: torchx run ${JOB_NAME} {builtins,cancel,configure,describe,log,run,runopts,status} ________________________________________________________ Executed in 300.70 millis fish external usr time 280.99 millis 916.00 micros 280.07 millis sys time 16.46 millis 0.00 micros 16.46 millis (torchx) tristanr@tristanr-arch2 ~/D/torchx (deferload)> time torchx status local_docker://torchx/sh-lsgcv92jm13ps torchx 2022-06-24 15:55:58 INFO AppDef: State: SUCCEEDED Num Restarts: -1 Roles: *sh[0]:SUCCEEDED sh[1]:SUCCEEDED sh[2]:SUCCEEDED ________________________________________________________ Executed in 507.45 millis fish external usr time 472.78 millis 926.00 micros 471.86 millis sys time 19.77 millis 0.00 micros 19.77 millis Reviewed By: kurman Differential Revision: D37432971 Pulled By: d4l3k fbshipit-source-id: 52fd9bd9d8ec6accadac2128af877d0607b01176
1 parent 75c848a commit c5acc00

23 files changed

+197
-210
lines changed

docs/source/schedulers.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ Scheduler Functions
3232
.. automodule:: torchx.schedulers
3333
.. currentmodule:: torchx.schedulers
3434

35-
.. autofunction:: get_schedulers
3635
.. autofunction:: get_scheduler_factories
3736
.. autofunction:: get_default_scheduler_name
3837

docs/source/schedulers/local.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ Image Providers
2424
Reference
2525
~~~~~~~~~~~~
2626

27-
.. autofunction:: create_cwd_scheduler
27+
.. autofunction:: create_scheduler
2828

2929
.. autoclass:: LogIterator
3030
:members:

torchx/cli/cmd_cancel.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
from torchx.cli.cmd_base import SubCommand
1212
from torchx.runner import get_runner
13-
from torchx.specs.api import parse_app_handle
1413

1514
logger: logging.Logger = logging.getLogger(__name__)
1615

@@ -25,6 +24,5 @@ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
2524

2625
def run(self, args: argparse.Namespace) -> None:
2726
app_handle = args.app_handle
28-
_, session_name, _ = parse_app_handle(app_handle)
29-
runner = get_runner(name=session_name)
27+
runner = get_runner()
3028
runner.cancel(app_handle)

torchx/cli/cmd_configure.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from torchx.cli.cmd_base import SubCommand
1313
from torchx.runner.config import dump
14-
from torchx.schedulers import get_schedulers
14+
from torchx.schedulers import get_scheduler_factories
1515

1616

1717
logger: logging.Logger = logging.getLogger(__name__)
@@ -42,7 +42,7 @@ def run(self, args: argparse.Namespace) -> None:
4242
if args.schedulers:
4343
schedulers = args.schedulers.split(",")
4444
else:
45-
schedulers = get_schedulers(session_name="_").keys()
45+
schedulers = get_scheduler_factories().keys()
4646

4747
required_only = not args.all
4848

torchx/cli/cmd_describe.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,15 @@ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
2828

2929
def run(self, args: argparse.Namespace) -> None:
3030
app_handle = args.app_handle
31-
scheduler, session_name, app_id = parse_app_handle(app_handle)
32-
runner = get_runner(name=session_name)
31+
scheduler, _, app_id = parse_app_handle(app_handle)
32+
runner = get_runner()
3333
app = runner.describe(app_handle)
3434

3535
if app:
3636
pprint.pprint(dataclasses.asdict(app), indent=2, width=80)
3737
else:
3838
logger.error(
39-
f"AppDef: {app_id} on session: {session_name},"
39+
f"AppDef: {app_id},"
4040
f" does not exist or has been removed from {scheduler}'s data plane"
4141
)
4242
sys.exit(1)

torchx/cli/cmd_log.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def get_logs(
9292
role_name = path[2] if len(path) > 2 else None
9393

9494
if not runner:
95-
runner = get_runner(name=session_name)
95+
runner = get_runner()
9696
app_handle = make_app_handle(scheduler_backend, session_name, app_id)
9797

9898
if len(path) == 4:

torchx/cli/cmd_run.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -125,15 +125,14 @@ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
125125
default=get_default_scheduler_name(),
126126
choices=list(scheduler_names),
127127
action=torchxconfig_run,
128-
help=f"Name of the scheduler to use. One of: [{','.join(scheduler_names)}]",
128+
help="Name of the scheduler to use.",
129129
)
130130
subparser.add_argument(
131131
"-cfg",
132132
"--scheduler_args",
133133
type=str,
134134
help="Arguments to pass to the scheduler (Ex:`cluster=foo,user=bar`)."
135-
" For a list of scheduler run options run: `torchx runopts`"
136-
"",
135+
" For a list of scheduler run options run: `torchx runopts`",
137136
)
138137
subparser.add_argument(
139138
"--dryrun",
@@ -175,8 +174,7 @@ def _run(self, runner: Runner, args: argparse.Namespace) -> None:
175174
" (e.g. `local_cwd`)"
176175
)
177176

178-
run_opts = runner.run_opts()
179-
scheduler_opts = run_opts[args.scheduler]
177+
scheduler_opts = runner.scheduler_run_opts(args.scheduler)
180178
cfg = scheduler_opts.cfg_from_str(args.scheduler_args)
181179
config.apply(scheduler=args.scheduler, cfg=cfg, dirs=CONFIG_DIRS)
182180

torchx/cli/cmd_runopts.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,12 @@ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
2626

2727
def run(self, args: argparse.Namespace) -> None:
2828
filter = args.scheduler
29-
run_opts = get_runner().run_opts()
30-
31-
for scheduler, opts in run_opts.items():
32-
if not filter or scheduler == filter:
33-
print(f"{GREEN}{scheduler}{ENDC}:\n{repr(opts)}\n")
29+
with get_runner() as runner:
30+
for scheduler in runner.scheduler_backends():
31+
if filter and scheduler != filter:
32+
continue
33+
try:
34+
opts = runner.scheduler_run_opts(scheduler)
35+
print(f"{GREEN}{scheduler}{ENDC}:\n{repr(opts)}\n")
36+
except ModuleNotFoundError as e:
37+
print(f"{GREEN}{scheduler}{ENDC}: {e}\n")

torchx/cli/cmd_status.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -159,15 +159,15 @@ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
159159

160160
def run(self, args: argparse.Namespace) -> None:
161161
app_handle = args.app_handle
162-
scheduler, session_name, app_id = parse_app_handle(app_handle)
163-
runner = get_runner(name=session_name)
162+
scheduler, _, app_id = parse_app_handle(app_handle)
163+
runner = get_runner()
164164
app_status = runner.status(app_handle)
165165
filter_roles = parse_list_arg(args.roles)
166166
if app_status:
167167
logger.info(format_app_status(app_status, filter_roles))
168168
else:
169169
logger.error(
170-
f"AppDef: {app_id} on session: {session_name},"
170+
f"AppDef: {app_id},"
171171
f" does not exist or has been removed from {scheduler}'s data plane"
172172
)
173173
sys.exit(1)

torchx/cli/main.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ def get_sub_cmds() -> Dict[str, SubCommand]:
5959
override_sub_cmds = load_group(
6060
"torchx.cli.cmds",
6161
default={},
62-
ignore_missing=True,
6362
)
6463
for cmd_name, cmd_cls in override_sub_cmds.items():
6564
sub_cmds[cmd_name] = cmd_cls()

0 commit comments

Comments
 (0)