From 1ecde79e50c58ae74236d7a4df42ee40f3d6d6b3 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Wed, 4 Oct 2023 07:58:16 -0700 Subject: [PATCH] Cleanup the code in the `dynamo` userbenchmark (#1960) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Skip importing the modules that are only available in the pytorch source code, not pytorch nightly release. Make dynamo benchmark work on both OSS and internal. Test Plan: ``` $ python run_benchmark.py dynamo --only alexnet --training --performance --inductor loading model: 0it [00:05, ?it/s] cuda train alexnet running benchmark: 100%|█████████████████| 30/30 [00:00<00:00, 41.46it/s] 1.129x ``` ``` $ buck2 run mode/opt //pytorch/benchmark:run_benchmark -- dynamo --only alexnet --training --inductor --performance --output-directory $HOME loading model: 0it [00:16, ?it/s] running benchmark: 100%|█████████████████| 30/30 [00:00<00:00, 37.94it/s] cuda train alexnet 1.120x ``` Differential Revision: D49912006 Pulled By: xuzhao9 --- userbenchmark/dynamo/_dynamo/testing.py | 364 -- userbenchmark/dynamo/_dynamo/utils.py | 2048 ---------- userbenchmark/dynamo/common.py | 3577 ----------------- .../dynamo/dynamobench/_dynamo/utils.py | 44 +- userbenchmark/dynamo/dynamobench/common.py | 20 +- .../dynamo/dynamobench/requirements.txt | 2 + userbenchmark/dynamo/run.py | 16 +- userbenchmark/dynamo/torchbench.py | 479 --- .../dynamo/torchbench_models_list.txt | 28 - .../dynamo/torchbench_models_list_cpu.txt | 48 - 10 files changed, 48 insertions(+), 6578 deletions(-) delete mode 100644 userbenchmark/dynamo/_dynamo/testing.py delete mode 100644 userbenchmark/dynamo/_dynamo/utils.py delete mode 100644 userbenchmark/dynamo/common.py create mode 100644 userbenchmark/dynamo/dynamobench/requirements.txt delete mode 100644 userbenchmark/dynamo/torchbench.py delete mode 100644 userbenchmark/dynamo/torchbench_models_list.txt delete mode 100644 userbenchmark/dynamo/torchbench_models_list_cpu.txt diff --git a/userbenchmark/dynamo/_dynamo/testing.py b/userbenchmark/dynamo/_dynamo/testing.py deleted file mode 100644 index f5dc961abd..0000000000 --- a/userbenchmark/dynamo/_dynamo/testing.py +++ /dev/null @@ -1,364 +0,0 @@ -import contextlib -import dis -import functools -import logging -import os.path -import re -import sys -import types -import unittest -from typing import Sequence, Union -from unittest.mock import patch - -import torch -from torch import fx -from torch._dynamo.output_graph import OutputGraph - -from torch._dynamo import config, eval_frame, optimize_assert, reset -from torch._dynamo.bytecode_transformation import ( - create_instruction, - debug_checks, - is_generator, - transform_code_object, -) -from torch._dynamo.guards import CheckFunctionManager, GuardedCode -from .utils import same - -unsupported = eval_frame.unsupported -three = 3 - -log = logging.getLogger(__name__) - - -def clone_me(x): - if x is None: - return None - return x.detach().clone().requires_grad_(x.requires_grad) - - -def skip_if_pytest(fn): - @functools.wraps(fn) - def wrapped(*args, **kwargs): - if "PYTEST_CURRENT_TEST" in os.environ: - raise unittest.SkipTest("does not work under pytest") - return fn(*args, **kwargs) - - return wrapped - - -def named_parameters_for_optimized_module(mod): - assert isinstance(mod, eval_frame.OptimizedModule) - return mod._orig_mod.named_parameters - - -def named_buffers_for_optimized_module(mod): - assert isinstance(mod, eval_frame.OptimizedModule) - return mod._orig_mod.named_buffers - - -def remove_optimized_module_prefix(name): - return re.sub(r"^_orig_mod[.]", "", name) - - -def collect_results(model, prediction, loss, example_inputs): - results = [] - results.append(prediction) - results.append(loss) - # if isinstance(loss, torch.Tensor) and loss.item() > 1: - # log.warning( - # f"High loss value alert - {loss:.2f}. Can result in unstable gradients." - # ) - - grads = dict() - params = dict() - for name, param in model.named_parameters(): - if isinstance(model, eval_frame.OptimizedModule): - name = remove_optimized_module_prefix(name) - param_copy = param - grad = param.grad - # Treat None and zero grad as same - if param.grad is None: - grad = torch.zeros_like(param) - grads[name + ".grad"] = grad - params[name] = param_copy - results.append(grads) - results.append(params) - buffers = dict() - for name, buffer in model.named_buffers(): - if isinstance(model, eval_frame.OptimizedModule): - name = remove_optimized_module_prefix(name) - buffers[name] = buffer - results.append(buffers) - for example in example_inputs: - if isinstance(example, (tuple, list)): - for inp in example: - if isinstance(inp, torch.Tensor): - results.append(inp.grad) - else: - if isinstance(example, torch.Tensor): - results.append(example.grad) - return results - - -def requires_bwd_pass(out): - if isinstance(out, torch.Tensor): - return out.requires_grad - elif isinstance(out, (list, tuple)): - return any(requires_bwd_pass(x) for x in out) - elif out is None: - return False - elif isinstance(out, int): - return False - raise NotImplementedError("Don't know how to reduce", type(out)) - - -def reduce_to_scalar_loss(out): - """Reduce the output of a model to get scalar loss""" - if isinstance(out, torch.Tensor): - # Mean does not work on integer tensors - return out.sum() / out.numel() - elif isinstance(out, (list, tuple)): - return sum([reduce_to_scalar_loss(x) for x in out]) / len(out) - elif type(out).__name__ in ( - "MaskedLMOutput", - "Seq2SeqLMOutput", - "CausalLMOutputWithCrossAttentions", - ): - return reduce_to_scalar_loss(out.logits) - elif type(out).__name__ == "SquashedNormal": - return out.mean.sum() - elif isinstance(out, dict): - return sum([reduce_to_scalar_loss(value) for value in out.values()]) / len( - out.keys() - ) - raise NotImplementedError("Don't know how to reduce", type(out)) - - -def debug_dir(): - path = os.path.join(os.path.dirname(__file__), "../debug") - if not os.path.exists(path): - os.mkdir(path) - return path - - -def debug_dump(name, code: types.CodeType, extra=""): - with open(os.path.join(debug_dir(), name), "w") as fd: - fd.write( - f"{dis.Bytecode(code).info()}\n\n{dis.Bytecode(code).dis()}\n\n{extra}\n" - ) - - -def debug_insert_nops(frame, cache_size, hooks, _): - """used to debug jump updates""" - - def insert_nops(instructions, code_options): - instructions.insert(0, create_instruction("NOP")) - instructions.insert(0, create_instruction("NOP")) - - if is_generator(frame.f_code): - return None - - debug_checks(frame.f_code) - code = transform_code_object(frame.f_code, insert_nops) - graph = OutputGraph( - code_options={}, - compiler_fn=None, - root_tx=None, - export=False, - export_constraints=None, - frame_state={"_id": 0}, - # TODO: shouldn't this be f_locals/f_globals from frame? - local_scope=locals(), - global_scope=globals(), - f_code=frame.f_code, - ) - - return GuardedCode(code, CheckFunctionManager(graph).check_fn) - - -class CompileCounter: - def __init__(self): - self.frame_count = 0 - self.op_count = 0 - - def __call__(self, gm: torch.fx.GraphModule, example_inputs): - self.frame_count += 1 - for node in gm.graph.nodes: - if "call" in node.op: - self.op_count += 1 - return gm.forward - - def clear(self): - self.frame_count = 0 - self.op_count = 0 - - -class CompileCounterWithBackend: - def __init__(self, backend): - self.frame_count = 0 - self.op_count = 0 - self.backend = backend - self.graphs = [] - - def __call__(self, gm: torch.fx.GraphModule, example_inputs): - from .backends.registry import lookup_backend - - self.frame_count += 1 - for node in gm.graph.nodes: - if "call" in node.op: - self.op_count += 1 - self.graphs.append(gm) - return lookup_backend(self.backend)(gm, example_inputs) - - -# Equivalent to backend="eager", but also records graphs that -# we can assert on -class EagerAndRecordGraphs: - def __init__(self): - self.graphs = [] - - def __call__(self, gm: torch.fx.GraphModule, example_inputs): - self.graphs.append(gm) - return gm - - -def strip_comment(code): - code = str(code) - return re.sub(r"(?m)^ *#.*\n?", "", code) - - -def remove_trailing_space(code): - return "\n".join([line.rstrip() for line in code.split("\n")]) - - -def normalize_gm(gm_str): - # strip comments as comments have path to files which may differ from - # system to system. - return remove_trailing_space(strip_comment(gm_str)) - - -def standard_test(self, fn, nargs, expected_ops=None, expected_ops_dynamic=None): - if not config.assume_static_by_default and expected_ops_dynamic is not None: - expected_ops = expected_ops_dynamic - - actual = CompileCounter() - if expected_ops is None: - expected = CompileCounter() - try: - gm = torch.fx.symbolic_trace(fn) - expected(gm) - print("\nfx.symbolic_trace graph:") - gm.graph.print_tabular() - expected_ops = expected.op_count - except Exception: - pass # Silently ignore FX errors (not our issue) - - args1 = [torch.randn(10, 10) for _ in range(nargs)] - args2 = [torch.randn(10, 10) for _ in range(nargs)] - correct1 = fn(*args1) - correct2 = fn(*args2) - reset() - opt_fn = optimize_assert(actual)(fn) - val1a = opt_fn(*args1) - val2a = opt_fn(*args2) - val1b = opt_fn(*args1) - val2b = opt_fn(*args2) - reset() - self.assertTrue(same(val1a, correct1)) - self.assertTrue(same(val1b, correct1)) - self.assertTrue(same(val2a, correct2)) - self.assertTrue(same(val2b, correct2)) - self.assertEqual(actual.frame_count, 1) - if expected_ops is not None: - self.assertEqual(actual.op_count, expected_ops) - - -def dummy_fx_compile(gm: fx.GraphModule, example_inputs): - return gm.forward - - -def format_speedup(speedup, pvalue, is_correct=True, pvalue_threshold=0.1): - if not is_correct: - return "ERROR" - if pvalue > pvalue_threshold: - return f"{speedup:.3f}x SAME" - return f"{speedup:.3f}x p={pvalue:.2f}" - - -def rand_strided( - size: Sequence[int], - stride: Sequence[int], - dtype: torch.dtype = torch.float32, - device: Union[str, torch.device] = "cpu", - extra_size: int = 0, -): - needed_size = ( - sum((shape - 1) * stride for shape, stride in zip(size, stride)) - + 1 - + extra_size - ) - if dtype.is_floating_point: - buffer = torch.randn(needed_size, dtype=dtype, device=device) - else: - buffer = torch.zeros(size=[needed_size], dtype=dtype, device=device) - return torch.as_strided(buffer, size, stride) - - -def _make_fn_with_patches(fn, *patches): - @functools.wraps(fn) - def _fn(*args, **kwargs): - with contextlib.ExitStack() as stack: - for module, attr, val in patches: - stack.enter_context(patch.object(module, attr, val)) - - return fn(*args, **kwargs) - - return _fn - - -def make_test_cls_with_patches(cls, cls_prefix, fn_suffix, *patches, xfail_prop=None): - class DummyTestClass(cls): - pass - - DummyTestClass.__name__ = f"{cls_prefix}{cls.__name__}" - DummyTestClass.__qualname__ = DummyTestClass.__name__ - - for name in dir(cls): - if name.startswith("test_"): - fn = getattr(cls, name) - if not callable(fn): - continue - new_name = f"{name}{fn_suffix}" - new_fn = _make_fn_with_patches(fn, *patches) - new_fn.__name__ = new_name - if xfail_prop is not None and hasattr(fn, xfail_prop): - new_fn = unittest.expectedFailure(new_fn) - setattr(DummyTestClass, new_name, new_fn) - - return DummyTestClass - - -# test Python 3.11+ specific features -def skipIfNotPy311(fn): - if sys.version_info >= (3, 11): - return fn - return unittest.skip(fn) - - -# Controls tests generated in test/inductor/test_torchinductor_dynamic_shapes.py -# and test/dynamo/test_dynamic_shapes.py -def expectedFailureDynamic(fn): - fn._expected_failure_dynamic = True - return fn - - -# Controls tests generated in test/inductor/test_torchinductor_codegen_dynamic_shapes.py -def expectedFailureCodegenDynamic(fn): - fn._expected_failure_codegen_dynamic = True - return fn - - -# Controls test generated in test/inductor/test_cpp_wrapper.py -def expectedFailureDynamicWrapper(fn): - fn._expected_failure_dynamic_wrapper = True - return fn diff --git a/userbenchmark/dynamo/_dynamo/utils.py b/userbenchmark/dynamo/_dynamo/utils.py deleted file mode 100644 index 471dbcc552..0000000000 --- a/userbenchmark/dynamo/_dynamo/utils.py +++ /dev/null @@ -1,2048 +0,0 @@ -import atexit -import collections -import contextlib -import copy -import cProfile -import dataclasses -import datetime -import dis -import enum -import functools -import gc -import inspect -import itertools -import linecache -import logging -import math -import operator -import os -import pstats -import sys -import textwrap -import time -import types -import typing -import weakref -from contextlib import contextmanager -from functools import lru_cache, wraps -from typing import Any, Dict, Optional, Tuple, Union - -import numpy as np - -# import torch._logging -# import torch._numpy as tnp -# from torch._guards import detect_fake_mode # noqa: F401 -from torch._dynamo import config - - -# NOTE: Make sure `NP_SUPPORTED_MODULES` and `NP_TO_TNP_MODULE` are in sync. -NP_SUPPORTED_MODULES = (np, np.fft, np.linalg, np.random) - -# NP_TO_TNP_MODULE = { -# np: tnp, -# np.fft: tnp.fft, -# np.linalg: tnp.linalg, -# np.random: tnp.random, -# } - -import importlib - -import torch -import torch._functorch.config -import torch.fx.experimental.symbolic_shapes -from torch import fx -from torch._dispatch.python import enable_python_dispatcher -from torch._subclasses.fake_tensor import FakeTensor -from torch.nn.modules.lazy import LazyModuleMixin -from torch.utils._pytree import tree_map - - -counters = collections.defaultdict(collections.Counter) -troubleshooting_url = "https://pytorch.org/docs/master/compile/troubleshooting.html" -nnmodule_doc_url = "https://pytorch.org/docs/master/compile/nn-module.html" -nnmodule_doc_url_msg = f"See {nnmodule_doc_url} for more information and limitations." -log = logging.getLogger(__name__) - -# profiling compilation time by function -compilation_time_metrics = collections.OrderedDict() - -# profiling compilation time by frame phase -frame_phase_timing = collections.OrderedDict() - -timer_counter = itertools.count() - - -def tabulate(rows, headers): - try: - import tabulate - - return tabulate.tabulate(rows, headers=headers) - except ImportError: - return "\n".join( - ", ".join(map(str, row)) for row in itertools.chain([headers], rows) - ) - - -def dynamo_profiled(func): - @wraps(func) - def profile_wrapper(*args, **kwargs): - global timer_counter - datafn = ( - func.__name__ + f"{next(timer_counter)}.profile" - ) # Name the data file sensibly - prof = cProfile.Profile() - prof.enable() - retval = prof.runcall(func, *args, **kwargs) - prof.disable() - print(f"### Cprofile for {func.__name__} iter {next(timer_counter)} ###") - ps = pstats.Stats(prof) - ps.sort_stats(pstats.SortKey.TIME).print_stats(20) - ps.sort_stats(pstats.SortKey.CUMULATIVE).print_stats(20) - prof.dump_stats(datafn) - return retval - - return profile_wrapper - - -curr_frame = 0 - - -# Note: Called for you by dynamo - you almost never ever want to invoke this yourself. -def increment_frame(): - global curr_frame - curr_frame = curr_frame + 1 - - -# Note: Called for you by dynamo - you almost never ever want to invoke this yourself. -def reset_frame_count(): - global curr_frame - frame_phase_timing.clear() - compilation_time_metrics.clear() - curr_frame = 0 - - -op_count = 0 - - -def increment_op_count(cnt): - global op_count - op_count += cnt - - -# Print a report of time spent so far -# Ex: -# TIMING: -# entire_frame_compile:8.574629999999999 -# backend_compile:5.26806 -def print_time_report(): - total = 0 - total_by_key = {} - for timings in frame_phase_timing.values(): - for key, timing in timings.items(): - total += timing - if key not in total_by_key: - total_by_key[key] = timing - else: - total_by_key[key] += timing - - out = "TIMING:" - for key, value in total_by_key.items(): - out = f"{out} {key}:{round(value, 5)}" - - print(out) - - -# dynamo_timed API works as a function decorator -# By wrapping a function in dynamo_timed, we can store a record in compilation_time_metrics -# where the key is the functions name. -# For example: -# -# @dynamo_timed -# def _foo(...): -# -# Would show up as an entry in our timing dict: -# OrderedDict([('bar.._foo', [0.083690, 0.23949, 3.1425e-05])]) -# This is extremely useful for granular debugging. -# -# For a higher-level mode, pass a phase_name into dynamo_timed -# phase_names record an extra record into a separate compilation timing structure, -# one keyed on frame+name rather than function. -# The frame is incremented outside of this function, in def increment_frame() above. -def dynamo_timed(original_function=None, phase_name=None): - def dynamo_timed_inner(func): - @wraps(func) - def time_wrapper(*args, **kwargs): - key = func.__qualname__ - if key not in compilation_time_metrics: - compilation_time_metrics[key] = [] - with torch.profiler.record_function(f"{key} (dynamo_timed)"): - t0 = time.time() - r = func(*args, **kwargs) - time_spent = time.time() - t0 - compilation_time_metrics[key].append(time_spent) - if phase_name: - frame_key = str(curr_frame) - if frame_key not in frame_phase_timing: - frame_phase_timing[frame_key] = {} - assert ( - phase_name not in frame_phase_timing[frame_key] - ), f"Duplicate phase name {phase_name} for frame {frame_key}" - frame_phase_timing[frame_key][phase_name] = time_spent - return r - - return time_wrapper - - if original_function: - return dynamo_timed_inner(original_function) - return dynamo_timed_inner - - -def compile_times(repr="str", aggregate=False): - """ - Get metrics about torchdynamo frontend/backend compilation times. - - Accumulates information from functions tagged with `@dynamo_timed`. - - repr='str' returns a printable string for user interaction, and 'csv' - returns headers, rows which can be logged for output - - aggregate causes values from multiple compilations (e.g. split graphs) - to be accumulated into one value. If false, expect more than one value - per metric. - """ - - def fmt_fn(values, item_fn=lambda x: x): - if aggregate: - return item_fn(sum(values)) - return ", ".join(map(item_fn, values)) - - if repr == "str": - rows = [ - (k, fmt_fn(compilation_time_metrics[k], item_fn=lambda x: f"{x:.4f}")) - for k in compilation_time_metrics - ] - out = "TorchDynamo compilation metrics:\n" - out += tabulate(rows, headers=("Function", "Runtimes (s)")) - return out - elif repr == "csv": - values = [ - fmt_fn(v, item_fn=lambda x: f"{x:.6f}") - for v in compilation_time_metrics.values() - ] - headers = list(compilation_time_metrics.keys()) - return headers, values - - -@atexit.register -def dump_compile_times(): - log.info(compile_times(repr="str", aggregate=True)) - - -tensortype_to_dtype = { - torch.FloatTensor: (torch.float32, torch.float), - torch.DoubleTensor: (torch.float64, torch.double), - torch.HalfTensor: (torch.float16, torch.half), - torch.BFloat16Tensor: (torch.bfloat16,), - torch.ByteTensor: (torch.uint8,), - torch.CharTensor: (torch.int8,), - torch.LongTensor: (torch.int64, torch.long), - torch.IntTensor: (torch.int32, torch.int), - torch.ShortTensor: (torch.int16, torch.short), - torch.BoolTensor: (torch.bool,), -} - - -class DuplicateWarningChecker: - def __init__(self, maxsize=4096): - self.maxsize = maxsize - self.reset() - - def reset(self): - self.set = collections.OrderedDict() - - def add(self, key): - if key in self.set: - self.set.move_to_end(key, last=True) - if not config.verbose: - return False - else: - self.set[key] = None - while len(self.set) > self.maxsize: - self.set.popitem(last=False) - return True - - -graph_break_dup_warning_checker = DuplicateWarningChecker() - - -def setup_compile_debug(): - compile_debug = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1" - - if compile_debug: - torch._logging.set_logs( - dynamo=logging.DEBUG, - aot=logging.DEBUG, - inductor=logging.DEBUG, - output_code=True, # this is off by default - ) - return add_file_handler() - - return contextlib.ExitStack() - - -def reset_graph_break_dup_checker(): - graph_break_dup_warning_checker.reset() - - -def add_file_handler(): - log_path = os.path.join(get_debug_dir(), "torchdynamo") - if not os.path.exists(log_path): - os.makedirs(log_path) - - log_file_handler = logging.FileHandler(os.path.join(log_path, "debug.log")) - logger = logging.getLogger("torch._dynamo") - logger.addHandler(log_file_handler) - - exitstack = contextlib.ExitStack() - exitstack.callback(lambda: logger.removeHandler(log_file_handler)) - return exitstack - - -def setup_log_file(): - exitstack = contextlib.ExitStack() - if config.log_file_name is not None: - log_file_handler = logging.FileHandler(config.log_file_name) - for logger in logging.get_loggers(): - logger.addHandler(log_file_handler) - exitstack.callback(lambda: logger.removeHandler(log_file_handler)) - return exitstack - - return exitstack - - -def gen_record_file_name(exc, code): - return f"{get_debug_dir()}/error_recordings/\ -{code.co_name}_{type(exc).__name__}_{code.co_firstlineno}.rec" - - -def write_record_to_file(filename, exec_record): - try: - if os.path.exists(filename): - log.warning( - "Unable to write execution record %s; file already exists.", filename - ) - else: - os.makedirs(os.path.dirname(filename), exist_ok=True) - with open(filename, "wb") as f: - exec_record.dump(f) - except Exception: - log.error("Unable to write execution record %s", filename, exc_info=1) - - -def count_calls(g: fx.Graph): - c = 0 - for n in g.nodes: - if "call" in n.op: - c += 1 - return c - - -def identity(x): - return x - - -def nothing(*args, **kwargs): - pass - - -class ExactWeakKeyDictionary: - """Similar to weakref.WeakKeyDictionary, but use `is`/`id` rather than `==` to compare equality""" - - def __init__(self): - self.values = dict() - self.refs = dict() - - def __getitem__(self, key): - return self.values[id(key)] - - def get(self, key, default=None): - return self.values.get(id(key), default) - - def __contains__(self, key): - return id(key) in self.values - - def __setitem__(self, key, value): - idx = id(key) - if idx not in self.refs: - self.refs[idx] = weakref.ref(key, lambda ref: self._remove_id(idx)) - self.values[idx] = value - - def _remove_id(self, idx): - if idx in self.values: - del self.values[idx] - if idx in self.refs: - del self.refs[idx] - - def clear(self): - self.refs.clear() - self.values.clear() - - -def istype(obj, allowed_types): - """isinstance() without subclasses""" - if isinstance(allowed_types, (tuple, list, set)): - return type(obj) in allowed_types - return type(obj) is allowed_types - - -def is_typing(value): - if sys.version_info < (3, 9): - return isinstance(value, typing._GenericAlias) - else: - return isinstance( - value, (typing._SpecialGenericAlias, typing._UnionGenericAlias) - ) - - -def is_numpy_int_type(value): - return istype( - value, - ( - np.int8, - np.int16, - np.int32, - np.int64, - np.uint8, - np.uint16, - np.uint32, - np.uint64, - ), - ) - - -def is_numpy_float_type(value): - return istype( - value, - ( - np.float16, - np.float32, - np.float64, - ), - ) - - -def is_numpy_ndarray(value): - return istype(value, np.ndarray) - - -def istensor(obj): - """Check of obj is a tensor""" - tensor_list = ( - torch.Tensor, - torch.nn.Parameter, - *config.traceable_tensor_subclasses, - ) - tensor_list = tensor_list + (torch._subclasses.FakeTensor,) - return istype(obj, tensor_list) - - -def is_lazy_module(mod): - return isinstance(mod, LazyModuleMixin) - - -@functools.lru_cache(4096) -def print_once(*args): - print(*args) - - -def make_cell(val=None): - """Some black magic to create a cell object that usually only exists in a closure""" - x = val - - def f(): - return x - - assert len(f.__closure__) == 1 - return f.__closure__[0] - - -def proxy_args_kwargs(args, kwargs): - try: - proxy_args = tuple(arg.as_proxy() for arg in args) - proxy_kwargs = {key: arg.as_proxy() for key, arg in kwargs.items()} - return proxy_args, proxy_kwargs - except NotImplementedError as e: - from .exc import unimplemented - from .variables.base import typestr - - raise unimplemented( - f"call_function args: {typestr(*args)} {typestr(*list(kwargs.values()))}" - ) from e - - -@dataclasses.dataclass -class CompilationMetrics: - frame_key: str - co_name: str - co_filename: str - co_firstlineno: int - cache_size: int - guard_count: Optional[int] - graph_op_count: Optional[int] - graph_node_count: Optional[int] - graph_input_count: Optional[int] - entire_frame_compile_time_s: Optional[float] - backend_compile_time_s: Optional[float] - fail_reason: Optional[str] - - -@dataclasses.dataclass -class CleanupHook: - """Remove a global variable when hook is called""" - - scope: Dict[str, Any] - name: str - - def __call__(self, *args): - CleanupManager.count -= 1 - del self.scope[self.name] - - @staticmethod - def create(scope, name, val): - assert name not in scope - CleanupManager.count += 1 - scope[name] = val - return CleanupHook(scope, name) - - -class CleanupManager(ExactWeakKeyDictionary): - count = 0 - - def _remove_id(self, idx): - for hook in self.values[idx]: - hook() - super()._remove_id(idx) - - -CleanupManager.instance = CleanupManager() - - -def clone_tensor(x): - """Clone the tensor and its gradient""" - y = x.clone().requires_grad_(x.requires_grad) - if x.is_leaf and x.grad is not None: - y.grad = x.grad.clone() - return y - - -def clone_input(x, *, dtype=None): - """copy while preserving strides""" - # TODO: this is questionable - if isinstance(x, torch._subclasses.FakeTensor): - # this func fails on fake tensors in __torch_dispatch__ - return x - - def torch_clone(x): - y = torch.clone(x) - if x.is_leaf: - y.requires_grad_(x.requires_grad) - if x.is_leaf and x.grad is not None: - y.grad = clone_input(x.grad, dtype=dtype) - if hasattr(x, "_dynamo_dynamic_indices"): - y._dynamo_dynamic_indices = x._dynamo_dynamic_indices.copy() - return y - - with torch.no_grad(): - if x.device.type == "xla": - # Access data_ptr() for a xla tensor will cause crash - return torch_clone(x) - - needed_size = sum( - (shape - 1) * stride for shape, stride in zip(x.size(), x.stride()) - ) - if x.is_quantized: - result = torch.empty_quantized((needed_size + 32,), x) - else: - result = torch.empty( - needed_size + 32, dtype=dtype or x.dtype, device=x.device - ) - cache_line_offset = ( - (x.data_ptr() - result.data_ptr()) % 32 - ) // x.element_size() - result.as_strided_(x.size(), x.stride(), cache_line_offset) - try: - result.copy_(x.clone()) - if x.is_leaf: - result.requires_grad_(x.requires_grad) - if x.is_leaf and x.grad is not None: - result.grad = clone_input(x.grad, dtype=dtype) - except RuntimeError: - # RuntimeError: unsupported operation: more than one element of the written-to - # tensor refers to a single memory location. Please clone() the tensor before - # performing the operation. - return torch_clone(x) - if hasattr(x, "_dynamo_dynamic_indices"): - result._dynamo_dynamic_indices = x._dynamo_dynamic_indices.copy() - return result - - -def clone_inputs(example_inputs): - if type(example_inputs) is dict: - res = dict(example_inputs) - for key, value in res.items(): - if isinstance(value, tuple): - res[key] = clone_inputs(value) - else: - assert isinstance(value, torch.Tensor), type(value) - res[key] = clone_input(value) - return res - - res = list(example_inputs) - for i in range(len(res)): - if isinstance(res[i], torch.Tensor): - res[i] = clone_input(res[i]) - return res - - -@contextmanager -def preserve_rng_state(): - with torch.utils._python_dispatch._disable_current_modes(): - rng_state = torch.clone(torch.random.get_rng_state()) - if torch.cuda.is_available(): - cuda_rng_state = torch.clone(torch.cuda.get_rng_state()) - try: - yield - finally: - with torch.utils._python_dispatch._disable_current_modes(): - torch.random.set_rng_state(rng_state) - if torch.cuda.is_available(): - torch.cuda.set_rng_state(cuda_rng_state) - - -def is_jit_model(model0): - return isinstance( - model0, - ( - torch.jit._trace.TopLevelTracedModule, - torch.jit._script.RecursiveScriptModule, - torch.jit.ScriptFunction, - torch.jit.ScriptModule, - ), - ) - - -def torchscript(model, example_inputs, verbose=False): - if is_jit_model(model): - # already done? - return model - - try: - return torch.jit.trace(model, example_inputs) - except Exception: - try: - return torch.jit.script(model) - except Exception: - if verbose: - log.exception("jit error") - else: - log.error("Both torch.jit.trace and torch.jit.script failed") - return None - - -def getfile(obj): - try: - return inspect.getfile(obj) - except TypeError: - return None - - -def is_namedtuple(obj): - """Test if an object is a namedtuple or a torch.return_types.* quasi-namedtuple""" - return is_namedtuple_cls(type(obj)) - - -def is_namedtuple_cls(cls): - """Test if an object is a namedtuple or a torch.return_types.* quasi-namedtuple""" - try: - if issubclass(cls, tuple): - bases = getattr(cls, "__bases__", []) or [None] - module = getattr(cls, "__module__", None) - return module == "torch.return_types" or ( - bases[0] is tuple and hasattr(cls, "_make") and hasattr(cls, "_fields") - ) - except TypeError: - pass - return False - - -@functools.lru_cache(1) -def namedtuple_fields(cls): - """Get the fields of a namedtuple or a torch.return_types.* quasi-namedtuple""" - if cls is slice: - return ["start", "stop", "step"] - - assert issubclass(cls, tuple) - if hasattr(cls, "_fields"): - # normal namedtuples - return cls._fields - - @dataclasses.dataclass - class Marker: - index: int - - # frustrating ones e.g. torch.return_types.max - assert cls.__module__ == "torch.return_types" - obj = cls(map(Marker, range(cls.n_fields))) - fields = [None] * cls.n_fields - for name in dir(obj): - if name[0] != "_" and isinstance(getattr(obj, name), Marker): - fields[getattr(obj, name).index] = name - return fields - - -def checkpoint_params(gm): - with torch.no_grad(): - rng_state = torch.clone(torch.random.get_rng_state()) - if torch.cuda.is_available(): - cuda_rng_state = torch.clone(torch.cuda.get_rng_state()) - saved_state = [] - for param in itertools.chain(gm.parameters(), gm.buffers()): - saved_state.append((param, param._version, torch.clone(param))) - - def restore(): - with torch.no_grad(): - torch.random.set_rng_state(rng_state) - if torch.cuda.is_available(): - torch.cuda.set_rng_state(cuda_rng_state) - for param, version, original_value in saved_state: - if param._version != version: - param.copy_(original_value) - - return restore - - -def timed(model, example_inputs, times=1): - if torch.cuda.is_available(): - synchronize = torch.cuda.synchronize - else: - synchronize = nothing - - synchronize() - gc.collect() - torch.manual_seed(1337) - t0 = time.perf_counter() - for _ in range(times): - result = model(*example_inputs) - synchronize() - t1 = time.perf_counter() - return result, t1 - t0 - - -def check_is_cuda(gm, example_inputs): - return all(x.is_cuda for x in itertools.chain(example_inputs, gm.parameters(True))) - - -@lru_cache(32) -def rot_n_helper(n): - assert n > 1 - vars = [f"v{i}" for i in range(n)] - rotated = reversed(vars[-1:] + vars[:-1]) - fn = eval(f"lambda {','.join(vars)}: ({','.join(rotated)})") - fn.__name__ = f"rot_{n}_helper" - return fn - - -def is_safe_constant(v): - if istype(v, (tuple, frozenset)): - return all(map(is_safe_constant, v)) - return isinstance(v, (enum.Enum, type)) or istype( - v, - ( - types.CodeType, - int, - float, - bool, - str, - bytes, - type(None), - slice, - type(type), - torch.device, - torch.dtype, - ), - ) - - -def guard_if_dyn(arg): - from .variables import ConstantVariable, SymNodeVariable - - if isinstance(arg, SymNodeVariable): - # This is because SymNodeVariable intentionally doesn't define - # as_python_constant to avoid shunting down some codepaths - # that expect consts. In this case, we know we definitely - # want to specialize though. - return arg.evaluate_expr() - elif isinstance(arg, ConstantVariable): - return arg.as_python_constant() - - return arg - - -def check_constant_args(args, kwargs): - return all(x.is_python_constant() for x in itertools.chain(args, kwargs.values())) - - -def check_unspec_python_args(args, kwargs): - from torch._dynamo.variables.constant import ConstantVariable - from torch._dynamo.variables.tensor import UnspecializedPythonVariable - - unspec_count = 0 - for x in itertools.chain(args, kwargs.values()): - if isinstance(x, UnspecializedPythonVariable): - unspec_count += 1 - elif not isinstance(x, (UnspecializedPythonVariable, ConstantVariable)): - return False - else: - pass - - return unspec_count > 0 - - -def check_numpy_ndarray_args(args, kwargs): - from torch._dynamo.variables.tensor import NumpyNdarrayVariable - - return any( - isinstance(x, NumpyNdarrayVariable) - for x in itertools.chain(args, kwargs.values()) - ) - - -def specialize_args_kwargs(tx, args, kwargs): - specialized_args = [] - specialized_kwargs = {} - for x in args: - specialized_args.append(x.as_specialized(tx)) - for k, v in kwargs.items(): - specialized_kwargs.update({k: v.as_specialized(tx)}) - return specialized_args, specialized_kwargs - - -dict_values = type(dict().values()) -odict_values = type(collections.OrderedDict().values()) -tuple_iterator = type(iter(tuple())) -tuple_iterator_len = tuple_iterator.__length_hint__ -object_new = object.__new__ - - -def nn_module_new(cls): - obj = object_new(cls) - torch.nn.Module.__init__(obj) - return obj - - -def product(it): - return functools.reduce(operator.mul, it, 1) - - -def tuple_iterator_getitem(it, index): - _, (obj,), start = it.__reduce__() - return obj[start + index] - - -def enum_repr(value, local): - # enum class can override __str__ method. Use __class__ and name attribute - # to extract the class name and key name. - name = value.__class__.__name__ - val = value.name - scope = "L" if local else "G" - local_name = f'{scope}["{name}"].{val}' - return local_name - - -def dict_param_key_ids(value): - return { - id(k) for k in value.keys() if isinstance(k, (torch.nn.Parameter, torch.Tensor)) - } - - -def dict_const_keys(value): - return { - k for k in value.keys() if not isinstance(k, (torch.nn.Parameter, torch.Tensor)) - } - - -def dict_const_keys_repr(const_keys, *, local): - if any(isinstance(k, enum.Enum) for k in const_keys): - # To workaround repr(Enum) returning invalid global reference before python 3.11 - # by calling enum_repr and removing quotes to render enum in guard code. - const_keys_str = f"{ {enum_repr(k, local=local) if isinstance(k, enum.Enum) else repr(k) for k in const_keys} }".replace( - "'", "" - ) - else: - const_keys_str = f"{const_keys!r}" - return const_keys_str - - -def global_key_name(key): - return f"__dict_key_{id(key)}" - - -from torch._subclasses import ( # noqa: F401 - FakeTensorMode, - UnsupportedFakeTensorException, -) - - -def wrap_fake_exception(fn): - try: - return fn() - except UnsupportedFakeTensorException as e: - from .exc import unimplemented - - msg = f"Unsupported: {e.reason} with fake tensor propagation." - log.warning(msg) - raise unimplemented(msg) from e - - -def deepcopy_to_fake_tensor(obj, fake_mode): - with torch._subclasses.fake_tensor.FakeCopyMode(fake_mode): - return wrap_fake_exception(lambda: copy.deepcopy(obj)) - - -def rmse(ref, res): - """ - Calculate root mean squared error - """ - return torch.sqrt(torch.mean(torch.square(ref - res))) - - -def same( - ref, - res, - fp64_ref=None, - cos_similarity=False, - tol=1e-4, - equal_nan=False, - exact_dtype=True, - relax_numpy_equality=False, - ignore_non_fp=False, - log_error=log.error, -): - """Check correctness to see if ref and res match""" - if fp64_ref is None: - fp64_ref = ref - if isinstance(ref, (list, tuple, torch.nn.ParameterList, torch.Size)): - assert isinstance(res, (list, tuple)), f"type mismatch {type(ref)} {type(res)}" - if len(ref) != len(res): - log_error("Length mismatch") - return False - return len(ref) == len(res) and all( - same( - ai, - bi, - fp64_refi, - cos_similarity, - tol, - equal_nan, - exact_dtype, - relax_numpy_equality, - ignore_non_fp, - log_error=log_error, - ) - for ai, bi, fp64_refi in zip(ref, res, fp64_ref) - ) - elif isinstance(ref, dict): - assert isinstance(res, dict) - assert set(ref.keys()) == set( - res.keys() - ), f"keys mismatch {set(ref.keys())} == {set(res.keys())}" - for k in sorted(ref.keys()): - if not ( - same( - ref[k], - res[k], - fp64_ref[k], - cos_similarity=cos_similarity, - tol=tol, - equal_nan=equal_nan, - exact_dtype=exact_dtype, - relax_numpy_equality=relax_numpy_equality, - ignore_non_fp=ignore_non_fp, - log_error=log_error, - ) - ): - log_error("Accuracy failed for key name %s", k) - return False - return True - elif isinstance(ref, torch.Tensor): - assert not isinstance(ref, torch._subclasses.FakeTensor) - assert not isinstance(res, torch._subclasses.FakeTensor) - - if ref.is_sparse: - assert res.is_sparse - ref = ref.to_dense() - res = res.to_dense() - assert isinstance(res, torch.Tensor), f"type mismatch {type(ref)} {type(res)}" - if exact_dtype: - if ref.dtype != res.dtype: - log_error("dtype mismatch %s, %s", ref.dtype, res.dtype) - return False - if ref.dtype == torch.bool: - if ignore_non_fp: - return True - # triton stores bool as int8, so add this for more accurate checking - r = torch.allclose( - ref.to(dtype=torch.uint8), - res.to(dtype=torch.uint8), - atol=tol, - rtol=tol, - equal_nan=equal_nan, - ) - if not r: - log_error("Accuracy failed: uint8 tensor did not match") - return r - - if cos_similarity: - ref = ref.flatten().to(torch.float32) - res = res.flatten().to(torch.float32) - if torch.allclose(ref, res, atol=tol, rtol=tol, equal_nan=True): - # early exit that handles zero/nan better - # cosine_similarity(zeros(10), zeros(10), dim=0) is 0 - return True - score = torch.nn.functional.cosine_similarity(ref, res, dim=0, eps=1e-6) - if score < 0.99: - log.warning("Similarity score=%s", score.cpu().detach().item()) - return score >= 0.99 - else: - if not exact_dtype: - ref = ref.to(res.dtype) - - # First try usual allclose - if torch.allclose(ref, res, atol=tol, rtol=tol, equal_nan=equal_nan): - return True - - # Check error from fp64 version - if fp64_ref.dtype == torch.float64: - ref_error = rmse(fp64_ref, ref).item() - res_error = rmse(fp64_ref, res).item() - multiplier = 2.0 - - if ( - fp64_ref.numel() < 1000 - or (ref.ndim == 4 and ref.shape[-1] == ref.shape[-2] == 1) - # large tol means a benchmark has been specified as REQUIRE_HIGHER_TOLERANCE - or tol >= 2 * 1e-2 - ): - # In the presence of noise, noise might dominate our error - # metric for smaller tensors. - # Similary, for 1x1 kernels, there seems to be high noise with amp. - multiplier = 3.0 - - passes_test = res_error <= (multiplier * ref_error + tol / 10.0) - if not passes_test: - log_error( - "RMSE (res-fp64): %.5f, (ref-fp64): %.5f and shape=%s", - res_error, - ref_error, - res.size(), - ) - # import pdb; pdb.set_trace() - return passes_test - - if ignore_non_fp: - return True - - log_error("Accuracy failed: allclose not within tol=%s", tol) - return False - elif isinstance(ref, (str, int, type(None), bool, torch.device)): - if ignore_non_fp: - return True - r = ref == res - if not r: - log_error("Accuracy failed (%s): %s != %s", type(ref), ref, res) - return r - elif isinstance(ref, float): - r = math.isclose(ref, res, rel_tol=tol, abs_tol=tol) - if not r: - log_error( - "Accuracy failed (float): %s != %s (within tol=%s)", ref, res, tol - ) - return r - elif is_numpy_int_type(ref) or is_numpy_float_type(ref): - if relax_numpy_equality and not ( - is_numpy_int_type(res) or is_numpy_float_type(res) - ): - ref = ref.item() - r = (type(ref) is type(res)) and (ref == res) - if not r: - log_error("Accuracy failed (numpy): %s != %s", ref, res) - return r - elif is_numpy_ndarray(ref): - return (type(ref) is type(res)) and same( - torch.as_tensor(ref), - torch.as_tensor(res), - fp64_ref, - cos_similarity=cos_similarity, - tol=tol, - equal_nan=equal_nan, - exact_dtype=exact_dtype, - relax_numpy_equality=relax_numpy_equality, - ignore_non_fp=ignore_non_fp, - log_error=log_error, - ) - elif type(ref).__name__ in ( - "MaskedLMOutput", - "Seq2SeqLMOutput", - "CausalLMOutputWithCrossAttentions", - "LongformerMaskedLMOutput", - "Instances", - "SquashedNormal", - "Boxes", - "Normal", - "TanhTransform", - "Foo", - "Variable", - ): - assert type(ref) is type(res) - return all( - same( - getattr(ref, key), - getattr(res, key), - getattr(fp64_ref, key), - cos_similarity=cos_similarity, - tol=tol, - equal_nan=equal_nan, - exact_dtype=exact_dtype, - relax_numpy_equality=relax_numpy_equality, - ignore_non_fp=ignore_non_fp, - log_error=log_error, - ) - for key in ref.__dict__.keys() - ) - else: - raise RuntimeError(f"unsupported type: {type(ref).__name__}") - - -def format_func_info(code): - short_filename = code.co_filename.split("/")[-1] - return f"'{code.co_name}' ({short_filename}:{code.co_firstlineno})" - - -@contextlib.contextmanager -def disable_cache_limit(): - prior = config.cache_size_limit - config.cache_size_limit = sys.maxsize - - try: - yield - finally: - config.cache_size_limit = prior - - -# map from transformed code back to original user code -orig_code_map = ExactWeakKeyDictionary() - -# keep a record of code_obj -> list of guard failure reasons for logging -guard_failures = collections.defaultdict(list) - -# Keep a record of graph break reasons for logging -graph_break_reasons = list() - -# keep record of compiled code, if we are in "error if recompile" -# to track code that dynamo has compiled previously -seen_code_map = ExactWeakKeyDictionary() - - -class CompileProfiler: - """Utility for profiling how and what dynamo would compile. - - Can be used for - * diagnosing recompilation issues - * determining an appropriate compile cache limit - * (TODO)confirming which functions got compiled/skipped - """ - - def __init__(self): - self.frame_count = 0 - self.op_count = 0 - self.backend_ctx_ctor = lambda: disable_cache_limit() - - def __call__(self, gm: torch.fx.GraphModule, example_inputs): - self.frame_count += 1 - for node in gm.graph.nodes: - if "call" in node.op: - self.op_count += 1 - return gm.forward - - def __enter__(self): - self.old_report_guard_failure = config.report_guard_failures - config.report_guard_failures = True - return self - - def __exit__(self, typ, val, traceback): - config.report_guard_failures = self.old_report_guard_failure - - def get_metrics(self): - return {"guard_failures": guard_failures} - - def report(self): - metrics = self.get_metrics() - gf = metrics["guard_failures"] - - def num_recompiles(code): - return len(gf[code]) - - def recompile_reasons(code): - return "\n".join([str(x) for x in gf[code]]) - - summarized_gf = [ - [format_func_info(code), num_recompiles(code), recompile_reasons(code)] - for code in gf - ] - - def graph_break_report(): - if "graph_break" in counters: - graph_breaks = counters["graph_break"] - return tabulate( - [[msg, graph_breaks[msg]] for msg in graph_breaks], - headers=["Graph Break Reason", "Count"], - ) - - def recompilation_report(): - if len(gf): - max_recompiles = max([num_recompiles(code) for code in gf]) - recomp_table = tabulate( - summarized_gf, - headers=["Function", "Recompiles", "Recompile Reasons"], - ) - return recomp_table + textwrap.dedent( - f""" - - Set torch._dynamo.config.cache_size_limit to {max_recompiles} to avoid being cache limited. - """ - ) - - report = textwrap.dedent( - """ - Torchdynamo Profiler Report - =========================== - - Graph Breaks - ------------ - Graph breaks happen when torchdynamo encounters code it can't safely trace. - If you want to find out why breaks are happening, check below for each break reason - You may gain additional insight by passing `fullgraph=True` to torch.compile, - to stop at the first break. - - """ - ) - report += graph_break_report() or "No graph breaks detected." - report += textwrap.dedent( - """ - - Recompilation - ------------- - These subgraphs were recompiled more than once due to guard failures - Guard failures indicate some condition assumed to be static by the tracer changed, - making it unsafe to reuse the compiled program. - - """ - ) - report += recompilation_report() or "No recompilation detected.\n" - return report - - -# return same dir unless user changes config between calls -@functools.lru_cache(None) -def _get_debug_dir(root_dir): - dir_name = ( - "run_" - + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f") - # use pid to avoid conflicts among ranks - + "-pid_" - + str(os.getpid()) - ) - return os.path.join(root_dir, dir_name) - - -def get_debug_dir(): - debug_root = config.debug_dir_root - return _get_debug_dir(debug_root) - - -def get_fake_value(node, tx): - """ - Run the computation represented by `node` using fake tensors and return the result. - """ - from .exc import ( - TorchRuntimeError, - unimplemented, - Unsupported, - UserError, - UserErrorType, - ) - - op = node.op - - def fake_wrapper(e): - if isinstance(e, torch.Tensor): - assert is_fake(e) - return e - - def visit(n: torch.fx.Node): - return n.meta["example_value"] - - args, kwargs = torch.fx.node.map_arg((node.args, node.kwargs), visit) - args = tree_map(fake_wrapper, args) - kwargs = tree_map(fake_wrapper, kwargs) - - nnmodule = None - if op == "call_method" and len(args) > 0 and isinstance(args[0], torch.nn.Module): - # If the first argument is nn.Module, should copy to fake mode. - args = (deepcopy_to_fake_tensor(args[0], tx.fake_mode),) + tuple(args[1:]) - - if op == "call_module": - nnmodule = tx.output.nn_modules[node.target] - - if is_lazy_module(nnmodule) and hasattr(nnmodule, "_initialize_hook"): - # In the case of a lazy module, we want to run - # the pre-hooks which initialize it. - # Afterwards, lazy module deletes its pre-hooks - # to avoid treating it as lazy on subsequent recompile. - nnmodule._infer_parameters(nnmodule, args) - - # no matter it's lazy module or not, we should copy to fake mode. - nnmodule = deepcopy_to_fake_tensor(nnmodule, tx.fake_mode) - - try: - with tx.fake_mode, enable_python_dispatcher(): - return wrap_fake_exception( - lambda: run_node(tx.output, node, args, kwargs, nnmodule) - ) - except Unsupported: - raise - except RuntimeError as e: - cause = e - if e.__cause__ is not None: - cause = e.__cause__ - - if isinstance( - cause, torch._subclasses.fake_tensor.DataDependentOutputException - ): - unimplemented(f"data dependent operator: {cause.func}") - elif isinstance( - cause, torch._subclasses.fake_tensor.DynamicOutputShapeException - ): - unimplemented(f"dynamic shape operator: {cause.func}") - elif isinstance( - cause, torch._subclasses.fake_tensor.UnsupportedOperatorException - ): - unimplemented( - f"unsupported operator: {cause.func} (see " - "https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.64r4npvq0w0" - " for how to fix)" - ) - elif isinstance( - cause, torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode - ): - unimplemented("guard on data-dependent symbolic int/float") - elif isinstance(cause, torch.utils._sympy.value_ranges.ValueRangeError): - raise UserError(UserErrorType.CONSTRAIN_VIOLATION, e.args[0]) from e - raise TorchRuntimeError(str(e)).with_traceback(e.__traceback__) from None - - -def run_node(tracer, node, args, kwargs, nnmodule): - """ - Runs a given node, with the given args and kwargs. - - Behavior is dicatated by a node's op. - - run_node is useful for extracting real values out of nodes. - See get_real_value for more info on common usage. - - Note: The tracer arg is only used for 'get_attr' ops - Note: The nnmodule arg is only used for 'call_module' ops - - Nodes that are not call_function, call_method, call_module, or get_attr will - raise an AssertionError. - """ - op = node.op - try: - if op == "call_function": - return node.target(*args, **kwargs) - elif op == "call_method": - return getattr(args[0], node.target)(*args[1:], **kwargs) - elif op == "call_module": - assert nnmodule is not None - return nnmodule(*args, **kwargs) - elif op == "get_attr": - return tracer.get_submodule(node.target) - elif op == "placeholder": - assert "example_value" in node.meta - return node.meta["example_value"] - except Exception as e: - fn_str = f"Failed running {op} {node.target}(*{args}, **{kwargs}):\n" - raise RuntimeError(fn_str + str(e)).with_traceback(e.__traceback__) from e - - raise AssertionError(op) - - -def get_real_value(node, tracer): - """ - Run the actual computation represented by `node` and return the result. - This will execute any dependent nodes in the graph as well. - """ - from .exc import TorchRuntimeError - - cache = tracer.real_value_cache - if node in cache: - return cache[node] - - op = node.op - args, kwargs = torch.fx.node.map_arg( - (node.args, node.kwargs), - lambda n: get_real_value(n, tracer), - ) - - if op == "call_module": - nn_module = tracer.output_graph.nn_modules[node.target] - if not is_lazy_module(nn_module): - nn_module = copy.deepcopy(nn_module) - else: - # In the case of a lazy module, we want to run - # the pre-hooks which initialize it - nn_module(*args, **kwargs) - else: - nn_module = None - - try: - real_value = run_node(tracer, node, args, kwargs, nn_module) - cache[node] = real_value - except RuntimeError as e: - raise TorchRuntimeError(str(e)).with_traceback(e.__traceback__) from None - return real_value - - -def assert_no_fake_params_or_buffers(gm): - from torch._subclasses.fake_tensor import FakeTensorConfig - - def stack_or_hint(t): - if FakeTensorConfig.debug: - import traceback - - return f"FAKE TENSOR CREATION TRACEBACK: \n {traceback.format_list(t._debug_trace)}" - else: - return "Enable TORCH_FAKE_TENSOR_DEBUG=1 to get creation stack traces on fake tensors." - - for name, buffer in gm.named_buffers(): - assert not isinstance( - buffer, torch._subclasses.FakeTensor - ), f"Unexpected fake buffer {name} {stack_or_hint(buffer)}" - for name, param in gm.named_parameters(): - assert not isinstance( - param, torch._subclasses.FakeTensor - ), f"Unexpected fake param {name} {stack_or_hint(param)}" - - -def fqn(obj: Any): - """ - Returns the fully qualified name of the object. - """ - return f"{obj.__module__}.{obj.__qualname__}" - - -def ifdynstaticdefault(count1, count2): - if torch._dynamo.config.assume_static_by_default: - return count1 - else: - return count2 - - -def import_submodule(mod: types.ModuleType): - """ - Ensure all the files in a given submodule are imported - """ - for filename in sorted(os.listdir(os.path.dirname(mod.__file__))): - if filename.endswith(".py") and filename[0] != "_": - importlib.import_module(f"{mod.__name__}.{filename[:-3]}") - - -def object_has_getattribute(value: Any): - try: - if isinstance( - inspect.getattr_static(type(value), "__getattribute__"), - types.FunctionType, - ): - return True - except AttributeError: - pass - return False - - -def get_custom_getattr(value: Any): - try: - getattr_fn = inspect.getattr_static(type(value), "__getattr__") - except AttributeError: - getattr_fn = None - if getattr_fn is torch.nn.Module.__getattr__: - # ignore this case of getattr - getattr_fn = None - return getattr_fn - - -class TensorStaticReason(enum.Enum): - PARAMETER = 2 - NOT_TENSOR = 4 - NN_MODULE_PROPERTY = 5 - - -def tensor_static_reason_to_message(reason: TensorStaticReason): - if reason == TensorStaticReason.PARAMETER: - return "mark_dynamic on parameter, parameters are always static today." - if reason == TensorStaticReason.NOT_TENSOR: - return "mark_dynamic on a non tensor, how did this happen?" - if reason == TensorStaticReason.NN_MODULE_PROPERTY: - return "tensor is static because it is nn module associated." - raise AssertionError(f"Illegal reason {reason}") - - -def tensor_always_has_static_shape( - tensor: Union[torch.Tensor, Any], is_tensor: bool, guard_source: "GuardSource" -) -> Tuple[bool, TensorStaticReason]: - """ - Given a tensor, source, and is_tensor flag, determine if a shape should be static. - - Args: - tensor - the real tensor to evaluate, parameters force a static shape. - is_tensor - internal dynamo check, esentially "is_tensor": target_cls is TensorVariable, - tensors not in a TensorVariable for whatever reason are forced static. - - Returns a tuple, where the first element is the bool of whether or not this tensor should have a static shape. - The second element is a TensorStaticReason, useful for passing to tensor_static_reason_to_message if needed. - """ - if guard_source.is_nn_module() and config.force_nn_module_property_static_shapes: - return True, TensorStaticReason.NN_MODULE_PROPERTY - if type(tensor) is torch.nn.Parameter and config.force_parameter_static_shapes: - return True, TensorStaticReason.PARAMETER - if not is_tensor: - return True, TensorStaticReason.NOT_TENSOR - return False, None - - -class LazyString: - def __init__(self, func, *args, **kwargs): - self.func = func - self.args = args - self.kwargs = kwargs - - def __str__(self): - return self.func(*self.args, **self.kwargs) - - -def lazy_format_graph_code(name, gm, maybe_id=None): - def format_name(): - if maybe_id is not None: - return f"{name} {maybe_id}" - else: - return name - - return LazyString( - lambda: _format_graph_code( - f"===== {format_name()} =====\n", - gm.forward.__code__.co_filename, - gm.print_readable(print_output=False), - ) - ) - - -def _format_graph_code(name, filename, graph_str): - return f"TRACED GRAPH\n {name} {filename} {graph_str}\n" - - -def lazy_format_graph_tabular(fn_name, gm): - def inner(): - try: - from tabulate import tabulate # TODO: Check that this is installed - except ImportError: - return ( - "Tabulate module missing, please install tabulate to log the graph in tabular format, logging code instead:\n" - + str(lazy_format_graph_code(fn_name, gm)) - ) - - node_specs = [ - [n.op, n.name, n.target, n.args, n.kwargs] for n in gm.graph.nodes - ] - graph_str = tabulate( - node_specs, headers=["opcode", "name", "target", "args", "kwargs"] - ) - return _format_graph_code(fn_name, gm.forward.__code__.co_filename, graph_str) - - return LazyString(inner) - - -def format_bytecode(prefix, name, filename, line_no, code): - return f"{prefix} {name} {filename} line {line_no} \n{dis.Bytecode(code).dis()}\n" - - -forward_hook_names = ["_forward_pre_hooks", "_forward_hooks"] -backward_hook_names = ["_backward_pre_hooks", "_backward_hooks"] -state_dict_hook_names = [ - "_state_dict_pre_hooks", - "_state_dict_hooks", - "_load_state_dict_pre_hooks", - "_load_state_dict_post_hooks", -] -all_hook_names = forward_hook_names + backward_hook_names + state_dict_hook_names - - -def nn_module_get_all_hooks( - mod, - check_forward_hooks=False, - check_backward_hooks=False, - check_state_dict_hooks=False, -): - reset_code = torch._C._dynamo.eval_frame.reset_code - """ - Sometimes its useful to differentiate between types of hooks such as forward/backward/pre - hooks executed during module.__call__, and state_dict hooks which are executed separately. - """ - hook_dicts_to_check = [] - check_all_hooks = ( - not check_forward_hooks - and not check_backward_hooks - and not check_state_dict_hooks - ) - if check_forward_hooks or check_all_hooks: - hook_dicts_to_check.extend(forward_hook_names) - if check_backward_hooks or check_all_hooks: - hook_dicts_to_check.extend(backward_hook_names) - if check_state_dict_hooks: - hook_dicts_to_check.extend(state_dict_hook_names) - - all_hooks = [] - for hook_dict_name in hook_dicts_to_check: - hooks = getattr(mod, hook_dict_name, []) - for hook_name in hooks: - hook = hooks[hook_name] - - all_hooks.append(hook) - return all_hooks - - -def nnmodule_has_hooks( - mod, - check_forward_hooks=False, - check_backward_hooks=False, - check_state_dict_hooks=False, -): - """ - Helper function to check if a module has any hooks attached to it. - """ - hooks = nn_module_get_all_hooks( - mod, - check_forward_hooks=check_forward_hooks, - check_backward_hooks=check_backward_hooks, - check_state_dict_hooks=check_state_dict_hooks, - ) - return bool(hooks) - - -def to_numpy_helper(value): - """Convert tensor and tnp.ndarray to numpy.ndarray.""" - if isinstance(value, tnp.ndarray): - return to_numpy_helper(value.tensor) - elif isinstance(value, torch.Tensor): - return value.cpu().numpy() - elif isinstance(value, (tuple, list)): - return type(value)(to_numpy_helper(obj) for obj in value) - else: - return value - - -def numpy_to_tensor(value): - """Convert tnp.ndarray to tensor, leave other types intact. If a list/tuple, loop through it to convert.""" - if isinstance(value, np.ndarray): - return torch.as_tensor(value) - if isinstance(value, tnp.ndarray): - return value.tensor - elif isinstance(value, (tuple, list)): - return type(value)(numpy_to_tensor(obj) for obj in value) - else: - return value - - -class numpy_to_tensor_wrapper: - def __init__(self, f): - self.f = f - self.__name__ = "wrapped_" + self.f.__name__ - - def __repr__(self): - return f">" - - def __call__(self, *args, **kwargs): - out = self.f(*args, **kwargs) - return numpy_to_tensor(out) - - -def numpy_attr_wrapper(obj, name): - if isinstance(obj, tnp.ndarray): - out = getattr(obj, name) - return numpy_to_tensor(out) - elif isinstance(obj, torch.Tensor): - out = getattr(tnp.ndarray(obj), name) - return numpy_to_tensor(out) - - -class numpy_method_wrapper: - """Convert obj from torch.Tensor to tnp.ndarray and call method. Then convert result back to torch.Tensor.""" - - def __init__(self, method: str): - self.method = method - self.__name__ = "wrapped_" + self.method - - def __repr__(self): - return f">" - - def __call__(self, *args, **kwargs): - obj = args[0] - if isinstance(obj, torch.Tensor): - obj = tnp.ndarray(obj) - method_callable = getattr(obj, self.method) - out = method_callable(*args[1:], **kwargs) - return numpy_to_tensor(out) - - -def defake(x): - if not isinstance(x, FakeTensor): - return x - if x._has_symbolic_sizes_strides: - size = [ - s.node.shape_env.size_hint(s.node.expr) - if isinstance(s, torch.SymInt) - else s - for s in x.size() - ] - stride = [ - s.node.shape_env.size_hint(s.node.expr) - if isinstance(s, torch.SymInt) - else s - for s in x.stride() - ] - else: - size = x.size() - stride = x.stride() - y = torch.empty_strided( - size, - stride, - dtype=x.dtype, - device=x.device, - requires_grad=x.requires_grad, - ) - y.zero_() - return y - - -def is_utils_checkpoint(obj): - # Lazy import to avoid circular dependenices - import torch.utils.checkpoint - - return obj is torch.utils.checkpoint.checkpoint - - -def build_checkpoint_variable(**options): - import torch._higher_order_ops.wrap as higher_order_ops - from .variables.higher_order_ops import TorchHigherOrderOperatorVariable - - # TODO - This is a temporary sitaution where we have two versions of - # checkpointing implemetation. We will converge on one and remove the other. - activation_checkpoint_op = higher_order_ops.tag_activation_checkpoint - if torch._functorch.config.functionalize_rng_ops: - activation_checkpoint_op = higher_order_ops.wrap_activation_checkpoint - - return TorchHigherOrderOperatorVariable.make( - activation_checkpoint_op, - **options, - ) - - -def is_compile_supported(device_type): - from .eval_frame import is_dynamo_supported - - compile_supported = is_dynamo_supported() - if device_type == "cpu": - pass - elif device_type == "cuda" and compile_supported: - from torch._inductor.utils import has_triton - - compile_supported = has_triton() - else: - compile_supported = False - return compile_supported - - -# The following 3.11 source code functions are adapted from -# https://github.com/python/cpython/blob/v3.11.4/Lib/traceback.py -# in order to output source code corresponding to bytecode in 3.11+. -# We need our own versions since we want to support multiline expressions. -def _fix_offset(str: str, offset: int) -> int: - """ - Convert byte offset `offset` of `str` into character offset. - Byte offset is used for 3.11+ instruction column data. - Takes things like unicode characters into consideration. - - Unchanged from CPython implementation. - """ - as_utf8 = str.encode("utf-8") - return len(as_utf8[:offset].decode("utf-8", errors="replace")) - - -@dataclasses.dataclass -class _Anchors: - # inclusive - left_end_lineno: int - left_end_offset: int - right_start_lineno: int - # exclusive - right_start_offset: int - - -def _extract_anchors_from_expr(segment: str) -> Optional[_Anchors]: - """ - Given source code `segment` corresponding to a bytecode - instruction, determine: - - for binary ops, the location of the binary op - - for indexing, the location of the brackets. - `segment` is expected to be a valid Python expression - """ - assert sys.version_info >= (3, 11) - - import ast - - try: - # Without brackets, `segment` is parsed as a statement. - # We expect an expression, so wrap `segment` in - # brackets to handle multi-line expressions. - tree = ast.parse("(\n" + segment + "\n)") - except SyntaxError: - return None - - if len(tree.body) != 1: - return None - - lines = segment.split("\n") - - # get character index given byte offset - def normalize(lineno, offset): - return _fix_offset(lines[lineno], offset) - - # Gets the next valid character index in `lines`, if - # the current location is not valid. Handles empty lines. - def next_valid_char(lineno, col): - while lineno < len(lines) and col >= len(lines[lineno]): - col = 0 - lineno += 1 - assert lineno < len(lines) and col < len(lines[lineno]) - return lineno, col - - # Get the next valid character index in `lines`. - def increment(lineno, col): - col += 1 - lineno, col = next_valid_char(lineno, col) - assert lineno < len(lines) and col < len(lines[lineno]) - return lineno, col - - # Get the next valid character at least on the next line - def nextline(lineno, col): - col = 0 - lineno += 1 - lineno, col = next_valid_char(lineno, col) - assert lineno < len(lines) and col < len(lines[lineno]) - return lineno, col - - statement = tree.body[0] - if isinstance(statement, ast.Expr): - expr = statement.value - if isinstance(expr, ast.BinOp): - # ast gives locations for BinOp subexpressions, e.g. - # ( left_expr ) + ( right_expr ) - # left^^^^^ right^^^^^ - # -2 since end_lineno is 1-indexed and because we added an extra - # bracket to `segment` when calling ast.parse - cur_lineno = expr.left.end_lineno - 2 - cur_col = normalize(cur_lineno, expr.left.end_col_offset) - cur_lineno, cur_col = next_valid_char(cur_lineno, cur_col) - - # Heuristic to find the operator character. - # The original CPython implementation did not look for ), \, or #, - # leading to incorrect anchor location, e.g. - # (x) + (y) - # ~~^~~~~~~ - while (ch := lines[cur_lineno][cur_col]).isspace() or ch in ")\\#": - if ch in "\\#": - cur_lineno, cur_col = nextline(cur_lineno, cur_col) - else: - cur_lineno, cur_col = increment(cur_lineno, cur_col) - - # binary op is 1 or 2 characters long, on the same line - right_col = cur_col + 1 - if ( - right_col < len(lines[cur_lineno]) - and not (ch := lines[cur_lineno][right_col]).isspace() - and ch not in "\\#" - ): - right_col += 1 - # right_col can be invalid since it is exclusive - - return _Anchors(cur_lineno, cur_col, cur_lineno, right_col) - elif isinstance(expr, ast.Subscript): - # ast gives locations for value and slice subexpressions, e.g. - # ( value_expr ) [ slice_expr ] - # value^^^^^ slice^^^^^ - # subscript^^^^^^^^^^^^^^^^^^^^ - # find left bracket (first '[' after value) - left_lineno = expr.value.end_lineno - 2 - left_col = normalize(left_lineno, expr.value.end_col_offset) - left_lineno, left_col = next_valid_char(left_lineno, left_col) - while lines[left_lineno][left_col] != "[": - left_lineno, left_col = increment(left_lineno, left_col) - # find right bracket (final character of expression) - right_lineno = expr.end_lineno - 2 - right_col = normalize(right_lineno, expr.end_col_offset) - return _Anchors(left_lineno, left_col, right_lineno, right_col) - elif isinstance(expr, ast.Call): - # ( func_expr ) (args, kwargs) - # func^^^^^ - # call^^^^^^^^^^^^^^^^^^^^^^^^ - # find left bracket (first '(' after func) - left_lineno = expr.func.end_lineno - 2 - left_col = normalize(left_lineno, expr.func.end_col_offset) - left_lineno, left_col = next_valid_char(left_lineno, left_col) - while lines[left_lineno][left_col] != "(": - left_lineno, left_col = increment(left_lineno, left_col) - # find right bracket (final character of expression) - right_lineno = expr.end_lineno - 2 - right_col = normalize(right_lineno, expr.end_col_offset) - return _Anchors(left_lineno, left_col, right_lineno, right_col) - - return None - - -def get_instruction_source_311(code: types.CodeType, inst: dis.Instruction) -> str: - """ - Python 3.11+ only. Returns lines of source code (from code object `code`) - corresponding to `inst`'s location data, and underlines relevant code to `inst`. - - Example: CALL on `g`: - f(g( - ^^ - h(x))) - ^^^^^ - - We need our own implementation since `format_frame_summary` in - Python's `traceback` module doesn't handle multi-line expressions - (and their anchor extraction code is not completely correct). - """ - if inst.positions.lineno is None: - return "" - # The rstrip + "\n" pattern is used throughout this function to handle - # linecache.getline errors. Error lines are treated as empty strings "", but we want - # to treat them as blank lines "\n". - first_line = linecache.getline(code.co_filename, inst.positions.lineno).rstrip() - if inst.positions.end_lineno is None: - return first_line - if inst.positions.col_offset is None or inst.positions.end_col_offset is None: - return first_line - - # character index of the start of the instruction - start_offset = _fix_offset(first_line, inst.positions.col_offset) - # character index of the end of the instruction - # compute later since end may be a different line - end_offset = None - # expression corresponding to the instruction so we can get anchors - segment = "" - # underline markers to be printed - start with `~` marker and replace with `^` later - markers = [] - - # Compute segment and initial markers - if inst.positions.end_lineno == inst.positions.lineno: - end_offset = _fix_offset(first_line, inst.positions.end_col_offset) - segment = first_line[start_offset:end_offset] - markers.append(" " * start_offset + "~" * (end_offset - start_offset)) - else: - segment = first_line[start_offset:] + "\n" - markers.append(" " * start_offset + "~" * (len(first_line) - start_offset)) - last_line = linecache.getline( - code.co_filename, inst.positions.end_lineno - ).rstrip() - end_offset = _fix_offset(last_line, inst.positions.end_col_offset) - for lineno in range(inst.positions.lineno + 1, inst.positions.end_lineno): - line = linecache.getline(code.co_filename, lineno).rstrip() - segment += line + "\n" - # don't underline leading spaces - num_spaces = len(line) - len(line.lstrip()) - markers.append(" " * num_spaces + "~" * (len(line) - num_spaces)) - segment += last_line[:end_offset] - num_spaces = len(last_line) - len(last_line.lstrip()) - markers.append(" " * num_spaces + "~" * (end_offset - num_spaces)) - - anchors: Optional[_Anchors] = None - try: - anchors = _extract_anchors_from_expr(segment) - except AssertionError: - pass - - # replace `~` markers with `^` where necessary - if anchors is None: - markers = [marker.replace("~", "^") for marker in markers] - else: - # make markers mutable - markers = [list(marker) for marker in markers] - - # anchor positions do not take start_offset into account - if anchors.left_end_lineno == 0: - anchors.left_end_offset += start_offset - if anchors.right_start_lineno == 0: - anchors.right_start_offset += start_offset - - # Turn `~`` markers between anchors to `^` - for line in range(len(markers)): - for col in range(len(markers[line])): - if line < anchors.left_end_lineno: - continue - if line == anchors.left_end_lineno and col < anchors.left_end_offset: - continue - if ( - line == anchors.right_start_lineno - and col >= anchors.right_start_offset - ): - continue - if line > anchors.right_start_lineno: - continue - if markers[line][col] == "~": - markers[line][col] = "^" - - # make markers into strings again - markers = ["".join(marker) for marker in markers] - - result = "" - for i in range(len(markers)): - result += ( - linecache.getline(code.co_filename, inst.positions.lineno + i).rstrip() - + "\n" - ) - result += markers[i] + "\n" - return result - - -def is_guard_failure_reporting_enabled(): - return ( - config.report_guard_failures - or torch._logging._internal.log_state.is_artifact_enabled("recompiles") - ) - - -def get_static_address_type(t): - if isinstance(t, torch.Tensor): - return getattr(t, "_dynamo_static_input_type", None) - - return None diff --git a/userbenchmark/dynamo/common.py b/userbenchmark/dynamo/common.py deleted file mode 100644 index 075b88f4a0..0000000000 --- a/userbenchmark/dynamo/common.py +++ /dev/null @@ -1,3577 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import collections -import contextlib -import copy -import csv -import dataclasses -import functools -import importlib -import itertools -import logging -import os -import pathlib -import random -import shutil -import signal -import subprocess -import sys -import time -from contextlib import contextmanager - -from typing import Any, Callable, Mapping, NamedTuple, Optional, Tuple, Type -from unittest.mock import MagicMock - -import numpy as np -import pandas as pd -import psutil -import torch -import torch._dynamo -import torch._dynamo.utils -import torch._export -import torch.distributed -import torch.fx._pytree as fx_pytree -import torch.multiprocessing as mp -from scipy.stats import gmean, ttest_ind -from torch._dynamo.profiler import fx_insert_profiling, Profiler -from torch._dynamo.testing import dummy_fx_compile, format_speedup, same -from torch._dynamo.utils import clone_inputs -from torch._functorch.aot_autograd import set_model_name -from torch._inductor import config as inductor_config -from torch._inductor.utils import fresh_inductor_cache -from torch._subclasses.fake_tensor import FakeTensorMode - -from torch.utils import _pytree as pytree -from torch.utils._pytree import tree_map, tree_map_only - -from tqdm.auto import tqdm, trange - -log = logging.getLogger(__name__) - -# We are primarily interested in TF32 -torch.backends.cuda.matmul.allow_tf32 = True - -# Suppress torch.profiler spam -os.environ["KINETO_LOG_LEVEL"] = "5" - -current_name = "" -current_device = "" -current_onnx_compiler = "" -current_batch_size = None -output_filename = None - -MAX_DOWNLOAD_ATTEMPTS = 5 - - -class CI(NamedTuple): - backend: str # aot_eager or inductor - training: bool - dynamic: bool = False - device: str = "cuda" - - -CI_SKIP = collections.defaultdict(list) - - -# Skips for dynamic=False - -# Here eager really means dynamo+eager -CI_SKIP[CI("eager", training=False)] = [ - # TorchBench - "DALLE2_pytorch", # AttributeError: text_encodings - "hf_BigBird", # fail_accuracy - # TypeError: pad_center() takes 1 positional argument but 2 were given - "tacotron2", - # Huggingface - "DebertaV2ForQuestionAnswering", # OOM -] - -CI_SKIP[CI("eager", training=True)] = [ - *CI_SKIP[CI("eager", training=False)], - # TorchBench - "BERT_pytorch", # accuracy - "Background_Matting", # fp64_OOM - "hf_BigBird", # fp64_OOM - "hf_T5_base", # fp64_OOM - "llama", # Accuracy failed: allclose not within tol=0.001 - "vision_maskrcnn", # The size of tensor a (29) must match the size of tensor b (33) (doesn't repro) - # Huggingface - "XGLMForCausalLM", # OOM - # TIMM - "cait_m36_384", # fp64_OOM - "convit_base", # fp64_OOM - "mobilenetv2_100", # accuracy - "xcit_large_24_p8_224", # fp64_OOM, -] - -CI_SKIP[CI("aot_eager", training=False)] = [ - *CI_SKIP[CI("eager", training=False)], - # all dynamic shapes errors for detectron variants - "demucs", # OOM - "detectron2_fasterrcnn_r_101_c4", - "detectron2_fasterrcnn_r_101_dc5", - "detectron2_fasterrcnn_r_101_fpn", - "detectron2_fasterrcnn_r_50_c4", - "detectron2_fasterrcnn_r_50_dc5", - "detectron2_fasterrcnn_r_50_fpn", - "detectron2_fcos_r_50_fpn", - "detectron2_maskrcnn_r_101_c4", - "detectron2_maskrcnn_r_101_fpn", - "detectron2_maskrcnn_r_50_c4", - "detectron2_maskrcnn_r_50_fpn", - "hf_BigBird", # OOM - "tacotron2", # AssertionError: Deduped args out of bounds - # Huggingface - "BartForConditionalGeneration", # OOM - "DebertaV2ForQuestionAnswering", # OOM - # Torchbench - "speech_transformer", # https://github.com/pytorch/pytorch/issues/99893 - "pyhpc_isoneutral_mixing", # https://github.com/pytorch/pytorch/issues/99893 - "pyhpc_turbulent_kinetic_energy", # https://github.com/pytorch/pytorch/issues/99893 -] - -CI_SKIP[CI("aot_eager", training=True)] = [ - *CI_SKIP[CI("aot_eager", training=False)], - # TorchBench - "Background_Matting", # fp64_OOM - "hf_T5_base", # fp64_OOM - "mobilenet_v2_quantized_qat", # fp64_OOM - "resnet50_quantized_qat", # fp64_OOM - "pytorch_struct", - # Huggingface - "MBartForConditionalGeneration", # OOM - "M2M100ForConditionalGeneration", # OOM - "XGLMForCausalLM", # OOM - # TIMM - "cait_m36_384", # fp64_OOM - "convit_base", # fp64_OOM - "fbnetv3_b", # Accuracy (blocks.2.2.bn1.weight.grad) - "levit_128", # Accuracy (patch_embed.0.c.weight.grad) - "lcnet_050", # Accuracy (blocks.1.0.bn2.weight.grad) - "sebotnet33ts_256", # Accuracy (stem.conv1.conv.weight.grad) - "xcit_large_24_p8_224", # fp64_OOM, -] - -CI_SKIP[CI("inductor", training=False)] = [ - # TorchBench - "DALLE2_pytorch", # AttributeError: text_encodings - "demucs", # OOM - "detectron2_fasterrcnn_r_101_c4", - "detectron2_fasterrcnn_r_101_dc5", - "detectron2_fasterrcnn_r_101_fpn", - "detectron2_fasterrcnn_r_50_c4", - "detectron2_fasterrcnn_r_50_dc5", - "detectron2_fasterrcnn_r_50_fpn", - "detectron2_fcos_r_50_fpn", - "detectron2_maskrcnn_r_101_c4", - "detectron2_maskrcnn_r_101_fpn", - "detectron2_maskrcnn_r_50_c4", - "detectron2_maskrcnn_r_50_fpn", - # TorchBench - "detectron2", - "densenet121", # flaky accuracy - "hf_T5", # accuracy - "hf_BigBird", # accuracy - "hf_GPT2_large", # OOM - "maml", # accuracy - "mobilenet_v2_quantized_qat", # The eval test only supports CPU - "pytorch_struct", # Test eval is not implemented - "pyhpc_equation_of_state", # Accuracy - "pyhpc_turbulent_kinetic_energy", # Accuracy - "tacotron2", -] - -CI_SKIP[CI("inductor", training=False, device="cpu")] = [ - # TorchBench - "drq", # Need to update torchbench - "detectron2_fasterrcnn_r_101_c4", - "detectron2_fasterrcnn_r_101_dc5", - "detectron2_fasterrcnn_r_101_fpn", - "detectron2_fasterrcnn_r_50_c4", - "detectron2_fasterrcnn_r_50_dc5", - "detectron2_fasterrcnn_r_50_fpn", - "detectron2_fcos_r_50_fpn", - "detectron2_maskrcnn_r_101_c4", - "detectron2_maskrcnn_r_101_fpn", - "detectron2_maskrcnn_r_50_c4", - "detectron2_maskrcnn_r_50_fpn", - "doctr_det_predictor", # requires newer gcc - "doctr_reco_predictor", # requires newer gcc - "gat", # does not work with fp32 - "gcn", # does not work with fp32 - "hf_Bert_large", # OOM - "hf_GPT2_large", # Intermittent failure on CI - "hf_T5_base", # OOM - "mobilenet_v2_quantized_qat", - "pyhpc_turbulent_kinetic_energy", - "resnet50_quantized_qat", # Eager model failed to run(Quantize only works on Float Tensor, got Double) - "sage", # does not work with fp32 - # Huggingface - "MBartForConditionalGeneration", # Accuracy https://github.com/pytorch/pytorch/issues/94793 - "PLBartForConditionalGeneration", # Accuracy https://github.com/pytorch/pytorch/issues/94794 - # TIMM - "cait_m36_384", # Accuracy - "pnasnet5large", # OOM - "xcit_large_24_p8_224", # OOM https://github.com/pytorch/pytorch/issues/95984 - "opacus_cifar10", # Fails to run https://github.com/pytorch/pytorch/issues/99201 -] - -CI_SKIP[CI("inductor", training=True)] = [ - *CI_SKIP[CI("inductor", training=False)], - # TorchBench - "Background_Matting", # fp64_OOM - "hf_T5_base", # accuracy - "mobilenet_v3_large", # accuracy - "resnet50_quantized_qat", # Eager model failed to run - "AlbertForQuestionAnswering", # accuracy - "crossvit_9_240", # fails to run on timm 0.8.22 with cudagraphs, mempools - "deit_base_distilled_patch16_224", # fails to run in timm 0.8.22, cudagraphs - "mobilevit_s", - "pit_b_224", - "twins_pcpvt_base", - "visformer_small", - "vit_base_patch16_224", - "xcit_large_24_p8_224", -] - -# Skips for dynamic=True - -CI_SKIP[CI("aot_eager", training=False, dynamic=True)] = [ - *CI_SKIP[CI("aot_eager", training=False)], - "vision_maskrcnn", # accuracy failure on boxes, after https://github.com/pytorch/pytorch/issues/101093 - # https://github.com/pytorch/pytorch/issues/103760 - "hf_T5_generate", - "hf_Bert", # Error: RelaxedUnspecConstraint(L['input_ids'].size()[0]) - inferred constant (4) -] - -CI_SKIP[CI("aot_eager", training=True, dynamic=True)] = [ - *CI_SKIP[CI("aot_eager", training=True)], - *CI_SKIP[CI("aot_eager", training=False, dynamic=True)], - "llama", # AssertionError: cannot compute free_symbols of True - "torchrec_dlrm", # RuntimeError: mat1 and mat2 must have the same dtype, but got Float and BFloat16 -] - -CI_SKIP[CI("inductor", training=False, dynamic=True)] = [ - *CI_SKIP[CI("aot_eager", training=False, dynamic=True)], - *CI_SKIP[CI("inductor", training=False)], - "nanogpt", # Assertion `index out of bounds: 0 <= tmp0 < 64` failed. -] - -CI_SKIP[CI("inductor", training=True, dynamic=True)] = [ - # NB: Intentionally omitting for symmetry with dynamic=False - # *CI_SKIP[CI("aot_eager", training=True, dynamic=True)], - *CI_SKIP[CI("inductor", training=False, dynamic=True)], - *CI_SKIP[CI("inductor", training=True)], - "levit_128", # Accuracy fails on A10G, passes on A100 - "sebotnet33ts_256", # Flaky accuracy failed -] - -CI_SKIP[CI("inductor", training=False, dynamic=True, device="cpu")] = [ - *CI_SKIP[CI("inductor", training=False, device="cpu")], - "pyhpc_isoneutral_mixing", - "dpn107", -] - -CI_SKIP_OPTIMIZER = { - # TIMM - "convmixer_768_32", # accuracy - "hrnet_w18", # Stack issue in fx - # HF - "pnasnet5large", # Stack issue in fx - "MobileBertForMaskedLM", # Stack issue in fx - "MobileBertForQuestionAnswering", # Stack issue in fx - "PegasusForConditionalGeneration", # OOM -} - -CI_SKIP_DYNAMIC_BATCH_ONLY = { - "sam", - # See https://github.com/mindee/doctr/blob/f2114758d529ed8d3d0030581638f0520b6b98d8/doctr/models/detection/core.py#L89 - # It iterates over the batch, which is dynamic, and dynamo chokes - # We should be able to graphbreak there. - "doctr_det_predictor", - "dlrm", -} - - -def model_specified_by_path(path_and_class_str): - return ":" in path_and_class_str - - -def load_model_from_path(path_and_class_str): - configs = {} - for kvstr in path_and_class_str.split(","): - k, v = kvstr.split(":") - configs[k] = v - - for name in ["path", "class"]: - if name not in configs: - raise RuntimeError( - "Invalid --only arguments. Check help message for the correct format" - ) - - path = configs["path"] - class_name = configs["class"] - - if path[:1] != "/": - raise RuntimeError( - "Use absolute path since dynamo may change the current working directory which makes using relative path tricky" - ) - - spec = importlib.util.spec_from_file_location("module_name", path) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - - model_class = getattr(module, class_name) - assert issubclass(model_class, torch.nn.Module) - model = model_class() - assert hasattr(model, "get_example_inputs") - inputs = model.get_example_inputs() - return model, inputs - - -def output_csv(filename, headers, row): - if os.path.exists(filename): - with open(filename) as fd: - lines = list(csv.reader(fd)) or [[]] - if headers and len(headers) > len(lines[0]): - # if prior results failed the header might not be filled in yet - lines[0] = headers - else: - headers = lines[0] - else: - lines = [headers] - lines.append([(f"{x:.6f}" if isinstance(x, float) else x) for x in row]) - with open(filename, "w") as fd: - writer = csv.writer(fd, lineterminator="\n") - for line in lines: - writer.writerow(list(line) + ["0"] * (len(headers) - len(line))) - - -def nothing(f): - return f - - -@functools.lru_cache(None) -def patch_torch_manual_seed(): - """Make torch manual seed deterministic. Helps with accuracy testing.""" - - def deterministic_torch_manual_seed(*args, **kwargs): - from torch._C import default_generator - - seed = 1337 - import torch.cuda - - if not torch.cuda._is_in_bad_fork(): - torch.cuda.manual_seed_all(seed) - return default_generator.manual_seed(seed) - - torch.manual_seed = deterministic_torch_manual_seed - - -def synchronize(): - pass - - -def summarize_graph_break(filename): - """ - Sorts and de-dupes the graphs breaks on the reason string. Note that this - function is just a best effort to reduce the logging information. We could - miss some graph breaks because of de-duping. We can further refine this - function as need arises. - """ - log_file = f"{filename.rstrip('.csv')}_graph_breaks.csv" - if os.path.exists(log_file): - df = pd.read_csv(log_file) - df = df.sort_values("reason").drop_duplicates(subset="reason") - - # Specialize for multi tensor sgd as reason is not identical - multi_tensor_sgd_row = df.loc[df["reason"].str.contains("_multi_tensor_sgd")] - if len(multi_tensor_sgd_row): - df = df[ - ~df["reason"].str.contains("_multi_tensor_sgd") - ] # Drop all sgd rows - df = pd.concat( - [df, pd.DataFrame([multi_tensor_sgd_row.iloc[0]])], axis=0 - ) # Add back a single row - df.to_csv(f"{log_file.rstrip('.csv')}_deduped.csv", index=False) - - -def print_summary(filename, print_dataframe=False): - if not (filename and os.path.exists(filename)): - return - data = pd.read_csv(filename) - if "tag" in data.columns: - for tag in data.tag.unique(): - if tag == "0.0000": - continue # This happens for failed runs - print(f"\nSummary for tag={tag}:") - print_summary_table(data[data.tag == tag], print_dataframe=print_dataframe) - else: - print_summary_table(data, print_dataframe=print_dataframe) - summarize_graph_break(filename) - - -def print_summary_table(data, print_dataframe=False): - if print_dataframe: - pd.options.display.max_rows = 1000 - pd.options.display.max_columns = 1000 - pd.options.display.width = 2000 - print(data) - width = max(map(len, data.columns)) - for col in data.columns: - try: - if col in ("dev", "name", "batch_size", "tag"): - continue - elif col in ("pct_ops", "pct_time"): - print(col.ljust(width), f"{data[col].mean():.3%}") - elif col in ("graphs", "graph_calls", "captured_ops", "total_ops"): - print(col.ljust(width), f"{data[col].mean():.3f}") - elif col in ("compilation_latency"): - print(col.ljust(width), f"mean={data[col].mean():.3f} seconds") - elif col in ("compression_ratio"): - print(col.ljust(width), f"mean={data[col].mean():.3f}x") - elif col in ("accuracy"): - pass_rate = (data[col] == "pass").mean() - print(col.ljust(width), f"pass_rate={100*pass_rate:.2f}%") - else: - cdata = data[col] - print( - col.ljust(width), - f"gmean={gmean(cdata):.2f}x mean={cdata.mean():.3f}x", - ) - except Exception as e: - pass - - -def tensor_is_on_xla(tensors): - def visit(x: torch.Tensor): - nonlocal result - if x.device.type == "xla": - result = True - - result = False - tree_map_only(torch.Tensor, visit, tensors) - return result - - -def timed( - model, - model_iter_fn, - example_inputs, - times=1, - return_result=False, - collect_outputs=False, -): - use_xla = tensor_is_on_xla(example_inputs) - synchronize() - - if use_xla: - xm.mark_step() - xm.wait_device_ops() - - time_total = 0 - # Dont collect outputs to correctly measure timing - for _ in range(times): - # Put this call inside the loop to reset the seed for each iteration. - # Don't include reset_rng_state() to correctly measure timing - reset_rng_state(use_xla) - t_iter_begin = time.perf_counter() - result = model_iter_fn(model, example_inputs, collect_outputs=collect_outputs) - - # instead of calling sync on result_list, we should call mark_step. - # In training case, result_list may be empty, but we want to - # send all the pending graphs for compilation. - if use_xla: - # For the model running on regular torchxla (baseline), we need the - # mark step to send the accumulated graph for compilation. - # - # For the model running with dynamo/torchxla bridge, in training case, - # we need the mark step to send the optimizer graph out for - # compilation. - xm.mark_step() - t_iter_end = time.perf_counter() - time_total += t_iter_end - t_iter_begin - - t_0 = time.perf_counter() - if use_xla: - xm.wait_device_ops() - synchronize() - t_1 = time.perf_counter() - time_total += t_1 - t_0 - return (time_total, result) if return_result else time_total - - -def _normalize_bench_inputs(example_inputs) -> Tuple[Tuple[Any], Mapping[str, Any]]: - # NOTE(bowbao): For huggingface benchmark, example_inputs are formatted as dictionary, - # and consumed like `model(**example_inputs)`. - # For other benchmarks, example_inputs are formatted as tuple and consumed - # like `model(*example_inputs)`. - if isinstance(example_inputs, dict): - return (), example_inputs - else: - return tuple(example_inputs), {} - - -def _register_dataclass_output_as_pytree(example_outputs) -> None: - # NOTE(angelayi): For huggingface benchmark, some example outputs are - # formatted as a dataclass which pytree cannot consume. So we want - # to register the pytree implementation here - example_outputs_flat, _ = pytree.tree_flatten(example_outputs) - output_dataclass_types = [ - type(out) for out in example_outputs_flat if dataclasses.is_dataclass(type(out)) - ] - for output_type in output_dataclass_types: - from torch._export.utils import register_dataclass_as_pytree_node - - register_dataclass_as_pytree_node(output_type) - - -class Stats: - totals = collections.defaultdict(collections.Counter) - - @classmethod - def reset_counters(cls): - for k, v in torch._dynamo.utils.counters.items(): - cls.totals[k].update(v) - ok = torch._dynamo.utils.counters["frames"]["ok"] - total = torch._dynamo.utils.counters["frames"]["total"] - torch._dynamo.utils.counters.clear() - return ok, total - - @classmethod - def print_summary(cls): - for k, v in sorted(cls.totals.items()): - lines = "\n ".join(map(str, v.most_common(50))) - print(f"STATS {k}\n {lines}") - - @classmethod - def aot_summary(cls): - return [cls.totals["aot_autograd"]["total"], cls.totals["aot_autograd"]["ok"]] - - -def coverage_experiment(args, model_iter_fn, model, example_inputs): - """ - Test operator/model coverage of TorchDynamo and record statistics - taken from a profiler. This target is mainly intended to check - correctness. - - Writes to ./coverage.csv - """ - profiler = Profiler() - frozen_model_iter_fn = torch._dynamo.run(model_iter_fn) - with profiler.prof: - frozen_model_iter_fn(model, example_inputs) - coverage_result = profiler.results() - output_csv( - output_filename, - ( - "dev", - "name", - "batch_size", - "graphs", - "graph_calls", - "captured_ops", - "total_ops", - "pct_ops", - "pct_time", - ), - [ - current_device, - current_name, - current_batch_size, - ] - + coverage_result.tocsv(), - ) - return coverage_result - - -def speedup_experiment_fx2trt(args, model_iter_fn, model, example_inputs): - """ - Measure speedups over eager using the trt inference backend. TRT backend is based fx graph - generated by torch._dynamo. - Writes to ./speedups_fx2trt.csv - """ - return speedup_experiment(args, model_iter_fn, model, example_inputs) - - -def recompile_profiler_experiment(args, model_iter_fn, model, example_inputs): - with torch._dynamo.utils.CompileProfiler() as prof: - opt_model_iter_fn = torch._dynamo.optimize(prof, nopython=args.nopython)( - model_iter_fn - ) - opt_model_iter_fn(model, example_inputs) - output_csv( - output_filename, ["model", "profiler report"], [current_name, prof.report()] - ) - met = prof.get_metrics() - guard_failures = len(met["guard_failures"]) - return [guard_failures] - - -def randomize_input(inputs): - if isinstance(inputs, (list, tuple)): - return type(inputs)([randomize_input(x) for x in inputs]) - elif isinstance(inputs, torch.Tensor): - if inputs.dtype in (torch.float32, torch.float64): - torch._dynamo.utils.counters["randomize_input"]["times"] += 1 - return torch.randn_like(inputs) - elif inputs.dtype == torch.int64: - # Note: we can not simply tune integer tensors as follows - # `return torch.randint_like(inputs, high=inputs.max().item())` - # This may break some invariants between tensors. - # E.g. in embedding lookup case, one tensor is the length - # and another is an indices tensor. - return inputs - else: - raise RuntimeError( - f"randomize_input need support tensor of type {inputs.dtype}" - ) - else: - raise RuntimeError( - f"randomize_input can not handle input of type {type(inputs)}" - ) - - -def maybe_mark_step(args): - if args.trace_on_xla: - xm.mark_step() - - -def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs): - """ - Measure speedups over eager. - - Writes to ./speedups.csv - """ - # if args.dynamic_shapes: - # return speedup_experiment_ds(args, model_iter_fn, model, example_inputs) - - timings = np.zeros((args.repeat, 2), np.float64) - # if we randomize the input, we should also check the result is correct - should_check_result = should_randomize_input = args.randomize_input - - import contextlib - - from torch._inductor.utils import maybe_profile - - @contextlib.contextmanager - def maybe_mark_profile(*args, **kwargs): - prof: torch.profiler.profile = kwargs.pop("p", None) - mark = kwargs.pop("mark", None) - if prof: - with torch.profiler.record_function(mark): - yield - else: - yield - - times = args.iterations_per_run - - # Use higher tolerance for XLA since XLA cause numerical unstability when - # graph size changes - tolerance = args.xla_tolerance if args.trace_on_xla else 1e-4 - torch._dynamo.config.repro_tolerance = tolerance - - with maybe_profile(args.export_profiler_trace) as p: - if args.export_aot_inductor: - frozen_model_iter_fn = export_aot_inductor(model_iter_fn) - else: - frozen_model_iter_fn = torch._dynamo.run(model_iter_fn) - - for rep in trange(args.repeat, desc="running benchmark"): - inputs = ( - randomize_input(copy.deepcopy(example_inputs)) - if should_randomize_input - else example_inputs - ) - # need call mark_step to perform the computation - # on randomize_input. Otherwise the first call using the - # inputs will incur high penalty then the next one. - maybe_mark_step(args) - - # interleave the runs to handle frequency scaling and load changes - with maybe_mark_profile(p=p, mark="expected"): - timings[rep, 0], expected_output = timed( - model, - model_iter_fn, - inputs, - return_result=True, - times=times, - collect_outputs=args.collect_outputs, - ) - - # call mark_step between the 2 calls to make the comparison fair. - maybe_mark_step(args) - - with maybe_mark_profile(p=p, mark="actual"): - timings[rep, 1], actual_output = timed( - model, - frozen_model_iter_fn, - inputs, - return_result=True, - times=times, - collect_outputs=args.collect_outputs, - ) - - if should_check_result: - is_correct = is_correct and same( - expected_output, actual_output, tol=tolerance - ) - - if args.export_profiler_trace: - name = args.profiler_trace_name + "_" + model.name + ".json" - name = os.path.join(torch._dynamo.config.base_dir, name) - p.export_chrome_trace(name) - median = np.median(timings, axis=0) - speedup = median[0] / median[1] - if args.dump_raw_metrics: - np.save( - f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy", - timings, - ) - - first_headers = ["dev", "name", "batch_size"] - first_fields = [current_device, current_name, current_batch_size] - if "tag" in kwargs: - first_headers.append("tag") - first_fields.append(kwargs["tag"]) - headers = first_headers + ["speedup", "abs_latency"] - row = first_fields + [float(speedup), median[1] * 1000] - msg = f"{speedup:.3f}x" - if args.baseline: - headers.extend( - [ - "baseline", - "speedup_vs_baseline", - ] - ) - df = pd.read_csv(args.baseline) - try: - baseline_speedup = df[df["name"] == current_name]["speedup"].item() - row.extend([baseline_speedup, speedup / baseline_speedup]) - msg = f"{baseline_speedup:.3f}x -> {speedup:.3f}x [{speedup / baseline_speedup:.3f}x]" - except (KeyError, ZeroDivisionError): - row.extend( - [ - 0.0, - 0.0, - ] - ) - if "compilation_latency" in kwargs: - headers += [ - "compilation_latency", - "compression_ratio", - "eager_peak_mem", - "dynamo_peak_mem", - ] - row.append(kwargs["compilation_latency"]) - row.append(kwargs["compression_ratio"]) - row.append(kwargs["eager_peak_mem"]) - row.append(kwargs["dynamo_peak_mem"]) - if "dynamo_stats" in kwargs: - for k, v in kwargs["dynamo_stats"].items(): - headers.append(k) - row.append(v) - output_csv( - output_filename, - headers, - row, - ) - headers, data = torch._dynamo.utils.compile_times(repr="csv", aggregate=True) - assert ( - output_filename.find(".csv") > 0 - ), f"expected output_filename to be a .csv, but got {output_filename}" - output_csv( - output_filename[:-4] + "_compilation_metrics.csv", - first_headers + headers, - first_fields + data, - ) - return msg - - -def speedup_experiment_ds(args, model_iter_fn, model, example_inputs): - """ - Run dynamic shapes benchmarks. - - Requires dynamic shape compatible models, which provide a list of example inputs. - - Warms up using the first input example and then iterates the inputs, - measuring (and expecting minimal) variance between the runtime for different examples. - - """ - timings = np.zeros((args.repeat, len(example_inputs), 2), np.float64) - - if args.repeat > 5: - print( - f"\ndynamic shapes experiments are slow, consider setting --repeat less than {args.repeat}\n" - ) - - nwarmup = 4 - for rep in range(args.repeat): - # Start each rep fresh, e.g. only warmup on example 0 - torch._dynamo.reset() - optimized_model_iter_fn = optimize_ctx(model_iter_fn) - for _ in range(nwarmup): - optimized_model_iter_fn(model, example_inputs[0]) - - for input_idx, inputs in enumerate(example_inputs): - # interleave the runs to handle frequency scaling and load changes - timings[rep, input_idx, 0] = timed( - model, model_iter_fn, inputs, return_result=False - ) - # different from regular speedup_experiment, we _DO_ want to allow recompilation - timings[rep, input_idx, 1] = timed( - model, optimized_model_iter_fn, inputs, return_result=False - ) - medians = np.median(timings, axis=0) - speedups = list(medians[:, 0] / medians[:, 1]) - speedups_mean = np.mean(speedups) - speedups_median = np.median(speedups) - speedups_var = np.var(speedups) - - # TODO this x[0] is not going to work in general but bert only has 1 input - shapes = [x[0].shape for x in example_inputs] - shape_keys = sorted(set(shapes)) - shape_speedups = { - shape: [ - it[1] for it in filter(lambda it: it[0] == shape, zip(shapes, speedups)) - ] - for shape in shape_keys - } - output_str = ( - f"mean: {speedups_mean:.3f}, median: {speedups_median:.3f}, var: {speedups_var:.3f}" - + "\nSpeedups by shape: " - + "\n".join( - [ - f"{shape}: " - + ", ".join([f"{speedup: .3g}" for speedup in shape_speedups[shape]]) - for shape in shape_keys - ] - ) - ) - output_csv( - output_filename, - ("dev", "name", "batch_size", "speedup mean", "speedup median", "speedup var"), - [ - current_device, - current_name, - current_batch_size, - speedups_mean, - speedups_median, - speedups_var, - ], - ) - return output_str - - -def speedup_experiment_onnx( - onnx_model_cls: Type[OnnxModelFromTorchScript], - args, - model_iter_fn, - model, - example_inputs, - **kwargs, -): - """ - Measure speedups over eager. - - This function is responsible for the following: - 1. Creation of OnnxModel, which handles export, ort initialization. - 2. Creating iobinding with OnnxModel if device is CUDA, which is essential for perf measurement. - 3. Running ORT with OnnxModel. - - Writes to ./{output_filename}, which should be - `pathlib.Path(self.output_dir) / f"{self.compiler}_{suite}_{self.dtype}_{self.mode}_{self.device}_{self.testing}.csv". - - TODO(bowbao): Record export time and export peak memory usage. - """ - timings = np.zeros((args.repeat, 2), np.float64) - is_correct = True - should_randomize_input = args.randomize_input - times = args.iterations_per_run - - onnx_model = onnx_model_cls( - args.output_directory or ".", model, copy.deepcopy(example_inputs) - ) - - def create_onnx_input_binded_fn( - onnx_model: OnnxModelFromTorchScript, pt_inputs, example_outputs - ): - # Goal is to move the iobinding creation outside of the timer function. - iobinding, outputs = onnx_model.create_iobinding(pt_inputs, example_outputs) - - def onnxrt_model_iter_fn(model, inputs, collect_outputs=True): - onnx_model.run_with_iobinding(iobinding, outputs) - if collect_outputs: - return outputs - - return onnxrt_model_iter_fn - - def create_onnx_fn(onnx_model: OnnxModelFromTorchScript, pt_inputs): - def onnxrt_model_iter_fn(model, inputs, collect_outputs=True): - return onnx_model.run(pt_inputs) - - return onnxrt_model_iter_fn - - for rep in range(args.repeat): - inputs = ( - randomize_input(copy.deepcopy(example_inputs)) - if should_randomize_input - else example_inputs - ) - timings[rep, 0], expected_output = timed( - model, - model_iter_fn, - inputs, - return_result=True, - times=times, - collect_outputs=args.collect_outputs, - ) - - if current_device == "cpu": - onnxrt_model_iter_fn = create_onnx_fn(onnx_model, inputs) - else: - onnxrt_model_iter_fn = create_onnx_input_binded_fn( - onnx_model, inputs, expected_output - ) - - timings[rep, 1], actual_output = timed( - model, - onnxrt_model_iter_fn, - inputs, - return_result=True, - times=times, - collect_outputs=args.collect_outputs, - ) - - pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue - median = np.median(timings, axis=0) - speedup = median[0] / median[1] - if args.dump_raw_metrics: - np.save( - f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy", - timings, - ) - - headers = ["dev", "name", "batch_size", "speedup", "abs_latency"] - row = [ - current_device, - current_name, - current_batch_size, - float(speedup), - median[1] * 1000, - ] - if "compilation_latency" in kwargs: - headers = headers + ["compilation_latency", "compression_ratio"] - row.append(kwargs["compilation_latency"]) - row.append(kwargs["compression_ratio"]) - - output_csv( - output_filename, - headers, - row, - ) - headers, data = torch._dynamo.utils.compile_times(repr="csv", aggregate=True) - assert ( - output_filename.find(".csv") > 0 - ), f"expected output_filename to be a .csv, but got {output_filename}" - output_csv( - output_filename[:-4] + "_compilation_metrics.csv", - ["dev", "name", "batch_size"] + headers, - [current_device, current_name, current_batch_size] + data, - ) - return format_speedup(speedup, pvalue, is_correct=is_correct) - - -def overhead_experiment(*args, model_iter_fn): - """ - Measure overheads of TorchDynamo by running with no backend (only - eager+FX), and reporting speedup/slowdown over eager. - - Writes to ./overheads.csv - """ - return speedup_experiment(*args, model_iter_fn) - - -def print_fx(gm, example_inputs): - print(gm.graph) - return gm - - -def print_aten_ops(gm, example_inputs): - from functorch.compile import aot_module - - def trace_printer(gm, _): - print(gm.graph) - return gm - - return aot_module(gm, fw_compiler=trace_printer, bw_compiler=trace_printer) - - -def baselines(models, model_iter_fn, example_inputs, args): - """ - Common measurement code across all baseline experiments. - """ - models = list(models) - for idx, (name, model) in enumerate(models): - if idx == 0: - result0 = model_iter_fn(model, example_inputs) - elif model is not None: - try: - result = model_iter_fn(model, example_inputs) - if same(result0, result): - continue - print(name, "is INCORRECT") - except Exception: - log.exception("error checking %s", name) - models[idx] = (name, None) - timings = np.zeros((args.repeat, len(models)), np.float64) - timings.fill(1.0e10) - for rep in range(args.repeat): - for idx, (name, model) in enumerate(models): - if model is not None: - try: - timings[rep, idx] = timed(model, model_iter_fn, example_inputs) - except Exception: - pass - pvalue = [ - ttest_ind(timings[:, 0], timings[:, i]).pvalue - for i in range(1, timings.shape[1]) - ] - median = np.median(timings, axis=0) - speedup = median[0] / median[1:] - for idx, (name, model) in enumerate(models[1:]): - if model is None: - speedup[idx] = 0.0 - result = " ".join( - [ - format_speedup(s, p, m is not None) - for s, p, m in zip(speedup, pvalue, [m for n, m in models[1:]]) - ] - ) - output_csv( - output_filename, - ("dev", "name", "batch_size") + tuple(n for n, m in models[1:]), - [current_device, current_name, current_batch_size] - + [f"{x:.4f}" for x in speedup], - ) - return result - - -def xla(args, model_iter_fn, model, example_inputs): - xla_dev = xm.xla_device(devkind=current_device) - model_xla = copy.deepcopy(model).to("cpu").to(device=xla_dev) - example_inputs_xla = tree_map_only( - torch.Tensor, lambda x: x.to("cpu").to(device=xla_dev), example_inputs - ) - for _ in range(3): # warmup - timed(model, model_iter_fn, example_inputs) - timed(model_xla, model_iter_fn, example_inputs_xla) - timings = np.zeros((args.repeat, 2), np.float64) - timings.fill(1.0e10) - for rep in range(args.repeat): - timings[rep, 0] = timed(model, model_iter_fn, example_inputs) - timings[rep, 1] = timed(model_xla, model_iter_fn, example_inputs_xla) - - pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue - time_baseline, time_xla = np.median(timings, axis=0) - speedup = time_baseline / time_xla - output_csv( - output_filename, - ("dev", "name", "batch_size", "speedup", "time_baseline", "time_xla"), - [ - current_device, - current_name, - current_batch_size, - speedup, - time_baseline, - time_xla, - ], - ) - return format_speedup(speedup, pvalue) - - -def try_script(model, example_inputs): - try: - return torch.jit.script(model) - except Exception: - return None - - -class AOTInductorModelCache: - cache = dict() - - @classmethod - def load(cls, model, example_inputs, eager_forward): - key = id(model) - if key not in cls.cache: - # Register the output dataclass to pytree - example_outputs = eager_forward( - copy.deepcopy(model), clone_inputs(example_inputs) - ) - _register_dataclass_output_as_pytree(example_outputs) - - example_args, example_kwargs = _normalize_bench_inputs(example_inputs) - example_inputs = torch._export.combine_args_kwargs( - example_args, example_kwargs - ) - - so_path, exported = torch._export.aot_compile( - model, example_args, example_kwargs - ) - - output_node = list(exported.graph.nodes)[-1] - output_tensors = [ - torch.empty( - node.meta["val"].size(), - dtype=node.meta["val"].dtype, - layout=node.meta["val"].layout, - device=node.meta["val"].device, - ) - for node in output_node.args[0] - ] - - # Use a utility function for easier benchmarking - source = """ - #include - - torch::aot_inductor::AOTInductorModel model; - - void run( - const std::vector& input_tensors, - std::vector& output_tensors) { - model.run(input_tensors, output_tensors, at::cuda::getCurrentCUDAStream()); - } - """ - module = torch.utils.cpp_extension.load_inline( - name="aot_inductor", - cpp_sources=[source], - functions=["run"], - extra_ldflags=[so_path], - with_cuda=True, - ) - - value = { - "module": module, - "exported": exported, - "output_tensors": output_tensors, - "output_spec": exported.call_spec.out_spec, - } - cls.cache[key] = value - - return ( - cls.cache[key]["module"], - cls.cache[key]["exported"], - cls.cache[key]["output_tensors"], - cls.cache[key]["output_spec"], - ) - - -def export_aot_inductor(forward: Callable): - eager_forward = forward - - def opt_aot_inductor(model, example_inputs, collect_outputs=False): - module, exported, output_tensors, output_spec = AOTInductorModelCache.load( - model, example_inputs, eager_forward - ) - param_buffer_values = list(exported.state_dict.values()) - example_args, example_kwargs = _normalize_bench_inputs(example_inputs) - example_inputs = torch._export.combine_args_kwargs(example_args, example_kwargs) - flat_example_inputs = fx_pytree.tree_flatten_spec( - example_inputs, exported.call_spec.in_spec - ) - all_args = (*param_buffer_values, *flat_example_inputs) - module.run(all_args, output_tensors) - return pytree.tree_unflatten(output_tensors, output_spec) - - return opt_aot_inductor - - -def download_retry_decorator(download_fn): - """ - Decorator function for applying retry logic to a download function. - - The wrapped function will be called up to 5 times and raises an exception if the function fails each time. - After each unsuccessful attempt, there is a delay before the next attempt, which is increased linearly with the number of tries. - - Usage: - @download_retry_decorator - def download_function(model_name: str): - # download logic goes here - """ - - @functools.wraps(download_fn) - def wrapper(self, *args, **kwargs) -> Any: - tries = 0 - total_allowed_tries = MAX_DOWNLOAD_ATTEMPTS - while tries <= total_allowed_tries: - try: - model = download_fn(self, *args, **kwargs) - return model - except Exception as e: - tries += 1 - if tries <= total_allowed_tries: - wait = tries * 30 - print( - f"Failed to load model: {e}. Trying again ({tries}/{total_allowed_tries}) after {wait}s" - ) - time.sleep(wait) - else: - raise RuntimeError( - f"Failed to load model '{args}' with following error(s): {str(e)}." - ) - - return wrapper - - -class OnnxModelFromTorchScript: - """TorchScript based onnx export. `torch.onnx.export` - - TODO(bowbao): - * large model export failed. - Onnx Model is larger than 2GB, but exporter makes decision based pt model size, which is - smaller than 2GB. - * OOM on slightly larger model. - Both pt model and ort inference session are on gpu. Attempt has been made to move ORT to - cuda:1, however ORT perf drop significantly. - For now running everything with batch_size 1 set in launch script. - """ - - TORCH_TO_NUMPY_DTYPE = { - torch.float16: np.float16, - torch.float32: np.float32, - torch.float64: np.float64, - torch.uint8: np.uint8, - torch.int8: np.int8, - torch.int16: np.int16, - torch.int32: np.int32, - torch.int64: np.longlong, - torch.bool: np.bool_, - } - - def __init__(self, output_directory, model, example_inputs): - self.model_path = self._generate_onnx_model_path(output_directory) - self._export( - model, - example_inputs, - self.model_path, - opset_version=17, - do_constant_folding=False, - verbose=False, - ) - self.onnx_session = self._init_ort_session(self.model_path) - - def _generate_onnx_model_path( - self, output_directory: str, onnx_model_folder_name: str = "bench_onnx_models" - ) -> str: - # Hack to get model name. - from torch._functorch import aot_autograd - - model_name = aot_autograd.model_name - model_path = pathlib.Path(output_directory, onnx_model_folder_name, model_name) - if model_path.exists() and model_path.is_dir(): - shutil.rmtree(model_path) - model_path.mkdir(parents=True, exist_ok=True) - return str(model_path / "model.onnx") - - def _export(self, model, example_inputs, output_path: str, /, **kwargs) -> None: - # Hack for huggingface models (kwargs only). - if isinstance(example_inputs, dict): - - class WrapperModel(torch.nn.Module): - def __init__(self, model, keys): - super().__init__() - self.model = model - self.keys = keys - - def forward(self, *args): - return self.model(**dict(zip(self.keys, args))) - - model = WrapperModel(model, list(example_inputs.keys())) - - torch.onnx.export( - model, - self.format_pt_inputs(example_inputs), - output_path, - **kwargs, - ) - - def _init_ort_session(self, model_path: str): - import onnxruntime - - if current_device == "cpu": - ort_providers = ["CPUExecutionProvider"] - else: - # NOTE(bowbao): Reduce OOM by running ORT on another gpu. - # TODO(bowbao): This works to avoid OOM, but performance is surprisingly very bad. - # cuda_provider_options = { - # "device_id": 1 if torch.cuda.device_count() > 1 else 0, - # } - # ort_providers = [("CUDAExecutionProvider", cuda_provider_options)] - ort_providers = ["CUDAExecutionProvider"] - - ort_session = onnxruntime.InferenceSession( - self.model_path, - providers=ort_providers, - ) - return ort_session - - def format_pt_inputs(self, pt_inputs): - # NOTE(bowbao): For huggingface benchmark, pt_inputs are formatted as dictionary, - # and consumed like `model(**pt_inputs)`. - # For other benchmarks, pt_inputs are formatted as tuple and consumed - # like `model(*pt_inputs)`. - if isinstance(pt_inputs, dict): - pt_inputs = list(pt_inputs.values()) - if isinstance(pt_inputs, torch.Tensor): - pt_inputs = (pt_inputs,) - return tuple(arg.contiguous() for arg in pt_inputs) - - def format_pt_outputs(self, pt_outputs): - if isinstance(pt_outputs, torch.Tensor): - pt_outputs = (pt_outputs,) - - pt_outputs, _ = pytree.tree_flatten(pt_outputs) - - # Hack for huggingface model outputs - try: - from transformers import modeling_outputs - except ImportError: - pass - else: - - def _to_tuple(x): - if isinstance(x, modeling_outputs.ModelOutput): - return x.to_tuple() - return x - - pt_outputs = pytree.tree_map(_to_tuple, pt_outputs) - pt_outputs, _ = pytree.tree_flatten(pt_outputs) - - return pt_outputs - - def create_outputs(self, *example_outputs): - return tuple(torch.empty_like(x) for x in example_outputs) - - def create_iobinding(self, pt_inputs, example_outputs): - pt_inputs = self.format_pt_inputs(pt_inputs) - example_outputs = self.format_pt_outputs(example_outputs) - - iobinding = self.onnx_session.io_binding() - args = [arg.contiguous() for arg in pt_inputs] - for ort_input, arg in zip(self.onnx_session.get_inputs(), args): - # NOTE: Small hack to reduce OOM issue by running ORT on another device. - # Disabled due to ORT perf regression. - # if torch.cuda.device_count() > 1: - # arg = arg.detach().to("cuda:1") - device = arg.device - iobinding.bind_input( - ort_input.name, - device.type, - device.index or 0, - self.TORCH_TO_NUMPY_DTYPE[arg.dtype], - arg.size(), - arg.data_ptr(), - ) - - outputs = self.create_outputs(*example_outputs) - for ort_output, output in zip(self.onnx_session.get_outputs(), outputs): - # if torch.cuda.device_count() > 1: - # output = output.detach().to("cuda:1") - device = output.device - iobinding.bind_output( - ort_output.name, - device.type, - device.index or 0, - self.TORCH_TO_NUMPY_DTYPE[output.dtype], - output.size(), - output.data_ptr(), - ) - return iobinding, outputs - - def run_with_iobinding(self, iobinding, outputs): - # 'outputs' are torch empty tensors binded to 'iobinding'. - self.onnx_session.run_with_iobinding(iobinding) - return outputs - - def run(self, pt_inputs): - # NOTE: For CUDA performance testing, use `run_with_iobinding` to exclude memory - # copying overhead for inputs/outputs between cpu and gpu. - # Otherwise perf number is inaccurate. - pt_inputs = self.format_pt_inputs(pt_inputs) - onnx_inputs = { - ort_input.name: pt_input.cpu().numpy() - for ort_input, pt_input in zip(self.onnx_session.get_inputs(), pt_inputs) - } - ort_outputs = self.onnx_session.run(None, onnx_inputs) - pt_outputs = [ - torch.from_numpy(ort_output).to(current_device) - for ort_output in ort_outputs - ] - if len(pt_outputs) == 1: - return pt_outputs[0] - return pt_outputs - - -class OnnxModelFromDynamo(OnnxModelFromTorchScript): - """Dynamo and Fx based export. `torch.onnx.dynamo_export`.""" - - def __init__(self, output_directory, model, example_inputs): - self.model_path = self._generate_onnx_model_path( - output_directory, "bench_dynamo_onnx_model" - ) - self._export_output = self._export(model, example_inputs, self.model_path) - self.onnx_session = self._init_ort_session(self.model_path) - - def _export( - self, model, example_inputs, output_path: str - ) -> torch.onnx.ExportOutput: - example_args, example_kwargs = _normalize_bench_inputs(example_inputs) - options = torch.onnx.ExportOptions() - export_output = torch.onnx.dynamo_export( - model, *example_args, **example_kwargs, export_options=options - ) - - export_output.save(output_path) - return export_output - - def format_pt_inputs(self, pt_inputs): - pt_args, pt_kwargs = _normalize_bench_inputs(pt_inputs) - return self._export_output.adapt_torch_inputs_to_onnx(*pt_args, **pt_kwargs) - - def format_pt_outputs(self, pt_outputs): - return self._export_output.adapt_torch_outputs_to_onnx(pt_outputs) - - -def optimize_onnx_ctx( - output_directory: str, - onnx_model_cls: Type[OnnxModelFromTorchScript], - run_n_iterations: Callable, -) -> Callable: - # NOTE(bowbao): This function creates and returns the onnx version of 'run_n_iterations', - # which does the following: - # 1. Export and cache model. - # 2. Create iobinding for ORT. - # 3. Run ORT for n iterations. - onnx_model: Optional[OnnxModelFromTorchScript] = None - - def run_n_iterations_onnx(model, inputs, n=2): - from _onnx import reporter - from torch.onnx._internal import exporter - from torch.onnx._internal.fx import diagnostics - - # NOTE(bowbao): Capture all export & ort errors and diagnostics. - # Serialize to csv, to be parsed and summarized later by '._onnx/reporter.py'. - # TODO: Accuracy mismatch is not reported here in csv. - assert ( - output_filename.find(".csv") > 0 - ), f"expected output_filename to be a .csv, but got {output_filename}" - output_error_filename = output_filename[:-4] + "_export_error.csv" - parser = reporter.ExportErrorParser( - current_device, current_name, current_batch_size - ) - try: - nonlocal onnx_model - if onnx_model is None: - onnx_model = onnx_model_cls( - output_directory, model, copy.deepcopy(inputs) - ) - - for _ in range(n - 1): - onnx_model.run(inputs) - return onnx_model.run(inputs) - except exporter.OnnxExporterError as e: - # `torch.onnx.dynamo_export` raises error that encloses diagnostics. - diagnostic_context = e.diagnostic_context - for parsed_error in parser.parse_diagnostic_context(diagnostic_context): - output_csv( - output_error_filename, parsed_error.headers, parsed_error.row - ) - - # Check also the raw exception that caused export failure. - # Skip if it is already analyzed by diagnostics. - cause_of_exception = e.__cause__ - if not isinstance( - cause_of_exception, diagnostics.RuntimeErrorWithDiagnostic - ): - parsed_error = parser.parse_exception(cause_of_exception) - output_csv( - output_error_filename, parsed_error.headers, parsed_error.row - ) - raise - except Exception as e: - # `torch.onnx.export` errors. - # ORT errors. - parsed_error = parser.parse_exception(e) - output_csv(output_error_filename, parsed_error.headers, parsed_error.row) - raise - - return run_n_iterations_onnx - - -def read_batch_size_from_file(args, filename, model_name): - batch_size = None - if os.path.exists("benchmarks"): - filename = os.path.join("benchmarks", filename) - assert os.path.exists(filename), filename - with open(filename) as f: - lines = f.readlines() - lines = [i.split(",") for i in lines if len(i.strip()) > 0] - for val in lines: - cur_name, b = val - if model_name == cur_name: - batch_size = int(b) - if batch_size is None: - log.warning("Could not find batch size for %s", model_name) - elif batch_size == -1: - raise RuntimeError( - f"Batch size is unset for {model_name} in {args.batch_size_file}" - ) - print(f"batch size: {batch_size}") - return batch_size - - -class TimeOutException(Exception): - pass - - -def alarm_handler(signum, frame): - raise TimeOutException() - - -def exit_after(s): - """ - Decorator to raise TimeoutException if the fn is taking more than s seconds - to run. - """ - - def outer(fn): - def inner(*args, **kwargs): - signal.signal(signal.SIGALRM, alarm_handler) - signal.alarm(s) - try: - result = fn(*args, **kwargs) - finally: - signal.alarm(0) - return result - - return inner - - return outer - - -def get_peak_memory(): - return torch.cuda.max_memory_allocated() / 10**9 - - -def null_experiment(args, model_iter_fn, model, example_inputs): - """ - A no-op experiment useful for making sure TorchBenchark alone works properly. - """ - - return [] - - -def cast_to(dtype, model, inputs): - # cast model and inputs to fp16 - if dtype == torch.float16: - model = model.half() - else: - model = model.to(dtype) - - inputs = tree_map( - lambda x: x.to(dtype) - if isinstance(x, torch.Tensor) and x.is_floating_point() - else x, - inputs, - ) - return model, inputs - - -def cast_to_bf16(model, inputs): - return cast_to(torch.bfloat16, model, inputs) - - -def cast_to_fp16(model, inputs): - return cast_to(torch.float16, model, inputs) - - -def cast_to_fp64(model, inputs): - return cast_to(torch.float64, model, inputs) - - -def cast_to_fp32(model, inputs): - return cast_to(torch.float32, model, inputs) - - -def reset_rng_state(use_xla=False): - torch.manual_seed(1337) - random.seed(1337) - np.random.seed(1337) - if use_xla: - xm.set_rng_state(1337, str(xm.xla_device())) - - -class DummyGradScaler: - def scale(self, loss): - return loss - - -def get_dynamo_stats(): - # TODO: consider deepcopy'ing the entire counters struct and - # adding a helper to do subtraction on it - return collections.Counter( - { - "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"], - "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"], - "graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()), - # NB: The plus removes zero counts - "unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]), - } - ) - - -def maybe_fresh_cache(fn, is_cold_start): - def inner(*args, **kwargs): - cache_minder = contextlib.nullcontext() - if is_cold_start: - cache_entries = {} - cache_minder = fresh_inductor_cache(cache_entries) - - try: - with cache_minder: - return fn(*args, **kwargs) - finally: - dump_cache = False - if dump_cache and is_cold_start: - output_csv( - output_filename[:-4] + "_triton_cache.csv", - ["dev", "name", "batch_size", "triton_cache"], - [ - current_device, - current_name, - current_batch_size, - cache_entries, - ], - ) - - return inner - - -@contextmanager -def maybe_init_distributed(should_init_distributed, rank, world_size, port="6789"): - try: - if should_init_distributed: - torch.cuda.set_device(rank) - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = port - torch.distributed.init_process_group( - "nccl", rank=rank, world_size=world_size - ) - yield - finally: - if should_init_distributed: - torch.distributed.destroy_process_group() - - -class BenchmarkRunner: - def __init__(self): - self.model_iter_fn = None - self.grad_scaler = DummyGradScaler() - self.autocast = contextlib.nullcontext - self.optimizer = None - self._args = None - - def setup_amp(self): - if self.args.only in self.fp32_only_models: - return - - if self.args.amp and self.args.devices == ["cuda"]: - # AMP training can lead to small loss values which can undeflow - # gradient values returning in zero gradients. To solve this - # problem, PyTorch introduces GradScaler. GradScaler is a stateful - # structure, that scales the loss values to prevent underflow. Loss - # values are big at the beginning of training (therefore not - # requiring scaling), while loss value tends to be small as network - # starts getting better (requiring scaling). GradScaler manages all - # of this fine tuning, checking the gradients are turning to inf, - # discarding such batches. - - # Since we are not running a long iteration, default value of - # init_scale 65536 is going to turn all gradients to inf. Therefore, - # we just use a init_scale of 2.0 for benchmarking purpose. - - # Disabling Gradscaler because - # 1) Benchmark setup runs 2 iterations of fwd-bwd. So, not useful. - # 2) Current setup shares grad_scaler for eager and dynamo model, - # which is bad as Gradscaler has state and can adjust the scaling - # factor between eager and dynamo run, making accuracy check - # harder. - # self.grad_scaler = torch.cuda.amp.GradScaler(init_scale=2.0) - self.autocast = torch.cuda.amp.autocast - elif (self.args.bfloat16 or self.args.amp) and self.args.devices == ["cpu"]: - self.autocast = torch.cpu.amp.autocast - - def init_optimizer(self, name, device, params): - if device == "cuda" and self.args.training and name not in CI_SKIP_OPTIMIZER: - self.optimizer = torch.optim.SGD(params, lr=0.01, foreach=True) - else: - self.optimizer = None - - @property - def args(self): - return self._args - - @args.setter - def args(self, args): - self._args = args - - @property - def skip_models(self): - return set() - - @property - def skip_models_for_cuda(self): - return set() - - @property - def skip_models_for_cpu(self): - return set() - - @property - def slow_models(self): - return set() - - @property - def very_slow_models(self): - return set() - - @property - def non_deterministic_models(self): - return set() - - @property - def fp32_only_models(self): - return set() - - @property - def force_amp_for_fp16_bf16_models(self): - return set() - - @property - def skip_not_suitable_for_training_models(self): - return set() - - @property - def failing_torchinductor_models(self): - return set() - - @property - def failing_fx2trt_models(self): - return set() - - @property - def skip_accuracy_checks_large_models_dashboard(self): - return set() - - @property - def skip_accuracy_check_as_eager_non_deterministic(self): - return set() - - @property - def get_tolerance_and_cosine_flag(self, is_training, current_device, name): - raise NotImplementedError() - - @property - def equal_nan(self): - equal_nan = True - if self.args.float32: - equal_nan = False - return equal_nan - - def iter_models(self, args): - for model_name in self.iter_model_names(args): - for device in args.devices: - try: - yield self.load_model( - device, - model_name, - batch_size=args.batch_size, - ) - except NotImplementedError: - continue # bad benchmark implementation - - def deepcopy_model(self, model): - return copy.deepcopy(model) - - def cast_based_on_args(self, model, example_inputs): - if self.args.float32 or self.args.only in self.fp32_only_models: - if not self.args.float32: - log.warning("Model %s supports float32 only", self.args.only) - model, example_inputs = cast_to_fp32(model, example_inputs) - elif self.args.float16: - if self.args.only in self.force_amp_for_fp16_bf16_models: - log.warning( - "Model %s does not support float16, running with amp instead", - self.args.only, - ) - self.args.amp = True - self.setup_amp() - else: - model, example_inputs = cast_to_fp16(model, example_inputs) - elif self.args.bfloat16: - if self.args.only in self.force_amp_for_fp16_bf16_models: - log.warning( - "Model %s does not support bfloat16, running with amp instead", - self.args.only, - ) - self.args.amp = True - self.setup_amp() - else: - model, example_inputs = cast_to_bf16(model, example_inputs) - - return model, example_inputs - - def validate_model(self, model, example_inputs): - """ - Runs the eager model with example inputs to ensure that eager passes. - """ - model = self.deepcopy_model(model) - example_inputs = clone_inputs(example_inputs) - model, example_inputs = self.cast_based_on_args(model, example_inputs) - try: - self.model_iter_fn(model, example_inputs) - except Exception as e: - raise NotImplementedError("Eager model failed to run") from e - - def maybe_cast(self, model, example_inputs): - model = self.deepcopy_model(model) - example_inputs = clone_inputs(example_inputs) - model, example_inputs = self.cast_based_on_args(model, example_inputs) - return model, example_inputs - - def decay_batch_exp(self, batch_size, factor=0.5, divisor=2): - out_batch_size = batch_size * factor - if out_batch_size > divisor: - out_batch_size = (out_batch_size + 1) // divisor * divisor - else: - out_batch_size = batch_size - 1 - return max(0, int(out_batch_size)) - - def batch_size_finder(self, device, model_name, initial_batch_size=1024): - batch_size = initial_batch_size - while batch_size >= 1: - torch.cuda.empty_cache() - try: - device, name, model, example_inputs, _ = self.load_model( - device, - model_name, - batch_size, - ) - self.model_iter_fn(model, example_inputs) - return batch_size - except RuntimeError as e: - error_str = str(e) - if "channels_last" in error_str: - break - batch_size = self.decay_batch_exp(batch_size) - return 1 - - def run_n_iterations(self, mod, inputs): - n = self.args.iterations - for _ in range(n - 1): - self.model_iter_fn(mod, inputs, collect_outputs=False) - return self.model_iter_fn(mod, inputs, collect_outputs=True) - - def optimizer_zero_grad(self, mod): - if self.optimizer is not None: - self.optimizer.zero_grad(True) - else: - mod.zero_grad(True) - - def optimizer_step(self): - if self.optimizer is not None: - self.optimizer.step() - - def get_benchmark_indices(self, length): - start = self._args.partition_id * (length // self._args.total_partitions) - end = ( - (self._args.partition_id + 1) * (length // self._args.total_partitions) - if self._args.partition_id < self._args.total_partitions - 1 - else length - ) - return start, end - - def deepcopy_and_maybe_ddp(self, model): - model = self.deepcopy_model(model) - if self.args.ddp: - assert ( - torch.distributed.is_available() - ), "Can't use DDP without a distributed enabled build" - from torch.nn.parallel import DistributedDataParallel as DDP - - model = DDP(model, find_unused_parameters=True) - elif self.args.fsdp: - assert ( - torch.distributed.is_available() - ), "Can't use FSDP without a distributed enabled build" - from torch.distributed.fsdp import ( - FullyShardedDataParallel as FSDP, - MixedPrecision, - ) - - from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy - - if self.args.float16: - dtype = torch.float16 - elif self.args.bfloat16: - dtype = torch.bfloat16 - else: - dtype = torch.float32 - - mp_policy = MixedPrecision( - param_dtype=dtype, - # Gradient communication precision. - reduce_dtype=dtype, - # Buffer precision. - buffer_dtype=dtype, - ) - - my_auto_wrap_policy = functools.partial( - size_based_auto_wrap_policy, recurse=True, min_num_params=int(1e5) - ) - - model = FSDP( - model, - use_orig_params=True, - device_id=torch.cuda.current_device() - if self.args.devices[-1] == "cuda" - else None, - mixed_precision=mp_policy, - limit_all_gathers=True, - auto_wrap_policy=my_auto_wrap_policy, - ) - if torch._inductor.config.triton.cudagraphs: - log.warning("Disabling cudagraphs for FSDP compatibility") - torch._inductor.config.triton.cudagraphs = False - return model - - def check_accuracy( - self, name, model, example_inputs, optimize_ctx, experiment, tag - ): - """ - Checks accuracy. - 1) Collect the outputs with fp64 datatype. This is useful for error checking. - 2) Checks if eager itself has variations. - """ - start_stats = get_dynamo_stats() - - def record_status(accuracy_status, dynamo_start_stats): - """ - Records the status in the csv file - """ - if current_name in self.non_deterministic_models: - if accuracy_status in ( - "pass", - "eager_two_runs_differ", - "fail_accuracy", - ): - accuracy_status = "pass" - - headers = ["dev", "name", "batch_size", "accuracy"] - fields = [current_device, current_name, current_batch_size, accuracy_status] - - if tag is not None: - headers.insert(3, "tag") - fields.insert(3, tag) - - dynamo_stats = get_dynamo_stats() - dynamo_stats.subtract(dynamo_start_stats) - for k, v in dynamo_stats.items(): - headers.append(k) - fields.append(v) - - output_csv(output_filename, headers, fields) - return accuracy_status - - if name in self.skip_accuracy_checks_large_models_dashboard: - return record_status("pass_due_to_skip", dynamo_start_stats=start_stats) - - # Collect the fp64 reference outputs to be used later for accuracy checking. - fp64_outputs = None - try: - model_fp64, inputs_fp64 = cast_to_fp64( - self.deepcopy_and_maybe_ddp(model), - clone_inputs(example_inputs), - ) - self.init_optimizer(name, current_device, model_fp64.parameters()) - fp64_outputs = self.run_n_iterations(model_fp64, inputs_fp64) - except Exception: - log.warning( - "fp64 golden ref were not generated for %s. Setting accuracy check to cosine", - name, - ) - self.args.cosine = True - fp64_outputs = None - - tolerance, cos_similarity = self.get_tolerance_and_cosine_flag( - self.args.training, current_device, name - ) - - # Cast the model to float16/float32 as necessary - model, example_inputs = self.maybe_cast(model, example_inputs) - accuracy_status = "pass" - - with self.pick_grad(name, self.args.training): - # Get results of native pytorch - reset_rng_state() - try: - model_copy = self.deepcopy_and_maybe_ddp(model) - self.init_optimizer(name, current_device, model_copy.parameters()) - correct_result = self.run_n_iterations( - model_copy, clone_inputs(example_inputs) - ) - except Exception as e: - accuracy_status = ( - "eager_1st_run_OOM" - if isinstance(e, torch.cuda.OutOfMemoryError) - else "eager_1st_run_fail" - ) - log.exception(e) - return record_status(accuracy_status, dynamo_start_stats=start_stats) - - # Rerun native pytorch - reset_rng_state() - try: - model_copy = self.deepcopy_and_maybe_ddp(model) - self.init_optimizer(name, current_device, model_copy.parameters()) - correct_rerun_result = self.run_n_iterations( - model_copy, clone_inputs(example_inputs) - ) - except Exception as e: - accuracy_status = ( - "eager_2nd_run_OOM" - if isinstance(e, torch.cuda.OutOfMemoryError) - else "eager_2nd_run_fail" - ) - return record_status(accuracy_status, dynamo_start_stats=start_stats) - - # Two eager runs should have exactly same result - is_same = True - try: - if ( - name not in self.skip_accuracy_check_as_eager_non_deterministic - and not same( - correct_result, - correct_rerun_result, - fp64_ref=None, - cos_similarity=False, - tol=0, - equal_nan=self.equal_nan, - ) - ): - is_same = False - except Exception as e: - # Sometimes torch.allclose may throw RuntimeError - is_same = False - - if not is_same: - accuracy_status = "eager_two_runs_differ" - return record_status(accuracy_status, dynamo_start_stats=start_stats) - - correct_rerun_result = None - - # Run with Dynamo - reset_rng_state() - torch._dynamo.reset() - try: - model_copy = self.deepcopy_and_maybe_ddp(model) - self.init_optimizer(name, current_device, model_copy.parameters()) - if self.args.export: - # TB and TIMM use list example_inputs - # HF use dict example_inputs - example_args, example_kwargs = _normalize_bench_inputs( - example_inputs - ) - - # Register the output dataclass to pytree - example_outputs = model_copy(*example_args, **example_kwargs) - _register_dataclass_output_as_pytree(example_outputs) - - # apply export on module directly - # no need for n iterations - # the logic should be the same to self.model_iter_fn (forward_pass) - with self.autocast(): - optimized_model_iter_fn = optimize_ctx( - model_copy, example_args, example_kwargs - ) - new_result = optimized_model_iter_fn( - *example_args, **example_kwargs - ) - else: - optimized_model_iter_fn = optimize_ctx(self.run_n_iterations) - new_result = optimized_model_iter_fn(model_copy, example_inputs) - except Exception as e: - log.exception(e) - print( - "TorchDynamo optimized model failed to run because of following error" - ) - accuracy_status = ( - "OOM" - if isinstance(e, torch.cuda.OutOfMemoryError) - else "fail_to_run" - ) - return record_status(accuracy_status, dynamo_start_stats=start_stats) - - if name in self.skip_accuracy_check_as_eager_non_deterministic: - return record_status("pass_due_to_skip", dynamo_start_stats=start_stats) - - # Workaround for ONNX for non-tensor outputs - if ( - current_onnx_compiler == "torchscript" - or current_onnx_compiler == "dynamo" - ): - from _onnx import patch - - ( - correct_result, - new_result, - fp64_outputs, - ) = patch.patch_non_tensor_outputs( - correct_result, new_result, fp64_outputs - ) - - try: - if not same( - correct_result, - new_result, - fp64_outputs, - equal_nan=self.equal_nan, - cos_similarity=cos_similarity, - tol=tolerance, - ): - is_same = False - except Exception as e: - # Sometimes torch.allclose may throw RuntimeError - is_same = False - - if not is_same: - if self.args.skip_accuracy_check: - accuracy_status = "pass_due_to_skip" - else: - accuracy_status = "fail_accuracy" - return record_status(accuracy_status, dynamo_start_stats=start_stats) - - return record_status(accuracy_status, dynamo_start_stats=start_stats) - - def check_tolerance( - self, name, model, example_inputs, optimize_ctx, base_device="cpu" - ): - """ - Checks tolerance based on https://pytorch.org/docs/stable/generated/torch.allclose.html. - """ - tolerance_status = "pass" - if name in self.skip_accuracy_checks_large_models_dashboard: - tolerance_status = "pass_due_to_skip" - return tolerance_status - # Cast the model to float16/float32 as necessary - model, example_inputs = self.maybe_cast(model, example_inputs) - - with self.pick_grad(name, self.args.training): - # Get results of native pytorch - reset_rng_state() - model_copy = copy.deepcopy(model) - model_copy = model_copy.to(base_device) - example_inputs_copy = copy.deepcopy(example_inputs) - example_inputs_copy = tree_map( - lambda x: x.to(base_device), example_inputs_copy - ) - self.init_optimizer(name, base_device, model_copy.parameters()) - correct_result = self.run_n_iterations(model_copy, example_inputs_copy) - - # Run with Dynamo - # Sometime CI fails with random triton compilation failure which will be skipped for now - # TODO: revisit this after switching to new Triton runtime - reset_rng_state() - torch._dynamo.reset() - try: - self.init_optimizer(name, current_device, model.parameters()) - optimized_model_iter_fn = optimize_ctx(self.run_n_iterations) - new_result = optimized_model_iter_fn(model, example_inputs) - except Exception as e: - log.exception(e) - if ( - self.args.ci - and isinstance(e, BackendCompilerFailed) - and ( - "Internal Triton PTX codegen error" in str(e) - or "cubin" in str(e) - ) - ): - return "pass_due_to_skip" - else: - print( - "TorchDynamo optimized model failed to run because of following error" - ) - return "fail_to_run" - - def dump_max_mean_values(tol, ref, res): - if isinstance(ref, (list, tuple, torch.nn.ParameterList, torch.Size)): - for refi, resi in zip(ref, res): - dump_max_mean_values(tol, refi, resi) - elif isinstance(ref, dict): - for k in ref.keys(): - dump_max_mean_values(tol, ref[k], res[k]) - elif isinstance(ref, torch.Tensor): - res = res.to(base_device) - t = torch.abs(ref - res) / (1 + torch.abs(ref)) - tol.append(t.flatten().to(torch.float32)) - return tol - - tol = [] - dump_max_mean_values(tol, correct_result, new_result) - tol = torch.cat(tol) - tol = torch.tensor(tol) - max = torch.max(tol) - mean = torch.mean(tol) - div = torch.std(tol) - headers = ["dev", "name", "batch_size", "max", "mean", "std"] - fields = [ - current_device, - current_name, - current_batch_size, - max.item(), - mean.item(), - div.item(), - ] - output_csv(output_filename, headers, fields) - return tolerance_status - - def run_performance_test( - self, name, model, example_inputs, optimize_ctx, experiment, tag=None - ): - if self.args.xla: - with self.pick_grad(name, self.args.training): - return experiment(*self.maybe_cast(model, example_inputs)) - - def warmup(fn, model, example_inputs, mode, niters=5): - peak_mem = 0 - start_stats = get_dynamo_stats() - try: - if current_device == "cuda": - torch.cuda.reset_peak_memory_stats() - torch.cuda.empty_cache() - t0 = time.perf_counter() - for _ in range(niters): - fn(model, example_inputs) - t1 = time.perf_counter() - latency = t1 - t0 - if current_device == "cuda": - peak_mem = get_peak_memory() - elif current_device == "cpu": - total = psutil.virtual_memory().total - percentage = psutil.Process(os.getpid()).memory_percent() - peak_mem = percentage * total / 10**9 - except Exception: - log.exception("Backend %s failed in warmup()", mode) - return sys.exit(-1) - dynamo_stats = get_dynamo_stats() - dynamo_stats.subtract(start_stats) - return latency, peak_mem, dynamo_stats - - # Cast the model to float16/float32 as necessary - model, example_inputs = self.maybe_cast(model, example_inputs) - - # Use distributed wrapping as necessary - model = self.deepcopy_and_maybe_ddp(model) - - self.init_optimizer(name, current_device, model.parameters()) - with self.pick_grad(name, self.args.training): - ok, total = Stats.reset_counters() - experiment_kwargs = {} - if tag is not None: - experiment_kwargs["tag"] = tag - results = [] - eager_latency, eager_peak_mem, _ = warmup( - self.model_iter_fn, model, example_inputs, "eager" - ) - optimized_model_iter_fn = optimize_ctx(self.model_iter_fn) - dynamo_latency, dynamo_peak_mem, dynamo_stats = warmup( - optimized_model_iter_fn, model, example_inputs, "dynamo" - ) - - compilation_time = dynamo_latency - eager_latency - compression_ratio = ( - eager_peak_mem / dynamo_peak_mem if dynamo_peak_mem else 0.0 - ) - if self.args.print_memory: - print( - f"memory: eager: {eager_peak_mem:.2f} GB, " - f"dynamo: {dynamo_peak_mem:.2f} GB, " - f"ratio: {compression_ratio:.2f}" - ) - - if experiment.func is speedup_experiment: - experiment_kwargs["compilation_latency"] = compilation_time - experiment_kwargs["compression_ratio"] = compression_ratio - experiment_kwargs["eager_peak_mem"] = eager_peak_mem - experiment_kwargs["dynamo_peak_mem"] = dynamo_peak_mem - experiment_kwargs["dynamo_stats"] = dynamo_stats - - if experiment.func is coverage_experiment: - ok, total = Stats.reset_counters() - results = [] - # run with torch._dynamo few times to populate the cache - for _ in range(3): - optimized_model_iter_fn(model, example_inputs) - _, frames_second_pass = Stats.reset_counters() # should be 0 - if frames_second_pass > 0: - optimized_model_iter_fn(model, example_inputs) - _, frames_third_pass = Stats.reset_counters() # should be 0 - else: - frames_third_pass = 0 - - results.append( - f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s" - ) - - if not hasattr(model, name): - model.name = name - results.append(experiment(model, example_inputs, **experiment_kwargs)) - return " ".join(map(str, results)) - - def minify_model( - self, - name, - model, - example_inputs, - optimize_ctx, - experiment, - tag, - ): - logging.info("Minifying %s...", name) - os.environ["TORCH_COMPILE_DEBUG"] = "1" - os.environ["TORCHDYNAMO_REPRO_AFTER"] = "dynamo" - os.environ["TORCHDYNAMO_REPRO_LEVEL"] = "4" - - self.check_accuracy(name, model, example_inputs, optimize_ctx, experiment, tag) - - if self.args.output_directory: - repro_dir = self.args.output_directory - else: - repro_dir = torch._dynamo.config.base_dir - - try: - shutil.move("repro.py", f"{repro_dir}/{name}_repro.py") - except OSError as e: - logging.error("Could not find repro script for model %s", name) - else: - logging.info( - "Repro script for model %s with minified graph saved to %s", - name, - repro_dir, - ) - - def run_one_model( - self, - name, - model, - example_inputs, - optimize_ctx, - experiment, - explain=False, - tag=None, - ): - mode = "train" if self.args.training else "eval" - msg = f"{current_device:4} {mode:5} {current_name:34} " - if tag: - msg += f" {tag:26}" - print(msg, flush=True) - - start_stats = get_dynamo_stats() - - if self.args.accuracy: - status = self.check_accuracy( - name, model, example_inputs, optimize_ctx, experiment, tag - ) - print(status) - if status == "fail_accuracy" and self.args.minify: - self.minify_model( - name, model, example_inputs, optimize_ctx, experiment, tag - ) - elif self.args.tolerance: - status = self.check_tolerance(name, model, example_inputs, optimize_ctx) - print(status) - elif self.args.performance: - status = self.run_performance_test( - name, model, example_inputs, optimize_ctx, experiment, tag - ) - print(status) - if self.args.timing: - from torch._dynamo.utils import op_count, print_time_report - from torch.utils._stats import simple_call_counter - - print_time_report() - stats = "STATS: " - stats = stats + " | ".join( - itertools.chain( - [f"call_* op count: {op_count}"], - (f"{key}:{value}" for key, value in simple_call_counter.items()), - ) - ) - print(stats) - stats = get_dynamo_stats() - stats.subtract(start_stats) - - if explain: - print( - f"Dynamo produced {stats['unique_graphs']} graphs " - f"covering {stats['calls_captured']} ops with " - f"{stats['graph_breaks']} graph breaks ({stats['unique_graph_breaks']} unique)" - ) - - if explain or self.args.log_graph_breaks or self.args.print_graph_breaks: - filename = f"{output_filename.rstrip('.csv')}_graph_breaks.csv" - - def add_double_quotes(x): - # Delimiter because reason could have comma - return f'"{x}"' - - for graph_break in graph_break_reasons: - reason = add_double_quotes(graph_break.reason) - user_stack = add_double_quotes( - ", ".join([str(x) for x in graph_break.user_stack]) - ) - output_csv( - filename, - ["model", "reason", "user_stack"], - [current_name, reason, user_stack], - ) - - if self.args.stats: - Stats.print_summary() - - -def help(fn): - return fn.__doc__ - - -diff_branch_default = "DIFF-BRANCH-DEFAULT" - - -def should_diff_branch(args): - return args.diff_branch != diff_branch_default - - -def parse_args(args=None): - parser = argparse.ArgumentParser() - parser.add_argument( - "--filter", "-k", action="append", help="filter benchmarks with regexp" - ) - parser.add_argument( - "--exclude", "-x", action="append", help="filter benchmarks with regexp" - ) - parser.add_argument( - "--exclude-exact", action="append", help="filter benchmarks with exact match" - ) - parser.add_argument( - "--total-partitions", - type=int, - default=1, - choices=range(1, 10), - help="Total number of partitions we want to divide the benchmark suite into", - ) - parser.add_argument( - "--partition-id", - type=int, - default=0, - help="ID of the benchmark suite partition to be run. Used to divide CI tasks", - ) - parser.add_argument( - "--devices", "--device", "-d", action="append", help="cpu or cuda" - ) - parser.add_argument("--device-index", help="CUDA device index") - parser.add_argument( - "--repeat", "-n", type=int, default=30, help="number of timing runs" - ) - iterations_per_run_help = """ - Run this may iterations for each time measurement. This is mainly used for - XLA training. We want to run multiple iterations per measurement so the - tracing and computation for different iteartions can overlap with each - other. This makes sure we have an accurate xla baseline. - """ - parser.add_argument( - "--iterations-per-run", type=int, default=1, help=iterations_per_run_help - ) - parser.add_argument( - "--randomize-input", - action="store_true", - help="Whether to randomize the input values. Dimensions will be kept the same.", - ) - parser.add_argument( - "--threads", - "-t", - type=int, - help="number of threads to use for eager and inductor", - ) - parser.add_argument( - "--nopython", action="store_true", help="Turn graph breaks into errors" - ) - parser.add_argument( - "--no-skip", - action="store_true", - help="run models that are in the global SKIP list", - ) - parser.add_argument( - "--prims-nvfuser", action="store_true", help="user prims + nvfuser backend" - ) - parser.add_argument( - "--dump-raw-metrics", - action="store_true", - help="dump raw timing metrics from speedup experiment", - ) - parser.add_argument( - "--log-operator-inputs", - action="store_true", - default=False, - ) - parser.add_argument( - "--channels-last", - action="store_true", - default=False, - help="use channels last format", - ) - parser.add_argument( - "--batch-size", "--batch_size", type=int, help="batch size for benchmarking" - ) - parser.add_argument( - "--iterations", type=int, default=2, help="how many iterations to run" - ) - parser.add_argument( - "--batch-size-file", type=str, help="String to load batch size from" - ) - parser.add_argument("--cosine", action="store_true", help="use cosine similarity") - parser.add_argument( - "--cpp-wrapper", action="store_true", help="turn on cpp/cuda wrapper codegen" - ) - parser.add_argument( - "--freezing", action="store_true", help="turn on freezing", default=False - ) - parser.add_argument( - "--ci", action="store_true", help="Flag to tell that its a CI run" - ) - parser.add_argument( - "--dynamic-ci-skips-only", - action="store_true", - help=( - "Run only the models that would have been skipped in CI " - "if dynamic-shapes, compared to running without dynamic-shapes. " - "This is useful for checking if more models are now " - "successfully passing with dynamic shapes. " - "Implies --dynamic-shapes and --ci" - ), - ) - parser.add_argument( - "--dashboard", action="store_true", help="Flag to tell that its a Dashboard run" - ) - parser.add_argument( - "--skip-fp64-check", action="store_true", help="skip accuracy check using fp64" - ) - parser.add_argument( - "--fast", "-f", action="store_true", help="skip slow benchmarks" - ) - parser.add_argument( - "--only", - help="""Run just one model from torchbench. Or - specify the path and class name of the model in format like: - --only=path:,class: - - Due to the fact that dynamo changes current working directory, - the path should be an absolute path. - - The class should have a method get_example_inputs to return the inputs - for the model. An example looks like - ``` - class LinearModel(nn.Module): - def __init__(self): - super().__init__() - self.linear = nn.Linear(10, 10) - - def forward(self, x): - return self.linear(x) - - def get_example_inputs(self): - return (torch.randn(2, 10),) - ``` - """, - ) - parser.add_argument( - "--multiprocess", - action="store_true", - help="Create n processes based on the number of devices (distributed use case).", - ) - parser.add_argument( - "--ddp", - action="store_true", - help="Wraps model in DDP before running it, and uses dynamo DDPOptmizer (graph breaks) by default.", - ) - parser.add_argument( - "--fsdp", - action="store_true", - help="""Wraps model in FSDP before running it. Disables cudagraphs by default. - Doesn't recursively wrap, mainly useful for checking dynamo UnspecNNModule compatibility - """, - ) - parser.add_argument( - "--no-optimize-ddp", - action="store_true", - help="Disables dynamo DDPOptimizer (graph breaks). (Applies only when using --ddp benchmark mode).", - ) - parser.add_argument( - "--distributed-master-port", - default="6789", - help="Port to bind for for torch.distributed. Use the default unless it's conflicting with another user", - ) - parser.add_argument( - "--dynamic-shapes", - action="store_true", - help="Runs a dynamic shapes version of the benchmark, if available.", - ) - parser.add_argument( - "--dynamic-batch-only", - action="store_true", - help="Only assume batch dimension is dynamic. Implies --dynamic-shapes", - ) - parser.add_argument( - "--specialize-int", action="store_true", help="Run with specialize_int=True." - ) - parser.add_argument( - "--use-eval-mode", - action="store_true", - help="sets model.eval() to reduce randomness", - ) - parser.add_argument( - "--skip-accuracy-check", - action="store_true", - help="keeps running even when accuracy fails", - ) - parser.add_argument( - "--generate-aot-autograd-stats", - action="store_true", - help="Generates AOT Autograd stats like how mnay graphs are sent to AOT", - ) - parser.add_argument( - "--inductor-settings", - action="store_true", - help="Use same settings as --inductor for baseline comparisons", - ) - parser.add_argument( - "--suppress-errors", - action="store_true", - help="Suppress errors instead of raising them", - ) - parser.add_argument( - "--output", - help="Overrides the output filename", - ) - parser.add_argument( - "--output-directory", - help="Overrides the directory to place output files.", - ) - parser.add_argument( - "--baseline", - help="Compare with a prior --output", - ) - parser.add_argument( - "--part", - default=None, - help="Specify the part of the model to run.", - ) - parser.add_argument( - "--export-profiler-trace", - action="store_true", - help="exports trace of kineto profiler", - ) - parser.add_argument( - "--profiler-trace-name", - "--profiler_trace_name", - help="Overwrites exported trace name", - ) - parser.add_argument( - "--diff-branch", - default=diff_branch_default, - help="delta current branch against given branch.", - ) - parser.add_argument( - "--tag", default=None, help="Specify a tag to be included in csv files." - ) - parser.add_argument( - "--explain", - action="store_true", - help="print some graph/op statistics during the run, similar to .explain()", - ) - parser.add_argument( - "--stats", - action="store_true", - help="print graph counter stats", - ) - parser.add_argument( - "--print-memory", - action="store_true", - help="print extra memory statistics", - ) - parser.add_argument( - "--print-dataframe-summary", - action="store_true", - help="print dataframe result used for calculating accuracy", - ) - parser.add_argument( - "--cold-start-latency", - "--cold_start_latency", - action="store_true", - help="Use a fresh triton cachedir when running each model, to force cold-start compile.", - ) - parser.add_argument( - "--disable-cudagraphs", - action="store_true", - help="Disables cudagraphs for Inductor", - ) - parser.add_argument( - "--disable-split-reductions", - action="store_true", - help="Disables split reductions for Inductor", - ) - parser.add_argument( - "--disable-persistent-reductions", - action="store_true", - help="Disables split reductions for Inductor", - ) - parser.add_argument( - "--disable-divisible-by-16", - action="store_true", - help="Disables divisible by 16 hint to Triton for Inductor", - ) - parser.add_argument( - "--inductor-compile-mode", - default=None, - help="torch.compile mode argument for inductor runs.", - ) - parser.add_argument( - "--print-graph-breaks", - action="store_true", - help="Show a warning whenever graph break", - ) - parser.add_argument( - "--log-graph-breaks", - action="store_true", - help="log graph breaks in a file", - ) - parser.add_argument( - "--trace-on-xla", - action="store_true", - help="Whether to trace the model on XLA or on eager device", - ) - parser.add_argument( - "--xla-tolerance", - type=float, - default=1e-2, - help="XLA needs a loose tolerance to pass the correctness check", - ) - parser.add_argument( - "--collect-outputs", - action="store_true", - help="""Whether to collect outputs for training. Set this to true if we - want to verify the numerical correctness of graidents. But that may - cause time measurement not accurate""", - ) - parser.add_argument( - "--enable-activation-checkpointing", - action="store_true", - help="Enables activation checkpointing for HF models", - ) - parser.add_argument("--timing", action="store_true", help="Emits phase timing") - - parser.add_argument( - "--progress", - action="store_true", - help="Print n/k models message between each model run.", - ) - - parser.add_argument( - "--timeout", - type=int, - default=2000, - help="timeout (second) for benchmarking.", - ) - - parser.add_argument( - "--per_process_memory_fraction", - type=float, - default=1, - help="Set per-process GPU memory fraction (limit) for reducing usable size and reproducing OOMs", - ) - - parser.add_argument( - "--no-translation-validation", - action="store_true", - help="Disable translation validation for accuracy builds.", - ) - - parser.add_argument( - "--minify", - action="store_true", - help="Enable minification when failure is below tolerance. Save repro script for each model.", - ) - - group_fuser = parser.add_mutually_exclusive_group() - # --nvfuser is now the default, keep the option to not break scripts - group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS) - group_fuser.add_argument("--nnc", action="store_true", help="enable NNC for GPUs") - - group_prec = parser.add_mutually_exclusive_group() - group_prec.add_argument("--float16", action="store_true", help="cast model to fp16") - group_prec.add_argument( - "--bfloat16", action="store_true", help="cast model to bf16" - ) - group_prec.add_argument("--float32", action="store_true", help="cast model to fp32") - group_prec.add_argument( - "--amp", action="store_true", help="use automatic mixed precision" - ) - - group_printout = parser.add_mutually_exclusive_group() - group_printout.add_argument( - "--verbose", "-v", action="store_true", help="enable verbose debug printouts" - ) - group_printout.add_argument( - "--quiet", "-q", action="store_true", help="suppress debug printouts" - ) - - group = parser.add_mutually_exclusive_group() - group.add_argument( - "--coverage", action="store_true", help="(default) " + help(coverage_experiment) - ) - group.add_argument( - "--overhead", action="store_true", help=help(overhead_experiment) - ) - group.add_argument( - "--speedup-dynamo-ts", - action="store_true", - help="TorchDynamo frontend with torchscript backend", - ) - group.add_argument( - "--speedup-fx2trt", action="store_true", help=help(speedup_experiment_fx2trt) - ) - group.add_argument( - "--speedup-fx2trt-fp16", - action="store_true", - help=help(speedup_experiment_fx2trt), - ) - group.add_argument( - "--print-fx", - action="store_true", - help="Print fx traces captured from model", - ) - group.add_argument( - "--print-aten-ops", - action="store_true", - help="Print traces of aten ops captured by AOT autograd", - ) - group.add_argument( - "--inductor", - action="store_true", - help="Measure speedup with TorchInductor", - ) - group.add_argument( - "--export", - action="store_true", - help="Measure pass rate with export", - ) - group.add_argument( - "--export-aot-inductor", - action="store_true", - help="Measure pass rate with Export+AOTInductor", - ) - group.add_argument( - "--xla", action="store_true", help="Compare TorchXLA to eager PyTorch" - ) - group.add_argument( - "--torchscript-onnx", - "--torchscript_onnx", - action="store_true", - help="Measure speedup with TorchScript ONNX, i.e. `torch.onnx.export`", - ) - group.add_argument( - "--dynamo-onnx", - "--dynamo_onnx", - action="store_true", - help="Measure speedup with Dynamo ONNX, i.e. `torch.onnx.dynamo_export`", - ) - group.add_argument( - "--backend", - choices=torch._dynamo.list_backends(exclude_tags=None), - help="measure speedup with a given backend", - ) - group.add_argument("--nothing", action="store_true", help=help(null_experiment)) - group.add_argument( - "--log-conv-args", - action="store_true", - help="Dump convolution input/weight/bias's shape/stride/dtype and other options to json", - ) - group.add_argument( - "--recompile-profiler", - "--recompile_profiler", - action="store_true", - help="Run the dynamo recompilation profiler on each model.", - ) - group.add_argument( - "--find-batch-sizes", - action="store_true", - help="finds the largest batch size that could fit on GPUs", - ) - - mode_group = parser.add_mutually_exclusive_group(required=True) - mode_group.add_argument( - "--accuracy", - action="store_true", - help="Checks accuracy with small batch size and eval mode", - ) - mode_group.add_argument( - "--performance", action="store_true", help="Measures performance speedup" - ) - mode_group.add_argument( - "--tolerance", - action="store_true", - help="extracts the tolerance for each model with small batch size and eval mode", - ) - run_mode_group = parser.add_mutually_exclusive_group(required=True) - run_mode_group.add_argument( - "--training", - action="store_true", - help="Performs training", - ) - run_mode_group.add_argument( - "--inference", action="store_true", help="Performs inference" - ) - return parser.parse_args(args) - - -def process_entry(rank, runner, original_dir, args): - args.rank = rank - with maybe_init_distributed( - args.use_distributed, - rank=rank, - world_size=args.world_size, - port=args.distributed_master_port, - ): - return maybe_fresh_cache( - run, (args.cold_start_latency and args.only) or args.ci - )(runner, args, original_dir) - - -def main(runner, original_dir=None, args=None): - if original_dir: - os.chdir(original_dir) - args = parse_args(args) - if args.baseline: - args.baseline = os.path.abspath(args.baseline) - - if should_diff_branch(args): - import git - - # We do this here so we error out earlier if there's an issue - repo = git.Repo() - if repo.is_dirty(): - raise RuntimeError( - "--diff-branch called on dirty branch. Commit, stash, or reset." - ) - main_branch = repo.active_branch.name - if main_branch == args.diff_branch: - raise RuntimeError( - f"--diff-branch: current branch is same as {args.diff_branch} branch, what are you diffing?" - ) - - device_count = torch.cuda.device_count() - args.use_distributed = (args.ddp or args.fsdp) and args.only - if args.multiprocess: - if device_count <= 1: - log.warning( - "The use multiprocess flag is set but there are <= 1 devices available." - ) - # multiprocess path - args.world_size = device_count - mp.spawn(process_entry, args=(runner, original_dir, args), nprocs=device_count) - else: - # single process path just uses the main process - args.world_size = 1 - process_entry(0, runner, original_dir, args) - - -def run(runner, args, original_dir=None): - # Pass the parsed args object to benchmark runner object - runner.args = args - - args.filter = args.filter or [r"."] - args.exclude = args.exclude or [r"^$"] - args.exclude_exact = args.exclude_exact or [] - - if args.inductor: - assert args.backend is None - args.backend = "inductor" - if args.dynamic_ci_skips_only: - args.dynamic_shapes = True - args.ci = True - if args.dynamic_batch_only: - args.dynamic_shapes = True - torch._dynamo.config.assume_static_by_default = True - if args.dynamic_shapes: - if not args.dynamic_batch_only: - torch._dynamo.config.assume_static_by_default = False - if args.specialize_int: - torch._dynamo.config.specialize_int = True - if args.ci: - if args.accuracy: - # Run fewer iterations when checking accuracy - args.repeat = 2 - - # Set translation validation on by default on CI accuracy runs. - torch._dynamo.config.translation_validation = True - - if args.dynamic_ci_skips_only: - # Test only the incremental set of jobs whose skipped was - # caused solely by turning on dynamic shapes - assert args.dynamic_shapes - ci = functools.partial(CI, args.backend, training=args.training) - args.filter = list( - set(CI_SKIP[ci(dynamic=True)]) - set(CI_SKIP[ci(dynamic=False)]) - ) - else: - ci = functools.partial( - CI, args.backend, training=args.training, dynamic=args.dynamic_shapes - ) - for device in args.devices: - args.exclude_exact.extend(CI_SKIP[ci(device=device)]) - if args.ddp: - # TODO: we could also hook DDP bench up to --speedup bench, _not_ for mgpu e2e perf, - # but just to measure impact on singlenode of performing graph-breaks. - # Left it as a follow up to keep this PR isolated. - assert ( - args.accuracy - ), "DDP benchmark is currently only hooked up to --accuracy bench" - assert args.training, "DDP benchmark requires --training mode" - if args.no_optimize_ddp: - torch._dynamo.config.optimize_ddp = False - else: - # TODO(whc) after enabling DDPOptimizer by default this could be removed or assert - torch._dynamo.config.optimize_ddp = True - if args.only == "dlrm": - log.error( - "DLRM+DDP is unsupported as it requires sharding the embedding layer separately from DDP" - ) - return sys.exit(-1) - if args.accuracy: - # Use small batch size. We use >1 batch size to ensure we test - # batch_norm type of operators that work on batch dims. - # TODO - Go through the failures for batch size = 2 - if args.batch_size is None: - if runner.suite_name == "huggingface": - args.batch_size = 1 - elif runner.suite_name == "torchbench": - args.batch_size = 4 - else: - # Larger batch size of TIMM models to have stable batch_norm - assert runner.suite_name == "timm_models" - args.batch_size = 8 - - # Remove sources of randomness - if runner.suite_name not in ("timm_models", "huggingface"): - # TODO - Using train mode for timm_models and HF models. Move to train mode for Torchbench as well. - args.use_eval_mode = True - inductor_config.fallback_random = True - if args.only is not None and args.only not in { - "alexnet", - "Background_Matting", - "pytorch_CycleGAN_and_pix2pix", - "pytorch_unet", - "Super_SloMo", - "vgg16", - # https://github.com/pytorch/pytorch/issues/96724 - "Wav2Vec2ForCTC", - "Wav2Vec2ForPreTraining", - "sam", - }: - # some of the models do not support use_deterministic_algorithms - torch.use_deterministic_algorithms(True) - os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.allow_tf32 = False - torch.backends.cudnn.benchmark = False - torch.backends.cuda.matmul.allow_tf32 = False - - # Remove randomeness when torch manual seed is called - patch_torch_manual_seed() - - # Some models e.g. yolov3 assert batch size on n_gpus - if "CUDA_VISIBLE_DEVICES" not in os.environ: - args.device_index = "0" - - # Stricter check to disable fallbacks - args.suppress_errors = False - - if args.device_index is not None: - os.environ["CUDA_VISIBLE_DEVICES"] = args.device_index - - elif args.performance: - # Ensure that we test on real scenarios - args.use_eval_mode = False - - if args.partition_id > args.total_partitions or args.partition_id < 0: - print("Invalid partition id") - return sys.exit(-1) - - if not args.devices: - if torch.cuda.is_available(): - args.devices = ["cuda"] - else: - log.warning("torch.cuda.is_available() == False, using CPU") - args.devices = ["cpu"] - - if args.devices != ["cpu"] and torch.cuda.is_available(): - global synchronize - synchronize = torch.cuda.synchronize - - if ( - args.devices == ["cuda"] - and torch.cuda.get_device_properties(0).total_memory < 25 * 2**30 - ): - # OOM errors on an RTX 3090 with 24gb RAM - runner.skip_models.update( - { - # torchbench - "hf_Longformer", - "timm_nfnet", - "timm_efficientdet", - } - ) - if args.training: - runner.skip_models.add("hf_T5") - - if args.nnc: - torch._C._jit_override_can_fuse_on_cpu(True) - torch._C._jit_override_can_fuse_on_gpu(True) - torch._C._jit_set_texpr_fuser_enabled(True) - torch._C._jit_set_nvfuser_enabled(False) - - if args.threads: - torch.set_num_threads(args.threads) - - if args.verbose: - torch._logging.set_logs(dynamo=logging.DEBUG) - - if args.print_graph_breaks: - torch._dynamo.config.print_graph_breaks = True - - if args.quiet: - torch._logging.set_logs(dynamo=logging.ERROR) - - torch._dynamo.config.suppress_errors = args.suppress_errors - - if args.training: - runner.model_iter_fn = runner.forward_and_backward_pass - runner.skip_models.update(runner.skip_not_suitable_for_training_models) - else: - runner.model_iter_fn = runner.forward_pass - - if args.fast: - runner.skip_models.update(runner.slow_models) - - if args.devices == ["cpu"]: - runner.skip_models.update(runner.very_slow_models) - runner.skip_models.update(runner.skip_models_for_cpu) - elif args.devices == ["cuda"]: - runner.skip_models.update(runner.skip_models_for_cuda) - - if args.no_skip: - runner.skip_models.clear() - - experiment = null_experiment - global current_name, current_device, current_batch_size, output_filename, optimize_ctx, current_onnx_compiler - optimize_ctx = contextlib.nullcontext() - - if args.overhead: - optimize_ctx = torch._dynamo.optimize(dummy_fx_compile, nopython=args.nopython) - experiment = speedup_experiment - output_filename = "overheads.csv" - elif args.inductor: - inductor_config.debug = args.verbose - if args.threads: - inductor_config.cpp.threads = args.threads - - optimize_ctx = functools.partial( - torch.compile, - backend="inductor", - fullgraph=args.nopython, - mode=args.inductor_compile_mode, - ) - experiment = speedup_experiment - output_filename = "inductor.csv" - elif args.export: - optimize_ctx = torch._export.export - experiment = speedup_experiment - output_filename = "export.csv" - elif args.xla: - (dev,) = args.devices - os.environ["PJRT_DEVICE"] = {"cuda": "GPU", "cpu": "CPU"}[dev] - torch._dynamo.mark_dynamic = MagicMock() - experiment = xla - output_filename = "xla.csv" - elif args.torchscript_onnx: - optimize_ctx = functools.partial( - optimize_onnx_ctx, args.output_directory or ".", OnnxModelFromTorchScript - ) - experiment = functools.partial( - speedup_experiment_onnx, OnnxModelFromTorchScript - ) - output_filename = "torchscript_onnx.csv" - current_onnx_compiler = "torchscript" - elif args.dynamo_onnx: - optimize_ctx = functools.partial( - optimize_onnx_ctx, args.output_directory or ".", OnnxModelFromDynamo - ) - experiment = functools.partial(speedup_experiment_onnx, OnnxModelFromDynamo) - output_filename = "dynamo_onnx.csv" - current_onnx_compiler = "dynamo" - elif args.speedup_dynamo_ts: - optimize_ctx = torch._dynamo.optimize("ts", nopython=args.nopython) - experiment = speedup_experiment - output_filename = "speedup_dynamo_ts.csv" - elif args.prims_nvfuser: - optimize_ctx = torch._dynamo.optimize("prims_nvfuser", nopython=args.nopython) - experiment = speedup_experiment - backend_str = "prims_nvfuser" - output_filename = f"accuracy_aot_{backend_str}.csv" - elif args.print_fx: - optimize_ctx = torch._dynamo.optimize( - print_fx, - nopython=args.nopython, - ) - elif args.print_aten_ops: - optimize_ctx = torch._dynamo.optimize( - print_aten_ops, - nopython=args.nopython, - ) - elif args.nothing: - optimize_ctx = nothing - experiment = speedup_experiment - output_filename = "nothing.csv" - elif args.backend or args.export_aot_inductor: - if args.export_aot_inductor: - assert not args.training, "AOTInductor only supports inference" - assert args.devices == ["cuda"], "AOTInductor only tested for CUDA" - optimize_ctx = export_aot_inductor - else: - optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython) - experiment = speedup_experiment - if args.accuracy: - output_filename = f"accuracy_{args.backend}.csv" - elif args.tolerance: - output_filename = f"tolerance_{args.backend}.csv" - else: - output_filename = f"speedup_{args.backend}.csv" - elif args.recompile_profiler: - output_filename = "recompile_profiler_log.csv" - experiment = recompile_profiler_experiment - else: - optimize_ctx = torch._dynamo.optimize( - fx_insert_profiling, nopython=args.nopython - ) - experiment = coverage_experiment - output_filename = "coverage.csv" - - if args.inductor or args.backend == "inductor" or args.export_aot_inductor: - inductor_config.triton.cudagraphs = not args.disable_cudagraphs - inductor_config.triton.persistent_reductions = ( - not args.disable_persistent_reductions - ) - inductor_config.split_reductions = not args.disable_split_reductions - inductor_config.triton.divisible_by_16 = not args.disable_divisible_by_16 - inductor_config.cpp_wrapper = args.cpp_wrapper - if args.inference: - inductor_config.freezing = args.freezing - - runner.setup_amp() - - if args.output: - output_filename = args.output - - if output_filename: - if args.output_directory: - output_filename = os.path.join(args.output_directory, output_filename) - else: - output_filename = os.path.join( - torch._dynamo.config.base_dir, output_filename - ) - - if args.find_batch_sizes and args.only: - for device in args.devices: - batch_size = runner.batch_size_finder(device, args.only) - print(args.only, batch_size) - output_csv(output_filename, [], [args.only, batch_size]) - return - - if args.export_profiler_trace: - if args.profiler_trace_name is None: - if args.backend: - args.profiler_trace_name = args.backend - elif args.inductor: - args.profiler_trace_name = "inductor" - else: - args.profiler_trace_name = "profile" - else: - args.profiler_trace_name = args.profiler_trace_name - - if args.no_translation_validation: - # Overwrite 'translation_validation' config, if specified. - torch._dynamo.config.translation_validation = False - - experiment = functools.partial(experiment, args, runner.model_iter_fn) - - if args.only and should_diff_branch(args): - import git - - repo = git.Repo() - main_branch = repo.active_branch.name - try: - # Adding diff-branch again to the args will override previous value - call_args = ( - [sys.executable] + sys.argv + [f"--diff-branch={diff_branch_default}"] - ) - # Run for main branch - subprocess.check_call(call_args + [f"--tag={main_branch}"]) - # Run for comparison branch - repo.git.checkout(args.diff_branch) - subprocess.check_call(call_args + [f"--tag={args.diff_branch}"]) - finally: - # Go back to main branch - repo.git.checkout(main_branch) - elif args.only: - model_name = args.only - for device in args.devices: - batch_size = args.batch_size - if args.batch_size_file: - batch_size = read_batch_size_from_file( - args, args.batch_size_file, model_name - ) - if model_specified_by_path(args.only): - model, example_inputs = load_model_from_path(args.only) - name = model.__class__.__name__ - model = model.to(device=device) - example_inputs = tree_map_only( - torch.Tensor, lambda x: x.to(device=device), example_inputs - ) - else: - try: - with tqdm(desc="loading model"): - if args.part: - ( - device, - name, - model, - example_inputs, - batch_size, - ) = runner.load_model( - device, - model_name, - batch_size=batch_size, - part=args.part, - ) - else: - if args.fsdp: - # Always load model on cpu for fsdp - # When initializing FSDP, we will use the cuda device if args.cuda is set - ( - _, - name, - model, - example_inputs, - batch_size, - ) = runner.load_model( - "cpu", model_name, batch_size=batch_size - ) - else: - ( - device, - name, - model, - example_inputs, - batch_size, - ) = runner.load_model( - device, model_name, batch_size=batch_size - ) - except NotImplementedError as e: - print(e) - import traceback - - print(traceback.format_exc()) - logging.warning("%s failed to load", args.only) - continue # bad benchmark implementation - - if args.trace_on_xla: - xla_dev = xm.xla_device() - model = model.to(device=xla_dev) - example_inputs = tree_map_only( - torch.Tensor, lambda x: x.to(device=xla_dev), example_inputs - ) - - current_name = name - current_device = device - current_batch_size = batch_size - set_model_name(name) - - # Look for stuff that looks like batch size, and mark it dynamic. - # Better integration would integrate directly with benchmark suite - # but cannot conveniently do this - # NB: This must be done late enough so that we don't do more - # conversions on the inputs - # NB: Assumes only the first batch-y like dimension is the batch - marked = False - - def detect_and_mark_batch(t): - nonlocal marked - for i, s in enumerate(t.size()): - if s == batch_size: - torch._dynamo.mark_dynamic(t, i) - marked = True - break - - if ( - args.dynamic_batch_only - and batch_size > 1 - and model_name not in CI_SKIP_DYNAMIC_BATCH_ONLY - ): - tree_map_only(torch.Tensor, detect_and_mark_batch, example_inputs) - assert marked, f"nothing in example_inputs had a dim with {batch_size}" - - if args.log_operator_inputs: - log_operator_inputs( - model, example_inputs, runner.model_iter_fn, name, args - ) - continue - - if args.per_process_memory_fraction != 1: - torch.cuda.set_per_process_memory_fraction( - args.per_process_memory_fraction - ) - - model, example_inputs = runner.cast_based_on_args(model, example_inputs) - runner.run_one_model( - name, - model, - example_inputs, - optimize_ctx, - experiment, - explain=args.explain, - tag=args.tag, - ) - if args.generate_aot_autograd_stats: - stats_file = output_filename.split(".csv")[0] + "_stats.csv" - output_csv( - stats_file, - ("dev", "name", "batch_size", "total_aot_graphs", "ok_aot_graphs"), - [ - current_device, - current_name, - current_batch_size, - *Stats.aot_summary(), - ], - ) - else: - if output_filename and os.path.exists(output_filename): - os.unlink(output_filename) - if original_dir: - os.chdir(original_dir) - model_names = list(runner.iter_model_names(args)) - nmodels = len(model_names) - for i, name in enumerate(model_names): - current_name = name - placeholder_batch_size = 0 - if args.progress: - print(f"Running model {i+1}/{nmodels}", flush=True) - - def write_csv(status): - if args.accuracy: - headers = ["dev", "name", "batch_size", "accuracy"] - rows = [ - [device, name, placeholder_batch_size, status] - for device in args.devices - ] - elif args.performance: - headers = ["dev", "name", "batch_size", "speedup", "abs_latency"] - rows = [ - [device, name, placeholder_batch_size, 0.0, 0.0] - for device in args.devices - ] - else: - headers = [] - rows = [ - [device, name, placeholder_batch_size, 0.0] - for device in args.devices - ] - - for row in rows: - output_csv(output_filename, headers, row) - - try: - timeout = args.timeout - if should_diff_branch(args): - timeout *= 2 - subprocess.check_call( - [sys.executable] + sys.argv + [f"--only={name}"], timeout=timeout - ) - except subprocess.TimeoutExpired: - print("TIMEOUT", file=sys.stderr) - write_csv("timeout") - except subprocess.SubprocessError: - print("ERROR", file=sys.stderr) - write_csv("infra_error") - print_summary(output_filename, print_dataframe=args.print_dataframe_summary) - - -def log_operator_inputs(model, example_inputs, model_iter_fn, name, args): - mode = "training" if args.training else "eval" - output = os.path.join(os.path.dirname(args.output), f"{name}_{mode}.txt") - - # TODO - add option for coalescing inputs over multiple runs - if os.path.exists(output): - print(f"Skipping {name}, {output} already exists") - return - - print(f"Running {name}") - - operator_mode = OperatorInputsMode() - fake_tensor_mode = FakeTensorMode() - - with torch._subclasses.fake_tensor.FakeCopyMode(fake_tensor_mode): - model_fake = copy.deepcopy(model) - example_inputs_fake = copy.deepcopy(example_inputs) - try: - with fake_tensor_mode, operator_mode: - model_iter_fn(model_fake, example_inputs_fake, collect_outputs=False) - except Exception as e: - print(f"{name} failed to run with fake tensors, trying real. Exception: {e}") - operator_mode = OperatorInputsMode() - try: - with operator_mode: - model_iter_fn(model, example_inputs, collect_outputs=False) - except Exception as e2: - print(f"{name} failed to run with real. Exception: {e2}") - raise - - print(f"Writing output to {output}") - operator_mode.log_to_file(output) - - -if __name__ == "__main__": - raise RuntimeError( - f"You shouldn't run {sys.argv[0]} directly, instead try timm_model.py, torchbench.py or hugginface.py" - ) diff --git a/userbenchmark/dynamo/dynamobench/_dynamo/utils.py b/userbenchmark/dynamo/dynamobench/_dynamo/utils.py index 81527ae647..34bc8ae390 100644 --- a/userbenchmark/dynamo/dynamobench/_dynamo/utils.py +++ b/userbenchmark/dynamo/dynamobench/_dynamo/utils.py @@ -33,27 +33,29 @@ except ModuleNotFoundError: np = None -import torch._logging -import torch._numpy as tnp -from torch._guards import detect_fake_mode # noqa: F401 -from torch._logging import LazyString -from . import config - - -# NOTE: Make sure `NP_SUPPORTED_MODULES` and `NP_TO_TNP_MODULE` are in sync. -if np: - NP_SUPPORTED_MODULES = (np, np.fft, np.linalg, np.random) - - NP_TO_TNP_MODULE = { - np: tnp, - np.fft: tnp.fft, - np.linalg: tnp.linalg, - np.random: tnp.random, - } -else: - NP_SUPPORTED_MODULES = {} +try: + import torch._logging + import torch._numpy as tnp + from torch._guards import detect_fake_mode # noqa: F401n + from torch._logging import LazyString + from . import config + # NOTE: Make sure `NP_SUPPORTED_MODULES` and `NP_TO_TNP_MODULE` are in sync. + if np: + NP_SUPPORTED_MODULES = (np, np.fft, np.linalg, np.random) + + NP_TO_TNP_MODULE = { + np: tnp, + np.fft: tnp.fft, + np.linalg: tnp.linalg, + np.random: tnp.random, + } + else: + NP_SUPPORTED_MODULES = {} - NP_TO_TNP_MODULE = {} + NP_TO_TNP_MODULE = {} + from torch._subclasses.fake_tensor import FakeTensor, is_fake +except: + pass import importlib @@ -62,7 +64,7 @@ import torch.fx.experimental.symbolic_shapes from torch import fx from torch._dispatch.python import enable_python_dispatcher -from torch._subclasses.fake_tensor import FakeTensor, is_fake + from torch.nn.modules.lazy import LazyModuleMixin from torch.utils._pytree import tree_map diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py index cb41ff4af2..5858c45bf8 100644 --- a/userbenchmark/dynamo/dynamobench/common.py +++ b/userbenchmark/dynamo/dynamobench/common.py @@ -54,10 +54,13 @@ from scipy.stats import gmean, ttest_ind from torch._dynamo.profiler import fx_insert_profiling, Profiler from torch._dynamo.testing import dummy_fx_compile, format_speedup, same -from torch._dynamo.utils import clone_inputs, graph_break_reasons +try: + from torch._dynamo.utils import clone_inputs, graph_break_reasons + from torch._inductor.utils import aot_inductor_launcher, fresh_inductor_cache +except ImportError: + from _dynamo.utils import clone_inputs, graph_break_reasons from torch._functorch.aot_autograd import set_model_name from torch._inductor import config as inductor_config -from torch._inductor.utils import aot_inductor_launcher, fresh_inductor_cache from torch._subclasses.fake_tensor import FakeTensorMode from torch.utils import _pytree as pytree @@ -65,11 +68,6 @@ from tqdm.auto import tqdm, trange -try: - from .microbenchmarks.operator_inp_utils import OperatorInputsMode -except ImportError: - from microbenchmarks.operator_inp_utils import OperatorInputsMode - try: import torch_xla import torch_xla.core.xla_model as xm @@ -3177,10 +3175,10 @@ def process_entry(rank, runner, original_dir, args): )(runner, args, original_dir) -def main(runner, original_dir=None): +def main(runner, original_dir=None, args=None): if original_dir: os.chdir(original_dir) - args = parse_args() + args = parse_args() if not args else parse_args(args) if args.baseline: args.baseline = os.path.abspath(args.baseline) @@ -3789,6 +3787,10 @@ def log_operator_inputs(model, example_inputs, model_iter_fn, name, args): return print(f"Running {name}") + try: + from .microbenchmarks.operator_inp_utils import OperatorInputsMode + except ImportError: + from microbenchmarks.operator_inp_utils import OperatorInputsMode operator_mode = OperatorInputsMode() fake_tensor_mode = FakeTensorMode() diff --git a/userbenchmark/dynamo/dynamobench/requirements.txt b/userbenchmark/dynamo/dynamobench/requirements.txt new file mode 100644 index 0000000000..a95678ade3 --- /dev/null +++ b/userbenchmark/dynamo/dynamobench/requirements.txt @@ -0,0 +1,2 @@ +pandas +scipy \ No newline at end of file diff --git a/userbenchmark/dynamo/run.py b/userbenchmark/dynamo/run.py index d410f7403c..42e98f2d15 100644 --- a/userbenchmark/dynamo/run.py +++ b/userbenchmark/dynamo/run.py @@ -1,11 +1,19 @@ import logging import warnings -from .torchbench import setup_torchbench_cwd, TorchBenchmarkRunner +from torchbenchmark import add_path, REPO_PATH + +DYNAMOBENCH_PATH = REPO_PATH.joinpath("userbenchmark", "dynamo", "dynamobench") + try: - from .common import main + # OSS Import + with add_path(str(DYNAMOBENCH_PATH)): + from torchbench import setup_torchbench_cwd, TorchBenchmarkRunner + from common import main except ImportError: - from common import main + # Meta Internal Import + from caffe2.benchmarks.dynamo.torchbench import setup_torchbench_cwd, TorchBenchmarkRunner + from caffe2.benchmarks.dynamo.common import main from typing import List @@ -13,4 +21,4 @@ def run(args: List[str]): original_dir = setup_torchbench_cwd() logging.basicConfig(level=logging.WARNING) warnings.filterwarnings("ignore") - main(TorchBenchmarkRunner(), original_dir, args=args) + main(TorchBenchmarkRunner(), original_dir, args) diff --git a/userbenchmark/dynamo/torchbench.py b/userbenchmark/dynamo/torchbench.py deleted file mode 100644 index 1327040aed..0000000000 --- a/userbenchmark/dynamo/torchbench.py +++ /dev/null @@ -1,479 +0,0 @@ -#!/usr/bin/env python3 -import gc -import importlib -import logging -import os -import re -import sys -import warnings -from os.path import abspath, exists - -import torch - -from .common import BenchmarkRunner, main -from ._dynamo.testing import collect_results, reduce_to_scalar_loss -from ._dynamo.utils import clone_inputs - -# We are primarily interested in tf32 datatype -torch.backends.cuda.matmul.allow_tf32 = True - - -def setup_torchbench_cwd(): - original_dir = abspath(os.getcwd()) - - os.environ["KALDI_ROOT"] = "/tmp" # avoids some spam - for torchbench_dir in ( - "./torchbenchmark", - "../torchbenchmark", - "../torchbench", - "../benchmark", - "../../torchbenchmark", - "../../torchbench", - "../../benchmark", - "../../../torchbench", - "../../../benchmark", - ): - if exists(torchbench_dir): - break - - if exists(torchbench_dir): - torchbench_dir = abspath(torchbench_dir) - os.chdir(torchbench_dir) - sys.path.append(torchbench_dir) - - return original_dir - - -# Some models have large dataset that doesn't fit in memory. Lower the batch -# size to test the accuracy. -USE_SMALL_BATCH_SIZE = { - "demucs": 4, - "dlrm": 1024, - "densenet121": 4, - "hf_Reformer": 4, - "hf_T5_base": 4, - "timm_efficientdet": 1, - "llama_v2_7b_16h": 1, -} - -DETECTRON2_MODELS = { - "detectron2_fasterrcnn_r_101_c4", - "detectron2_fasterrcnn_r_101_dc5", - "detectron2_fasterrcnn_r_101_fpn", - "detectron2_fasterrcnn_r_50_c4", - "detectron2_fasterrcnn_r_50_dc5", - "detectron2_fasterrcnn_r_50_fpn", - "detectron2_maskrcnn_r_101_c4", - "detectron2_maskrcnn_r_101_fpn", - "detectron2_maskrcnn_r_50_fpn", -} - -SKIP = { - # https://github.com/pytorch/torchdynamo/issues/101 - "detectron2_maskrcnn", - # https://github.com/pytorch/torchdynamo/issues/145 - "fambench_xlmr", - # TIMEOUT, https://github.com/pytorch/pytorch/issues/98467 - "tacotron2", - "hf_Bert", # Error: RelaxedUnspecConstraint(L['input_ids'].size()[0]) - inferred constant (4) - "hf_Bert_large", # Error: RelaxedUnspecConstraint(L['input_ids'].size()[0]) - inferred constant (4) - # takes too long, extreme slowdown (< .001) - "maml", -} - -SKIP_FOR_CPU = { - "hf_T5_generate", # OOMs - "cm3leon_generate", # model is CUDA only - "nanogpt", # timeout - "sam", # timeout - "llama_v2_7b_16h", # model is CUDA only - "stable_diffusion", # flaky - "torchrec_dlrm", # requires FBGEMM, CUDA only -} - -SKIP_FOR_CUDA = { - "gat", # only works on CPU - "gcn", # only works on CPU - "sage", # only works on CPU -} - -# Additional models that are skipped in training -SKIP_TRAIN = { - # not designed for training - "pyhpc_equation_of_state", - "pyhpc_isoneutral_mixing", - "pyhpc_turbulent_kinetic_energy", - "maml", - "llama", - "llama_v2_7b_16h", -} -SKIP_TRAIN.update(DETECTRON2_MODELS) - -# These models support only train mode. So accuracy checking can't be done in -# eval mode. -ONLY_TRAINING_MODE = { - "tts_angular", - "tacotron2", - "demucs", - "hf_Reformer", - "pytorch_struct", - "yolov3", -} -ONLY_TRAINING_MODE.update(DETECTRON2_MODELS) - -# Need lower tolerance on GPU. GPU kernels have non deterministic kernels for these models. -REQUIRE_HIGHER_TOLERANCE = { - "alexnet", - "attention_is_all_you_need_pytorch", - "densenet121", - "hf_Albert", - "vgg16", - "mobilenet_v3_large", - "nvidia_deeprecommender", - "timm_efficientdet", -} - -# These models need >1e-3 tolerance -REQUIRE_EVEN_HIGHER_TOLERANCE = { - "soft_actor_critic", - "tacotron2", -} - -REQUIRE_HIGHER_FP16_TOLERANCE = { - "drq", -} - -REQUIRE_COSINE_TOLERACE = { - # Just keeping it here even though its empty, if we need this in future. -} - -# non-deterministic output / cant check correctness -NONDETERMINISTIC = { - # https://github.com/pytorch/pytorch/issues/98355 - "mobilenet_v3_large", -} - -# These benchmarks took >600s on an i9-11900K CPU -VERY_SLOW_BENCHMARKS = { - "hf_BigBird", # 3339s - "hf_Longformer", # 3062s - "hf_T5", # 930s -} - -# These benchmarks took >60s on an i9-11900K CPU -SLOW_BENCHMARKS = { - *VERY_SLOW_BENCHMARKS, - "BERT_pytorch", # 137s - "demucs", # 116s - "fastNLP_Bert", # 242s - "hf_Albert", # 221s - "hf_Bart", # 400s - "hf_Bert", # 334s - "hf_DistilBert", # 187s - "hf_GPT2", # 470s - "hf_Reformer", # 141s - "speech_transformer", # 317s - "vision_maskrcnn", # 99s -} - -TRT_NOT_YET_WORKING = { - "alexnet", - "resnet18", - "resnet50", - "mobilenet_v2", - "mnasnet1_0", - "squeezenet1_1", - "shufflenetv2_x1_0", - "vgg16", - "resnext50_32x4d", -} - -DONT_CHANGE_BATCH_SIZE = { - "demucs", - "pytorch_struct", - "pyhpc_turbulent_kinetic_energy", - "vision_maskrcnn", # https://github.com/pytorch/benchmark/pull/1656 -} - - -SKIP_ACCURACY_CHECK_MODELS = { - # Models too large to have eager, dynamo and fp64_numbers simultaneosuly - # even for 40 GB machine. We have tested accuracy for smaller version of - # these models - "hf_GPT2_large", - "hf_T5_large", - "timm_vision_transformer_large", - "maml", # accuracy https://github.com/pytorch/pytorch/issues/93847 - "llama_v2_7b_16h", - "Background_Matting", -} - -SKIP_ACCURACY_CHECK_AS_EAGER_NON_DETERMINISTIC_MODELS = { - # Models that deterministic algorithms can not be turned on for eager mode. - "Background_Matting", -} - - -MAX_BATCH_SIZE_FOR_ACCURACY_CHECK = { - "hf_GPT2": 2, - "pytorch_unet": 2, -} - -FORCE_AMP_FOR_FP16_BF16_MODELS = { - "DALLE2_pytorch", - "doctr_det_predictor", - "doctr_reco_predictor", - "Super_SloMo", - "tts_angular", -} - -# models in canary_models that we should run anyway -CANARY_MODELS = { - "torchrec_dlrm", -} - - -class TorchBenchmarkRunner(BenchmarkRunner): - def __init__(self): - super().__init__() - self.suite_name = "torchbench" - self.optimizer = None - - @property - def skip_models(self): - return SKIP - - @property - def skip_models_for_cpu(self): - return SKIP_FOR_CPU - - @property - def skip_models_for_cuda(self): - return SKIP_FOR_CUDA - - @property - def slow_models(self): - return SLOW_BENCHMARKS - - @property - def very_slow_models(self): - return VERY_SLOW_BENCHMARKS - - @property - def non_deterministic_models(self): - return NONDETERMINISTIC - - @property - def skip_not_suitable_for_training_models(self): - return SKIP_TRAIN - - @property - def failing_fx2trt_models(self): - return TRT_NOT_YET_WORKING - - @property - def force_amp_for_fp16_bf16_models(self): - return FORCE_AMP_FOR_FP16_BF16_MODELS - - @property - def skip_accuracy_checks_large_models_dashboard(self): - if self.args.dashboard or self.args.accuracy: - return SKIP_ACCURACY_CHECK_MODELS - return set() - - @property - def skip_accuracy_check_as_eager_non_deterministic(self): - if self.args.accuracy and self.args.training: - return SKIP_ACCURACY_CHECK_AS_EAGER_NON_DETERMINISTIC_MODELS - return set() - - def load_model( - self, - device, - model_name, - batch_size=None, - part=None, - ): - if self.args.enable_activation_checkpointing: - raise NotImplementedError( - "Activation checkpointing not implemented for Torchbench models" - ) - is_training = self.args.training - use_eval_mode = self.args.use_eval_mode - dynamic_shapes = self.args.dynamic_shapes - candidates = [ - f"torchbenchmark.models.{model_name}", - f"torchbenchmark.canary_models.{model_name}", - f"torchbenchmark.models.fb.{model_name}", - ] - for c in candidates: - try: - module = importlib.import_module(c) - break - except ModuleNotFoundError as e: - if e.name != c: - raise - else: - raise ImportError(f"could not import any of {candidates}") - benchmark_cls = getattr(module, "Model", None) - if not hasattr(benchmark_cls, "name"): - benchmark_cls.name = model_name - - cant_change_batch_size = ( - not getattr(benchmark_cls, "ALLOW_CUSTOMIZE_BSIZE", True) - or model_name in DONT_CHANGE_BATCH_SIZE - ) - if cant_change_batch_size: - batch_size = None - if batch_size is None and is_training and model_name in USE_SMALL_BATCH_SIZE: - batch_size = USE_SMALL_BATCH_SIZE[model_name] - - # Control the memory footprint for few models - if self.args.accuracy and model_name in MAX_BATCH_SIZE_FOR_ACCURACY_CHECK: - batch_size = min(batch_size, MAX_BATCH_SIZE_FOR_ACCURACY_CHECK[model_name]) - - # workaround "RuntimeError: not allowed to set torch.backends.cudnn flags" - torch.backends.__allow_nonbracketed_mutation_flag = True - extra_args = [] - if part: - extra_args = ["--part", part] - - if model_name == "vision_maskrcnn" and is_training: - # Output of vision_maskrcnn model is a list of bounding boxes, - # sorted on the basis of their scores. This makes accuracy - # comparison hard with torch.compile. torch.compile can cause minor - # divergences in the output because of how fusion works for amp in - # TorchInductor compared to eager. Therefore, instead of looking at - # all the bounding boxes, we compare only top 5. - model_kwargs = {"box_detections_per_img": 5} - benchmark = benchmark_cls( - test="train", - device=device, - batch_size=batch_size, - extra_args=extra_args, - model_kwargs=model_kwargs, - ) - elif is_training: - benchmark = benchmark_cls( - test="train", - device=device, - batch_size=batch_size, - extra_args=extra_args, - ) - else: - benchmark = benchmark_cls( - test="eval", - device=device, - batch_size=batch_size, - extra_args=extra_args, - ) - model, example_inputs = benchmark.get_module() - - # Models that must be in train mode while training - if is_training and (not use_eval_mode or model_name in ONLY_TRAINING_MODE): - model.train() - else: - model.eval() - gc.collect() - batch_size = benchmark.batch_size - - # Torchbench has quite different setup for yolov3, so directly passing - # the right example_inputs - if model_name == "yolov3": - example_inputs = (torch.rand(batch_size, 3, 384, 512).to(device),) - # See https://github.com/pytorch/benchmark/issues/1561 - if model_name == "maml_omniglot": - batch_size = 5 - assert example_inputs[0].shape[0] == batch_size - if model_name == "vision_maskrcnn": - batch_size = 1 - # global current_name, current_device - # current_device = device - # current_name = benchmark.name - - if self.args.trace_on_xla: - # work around for: https://github.com/pytorch/xla/issues/4174 - import torch_xla # noqa: F401 - self.validate_model(model, example_inputs) - return device, benchmark.name, model, example_inputs, batch_size - - def iter_model_names(self, args): - from torchbenchmark import _list_canary_model_paths, _list_model_paths - - models = _list_model_paths() - models += [ - f - for f in _list_canary_model_paths() - if os.path.basename(f) in CANARY_MODELS - ] - models.sort() - - start, end = self.get_benchmark_indices(len(models)) - for index, model_path in enumerate(models): - if index < start or index >= end: - continue - - model_name = os.path.basename(model_path) - if ( - not re.search("|".join(args.filter), model_name, re.I) - or re.search("|".join(args.exclude), model_name, re.I) - or model_name in args.exclude_exact - or model_name in self.skip_models - ): - continue - - yield model_name - - def pick_grad(self, name, is_training): - if is_training or name in ("maml",): - return torch.enable_grad() - else: - return torch.no_grad() - - def get_tolerance_and_cosine_flag(self, is_training, current_device, name): - tolerance = 1e-4 - cosine = self.args.cosine - # Increase the tolerance for torch allclose - if self.args.float16 or self.args.amp: - if name in REQUIRE_HIGHER_FP16_TOLERANCE: - return 1e-2, cosine - return 1e-3, cosine - if is_training and current_device == "cuda": - tolerance = 1e-3 - if name in REQUIRE_COSINE_TOLERACE: - cosine = True - elif name in REQUIRE_HIGHER_TOLERANCE: - tolerance = 1e-3 - elif name in REQUIRE_EVEN_HIGHER_TOLERANCE: - tolerance = 8 * 1e-2 - return tolerance, cosine - - def compute_loss(self, pred): - return reduce_to_scalar_loss(pred) - - def forward_pass(self, mod, inputs, collect_outputs=True): - with self.autocast(): - return mod(*inputs) - - def forward_and_backward_pass(self, mod, inputs, collect_outputs=True): - cloned_inputs = clone_inputs(inputs) - self.optimizer_zero_grad(mod) - with self.autocast(): - pred = mod(*cloned_inputs) - loss = self.compute_loss(pred) - self.grad_scaler.scale(loss).backward() - self.optimizer_step() - if collect_outputs: - return collect_results(mod, pred, loss, cloned_inputs) - return None - - -def torchbench_main(): - original_dir = setup_torchbench_cwd() - logging.basicConfig(level=logging.WARNING) - warnings.filterwarnings("ignore") - main(TorchBenchmarkRunner(), original_dir) - - -if __name__ == "__main__": - torchbench_main() diff --git a/userbenchmark/dynamo/torchbench_models_list.txt b/userbenchmark/dynamo/torchbench_models_list.txt deleted file mode 100644 index 04947c4a6a..0000000000 --- a/userbenchmark/dynamo/torchbench_models_list.txt +++ /dev/null @@ -1,28 +0,0 @@ -BERT_pytorch,128 -Background_Matting, 16 -LearningToPaint,1024 -alexnet,1024 -dcgan,1024 -densenet121,64 -hf_Albert,32 -hf_Bart,16 -hf_Bert,16 -hf_GPT2,16 -hf_T5,4 -mnasnet1_0,256 -mobilenet_v2,128 -mobilenet_v3_large,256 -nvidia_deeprecommender,1024 -pytorch_unet,8 -resnet18,512 -resnet50,128 -resnext50_32x4d,128 -shufflenet_v2_x1_0,512 -squeezenet1_1,512 -timm_nfnet,256 -timm_efficientnet,128 -timm_regnet,128 -timm_resnest,256 -timm_vision_transformer,256 -timm_vovnet,128 -vgg16,128 diff --git a/userbenchmark/dynamo/torchbench_models_list_cpu.txt b/userbenchmark/dynamo/torchbench_models_list_cpu.txt deleted file mode 100644 index ab485702b8..0000000000 --- a/userbenchmark/dynamo/torchbench_models_list_cpu.txt +++ /dev/null @@ -1,48 +0,0 @@ -alexnet,128 -attention_is_all_you_need_pytorch,64 -BERT_pytorch,32 -dcgan,256 -densenet121,512 -dlrm,2048 -fastNLP_Bert,8 -functorch_dp_cifar10,1024 -hf_Albert,8 -hf_Bart,8 -hf_Bert,8 -hf_Bert_large,8 -hf_DistilBert,8 -hf_GPT2,8 -hf_GPT2_large,1 -hf_Longformer,4 -hf_Reformer,8 -hf_T5,4 -hf_T5_base,1 -hf_T5_large,1 -LearningToPaint,96 -lennard_jones,1024 -mnasnet1_0,32 -mobilenet_v2,16 -mobilenet_v3_large,32 -nvidia_deeprecommender,256 -phlippe_densenet,128 -phlippe_resnet,512 -pytorch_unet,4 -resnet152,32 -resnet18,256 -resnet50,256 -resnext50_32x4d,256 -shufflenet_v2_x1_0,64 -speech_transformer,1024 -squeezenet1_1,16 -Super_SloMo,1024 -timm_efficientnet,64 -timm_nfnet,128 -timm_regnet,32 -timm_resnest,32 -timm_vision_transformer,16 -timm_vision_transformer_large,8 -timm_vovnet,32 -tts_angular,1024 -vgg16,64 -vision_maskrcnn,1 -yolov3,32