From 1ecde79e50c58ae74236d7a4df42ee40f3d6d6b3 Mon Sep 17 00:00:00 2001
From: Xu Zhao <xzhao9@meta.com>
Date: Wed, 4 Oct 2023 07:58:16 -0700
Subject: [PATCH] Cleanup the code in the `dynamo` userbenchmark (#1960)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Skip importing the modules that are only available in the pytorch source code, not pytorch nightly release.

Make dynamo benchmark work on both OSS and internal.


Test Plan:
```
$ python run_benchmark.py dynamo --only alexnet --training --performance --inductor
loading model: 0it [00:05, ?it/s]
cuda train alexnet
running benchmark: 100%|█████████████████| 30/30 [00:00<00:00, 41.46it/s]
1.129x
```


```
$ buck2 run mode/opt //pytorch/benchmark:run_benchmark -- dynamo --only alexnet --training --inductor --performance --output-directory $HOME
loading model: 0it [00:16, ?it/s]
running benchmark: 100%|█████████████████| 30/30 [00:00<00:00, 37.94it/s]
cuda train alexnet
1.120x
```

Differential Revision: D49912006

Pulled By: xuzhao9
---
 userbenchmark/dynamo/_dynamo/testing.py       |  364 --
 userbenchmark/dynamo/_dynamo/utils.py         | 2048 ----------
 userbenchmark/dynamo/common.py                | 3577 -----------------
 .../dynamo/dynamobench/_dynamo/utils.py       |   44 +-
 userbenchmark/dynamo/dynamobench/common.py    |   20 +-
 .../dynamo/dynamobench/requirements.txt       |    2 +
 userbenchmark/dynamo/run.py                   |   16 +-
 userbenchmark/dynamo/torchbench.py            |  479 ---
 .../dynamo/torchbench_models_list.txt         |   28 -
 .../dynamo/torchbench_models_list_cpu.txt     |   48 -
 10 files changed, 48 insertions(+), 6578 deletions(-)
 delete mode 100644 userbenchmark/dynamo/_dynamo/testing.py
 delete mode 100644 userbenchmark/dynamo/_dynamo/utils.py
 delete mode 100644 userbenchmark/dynamo/common.py
 create mode 100644 userbenchmark/dynamo/dynamobench/requirements.txt
 delete mode 100644 userbenchmark/dynamo/torchbench.py
 delete mode 100644 userbenchmark/dynamo/torchbench_models_list.txt
 delete mode 100644 userbenchmark/dynamo/torchbench_models_list_cpu.txt

diff --git a/userbenchmark/dynamo/_dynamo/testing.py b/userbenchmark/dynamo/_dynamo/testing.py
deleted file mode 100644
index f5dc961abd..0000000000
--- a/userbenchmark/dynamo/_dynamo/testing.py
+++ /dev/null
@@ -1,364 +0,0 @@
-import contextlib
-import dis
-import functools
-import logging
-import os.path
-import re
-import sys
-import types
-import unittest
-from typing import Sequence, Union
-from unittest.mock import patch
-
-import torch
-from torch import fx
-from torch._dynamo.output_graph import OutputGraph
-
-from torch._dynamo import config, eval_frame, optimize_assert, reset
-from torch._dynamo.bytecode_transformation import (
-    create_instruction,
-    debug_checks,
-    is_generator,
-    transform_code_object,
-)
-from torch._dynamo.guards import CheckFunctionManager, GuardedCode
-from .utils import same
-
-unsupported = eval_frame.unsupported
-three = 3
-
-log = logging.getLogger(__name__)
-
-
-def clone_me(x):
-    if x is None:
-        return None
-    return x.detach().clone().requires_grad_(x.requires_grad)
-
-
-def skip_if_pytest(fn):
-    @functools.wraps(fn)
-    def wrapped(*args, **kwargs):
-        if "PYTEST_CURRENT_TEST" in os.environ:
-            raise unittest.SkipTest("does not work under pytest")
-        return fn(*args, **kwargs)
-
-    return wrapped
-
-
-def named_parameters_for_optimized_module(mod):
-    assert isinstance(mod, eval_frame.OptimizedModule)
-    return mod._orig_mod.named_parameters
-
-
-def named_buffers_for_optimized_module(mod):
-    assert isinstance(mod, eval_frame.OptimizedModule)
-    return mod._orig_mod.named_buffers
-
-
-def remove_optimized_module_prefix(name):
-    return re.sub(r"^_orig_mod[.]", "", name)
-
-
-def collect_results(model, prediction, loss, example_inputs):
-    results = []
-    results.append(prediction)
-    results.append(loss)
-    # if isinstance(loss, torch.Tensor) and loss.item() > 1:
-    #     log.warning(
-    #         f"High loss value alert - {loss:.2f}. Can result in unstable gradients."
-    #     )
-
-    grads = dict()
-    params = dict()
-    for name, param in model.named_parameters():
-        if isinstance(model, eval_frame.OptimizedModule):
-            name = remove_optimized_module_prefix(name)
-        param_copy = param
-        grad = param.grad
-        # Treat None and zero grad as same
-        if param.grad is None:
-            grad = torch.zeros_like(param)
-        grads[name + ".grad"] = grad
-        params[name] = param_copy
-    results.append(grads)
-    results.append(params)
-    buffers = dict()
-    for name, buffer in model.named_buffers():
-        if isinstance(model, eval_frame.OptimizedModule):
-            name = remove_optimized_module_prefix(name)
-        buffers[name] = buffer
-    results.append(buffers)
-    for example in example_inputs:
-        if isinstance(example, (tuple, list)):
-            for inp in example:
-                if isinstance(inp, torch.Tensor):
-                    results.append(inp.grad)
-        else:
-            if isinstance(example, torch.Tensor):
-                results.append(example.grad)
-    return results
-
-
-def requires_bwd_pass(out):
-    if isinstance(out, torch.Tensor):
-        return out.requires_grad
-    elif isinstance(out, (list, tuple)):
-        return any(requires_bwd_pass(x) for x in out)
-    elif out is None:
-        return False
-    elif isinstance(out, int):
-        return False
-    raise NotImplementedError("Don't know how to reduce", type(out))
-
-
-def reduce_to_scalar_loss(out):
-    """Reduce the output of a model to get scalar loss"""
-    if isinstance(out, torch.Tensor):
-        # Mean does not work on integer tensors
-        return out.sum() / out.numel()
-    elif isinstance(out, (list, tuple)):
-        return sum([reduce_to_scalar_loss(x) for x in out]) / len(out)
-    elif type(out).__name__ in (
-        "MaskedLMOutput",
-        "Seq2SeqLMOutput",
-        "CausalLMOutputWithCrossAttentions",
-    ):
-        return reduce_to_scalar_loss(out.logits)
-    elif type(out).__name__ == "SquashedNormal":
-        return out.mean.sum()
-    elif isinstance(out, dict):
-        return sum([reduce_to_scalar_loss(value) for value in out.values()]) / len(
-            out.keys()
-        )
-    raise NotImplementedError("Don't know how to reduce", type(out))
-
-
-def debug_dir():
-    path = os.path.join(os.path.dirname(__file__), "../debug")
-    if not os.path.exists(path):
-        os.mkdir(path)
-    return path
-
-
-def debug_dump(name, code: types.CodeType, extra=""):
-    with open(os.path.join(debug_dir(), name), "w") as fd:
-        fd.write(
-            f"{dis.Bytecode(code).info()}\n\n{dis.Bytecode(code).dis()}\n\n{extra}\n"
-        )
-
-
-def debug_insert_nops(frame, cache_size, hooks, _):
-    """used to debug jump updates"""
-
-    def insert_nops(instructions, code_options):
-        instructions.insert(0, create_instruction("NOP"))
-        instructions.insert(0, create_instruction("NOP"))
-
-    if is_generator(frame.f_code):
-        return None
-
-    debug_checks(frame.f_code)
-    code = transform_code_object(frame.f_code, insert_nops)
-    graph = OutputGraph(
-        code_options={},
-        compiler_fn=None,
-        root_tx=None,
-        export=False,
-        export_constraints=None,
-        frame_state={"_id": 0},
-        # TODO: shouldn't this be f_locals/f_globals from frame?
-        local_scope=locals(),
-        global_scope=globals(),
-        f_code=frame.f_code,
-    )
-
-    return GuardedCode(code, CheckFunctionManager(graph).check_fn)
-
-
-class CompileCounter:
-    def __init__(self):
-        self.frame_count = 0
-        self.op_count = 0
-
-    def __call__(self, gm: torch.fx.GraphModule, example_inputs):
-        self.frame_count += 1
-        for node in gm.graph.nodes:
-            if "call" in node.op:
-                self.op_count += 1
-        return gm.forward
-
-    def clear(self):
-        self.frame_count = 0
-        self.op_count = 0
-
-
-class CompileCounterWithBackend:
-    def __init__(self, backend):
-        self.frame_count = 0
-        self.op_count = 0
-        self.backend = backend
-        self.graphs = []
-
-    def __call__(self, gm: torch.fx.GraphModule, example_inputs):
-        from .backends.registry import lookup_backend
-
-        self.frame_count += 1
-        for node in gm.graph.nodes:
-            if "call" in node.op:
-                self.op_count += 1
-        self.graphs.append(gm)
-        return lookup_backend(self.backend)(gm, example_inputs)
-
-
-# Equivalent to backend="eager", but also records graphs that
-# we can assert on
-class EagerAndRecordGraphs:
-    def __init__(self):
-        self.graphs = []
-
-    def __call__(self, gm: torch.fx.GraphModule, example_inputs):
-        self.graphs.append(gm)
-        return gm
-
-
-def strip_comment(code):
-    code = str(code)
-    return re.sub(r"(?m)^ *#.*\n?", "", code)
-
-
-def remove_trailing_space(code):
-    return "\n".join([line.rstrip() for line in code.split("\n")])
-
-
-def normalize_gm(gm_str):
-    # strip comments as comments have path to files which may differ from
-    # system to system.
-    return remove_trailing_space(strip_comment(gm_str))
-
-
-def standard_test(self, fn, nargs, expected_ops=None, expected_ops_dynamic=None):
-    if not config.assume_static_by_default and expected_ops_dynamic is not None:
-        expected_ops = expected_ops_dynamic
-
-    actual = CompileCounter()
-    if expected_ops is None:
-        expected = CompileCounter()
-        try:
-            gm = torch.fx.symbolic_trace(fn)
-            expected(gm)
-            print("\nfx.symbolic_trace graph:")
-            gm.graph.print_tabular()
-            expected_ops = expected.op_count
-        except Exception:
-            pass  # Silently ignore FX errors (not our issue)
-
-    args1 = [torch.randn(10, 10) for _ in range(nargs)]
-    args2 = [torch.randn(10, 10) for _ in range(nargs)]
-    correct1 = fn(*args1)
-    correct2 = fn(*args2)
-    reset()
-    opt_fn = optimize_assert(actual)(fn)
-    val1a = opt_fn(*args1)
-    val2a = opt_fn(*args2)
-    val1b = opt_fn(*args1)
-    val2b = opt_fn(*args2)
-    reset()
-    self.assertTrue(same(val1a, correct1))
-    self.assertTrue(same(val1b, correct1))
-    self.assertTrue(same(val2a, correct2))
-    self.assertTrue(same(val2b, correct2))
-    self.assertEqual(actual.frame_count, 1)
-    if expected_ops is not None:
-        self.assertEqual(actual.op_count, expected_ops)
-
-
-def dummy_fx_compile(gm: fx.GraphModule, example_inputs):
-    return gm.forward
-
-
-def format_speedup(speedup, pvalue, is_correct=True, pvalue_threshold=0.1):
-    if not is_correct:
-        return "ERROR"
-    if pvalue > pvalue_threshold:
-        return f"{speedup:.3f}x SAME"
-    return f"{speedup:.3f}x p={pvalue:.2f}"
-
-
-def rand_strided(
-    size: Sequence[int],
-    stride: Sequence[int],
-    dtype: torch.dtype = torch.float32,
-    device: Union[str, torch.device] = "cpu",
-    extra_size: int = 0,
-):
-    needed_size = (
-        sum((shape - 1) * stride for shape, stride in zip(size, stride))
-        + 1
-        + extra_size
-    )
-    if dtype.is_floating_point:
-        buffer = torch.randn(needed_size, dtype=dtype, device=device)
-    else:
-        buffer = torch.zeros(size=[needed_size], dtype=dtype, device=device)
-    return torch.as_strided(buffer, size, stride)
-
-
-def _make_fn_with_patches(fn, *patches):
-    @functools.wraps(fn)
-    def _fn(*args, **kwargs):
-        with contextlib.ExitStack() as stack:
-            for module, attr, val in patches:
-                stack.enter_context(patch.object(module, attr, val))
-
-            return fn(*args, **kwargs)
-
-    return _fn
-
-
-def make_test_cls_with_patches(cls, cls_prefix, fn_suffix, *patches, xfail_prop=None):
-    class DummyTestClass(cls):
-        pass
-
-    DummyTestClass.__name__ = f"{cls_prefix}{cls.__name__}"
-    DummyTestClass.__qualname__ = DummyTestClass.__name__
-
-    for name in dir(cls):
-        if name.startswith("test_"):
-            fn = getattr(cls, name)
-            if not callable(fn):
-                continue
-            new_name = f"{name}{fn_suffix}"
-            new_fn = _make_fn_with_patches(fn, *patches)
-            new_fn.__name__ = new_name
-            if xfail_prop is not None and hasattr(fn, xfail_prop):
-                new_fn = unittest.expectedFailure(new_fn)
-            setattr(DummyTestClass, new_name, new_fn)
-
-    return DummyTestClass
-
-
-# test Python 3.11+ specific features
-def skipIfNotPy311(fn):
-    if sys.version_info >= (3, 11):
-        return fn
-    return unittest.skip(fn)
-
-
-# Controls tests generated in test/inductor/test_torchinductor_dynamic_shapes.py
-# and test/dynamo/test_dynamic_shapes.py
-def expectedFailureDynamic(fn):
-    fn._expected_failure_dynamic = True
-    return fn
-
-
-# Controls tests generated in test/inductor/test_torchinductor_codegen_dynamic_shapes.py
-def expectedFailureCodegenDynamic(fn):
-    fn._expected_failure_codegen_dynamic = True
-    return fn
-
-
-# Controls test generated in test/inductor/test_cpp_wrapper.py
-def expectedFailureDynamicWrapper(fn):
-    fn._expected_failure_dynamic_wrapper = True
-    return fn
diff --git a/userbenchmark/dynamo/_dynamo/utils.py b/userbenchmark/dynamo/_dynamo/utils.py
deleted file mode 100644
index 471dbcc552..0000000000
--- a/userbenchmark/dynamo/_dynamo/utils.py
+++ /dev/null
@@ -1,2048 +0,0 @@
-import atexit
-import collections
-import contextlib
-import copy
-import cProfile
-import dataclasses
-import datetime
-import dis
-import enum
-import functools
-import gc
-import inspect
-import itertools
-import linecache
-import logging
-import math
-import operator
-import os
-import pstats
-import sys
-import textwrap
-import time
-import types
-import typing
-import weakref
-from contextlib import contextmanager
-from functools import lru_cache, wraps
-from typing import Any, Dict, Optional, Tuple, Union
-
-import numpy as np
-
-# import torch._logging
-# import torch._numpy as tnp
-# from torch._guards import detect_fake_mode  # noqa: F401
-from torch._dynamo import config
-
-
-# NOTE: Make sure `NP_SUPPORTED_MODULES` and `NP_TO_TNP_MODULE` are in sync.
-NP_SUPPORTED_MODULES = (np, np.fft, np.linalg, np.random)
-
-# NP_TO_TNP_MODULE = {
-#     np: tnp,
-#     np.fft: tnp.fft,
-#     np.linalg: tnp.linalg,
-#     np.random: tnp.random,
-# }
-
-import importlib
-
-import torch
-import torch._functorch.config
-import torch.fx.experimental.symbolic_shapes
-from torch import fx
-from torch._dispatch.python import enable_python_dispatcher
-from torch._subclasses.fake_tensor import FakeTensor
-from torch.nn.modules.lazy import LazyModuleMixin
-from torch.utils._pytree import tree_map
-
-
-counters = collections.defaultdict(collections.Counter)
-troubleshooting_url = "https://pytorch.org/docs/master/compile/troubleshooting.html"
-nnmodule_doc_url = "https://pytorch.org/docs/master/compile/nn-module.html"
-nnmodule_doc_url_msg = f"See {nnmodule_doc_url} for more information and limitations."
-log = logging.getLogger(__name__)
-
-# profiling compilation time by function
-compilation_time_metrics = collections.OrderedDict()
-
-# profiling compilation time by frame phase
-frame_phase_timing = collections.OrderedDict()
-
-timer_counter = itertools.count()
-
-
-def tabulate(rows, headers):
-    try:
-        import tabulate
-
-        return tabulate.tabulate(rows, headers=headers)
-    except ImportError:
-        return "\n".join(
-            ", ".join(map(str, row)) for row in itertools.chain([headers], rows)
-        )
-
-
-def dynamo_profiled(func):
-    @wraps(func)
-    def profile_wrapper(*args, **kwargs):
-        global timer_counter
-        datafn = (
-            func.__name__ + f"{next(timer_counter)}.profile"
-        )  # Name the data file sensibly
-        prof = cProfile.Profile()
-        prof.enable()
-        retval = prof.runcall(func, *args, **kwargs)
-        prof.disable()
-        print(f"### Cprofile for {func.__name__} iter {next(timer_counter)} ###")
-        ps = pstats.Stats(prof)
-        ps.sort_stats(pstats.SortKey.TIME).print_stats(20)
-        ps.sort_stats(pstats.SortKey.CUMULATIVE).print_stats(20)
-        prof.dump_stats(datafn)
-        return retval
-
-    return profile_wrapper
-
-
-curr_frame = 0
-
-
-# Note: Called for you by dynamo - you almost never ever want to invoke this yourself.
-def increment_frame():
-    global curr_frame
-    curr_frame = curr_frame + 1
-
-
-# Note: Called for you by dynamo - you almost never ever want to invoke this yourself.
-def reset_frame_count():
-    global curr_frame
-    frame_phase_timing.clear()
-    compilation_time_metrics.clear()
-    curr_frame = 0
-
-
-op_count = 0
-
-
-def increment_op_count(cnt):
-    global op_count
-    op_count += cnt
-
-
-# Print a report of time spent so far
-# Ex:
-# TIMING:
-# entire_frame_compile:8.574629999999999
-# backend_compile:5.26806
-def print_time_report():
-    total = 0
-    total_by_key = {}
-    for timings in frame_phase_timing.values():
-        for key, timing in timings.items():
-            total += timing
-            if key not in total_by_key:
-                total_by_key[key] = timing
-            else:
-                total_by_key[key] += timing
-
-    out = "TIMING:"
-    for key, value in total_by_key.items():
-        out = f"{out} {key}:{round(value, 5)}"
-
-    print(out)
-
-
-# dynamo_timed API works as a function decorator
-# By wrapping a function in dynamo_timed, we can store a record in compilation_time_metrics
-# where the key is the functions name.
-# For example:
-#
-#  @dynamo_timed
-#  def _foo(...):
-#
-# Would show up as an entry in our timing dict:
-# OrderedDict([('bar.<locals>._foo', [0.083690, 0.23949, 3.1425e-05])])
-# This is extremely useful for granular debugging.
-#
-# For a higher-level mode, pass a phase_name into dynamo_timed
-# phase_names record an extra record into a separate compilation timing structure,
-# one keyed on frame+name rather than function.
-# The frame is incremented outside of this function, in def increment_frame() above.
-def dynamo_timed(original_function=None, phase_name=None):
-    def dynamo_timed_inner(func):
-        @wraps(func)
-        def time_wrapper(*args, **kwargs):
-            key = func.__qualname__
-            if key not in compilation_time_metrics:
-                compilation_time_metrics[key] = []
-            with torch.profiler.record_function(f"{key} (dynamo_timed)"):
-                t0 = time.time()
-                r = func(*args, **kwargs)
-                time_spent = time.time() - t0
-            compilation_time_metrics[key].append(time_spent)
-            if phase_name:
-                frame_key = str(curr_frame)
-                if frame_key not in frame_phase_timing:
-                    frame_phase_timing[frame_key] = {}
-                assert (
-                    phase_name not in frame_phase_timing[frame_key]
-                ), f"Duplicate phase name {phase_name} for frame {frame_key}"
-                frame_phase_timing[frame_key][phase_name] = time_spent
-            return r
-
-        return time_wrapper
-
-    if original_function:
-        return dynamo_timed_inner(original_function)
-    return dynamo_timed_inner
-
-
-def compile_times(repr="str", aggregate=False):
-    """
-    Get metrics about torchdynamo frontend/backend compilation times.
-
-    Accumulates information from functions tagged with `@dynamo_timed`.
-
-    repr='str' returns a printable string for user interaction, and 'csv'
-    returns headers, rows which can be logged for output
-
-    aggregate causes values from multiple compilations (e.g. split graphs)
-    to be accumulated into one value.  If false, expect more than one value
-    per metric.
-    """
-
-    def fmt_fn(values, item_fn=lambda x: x):
-        if aggregate:
-            return item_fn(sum(values))
-        return ", ".join(map(item_fn, values))
-
-    if repr == "str":
-        rows = [
-            (k, fmt_fn(compilation_time_metrics[k], item_fn=lambda x: f"{x:.4f}"))
-            for k in compilation_time_metrics
-        ]
-        out = "TorchDynamo compilation metrics:\n"
-        out += tabulate(rows, headers=("Function", "Runtimes (s)"))
-        return out
-    elif repr == "csv":
-        values = [
-            fmt_fn(v, item_fn=lambda x: f"{x:.6f}")
-            for v in compilation_time_metrics.values()
-        ]
-        headers = list(compilation_time_metrics.keys())
-        return headers, values
-
-
-@atexit.register
-def dump_compile_times():
-    log.info(compile_times(repr="str", aggregate=True))
-
-
-tensortype_to_dtype = {
-    torch.FloatTensor: (torch.float32, torch.float),
-    torch.DoubleTensor: (torch.float64, torch.double),
-    torch.HalfTensor: (torch.float16, torch.half),
-    torch.BFloat16Tensor: (torch.bfloat16,),
-    torch.ByteTensor: (torch.uint8,),
-    torch.CharTensor: (torch.int8,),
-    torch.LongTensor: (torch.int64, torch.long),
-    torch.IntTensor: (torch.int32, torch.int),
-    torch.ShortTensor: (torch.int16, torch.short),
-    torch.BoolTensor: (torch.bool,),
-}
-
-
-class DuplicateWarningChecker:
-    def __init__(self, maxsize=4096):
-        self.maxsize = maxsize
-        self.reset()
-
-    def reset(self):
-        self.set = collections.OrderedDict()
-
-    def add(self, key):
-        if key in self.set:
-            self.set.move_to_end(key, last=True)
-            if not config.verbose:
-                return False
-        else:
-            self.set[key] = None
-            while len(self.set) > self.maxsize:
-                self.set.popitem(last=False)
-        return True
-
-
-graph_break_dup_warning_checker = DuplicateWarningChecker()
-
-
-def setup_compile_debug():
-    compile_debug = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
-
-    if compile_debug:
-        torch._logging.set_logs(
-            dynamo=logging.DEBUG,
-            aot=logging.DEBUG,
-            inductor=logging.DEBUG,
-            output_code=True,  # this is off by default
-        )
-        return add_file_handler()
-
-    return contextlib.ExitStack()
-
-
-def reset_graph_break_dup_checker():
-    graph_break_dup_warning_checker.reset()
-
-
-def add_file_handler():
-    log_path = os.path.join(get_debug_dir(), "torchdynamo")
-    if not os.path.exists(log_path):
-        os.makedirs(log_path)
-
-    log_file_handler = logging.FileHandler(os.path.join(log_path, "debug.log"))
-    logger = logging.getLogger("torch._dynamo")
-    logger.addHandler(log_file_handler)
-
-    exitstack = contextlib.ExitStack()
-    exitstack.callback(lambda: logger.removeHandler(log_file_handler))
-    return exitstack
-
-
-def setup_log_file():
-    exitstack = contextlib.ExitStack()
-    if config.log_file_name is not None:
-        log_file_handler = logging.FileHandler(config.log_file_name)
-        for logger in logging.get_loggers():
-            logger.addHandler(log_file_handler)
-            exitstack.callback(lambda: logger.removeHandler(log_file_handler))
-        return exitstack
-
-    return exitstack
-
-
-def gen_record_file_name(exc, code):
-    return f"{get_debug_dir()}/error_recordings/\
-{code.co_name}_{type(exc).__name__}_{code.co_firstlineno}.rec"
-
-
-def write_record_to_file(filename, exec_record):
-    try:
-        if os.path.exists(filename):
-            log.warning(
-                "Unable to write execution record %s; file already exists.", filename
-            )
-        else:
-            os.makedirs(os.path.dirname(filename), exist_ok=True)
-            with open(filename, "wb") as f:
-                exec_record.dump(f)
-    except Exception:
-        log.error("Unable to write execution record %s", filename, exc_info=1)
-
-
-def count_calls(g: fx.Graph):
-    c = 0
-    for n in g.nodes:
-        if "call" in n.op:
-            c += 1
-    return c
-
-
-def identity(x):
-    return x
-
-
-def nothing(*args, **kwargs):
-    pass
-
-
-class ExactWeakKeyDictionary:
-    """Similar to weakref.WeakKeyDictionary, but use `is`/`id` rather than `==` to compare equality"""
-
-    def __init__(self):
-        self.values = dict()
-        self.refs = dict()
-
-    def __getitem__(self, key):
-        return self.values[id(key)]
-
-    def get(self, key, default=None):
-        return self.values.get(id(key), default)
-
-    def __contains__(self, key):
-        return id(key) in self.values
-
-    def __setitem__(self, key, value):
-        idx = id(key)
-        if idx not in self.refs:
-            self.refs[idx] = weakref.ref(key, lambda ref: self._remove_id(idx))
-        self.values[idx] = value
-
-    def _remove_id(self, idx):
-        if idx in self.values:
-            del self.values[idx]
-        if idx in self.refs:
-            del self.refs[idx]
-
-    def clear(self):
-        self.refs.clear()
-        self.values.clear()
-
-
-def istype(obj, allowed_types):
-    """isinstance() without subclasses"""
-    if isinstance(allowed_types, (tuple, list, set)):
-        return type(obj) in allowed_types
-    return type(obj) is allowed_types
-
-
-def is_typing(value):
-    if sys.version_info < (3, 9):
-        return isinstance(value, typing._GenericAlias)
-    else:
-        return isinstance(
-            value, (typing._SpecialGenericAlias, typing._UnionGenericAlias)
-        )
-
-
-def is_numpy_int_type(value):
-    return istype(
-        value,
-        (
-            np.int8,
-            np.int16,
-            np.int32,
-            np.int64,
-            np.uint8,
-            np.uint16,
-            np.uint32,
-            np.uint64,
-        ),
-    )
-
-
-def is_numpy_float_type(value):
-    return istype(
-        value,
-        (
-            np.float16,
-            np.float32,
-            np.float64,
-        ),
-    )
-
-
-def is_numpy_ndarray(value):
-    return istype(value, np.ndarray)
-
-
-def istensor(obj):
-    """Check of obj is a tensor"""
-    tensor_list = (
-        torch.Tensor,
-        torch.nn.Parameter,
-        *config.traceable_tensor_subclasses,
-    )
-    tensor_list = tensor_list + (torch._subclasses.FakeTensor,)
-    return istype(obj, tensor_list)
-
-
-def is_lazy_module(mod):
-    return isinstance(mod, LazyModuleMixin)
-
-
-@functools.lru_cache(4096)
-def print_once(*args):
-    print(*args)
-
-
-def make_cell(val=None):
-    """Some black magic to create a cell object that usually only exists in a closure"""
-    x = val
-
-    def f():
-        return x
-
-    assert len(f.__closure__) == 1
-    return f.__closure__[0]
-
-
-def proxy_args_kwargs(args, kwargs):
-    try:
-        proxy_args = tuple(arg.as_proxy() for arg in args)
-        proxy_kwargs = {key: arg.as_proxy() for key, arg in kwargs.items()}
-        return proxy_args, proxy_kwargs
-    except NotImplementedError as e:
-        from .exc import unimplemented
-        from .variables.base import typestr
-
-        raise unimplemented(
-            f"call_function args: {typestr(*args)} {typestr(*list(kwargs.values()))}"
-        ) from e
-
-
-@dataclasses.dataclass
-class CompilationMetrics:
-    frame_key: str
-    co_name: str
-    co_filename: str
-    co_firstlineno: int
-    cache_size: int
-    guard_count: Optional[int]
-    graph_op_count: Optional[int]
-    graph_node_count: Optional[int]
-    graph_input_count: Optional[int]
-    entire_frame_compile_time_s: Optional[float]
-    backend_compile_time_s: Optional[float]
-    fail_reason: Optional[str]
-
-
-@dataclasses.dataclass
-class CleanupHook:
-    """Remove a global variable when hook is called"""
-
-    scope: Dict[str, Any]
-    name: str
-
-    def __call__(self, *args):
-        CleanupManager.count -= 1
-        del self.scope[self.name]
-
-    @staticmethod
-    def create(scope, name, val):
-        assert name not in scope
-        CleanupManager.count += 1
-        scope[name] = val
-        return CleanupHook(scope, name)
-
-
-class CleanupManager(ExactWeakKeyDictionary):
-    count = 0
-
-    def _remove_id(self, idx):
-        for hook in self.values[idx]:
-            hook()
-        super()._remove_id(idx)
-
-
-CleanupManager.instance = CleanupManager()
-
-
-def clone_tensor(x):
-    """Clone the tensor and its gradient"""
-    y = x.clone().requires_grad_(x.requires_grad)
-    if x.is_leaf and x.grad is not None:
-        y.grad = x.grad.clone()
-    return y
-
-
-def clone_input(x, *, dtype=None):
-    """copy while preserving strides"""
-    # TODO: this is questionable
-    if isinstance(x, torch._subclasses.FakeTensor):
-        # this func fails on fake tensors in __torch_dispatch__
-        return x
-
-    def torch_clone(x):
-        y = torch.clone(x)
-        if x.is_leaf:
-            y.requires_grad_(x.requires_grad)
-        if x.is_leaf and x.grad is not None:
-            y.grad = clone_input(x.grad, dtype=dtype)
-        if hasattr(x, "_dynamo_dynamic_indices"):
-            y._dynamo_dynamic_indices = x._dynamo_dynamic_indices.copy()
-        return y
-
-    with torch.no_grad():
-        if x.device.type == "xla":
-            # Access data_ptr() for a xla tensor will cause crash
-            return torch_clone(x)
-
-        needed_size = sum(
-            (shape - 1) * stride for shape, stride in zip(x.size(), x.stride())
-        )
-        if x.is_quantized:
-            result = torch.empty_quantized((needed_size + 32,), x)
-        else:
-            result = torch.empty(
-                needed_size + 32, dtype=dtype or x.dtype, device=x.device
-            )
-        cache_line_offset = (
-            (x.data_ptr() - result.data_ptr()) % 32
-        ) // x.element_size()
-        result.as_strided_(x.size(), x.stride(), cache_line_offset)
-        try:
-            result.copy_(x.clone())
-            if x.is_leaf:
-                result.requires_grad_(x.requires_grad)
-            if x.is_leaf and x.grad is not None:
-                result.grad = clone_input(x.grad, dtype=dtype)
-        except RuntimeError:
-            # RuntimeError: unsupported operation: more than one element of the written-to
-            # tensor refers to a single memory location. Please clone() the tensor before
-            # performing the operation.
-            return torch_clone(x)
-        if hasattr(x, "_dynamo_dynamic_indices"):
-            result._dynamo_dynamic_indices = x._dynamo_dynamic_indices.copy()
-        return result
-
-
-def clone_inputs(example_inputs):
-    if type(example_inputs) is dict:
-        res = dict(example_inputs)
-        for key, value in res.items():
-            if isinstance(value, tuple):
-                res[key] = clone_inputs(value)
-            else:
-                assert isinstance(value, torch.Tensor), type(value)
-                res[key] = clone_input(value)
-        return res
-
-    res = list(example_inputs)
-    for i in range(len(res)):
-        if isinstance(res[i], torch.Tensor):
-            res[i] = clone_input(res[i])
-    return res
-
-
-@contextmanager
-def preserve_rng_state():
-    with torch.utils._python_dispatch._disable_current_modes():
-        rng_state = torch.clone(torch.random.get_rng_state())
-        if torch.cuda.is_available():
-            cuda_rng_state = torch.clone(torch.cuda.get_rng_state())
-    try:
-        yield
-    finally:
-        with torch.utils._python_dispatch._disable_current_modes():
-            torch.random.set_rng_state(rng_state)
-            if torch.cuda.is_available():
-                torch.cuda.set_rng_state(cuda_rng_state)
-
-
-def is_jit_model(model0):
-    return isinstance(
-        model0,
-        (
-            torch.jit._trace.TopLevelTracedModule,
-            torch.jit._script.RecursiveScriptModule,
-            torch.jit.ScriptFunction,
-            torch.jit.ScriptModule,
-        ),
-    )
-
-
-def torchscript(model, example_inputs, verbose=False):
-    if is_jit_model(model):
-        # already done?
-        return model
-
-    try:
-        return torch.jit.trace(model, example_inputs)
-    except Exception:
-        try:
-            return torch.jit.script(model)
-        except Exception:
-            if verbose:
-                log.exception("jit error")
-            else:
-                log.error("Both torch.jit.trace and torch.jit.script failed")
-    return None
-
-
-def getfile(obj):
-    try:
-        return inspect.getfile(obj)
-    except TypeError:
-        return None
-
-
-def is_namedtuple(obj):
-    """Test if an object is a namedtuple or a torch.return_types.* quasi-namedtuple"""
-    return is_namedtuple_cls(type(obj))
-
-
-def is_namedtuple_cls(cls):
-    """Test if an object is a namedtuple or a torch.return_types.* quasi-namedtuple"""
-    try:
-        if issubclass(cls, tuple):
-            bases = getattr(cls, "__bases__", []) or [None]
-            module = getattr(cls, "__module__", None)
-            return module == "torch.return_types" or (
-                bases[0] is tuple and hasattr(cls, "_make") and hasattr(cls, "_fields")
-            )
-    except TypeError:
-        pass
-    return False
-
-
-@functools.lru_cache(1)
-def namedtuple_fields(cls):
-    """Get the fields of a namedtuple or a torch.return_types.* quasi-namedtuple"""
-    if cls is slice:
-        return ["start", "stop", "step"]
-
-    assert issubclass(cls, tuple)
-    if hasattr(cls, "_fields"):
-        # normal namedtuples
-        return cls._fields
-
-    @dataclasses.dataclass
-    class Marker:
-        index: int
-
-    # frustrating ones e.g. torch.return_types.max
-    assert cls.__module__ == "torch.return_types"
-    obj = cls(map(Marker, range(cls.n_fields)))
-    fields = [None] * cls.n_fields
-    for name in dir(obj):
-        if name[0] != "_" and isinstance(getattr(obj, name), Marker):
-            fields[getattr(obj, name).index] = name
-    return fields
-
-
-def checkpoint_params(gm):
-    with torch.no_grad():
-        rng_state = torch.clone(torch.random.get_rng_state())
-        if torch.cuda.is_available():
-            cuda_rng_state = torch.clone(torch.cuda.get_rng_state())
-        saved_state = []
-        for param in itertools.chain(gm.parameters(), gm.buffers()):
-            saved_state.append((param, param._version, torch.clone(param)))
-
-    def restore():
-        with torch.no_grad():
-            torch.random.set_rng_state(rng_state)
-            if torch.cuda.is_available():
-                torch.cuda.set_rng_state(cuda_rng_state)
-            for param, version, original_value in saved_state:
-                if param._version != version:
-                    param.copy_(original_value)
-
-    return restore
-
-
-def timed(model, example_inputs, times=1):
-    if torch.cuda.is_available():
-        synchronize = torch.cuda.synchronize
-    else:
-        synchronize = nothing
-
-    synchronize()
-    gc.collect()
-    torch.manual_seed(1337)
-    t0 = time.perf_counter()
-    for _ in range(times):
-        result = model(*example_inputs)
-        synchronize()
-    t1 = time.perf_counter()
-    return result, t1 - t0
-
-
-def check_is_cuda(gm, example_inputs):
-    return all(x.is_cuda for x in itertools.chain(example_inputs, gm.parameters(True)))
-
-
-@lru_cache(32)
-def rot_n_helper(n):
-    assert n > 1
-    vars = [f"v{i}" for i in range(n)]
-    rotated = reversed(vars[-1:] + vars[:-1])
-    fn = eval(f"lambda {','.join(vars)}: ({','.join(rotated)})")
-    fn.__name__ = f"rot_{n}_helper"
-    return fn
-
-
-def is_safe_constant(v):
-    if istype(v, (tuple, frozenset)):
-        return all(map(is_safe_constant, v))
-    return isinstance(v, (enum.Enum, type)) or istype(
-        v,
-        (
-            types.CodeType,
-            int,
-            float,
-            bool,
-            str,
-            bytes,
-            type(None),
-            slice,
-            type(type),
-            torch.device,
-            torch.dtype,
-        ),
-    )
-
-
-def guard_if_dyn(arg):
-    from .variables import ConstantVariable, SymNodeVariable
-
-    if isinstance(arg, SymNodeVariable):
-        # This is because SymNodeVariable intentionally doesn't define
-        # as_python_constant to avoid shunting down some codepaths
-        # that expect consts.   In this case, we know we definitely
-        # want to specialize though.
-        return arg.evaluate_expr()
-    elif isinstance(arg, ConstantVariable):
-        return arg.as_python_constant()
-
-    return arg
-
-
-def check_constant_args(args, kwargs):
-    return all(x.is_python_constant() for x in itertools.chain(args, kwargs.values()))
-
-
-def check_unspec_python_args(args, kwargs):
-    from torch._dynamo.variables.constant import ConstantVariable
-    from torch._dynamo.variables.tensor import UnspecializedPythonVariable
-
-    unspec_count = 0
-    for x in itertools.chain(args, kwargs.values()):
-        if isinstance(x, UnspecializedPythonVariable):
-            unspec_count += 1
-        elif not isinstance(x, (UnspecializedPythonVariable, ConstantVariable)):
-            return False
-        else:
-            pass
-
-    return unspec_count > 0
-
-
-def check_numpy_ndarray_args(args, kwargs):
-    from torch._dynamo.variables.tensor import NumpyNdarrayVariable
-
-    return any(
-        isinstance(x, NumpyNdarrayVariable)
-        for x in itertools.chain(args, kwargs.values())
-    )
-
-
-def specialize_args_kwargs(tx, args, kwargs):
-    specialized_args = []
-    specialized_kwargs = {}
-    for x in args:
-        specialized_args.append(x.as_specialized(tx))
-    for k, v in kwargs.items():
-        specialized_kwargs.update({k: v.as_specialized(tx)})
-    return specialized_args, specialized_kwargs
-
-
-dict_values = type(dict().values())
-odict_values = type(collections.OrderedDict().values())
-tuple_iterator = type(iter(tuple()))
-tuple_iterator_len = tuple_iterator.__length_hint__
-object_new = object.__new__
-
-
-def nn_module_new(cls):
-    obj = object_new(cls)
-    torch.nn.Module.__init__(obj)
-    return obj
-
-
-def product(it):
-    return functools.reduce(operator.mul, it, 1)
-
-
-def tuple_iterator_getitem(it, index):
-    _, (obj,), start = it.__reduce__()
-    return obj[start + index]
-
-
-def enum_repr(value, local):
-    # enum class can override __str__ method. Use __class__ and name attribute
-    # to extract the class name and key name.
-    name = value.__class__.__name__
-    val = value.name
-    scope = "L" if local else "G"
-    local_name = f'{scope}["{name}"].{val}'
-    return local_name
-
-
-def dict_param_key_ids(value):
-    return {
-        id(k) for k in value.keys() if isinstance(k, (torch.nn.Parameter, torch.Tensor))
-    }
-
-
-def dict_const_keys(value):
-    return {
-        k for k in value.keys() if not isinstance(k, (torch.nn.Parameter, torch.Tensor))
-    }
-
-
-def dict_const_keys_repr(const_keys, *, local):
-    if any(isinstance(k, enum.Enum) for k in const_keys):
-        # To workaround repr(Enum) returning invalid global reference before python 3.11
-        # by calling enum_repr and removing quotes to render enum in guard code.
-        const_keys_str = f"{ {enum_repr(k, local=local) if isinstance(k, enum.Enum) else repr(k) for k in const_keys} }".replace(
-            "'", ""
-        )
-    else:
-        const_keys_str = f"{const_keys!r}"
-    return const_keys_str
-
-
-def global_key_name(key):
-    return f"__dict_key_{id(key)}"
-
-
-from torch._subclasses import (  # noqa: F401
-    FakeTensorMode,
-    UnsupportedFakeTensorException,
-)
-
-
-def wrap_fake_exception(fn):
-    try:
-        return fn()
-    except UnsupportedFakeTensorException as e:
-        from .exc import unimplemented
-
-        msg = f"Unsupported: {e.reason} with fake tensor propagation."
-        log.warning(msg)
-        raise unimplemented(msg) from e
-
-
-def deepcopy_to_fake_tensor(obj, fake_mode):
-    with torch._subclasses.fake_tensor.FakeCopyMode(fake_mode):
-        return wrap_fake_exception(lambda: copy.deepcopy(obj))
-
-
-def rmse(ref, res):
-    """
-    Calculate root mean squared error
-    """
-    return torch.sqrt(torch.mean(torch.square(ref - res)))
-
-
-def same(
-    ref,
-    res,
-    fp64_ref=None,
-    cos_similarity=False,
-    tol=1e-4,
-    equal_nan=False,
-    exact_dtype=True,
-    relax_numpy_equality=False,
-    ignore_non_fp=False,
-    log_error=log.error,
-):
-    """Check correctness to see if ref and res match"""
-    if fp64_ref is None:
-        fp64_ref = ref
-    if isinstance(ref, (list, tuple, torch.nn.ParameterList, torch.Size)):
-        assert isinstance(res, (list, tuple)), f"type mismatch {type(ref)} {type(res)}"
-        if len(ref) != len(res):
-            log_error("Length mismatch")
-            return False
-        return len(ref) == len(res) and all(
-            same(
-                ai,
-                bi,
-                fp64_refi,
-                cos_similarity,
-                tol,
-                equal_nan,
-                exact_dtype,
-                relax_numpy_equality,
-                ignore_non_fp,
-                log_error=log_error,
-            )
-            for ai, bi, fp64_refi in zip(ref, res, fp64_ref)
-        )
-    elif isinstance(ref, dict):
-        assert isinstance(res, dict)
-        assert set(ref.keys()) == set(
-            res.keys()
-        ), f"keys mismatch {set(ref.keys())} == {set(res.keys())}"
-        for k in sorted(ref.keys()):
-            if not (
-                same(
-                    ref[k],
-                    res[k],
-                    fp64_ref[k],
-                    cos_similarity=cos_similarity,
-                    tol=tol,
-                    equal_nan=equal_nan,
-                    exact_dtype=exact_dtype,
-                    relax_numpy_equality=relax_numpy_equality,
-                    ignore_non_fp=ignore_non_fp,
-                    log_error=log_error,
-                )
-            ):
-                log_error("Accuracy failed for key name %s", k)
-                return False
-        return True
-    elif isinstance(ref, torch.Tensor):
-        assert not isinstance(ref, torch._subclasses.FakeTensor)
-        assert not isinstance(res, torch._subclasses.FakeTensor)
-
-        if ref.is_sparse:
-            assert res.is_sparse
-            ref = ref.to_dense()
-            res = res.to_dense()
-        assert isinstance(res, torch.Tensor), f"type mismatch {type(ref)} {type(res)}"
-        if exact_dtype:
-            if ref.dtype != res.dtype:
-                log_error("dtype mismatch %s, %s", ref.dtype, res.dtype)
-                return False
-            if ref.dtype == torch.bool:
-                if ignore_non_fp:
-                    return True
-                # triton stores bool as int8, so add this for more accurate checking
-                r = torch.allclose(
-                    ref.to(dtype=torch.uint8),
-                    res.to(dtype=torch.uint8),
-                    atol=tol,
-                    rtol=tol,
-                    equal_nan=equal_nan,
-                )
-                if not r:
-                    log_error("Accuracy failed: uint8 tensor did not match")
-                return r
-
-        if cos_similarity:
-            ref = ref.flatten().to(torch.float32)
-            res = res.flatten().to(torch.float32)
-            if torch.allclose(ref, res, atol=tol, rtol=tol, equal_nan=True):
-                # early exit that handles zero/nan better
-                # cosine_similarity(zeros(10), zeros(10), dim=0) is 0
-                return True
-            score = torch.nn.functional.cosine_similarity(ref, res, dim=0, eps=1e-6)
-            if score < 0.99:
-                log.warning("Similarity score=%s", score.cpu().detach().item())
-            return score >= 0.99
-        else:
-            if not exact_dtype:
-                ref = ref.to(res.dtype)
-
-            # First try usual allclose
-            if torch.allclose(ref, res, atol=tol, rtol=tol, equal_nan=equal_nan):
-                return True
-
-            # Check error from fp64 version
-            if fp64_ref.dtype == torch.float64:
-                ref_error = rmse(fp64_ref, ref).item()
-                res_error = rmse(fp64_ref, res).item()
-                multiplier = 2.0
-
-                if (
-                    fp64_ref.numel() < 1000
-                    or (ref.ndim == 4 and ref.shape[-1] == ref.shape[-2] == 1)
-                    # large tol means a benchmark has been specified as REQUIRE_HIGHER_TOLERANCE
-                    or tol >= 2 * 1e-2
-                ):
-                    # In the presence of noise, noise might dominate our error
-                    # metric for smaller tensors.
-                    # Similary, for 1x1 kernels, there seems to be high noise with amp.
-                    multiplier = 3.0
-
-                passes_test = res_error <= (multiplier * ref_error + tol / 10.0)
-                if not passes_test:
-                    log_error(
-                        "RMSE (res-fp64): %.5f, (ref-fp64): %.5f and shape=%s",
-                        res_error,
-                        ref_error,
-                        res.size(),
-                    )
-                    # import pdb; pdb.set_trace()
-                return passes_test
-
-            if ignore_non_fp:
-                return True
-
-            log_error("Accuracy failed: allclose not within tol=%s", tol)
-            return False
-    elif isinstance(ref, (str, int, type(None), bool, torch.device)):
-        if ignore_non_fp:
-            return True
-        r = ref == res
-        if not r:
-            log_error("Accuracy failed (%s): %s != %s", type(ref), ref, res)
-        return r
-    elif isinstance(ref, float):
-        r = math.isclose(ref, res, rel_tol=tol, abs_tol=tol)
-        if not r:
-            log_error(
-                "Accuracy failed (float): %s != %s (within tol=%s)", ref, res, tol
-            )
-        return r
-    elif is_numpy_int_type(ref) or is_numpy_float_type(ref):
-        if relax_numpy_equality and not (
-            is_numpy_int_type(res) or is_numpy_float_type(res)
-        ):
-            ref = ref.item()
-        r = (type(ref) is type(res)) and (ref == res)
-        if not r:
-            log_error("Accuracy failed (numpy): %s != %s", ref, res)
-        return r
-    elif is_numpy_ndarray(ref):
-        return (type(ref) is type(res)) and same(
-            torch.as_tensor(ref),
-            torch.as_tensor(res),
-            fp64_ref,
-            cos_similarity=cos_similarity,
-            tol=tol,
-            equal_nan=equal_nan,
-            exact_dtype=exact_dtype,
-            relax_numpy_equality=relax_numpy_equality,
-            ignore_non_fp=ignore_non_fp,
-            log_error=log_error,
-        )
-    elif type(ref).__name__ in (
-        "MaskedLMOutput",
-        "Seq2SeqLMOutput",
-        "CausalLMOutputWithCrossAttentions",
-        "LongformerMaskedLMOutput",
-        "Instances",
-        "SquashedNormal",
-        "Boxes",
-        "Normal",
-        "TanhTransform",
-        "Foo",
-        "Variable",
-    ):
-        assert type(ref) is type(res)
-        return all(
-            same(
-                getattr(ref, key),
-                getattr(res, key),
-                getattr(fp64_ref, key),
-                cos_similarity=cos_similarity,
-                tol=tol,
-                equal_nan=equal_nan,
-                exact_dtype=exact_dtype,
-                relax_numpy_equality=relax_numpy_equality,
-                ignore_non_fp=ignore_non_fp,
-                log_error=log_error,
-            )
-            for key in ref.__dict__.keys()
-        )
-    else:
-        raise RuntimeError(f"unsupported type: {type(ref).__name__}")
-
-
-def format_func_info(code):
-    short_filename = code.co_filename.split("/")[-1]
-    return f"'{code.co_name}' ({short_filename}:{code.co_firstlineno})"
-
-
-@contextlib.contextmanager
-def disable_cache_limit():
-    prior = config.cache_size_limit
-    config.cache_size_limit = sys.maxsize
-
-    try:
-        yield
-    finally:
-        config.cache_size_limit = prior
-
-
-# map from transformed code back to original user code
-orig_code_map = ExactWeakKeyDictionary()
-
-# keep a record of code_obj -> list of guard failure reasons for logging
-guard_failures = collections.defaultdict(list)
-
-# Keep a record of graph break reasons for logging
-graph_break_reasons = list()
-
-# keep record of compiled code, if we are in "error if recompile"
-# to track code that dynamo has compiled previously
-seen_code_map = ExactWeakKeyDictionary()
-
-
-class CompileProfiler:
-    """Utility for profiling how and what dynamo would compile.
-
-    Can be used for
-     * diagnosing recompilation issues
-     * determining an appropriate compile cache limit
-     * (TODO)confirming which functions got compiled/skipped
-    """
-
-    def __init__(self):
-        self.frame_count = 0
-        self.op_count = 0
-        self.backend_ctx_ctor = lambda: disable_cache_limit()
-
-    def __call__(self, gm: torch.fx.GraphModule, example_inputs):
-        self.frame_count += 1
-        for node in gm.graph.nodes:
-            if "call" in node.op:
-                self.op_count += 1
-        return gm.forward
-
-    def __enter__(self):
-        self.old_report_guard_failure = config.report_guard_failures
-        config.report_guard_failures = True
-        return self
-
-    def __exit__(self, typ, val, traceback):
-        config.report_guard_failures = self.old_report_guard_failure
-
-    def get_metrics(self):
-        return {"guard_failures": guard_failures}
-
-    def report(self):
-        metrics = self.get_metrics()
-        gf = metrics["guard_failures"]
-
-        def num_recompiles(code):
-            return len(gf[code])
-
-        def recompile_reasons(code):
-            return "\n".join([str(x) for x in gf[code]])
-
-        summarized_gf = [
-            [format_func_info(code), num_recompiles(code), recompile_reasons(code)]
-            for code in gf
-        ]
-
-        def graph_break_report():
-            if "graph_break" in counters:
-                graph_breaks = counters["graph_break"]
-                return tabulate(
-                    [[msg, graph_breaks[msg]] for msg in graph_breaks],
-                    headers=["Graph Break Reason", "Count"],
-                )
-
-        def recompilation_report():
-            if len(gf):
-                max_recompiles = max([num_recompiles(code) for code in gf])
-                recomp_table = tabulate(
-                    summarized_gf,
-                    headers=["Function", "Recompiles", "Recompile Reasons"],
-                )
-                return recomp_table + textwrap.dedent(
-                    f"""
-
-                    Set torch._dynamo.config.cache_size_limit to {max_recompiles} to avoid being cache limited.
-                """
-                )
-
-        report = textwrap.dedent(
-            """
-            Torchdynamo Profiler Report
-            ===========================
-
-            Graph Breaks
-            ------------
-            Graph breaks happen when torchdynamo encounters code it can't safely trace.
-            If you want to find out why breaks are happening, check below for each break reason
-            You may gain additional insight by passing `fullgraph=True` to torch.compile,
-            to stop at the first break.
-
-        """
-        )
-        report += graph_break_report() or "No graph breaks detected."
-        report += textwrap.dedent(
-            """
-
-            Recompilation
-            -------------
-            These subgraphs were recompiled more than once due to guard failures
-            Guard failures indicate some condition assumed to be static by the tracer changed,
-            making it unsafe to reuse the compiled program.
-
-        """
-        )
-        report += recompilation_report() or "No recompilation detected.\n"
-        return report
-
-
-# return same dir unless user changes config between calls
-@functools.lru_cache(None)
-def _get_debug_dir(root_dir):
-    dir_name = (
-        "run_"
-        + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
-        # use pid to avoid conflicts among ranks
-        + "-pid_"
-        + str(os.getpid())
-    )
-    return os.path.join(root_dir, dir_name)
-
-
-def get_debug_dir():
-    debug_root = config.debug_dir_root
-    return _get_debug_dir(debug_root)
-
-
-def get_fake_value(node, tx):
-    """
-    Run the computation represented by `node` using fake tensors and return the result.
-    """
-    from .exc import (
-        TorchRuntimeError,
-        unimplemented,
-        Unsupported,
-        UserError,
-        UserErrorType,
-    )
-
-    op = node.op
-
-    def fake_wrapper(e):
-        if isinstance(e, torch.Tensor):
-            assert is_fake(e)
-        return e
-
-    def visit(n: torch.fx.Node):
-        return n.meta["example_value"]
-
-    args, kwargs = torch.fx.node.map_arg((node.args, node.kwargs), visit)
-    args = tree_map(fake_wrapper, args)
-    kwargs = tree_map(fake_wrapper, kwargs)
-
-    nnmodule = None
-    if op == "call_method" and len(args) > 0 and isinstance(args[0], torch.nn.Module):
-        # If the first argument is nn.Module, should copy to fake mode.
-        args = (deepcopy_to_fake_tensor(args[0], tx.fake_mode),) + tuple(args[1:])
-
-    if op == "call_module":
-        nnmodule = tx.output.nn_modules[node.target]
-
-        if is_lazy_module(nnmodule) and hasattr(nnmodule, "_initialize_hook"):
-            # In the case of a lazy module, we want to run
-            # the pre-hooks which initialize it.
-            # Afterwards, lazy module deletes its pre-hooks
-            # to avoid treating it as lazy on subsequent recompile.
-            nnmodule._infer_parameters(nnmodule, args)
-
-        # no matter it's lazy module or not, we should copy to fake mode.
-        nnmodule = deepcopy_to_fake_tensor(nnmodule, tx.fake_mode)
-
-    try:
-        with tx.fake_mode, enable_python_dispatcher():
-            return wrap_fake_exception(
-                lambda: run_node(tx.output, node, args, kwargs, nnmodule)
-            )
-    except Unsupported:
-        raise
-    except RuntimeError as e:
-        cause = e
-        if e.__cause__ is not None:
-            cause = e.__cause__
-
-        if isinstance(
-            cause, torch._subclasses.fake_tensor.DataDependentOutputException
-        ):
-            unimplemented(f"data dependent operator: {cause.func}")
-        elif isinstance(
-            cause, torch._subclasses.fake_tensor.DynamicOutputShapeException
-        ):
-            unimplemented(f"dynamic shape operator: {cause.func}")
-        elif isinstance(
-            cause, torch._subclasses.fake_tensor.UnsupportedOperatorException
-        ):
-            unimplemented(
-                f"unsupported operator: {cause.func} (see "
-                "https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.64r4npvq0w0"
-                " for how to fix)"
-            )
-        elif isinstance(
-            cause, torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode
-        ):
-            unimplemented("guard on data-dependent symbolic int/float")
-        elif isinstance(cause, torch.utils._sympy.value_ranges.ValueRangeError):
-            raise UserError(UserErrorType.CONSTRAIN_VIOLATION, e.args[0]) from e
-        raise TorchRuntimeError(str(e)).with_traceback(e.__traceback__) from None
-
-
-def run_node(tracer, node, args, kwargs, nnmodule):
-    """
-    Runs a given node, with the given args and kwargs.
-
-    Behavior is dicatated by a node's op.
-
-    run_node is useful for extracting real values out of nodes.
-    See get_real_value for more info on common usage.
-
-    Note: The tracer arg is only used for 'get_attr' ops
-    Note: The nnmodule arg is only used for 'call_module' ops
-
-    Nodes that are not call_function, call_method, call_module, or get_attr will
-    raise an AssertionError.
-    """
-    op = node.op
-    try:
-        if op == "call_function":
-            return node.target(*args, **kwargs)
-        elif op == "call_method":
-            return getattr(args[0], node.target)(*args[1:], **kwargs)
-        elif op == "call_module":
-            assert nnmodule is not None
-            return nnmodule(*args, **kwargs)
-        elif op == "get_attr":
-            return tracer.get_submodule(node.target)
-        elif op == "placeholder":
-            assert "example_value" in node.meta
-            return node.meta["example_value"]
-    except Exception as e:
-        fn_str = f"Failed running {op} {node.target}(*{args}, **{kwargs}):\n"
-        raise RuntimeError(fn_str + str(e)).with_traceback(e.__traceback__) from e
-
-    raise AssertionError(op)
-
-
-def get_real_value(node, tracer):
-    """
-    Run the actual computation represented by `node` and return the result.
-    This will execute any dependent nodes in the graph as well.
-    """
-    from .exc import TorchRuntimeError
-
-    cache = tracer.real_value_cache
-    if node in cache:
-        return cache[node]
-
-    op = node.op
-    args, kwargs = torch.fx.node.map_arg(
-        (node.args, node.kwargs),
-        lambda n: get_real_value(n, tracer),
-    )
-
-    if op == "call_module":
-        nn_module = tracer.output_graph.nn_modules[node.target]
-        if not is_lazy_module(nn_module):
-            nn_module = copy.deepcopy(nn_module)
-        else:
-            # In the case of a lazy module, we want to run
-            # the pre-hooks which initialize it
-            nn_module(*args, **kwargs)
-    else:
-        nn_module = None
-
-    try:
-        real_value = run_node(tracer, node, args, kwargs, nn_module)
-        cache[node] = real_value
-    except RuntimeError as e:
-        raise TorchRuntimeError(str(e)).with_traceback(e.__traceback__) from None
-    return real_value
-
-
-def assert_no_fake_params_or_buffers(gm):
-    from torch._subclasses.fake_tensor import FakeTensorConfig
-
-    def stack_or_hint(t):
-        if FakeTensorConfig.debug:
-            import traceback
-
-            return f"FAKE TENSOR CREATION TRACEBACK: \n {traceback.format_list(t._debug_trace)}"
-        else:
-            return "Enable TORCH_FAKE_TENSOR_DEBUG=1 to get creation stack traces on fake tensors."
-
-    for name, buffer in gm.named_buffers():
-        assert not isinstance(
-            buffer, torch._subclasses.FakeTensor
-        ), f"Unexpected fake buffer {name} {stack_or_hint(buffer)}"
-    for name, param in gm.named_parameters():
-        assert not isinstance(
-            param, torch._subclasses.FakeTensor
-        ), f"Unexpected fake param {name} {stack_or_hint(param)}"
-
-
-def fqn(obj: Any):
-    """
-    Returns the fully qualified name of the object.
-    """
-    return f"{obj.__module__}.{obj.__qualname__}"
-
-
-def ifdynstaticdefault(count1, count2):
-    if torch._dynamo.config.assume_static_by_default:
-        return count1
-    else:
-        return count2
-
-
-def import_submodule(mod: types.ModuleType):
-    """
-    Ensure all the files in a given submodule are imported
-    """
-    for filename in sorted(os.listdir(os.path.dirname(mod.__file__))):
-        if filename.endswith(".py") and filename[0] != "_":
-            importlib.import_module(f"{mod.__name__}.{filename[:-3]}")
-
-
-def object_has_getattribute(value: Any):
-    try:
-        if isinstance(
-            inspect.getattr_static(type(value), "__getattribute__"),
-            types.FunctionType,
-        ):
-            return True
-    except AttributeError:
-        pass
-    return False
-
-
-def get_custom_getattr(value: Any):
-    try:
-        getattr_fn = inspect.getattr_static(type(value), "__getattr__")
-    except AttributeError:
-        getattr_fn = None
-    if getattr_fn is torch.nn.Module.__getattr__:
-        # ignore this case of getattr
-        getattr_fn = None
-    return getattr_fn
-
-
-class TensorStaticReason(enum.Enum):
-    PARAMETER = 2
-    NOT_TENSOR = 4
-    NN_MODULE_PROPERTY = 5
-
-
-def tensor_static_reason_to_message(reason: TensorStaticReason):
-    if reason == TensorStaticReason.PARAMETER:
-        return "mark_dynamic on parameter, parameters are always static today."
-    if reason == TensorStaticReason.NOT_TENSOR:
-        return "mark_dynamic on a non tensor, how did this happen?"
-    if reason == TensorStaticReason.NN_MODULE_PROPERTY:
-        return "tensor is static because it is nn module associated."
-    raise AssertionError(f"Illegal reason {reason}")
-
-
-def tensor_always_has_static_shape(
-    tensor: Union[torch.Tensor, Any], is_tensor: bool, guard_source: "GuardSource"
-) -> Tuple[bool, TensorStaticReason]:
-    """
-    Given a tensor, source, and is_tensor flag, determine if a shape should be static.
-
-    Args:
-    tensor - the real tensor to evaluate, parameters force a static shape.
-    is_tensor - internal dynamo check, esentially "is_tensor": target_cls is TensorVariable,
-    tensors not in a TensorVariable for whatever reason are forced static.
-
-    Returns a tuple, where the first element is the bool of whether or not this tensor should have a static shape.
-    The second element is a TensorStaticReason, useful for passing to tensor_static_reason_to_message if needed.
-    """
-    if guard_source.is_nn_module() and config.force_nn_module_property_static_shapes:
-        return True, TensorStaticReason.NN_MODULE_PROPERTY
-    if type(tensor) is torch.nn.Parameter and config.force_parameter_static_shapes:
-        return True, TensorStaticReason.PARAMETER
-    if not is_tensor:
-        return True, TensorStaticReason.NOT_TENSOR
-    return False, None
-
-
-class LazyString:
-    def __init__(self, func, *args, **kwargs):
-        self.func = func
-        self.args = args
-        self.kwargs = kwargs
-
-    def __str__(self):
-        return self.func(*self.args, **self.kwargs)
-
-
-def lazy_format_graph_code(name, gm, maybe_id=None):
-    def format_name():
-        if maybe_id is not None:
-            return f"{name} {maybe_id}"
-        else:
-            return name
-
-    return LazyString(
-        lambda: _format_graph_code(
-            f"===== {format_name()} =====\n",
-            gm.forward.__code__.co_filename,
-            gm.print_readable(print_output=False),
-        )
-    )
-
-
-def _format_graph_code(name, filename, graph_str):
-    return f"TRACED GRAPH\n {name} {filename} {graph_str}\n"
-
-
-def lazy_format_graph_tabular(fn_name, gm):
-    def inner():
-        try:
-            from tabulate import tabulate  # TODO: Check that this is installed
-        except ImportError:
-            return (
-                "Tabulate module missing, please install tabulate to log the graph in tabular format, logging code instead:\n"
-                + str(lazy_format_graph_code(fn_name, gm))
-            )
-
-        node_specs = [
-            [n.op, n.name, n.target, n.args, n.kwargs] for n in gm.graph.nodes
-        ]
-        graph_str = tabulate(
-            node_specs, headers=["opcode", "name", "target", "args", "kwargs"]
-        )
-        return _format_graph_code(fn_name, gm.forward.__code__.co_filename, graph_str)
-
-    return LazyString(inner)
-
-
-def format_bytecode(prefix, name, filename, line_no, code):
-    return f"{prefix} {name} {filename} line {line_no} \n{dis.Bytecode(code).dis()}\n"
-
-
-forward_hook_names = ["_forward_pre_hooks", "_forward_hooks"]
-backward_hook_names = ["_backward_pre_hooks", "_backward_hooks"]
-state_dict_hook_names = [
-    "_state_dict_pre_hooks",
-    "_state_dict_hooks",
-    "_load_state_dict_pre_hooks",
-    "_load_state_dict_post_hooks",
-]
-all_hook_names = forward_hook_names + backward_hook_names + state_dict_hook_names
-
-
-def nn_module_get_all_hooks(
-    mod,
-    check_forward_hooks=False,
-    check_backward_hooks=False,
-    check_state_dict_hooks=False,
-):
-    reset_code = torch._C._dynamo.eval_frame.reset_code
-    """
-    Sometimes its useful to differentiate between types of hooks such as forward/backward/pre
-    hooks executed during module.__call__, and state_dict hooks which are executed separately.
-    """
-    hook_dicts_to_check = []
-    check_all_hooks = (
-        not check_forward_hooks
-        and not check_backward_hooks
-        and not check_state_dict_hooks
-    )
-    if check_forward_hooks or check_all_hooks:
-        hook_dicts_to_check.extend(forward_hook_names)
-    if check_backward_hooks or check_all_hooks:
-        hook_dicts_to_check.extend(backward_hook_names)
-    if check_state_dict_hooks:
-        hook_dicts_to_check.extend(state_dict_hook_names)
-
-    all_hooks = []
-    for hook_dict_name in hook_dicts_to_check:
-        hooks = getattr(mod, hook_dict_name, [])
-        for hook_name in hooks:
-            hook = hooks[hook_name]
-
-            all_hooks.append(hook)
-    return all_hooks
-
-
-def nnmodule_has_hooks(
-    mod,
-    check_forward_hooks=False,
-    check_backward_hooks=False,
-    check_state_dict_hooks=False,
-):
-    """
-    Helper function to check if a module has any hooks attached to it.
-    """
-    hooks = nn_module_get_all_hooks(
-        mod,
-        check_forward_hooks=check_forward_hooks,
-        check_backward_hooks=check_backward_hooks,
-        check_state_dict_hooks=check_state_dict_hooks,
-    )
-    return bool(hooks)
-
-
-def to_numpy_helper(value):
-    """Convert tensor and tnp.ndarray to numpy.ndarray."""
-    if isinstance(value, tnp.ndarray):
-        return to_numpy_helper(value.tensor)
-    elif isinstance(value, torch.Tensor):
-        return value.cpu().numpy()
-    elif isinstance(value, (tuple, list)):
-        return type(value)(to_numpy_helper(obj) for obj in value)
-    else:
-        return value
-
-
-def numpy_to_tensor(value):
-    """Convert tnp.ndarray to tensor, leave other types intact. If a list/tuple, loop through it to convert."""
-    if isinstance(value, np.ndarray):
-        return torch.as_tensor(value)
-    if isinstance(value, tnp.ndarray):
-        return value.tensor
-    elif isinstance(value, (tuple, list)):
-        return type(value)(numpy_to_tensor(obj) for obj in value)
-    else:
-        return value
-
-
-class numpy_to_tensor_wrapper:
-    def __init__(self, f):
-        self.f = f
-        self.__name__ = "wrapped_" + self.f.__name__
-
-    def __repr__(self):
-        return f"<Wrapped function <original {self.f.__name__}>>"
-
-    def __call__(self, *args, **kwargs):
-        out = self.f(*args, **kwargs)
-        return numpy_to_tensor(out)
-
-
-def numpy_attr_wrapper(obj, name):
-    if isinstance(obj, tnp.ndarray):
-        out = getattr(obj, name)
-        return numpy_to_tensor(out)
-    elif isinstance(obj, torch.Tensor):
-        out = getattr(tnp.ndarray(obj), name)
-        return numpy_to_tensor(out)
-
-
-class numpy_method_wrapper:
-    """Convert obj from torch.Tensor to tnp.ndarray and call method. Then convert result back to torch.Tensor."""
-
-    def __init__(self, method: str):
-        self.method = method
-        self.__name__ = "wrapped_" + self.method
-
-    def __repr__(self):
-        return f"<Wrapped method <original {self.method}>>"
-
-    def __call__(self, *args, **kwargs):
-        obj = args[0]
-        if isinstance(obj, torch.Tensor):
-            obj = tnp.ndarray(obj)
-        method_callable = getattr(obj, self.method)
-        out = method_callable(*args[1:], **kwargs)
-        return numpy_to_tensor(out)
-
-
-def defake(x):
-    if not isinstance(x, FakeTensor):
-        return x
-    if x._has_symbolic_sizes_strides:
-        size = [
-            s.node.shape_env.size_hint(s.node.expr)
-            if isinstance(s, torch.SymInt)
-            else s
-            for s in x.size()
-        ]
-        stride = [
-            s.node.shape_env.size_hint(s.node.expr)
-            if isinstance(s, torch.SymInt)
-            else s
-            for s in x.stride()
-        ]
-    else:
-        size = x.size()
-        stride = x.stride()
-    y = torch.empty_strided(
-        size,
-        stride,
-        dtype=x.dtype,
-        device=x.device,
-        requires_grad=x.requires_grad,
-    )
-    y.zero_()
-    return y
-
-
-def is_utils_checkpoint(obj):
-    # Lazy import to avoid circular dependenices
-    import torch.utils.checkpoint
-
-    return obj is torch.utils.checkpoint.checkpoint
-
-
-def build_checkpoint_variable(**options):
-    import torch._higher_order_ops.wrap as higher_order_ops
-    from .variables.higher_order_ops import TorchHigherOrderOperatorVariable
-
-    # TODO - This is a temporary sitaution where we have two versions of
-    # checkpointing implemetation. We will converge on one and remove the other.
-    activation_checkpoint_op = higher_order_ops.tag_activation_checkpoint
-    if torch._functorch.config.functionalize_rng_ops:
-        activation_checkpoint_op = higher_order_ops.wrap_activation_checkpoint
-
-    return TorchHigherOrderOperatorVariable.make(
-        activation_checkpoint_op,
-        **options,
-    )
-
-
-def is_compile_supported(device_type):
-    from .eval_frame import is_dynamo_supported
-
-    compile_supported = is_dynamo_supported()
-    if device_type == "cpu":
-        pass
-    elif device_type == "cuda" and compile_supported:
-        from torch._inductor.utils import has_triton
-
-        compile_supported = has_triton()
-    else:
-        compile_supported = False
-    return compile_supported
-
-
-# The following 3.11 source code functions are adapted from
-# https://github.com/python/cpython/blob/v3.11.4/Lib/traceback.py
-# in order to output source code corresponding to bytecode in 3.11+.
-# We need our own versions since we want to support multiline expressions.
-def _fix_offset(str: str, offset: int) -> int:
-    """
-    Convert byte offset `offset` of `str` into character offset.
-    Byte offset is used for 3.11+ instruction column data.
-    Takes things like unicode characters into consideration.
-
-    Unchanged from CPython implementation.
-    """
-    as_utf8 = str.encode("utf-8")
-    return len(as_utf8[:offset].decode("utf-8", errors="replace"))
-
-
-@dataclasses.dataclass
-class _Anchors:
-    # inclusive
-    left_end_lineno: int
-    left_end_offset: int
-    right_start_lineno: int
-    # exclusive
-    right_start_offset: int
-
-
-def _extract_anchors_from_expr(segment: str) -> Optional[_Anchors]:
-    """
-    Given source code `segment` corresponding to a bytecode
-    instruction, determine:
-        - for binary ops, the location of the binary op
-        - for indexing, the location of the brackets.
-    `segment` is expected to be a valid Python expression
-    """
-    assert sys.version_info >= (3, 11)
-
-    import ast
-
-    try:
-        # Without brackets, `segment` is parsed as a statement.
-        # We expect an expression, so wrap `segment` in
-        # brackets to handle multi-line expressions.
-        tree = ast.parse("(\n" + segment + "\n)")
-    except SyntaxError:
-        return None
-
-    if len(tree.body) != 1:
-        return None
-
-    lines = segment.split("\n")
-
-    # get character index given byte offset
-    def normalize(lineno, offset):
-        return _fix_offset(lines[lineno], offset)
-
-    # Gets the next valid character index in `lines`, if
-    # the current location is not valid. Handles empty lines.
-    def next_valid_char(lineno, col):
-        while lineno < len(lines) and col >= len(lines[lineno]):
-            col = 0
-            lineno += 1
-        assert lineno < len(lines) and col < len(lines[lineno])
-        return lineno, col
-
-    # Get the next valid character index in `lines`.
-    def increment(lineno, col):
-        col += 1
-        lineno, col = next_valid_char(lineno, col)
-        assert lineno < len(lines) and col < len(lines[lineno])
-        return lineno, col
-
-    # Get the next valid character at least on the next line
-    def nextline(lineno, col):
-        col = 0
-        lineno += 1
-        lineno, col = next_valid_char(lineno, col)
-        assert lineno < len(lines) and col < len(lines[lineno])
-        return lineno, col
-
-    statement = tree.body[0]
-    if isinstance(statement, ast.Expr):
-        expr = statement.value
-        if isinstance(expr, ast.BinOp):
-            # ast gives locations for BinOp subexpressions, e.g.
-            # ( left_expr ) + ( right_expr )
-            #   left^^^^^       right^^^^^
-            # -2 since end_lineno is 1-indexed and because we added an extra
-            # bracket to `segment` when calling ast.parse
-            cur_lineno = expr.left.end_lineno - 2
-            cur_col = normalize(cur_lineno, expr.left.end_col_offset)
-            cur_lineno, cur_col = next_valid_char(cur_lineno, cur_col)
-
-            # Heuristic to find the operator character.
-            # The original CPython implementation did not look for ), \, or #,
-            # leading to incorrect anchor location, e.g.
-            # (x) + (y)
-            # ~~^~~~~~~
-            while (ch := lines[cur_lineno][cur_col]).isspace() or ch in ")\\#":
-                if ch in "\\#":
-                    cur_lineno, cur_col = nextline(cur_lineno, cur_col)
-                else:
-                    cur_lineno, cur_col = increment(cur_lineno, cur_col)
-
-            # binary op is 1 or 2 characters long, on the same line
-            right_col = cur_col + 1
-            if (
-                right_col < len(lines[cur_lineno])
-                and not (ch := lines[cur_lineno][right_col]).isspace()
-                and ch not in "\\#"
-            ):
-                right_col += 1
-            # right_col can be invalid since it is exclusive
-
-            return _Anchors(cur_lineno, cur_col, cur_lineno, right_col)
-        elif isinstance(expr, ast.Subscript):
-            # ast gives locations for value and slice subexpressions, e.g.
-            # ( value_expr ) [ slice_expr ]
-            #   value^^^^^     slice^^^^^
-            # subscript^^^^^^^^^^^^^^^^^^^^
-            # find left bracket (first '[' after value)
-            left_lineno = expr.value.end_lineno - 2
-            left_col = normalize(left_lineno, expr.value.end_col_offset)
-            left_lineno, left_col = next_valid_char(left_lineno, left_col)
-            while lines[left_lineno][left_col] != "[":
-                left_lineno, left_col = increment(left_lineno, left_col)
-            # find right bracket (final character of expression)
-            right_lineno = expr.end_lineno - 2
-            right_col = normalize(right_lineno, expr.end_col_offset)
-            return _Anchors(left_lineno, left_col, right_lineno, right_col)
-        elif isinstance(expr, ast.Call):
-            # ( func_expr ) (args, kwargs)
-            #   func^^^^^
-            # call^^^^^^^^^^^^^^^^^^^^^^^^
-            # find left bracket (first '(' after func)
-            left_lineno = expr.func.end_lineno - 2
-            left_col = normalize(left_lineno, expr.func.end_col_offset)
-            left_lineno, left_col = next_valid_char(left_lineno, left_col)
-            while lines[left_lineno][left_col] != "(":
-                left_lineno, left_col = increment(left_lineno, left_col)
-            # find right bracket (final character of expression)
-            right_lineno = expr.end_lineno - 2
-            right_col = normalize(right_lineno, expr.end_col_offset)
-            return _Anchors(left_lineno, left_col, right_lineno, right_col)
-
-    return None
-
-
-def get_instruction_source_311(code: types.CodeType, inst: dis.Instruction) -> str:
-    """
-    Python 3.11+ only. Returns lines of source code (from code object `code`)
-    corresponding to `inst`'s location data, and underlines relevant code to `inst`.
-
-    Example: CALL on `g`:
-    f(g(
-      ^^
-        h(x)))
-        ^^^^^
-
-    We need our own implementation since `format_frame_summary` in
-    Python's `traceback` module doesn't handle multi-line expressions
-    (and their anchor extraction code is not completely correct).
-    """
-    if inst.positions.lineno is None:
-        return ""
-    # The rstrip + "\n" pattern is used throughout this function to handle
-    # linecache.getline errors. Error lines are treated as empty strings "", but we want
-    # to treat them as blank lines "\n".
-    first_line = linecache.getline(code.co_filename, inst.positions.lineno).rstrip()
-    if inst.positions.end_lineno is None:
-        return first_line
-    if inst.positions.col_offset is None or inst.positions.end_col_offset is None:
-        return first_line
-
-    # character index of the start of the instruction
-    start_offset = _fix_offset(first_line, inst.positions.col_offset)
-    # character index of the end of the instruction
-    # compute later since end may be a different line
-    end_offset = None
-    # expression corresponding to the instruction so we can get anchors
-    segment = ""
-    # underline markers to be printed - start with `~` marker and replace with `^` later
-    markers = []
-
-    # Compute segment and initial markers
-    if inst.positions.end_lineno == inst.positions.lineno:
-        end_offset = _fix_offset(first_line, inst.positions.end_col_offset)
-        segment = first_line[start_offset:end_offset]
-        markers.append(" " * start_offset + "~" * (end_offset - start_offset))
-    else:
-        segment = first_line[start_offset:] + "\n"
-        markers.append(" " * start_offset + "~" * (len(first_line) - start_offset))
-        last_line = linecache.getline(
-            code.co_filename, inst.positions.end_lineno
-        ).rstrip()
-        end_offset = _fix_offset(last_line, inst.positions.end_col_offset)
-        for lineno in range(inst.positions.lineno + 1, inst.positions.end_lineno):
-            line = linecache.getline(code.co_filename, lineno).rstrip()
-            segment += line + "\n"
-            # don't underline leading spaces
-            num_spaces = len(line) - len(line.lstrip())
-            markers.append(" " * num_spaces + "~" * (len(line) - num_spaces))
-        segment += last_line[:end_offset]
-        num_spaces = len(last_line) - len(last_line.lstrip())
-        markers.append(" " * num_spaces + "~" * (end_offset - num_spaces))
-
-    anchors: Optional[_Anchors] = None
-    try:
-        anchors = _extract_anchors_from_expr(segment)
-    except AssertionError:
-        pass
-
-    # replace `~` markers with `^` where necessary
-    if anchors is None:
-        markers = [marker.replace("~", "^") for marker in markers]
-    else:
-        # make markers mutable
-        markers = [list(marker) for marker in markers]
-
-        # anchor positions do not take start_offset into account
-        if anchors.left_end_lineno == 0:
-            anchors.left_end_offset += start_offset
-        if anchors.right_start_lineno == 0:
-            anchors.right_start_offset += start_offset
-
-        # Turn `~`` markers between anchors to `^`
-        for line in range(len(markers)):
-            for col in range(len(markers[line])):
-                if line < anchors.left_end_lineno:
-                    continue
-                if line == anchors.left_end_lineno and col < anchors.left_end_offset:
-                    continue
-                if (
-                    line == anchors.right_start_lineno
-                    and col >= anchors.right_start_offset
-                ):
-                    continue
-                if line > anchors.right_start_lineno:
-                    continue
-                if markers[line][col] == "~":
-                    markers[line][col] = "^"
-
-        # make markers into strings again
-        markers = ["".join(marker) for marker in markers]
-
-    result = ""
-    for i in range(len(markers)):
-        result += (
-            linecache.getline(code.co_filename, inst.positions.lineno + i).rstrip()
-            + "\n"
-        )
-        result += markers[i] + "\n"
-    return result
-
-
-def is_guard_failure_reporting_enabled():
-    return (
-        config.report_guard_failures
-        or torch._logging._internal.log_state.is_artifact_enabled("recompiles")
-    )
-
-
-def get_static_address_type(t):
-    if isinstance(t, torch.Tensor):
-        return getattr(t, "_dynamo_static_input_type", None)
-
-    return None
diff --git a/userbenchmark/dynamo/common.py b/userbenchmark/dynamo/common.py
deleted file mode 100644
index 075b88f4a0..0000000000
--- a/userbenchmark/dynamo/common.py
+++ /dev/null
@@ -1,3577 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import collections
-import contextlib
-import copy
-import csv
-import dataclasses
-import functools
-import importlib
-import itertools
-import logging
-import os
-import pathlib
-import random
-import shutil
-import signal
-import subprocess
-import sys
-import time
-from contextlib import contextmanager
-
-from typing import Any, Callable, Mapping, NamedTuple, Optional, Tuple, Type
-from unittest.mock import MagicMock
-
-import numpy as np
-import pandas as pd
-import psutil
-import torch
-import torch._dynamo
-import torch._dynamo.utils
-import torch._export
-import torch.distributed
-import torch.fx._pytree as fx_pytree
-import torch.multiprocessing as mp
-from scipy.stats import gmean, ttest_ind
-from torch._dynamo.profiler import fx_insert_profiling, Profiler
-from torch._dynamo.testing import dummy_fx_compile, format_speedup, same
-from torch._dynamo.utils import clone_inputs
-from torch._functorch.aot_autograd import set_model_name
-from torch._inductor import config as inductor_config
-from torch._inductor.utils import fresh_inductor_cache
-from torch._subclasses.fake_tensor import FakeTensorMode
-
-from torch.utils import _pytree as pytree
-from torch.utils._pytree import tree_map, tree_map_only
-
-from tqdm.auto import tqdm, trange
-
-log = logging.getLogger(__name__)
-
-# We are primarily interested in TF32
-torch.backends.cuda.matmul.allow_tf32 = True
-
-# Suppress torch.profiler spam
-os.environ["KINETO_LOG_LEVEL"] = "5"
-
-current_name = ""
-current_device = ""
-current_onnx_compiler = ""
-current_batch_size = None
-output_filename = None
-
-MAX_DOWNLOAD_ATTEMPTS = 5
-
-
-class CI(NamedTuple):
-    backend: str  # aot_eager or inductor
-    training: bool
-    dynamic: bool = False
-    device: str = "cuda"
-
-
-CI_SKIP = collections.defaultdict(list)
-
-
-# Skips for dynamic=False
-
-# Here eager really means dynamo+eager
-CI_SKIP[CI("eager", training=False)] = [
-    # TorchBench
-    "DALLE2_pytorch",  # AttributeError: text_encodings
-    "hf_BigBird",  # fail_accuracy
-    # TypeError: pad_center() takes 1 positional argument but 2 were given
-    "tacotron2",
-    # Huggingface
-    "DebertaV2ForQuestionAnswering",  # OOM
-]
-
-CI_SKIP[CI("eager", training=True)] = [
-    *CI_SKIP[CI("eager", training=False)],
-    # TorchBench
-    "BERT_pytorch",  # accuracy
-    "Background_Matting",  # fp64_OOM
-    "hf_BigBird",  # fp64_OOM
-    "hf_T5_base",  # fp64_OOM
-    "llama",  # Accuracy failed: allclose not within tol=0.001
-    "vision_maskrcnn",  # The size of tensor a (29) must match the size of tensor b (33) (doesn't repro)
-    # Huggingface
-    "XGLMForCausalLM",  # OOM
-    # TIMM
-    "cait_m36_384",  # fp64_OOM
-    "convit_base",  # fp64_OOM
-    "mobilenetv2_100",  # accuracy
-    "xcit_large_24_p8_224",  # fp64_OOM,
-]
-
-CI_SKIP[CI("aot_eager", training=False)] = [
-    *CI_SKIP[CI("eager", training=False)],
-    # all dynamic shapes errors for detectron variants
-    "demucs",  # OOM
-    "detectron2_fasterrcnn_r_101_c4",
-    "detectron2_fasterrcnn_r_101_dc5",
-    "detectron2_fasterrcnn_r_101_fpn",
-    "detectron2_fasterrcnn_r_50_c4",
-    "detectron2_fasterrcnn_r_50_dc5",
-    "detectron2_fasterrcnn_r_50_fpn",
-    "detectron2_fcos_r_50_fpn",
-    "detectron2_maskrcnn_r_101_c4",
-    "detectron2_maskrcnn_r_101_fpn",
-    "detectron2_maskrcnn_r_50_c4",
-    "detectron2_maskrcnn_r_50_fpn",
-    "hf_BigBird",  # OOM
-    "tacotron2",  # AssertionError: Deduped args out of bounds
-    # Huggingface
-    "BartForConditionalGeneration",  # OOM
-    "DebertaV2ForQuestionAnswering",  # OOM
-    # Torchbench
-    "speech_transformer",  # https://github.com/pytorch/pytorch/issues/99893
-    "pyhpc_isoneutral_mixing",  # https://github.com/pytorch/pytorch/issues/99893
-    "pyhpc_turbulent_kinetic_energy",  # https://github.com/pytorch/pytorch/issues/99893
-]
-
-CI_SKIP[CI("aot_eager", training=True)] = [
-    *CI_SKIP[CI("aot_eager", training=False)],
-    # TorchBench
-    "Background_Matting",  # fp64_OOM
-    "hf_T5_base",  # fp64_OOM
-    "mobilenet_v2_quantized_qat",  # fp64_OOM
-    "resnet50_quantized_qat",  # fp64_OOM
-    "pytorch_struct",
-    # Huggingface
-    "MBartForConditionalGeneration",  # OOM
-    "M2M100ForConditionalGeneration",  # OOM
-    "XGLMForCausalLM",  # OOM
-    # TIMM
-    "cait_m36_384",  # fp64_OOM
-    "convit_base",  # fp64_OOM
-    "fbnetv3_b",  # Accuracy (blocks.2.2.bn1.weight.grad)
-    "levit_128",  # Accuracy (patch_embed.0.c.weight.grad)
-    "lcnet_050",  # Accuracy (blocks.1.0.bn2.weight.grad)
-    "sebotnet33ts_256",  # Accuracy (stem.conv1.conv.weight.grad)
-    "xcit_large_24_p8_224",  # fp64_OOM,
-]
-
-CI_SKIP[CI("inductor", training=False)] = [
-    # TorchBench
-    "DALLE2_pytorch",  # AttributeError: text_encodings
-    "demucs",  # OOM
-    "detectron2_fasterrcnn_r_101_c4",
-    "detectron2_fasterrcnn_r_101_dc5",
-    "detectron2_fasterrcnn_r_101_fpn",
-    "detectron2_fasterrcnn_r_50_c4",
-    "detectron2_fasterrcnn_r_50_dc5",
-    "detectron2_fasterrcnn_r_50_fpn",
-    "detectron2_fcos_r_50_fpn",
-    "detectron2_maskrcnn_r_101_c4",
-    "detectron2_maskrcnn_r_101_fpn",
-    "detectron2_maskrcnn_r_50_c4",
-    "detectron2_maskrcnn_r_50_fpn",
-    # TorchBench
-    "detectron2",
-    "densenet121",  # flaky accuracy
-    "hf_T5",  # accuracy
-    "hf_BigBird",  # accuracy
-    "hf_GPT2_large",  # OOM
-    "maml",  # accuracy
-    "mobilenet_v2_quantized_qat",  # The eval test only supports CPU
-    "pytorch_struct",  # Test eval is not implemented
-    "pyhpc_equation_of_state",  # Accuracy
-    "pyhpc_turbulent_kinetic_energy",  # Accuracy
-    "tacotron2",
-]
-
-CI_SKIP[CI("inductor", training=False, device="cpu")] = [
-    # TorchBench
-    "drq",  # Need to update torchbench
-    "detectron2_fasterrcnn_r_101_c4",
-    "detectron2_fasterrcnn_r_101_dc5",
-    "detectron2_fasterrcnn_r_101_fpn",
-    "detectron2_fasterrcnn_r_50_c4",
-    "detectron2_fasterrcnn_r_50_dc5",
-    "detectron2_fasterrcnn_r_50_fpn",
-    "detectron2_fcos_r_50_fpn",
-    "detectron2_maskrcnn_r_101_c4",
-    "detectron2_maskrcnn_r_101_fpn",
-    "detectron2_maskrcnn_r_50_c4",
-    "detectron2_maskrcnn_r_50_fpn",
-    "doctr_det_predictor",  # requires newer gcc
-    "doctr_reco_predictor",  # requires newer gcc
-    "gat",  # does not work with fp32
-    "gcn",  # does not work with fp32
-    "hf_Bert_large",  # OOM
-    "hf_GPT2_large",  # Intermittent failure on CI
-    "hf_T5_base",  # OOM
-    "mobilenet_v2_quantized_qat",
-    "pyhpc_turbulent_kinetic_energy",
-    "resnet50_quantized_qat",  # Eager model failed to run(Quantize only works on Float Tensor, got Double)
-    "sage",  # does not work with fp32
-    # Huggingface
-    "MBartForConditionalGeneration",  # Accuracy https://github.com/pytorch/pytorch/issues/94793
-    "PLBartForConditionalGeneration",  # Accuracy https://github.com/pytorch/pytorch/issues/94794
-    # TIMM
-    "cait_m36_384",  # Accuracy
-    "pnasnet5large",  # OOM
-    "xcit_large_24_p8_224",  # OOM https://github.com/pytorch/pytorch/issues/95984
-    "opacus_cifar10",  # Fails to run https://github.com/pytorch/pytorch/issues/99201
-]
-
-CI_SKIP[CI("inductor", training=True)] = [
-    *CI_SKIP[CI("inductor", training=False)],
-    # TorchBench
-    "Background_Matting",  # fp64_OOM
-    "hf_T5_base",  # accuracy
-    "mobilenet_v3_large",  # accuracy
-    "resnet50_quantized_qat",  # Eager model failed to run
-    "AlbertForQuestionAnswering",  # accuracy
-    "crossvit_9_240",  # fails to run on timm 0.8.22 with cudagraphs, mempools
-    "deit_base_distilled_patch16_224",  # fails to run in timm 0.8.22, cudagraphs
-    "mobilevit_s",
-    "pit_b_224",
-    "twins_pcpvt_base",
-    "visformer_small",
-    "vit_base_patch16_224",
-    "xcit_large_24_p8_224",
-]
-
-# Skips for dynamic=True
-
-CI_SKIP[CI("aot_eager", training=False, dynamic=True)] = [
-    *CI_SKIP[CI("aot_eager", training=False)],
-    "vision_maskrcnn",  # accuracy failure on boxes, after https://github.com/pytorch/pytorch/issues/101093
-    # https://github.com/pytorch/pytorch/issues/103760
-    "hf_T5_generate",
-    "hf_Bert",  # Error: RelaxedUnspecConstraint(L['input_ids'].size()[0]) - inferred constant (4)
-]
-
-CI_SKIP[CI("aot_eager", training=True, dynamic=True)] = [
-    *CI_SKIP[CI("aot_eager", training=True)],
-    *CI_SKIP[CI("aot_eager", training=False, dynamic=True)],
-    "llama",  # AssertionError: cannot compute free_symbols of True
-    "torchrec_dlrm",  # RuntimeError: mat1 and mat2 must have the same dtype, but got Float and BFloat16
-]
-
-CI_SKIP[CI("inductor", training=False, dynamic=True)] = [
-    *CI_SKIP[CI("aot_eager", training=False, dynamic=True)],
-    *CI_SKIP[CI("inductor", training=False)],
-    "nanogpt",  # Assertion `index out of bounds: 0 <= tmp0 < 64` failed.
-]
-
-CI_SKIP[CI("inductor", training=True, dynamic=True)] = [
-    # NB: Intentionally omitting for symmetry with dynamic=False
-    # *CI_SKIP[CI("aot_eager", training=True, dynamic=True)],
-    *CI_SKIP[CI("inductor", training=False, dynamic=True)],
-    *CI_SKIP[CI("inductor", training=True)],
-    "levit_128",  # Accuracy fails on A10G, passes on A100
-    "sebotnet33ts_256",  # Flaky accuracy failed
-]
-
-CI_SKIP[CI("inductor", training=False, dynamic=True, device="cpu")] = [
-    *CI_SKIP[CI("inductor", training=False, device="cpu")],
-    "pyhpc_isoneutral_mixing",
-    "dpn107",
-]
-
-CI_SKIP_OPTIMIZER = {
-    # TIMM
-    "convmixer_768_32",  # accuracy
-    "hrnet_w18",  # Stack issue in fx
-    # HF
-    "pnasnet5large",  # Stack issue in fx
-    "MobileBertForMaskedLM",  # Stack issue in fx
-    "MobileBertForQuestionAnswering",  # Stack issue in fx
-    "PegasusForConditionalGeneration",  # OOM
-}
-
-CI_SKIP_DYNAMIC_BATCH_ONLY = {
-    "sam",
-    # See https://github.com/mindee/doctr/blob/f2114758d529ed8d3d0030581638f0520b6b98d8/doctr/models/detection/core.py#L89
-    # It iterates over the batch, which is dynamic, and dynamo chokes
-    # We should be able to graphbreak there.
-    "doctr_det_predictor",
-    "dlrm",
-}
-
-
-def model_specified_by_path(path_and_class_str):
-    return ":" in path_and_class_str
-
-
-def load_model_from_path(path_and_class_str):
-    configs = {}
-    for kvstr in path_and_class_str.split(","):
-        k, v = kvstr.split(":")
-        configs[k] = v
-
-    for name in ["path", "class"]:
-        if name not in configs:
-            raise RuntimeError(
-                "Invalid --only arguments. Check help message for the correct format"
-            )
-
-    path = configs["path"]
-    class_name = configs["class"]
-
-    if path[:1] != "/":
-        raise RuntimeError(
-            "Use absolute path since dynamo may change the current working directory which makes using relative path tricky"
-        )
-
-    spec = importlib.util.spec_from_file_location("module_name", path)
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-
-    model_class = getattr(module, class_name)
-    assert issubclass(model_class, torch.nn.Module)
-    model = model_class()
-    assert hasattr(model, "get_example_inputs")
-    inputs = model.get_example_inputs()
-    return model, inputs
-
-
-def output_csv(filename, headers, row):
-    if os.path.exists(filename):
-        with open(filename) as fd:
-            lines = list(csv.reader(fd)) or [[]]
-            if headers and len(headers) > len(lines[0]):
-                # if prior results failed the header might not be filled in yet
-                lines[0] = headers
-            else:
-                headers = lines[0]
-    else:
-        lines = [headers]
-    lines.append([(f"{x:.6f}" if isinstance(x, float) else x) for x in row])
-    with open(filename, "w") as fd:
-        writer = csv.writer(fd, lineterminator="\n")
-        for line in lines:
-            writer.writerow(list(line) + ["0"] * (len(headers) - len(line)))
-
-
-def nothing(f):
-    return f
-
-
-@functools.lru_cache(None)
-def patch_torch_manual_seed():
-    """Make torch manual seed deterministic. Helps with accuracy testing."""
-
-    def deterministic_torch_manual_seed(*args, **kwargs):
-        from torch._C import default_generator
-
-        seed = 1337
-        import torch.cuda
-
-        if not torch.cuda._is_in_bad_fork():
-            torch.cuda.manual_seed_all(seed)
-        return default_generator.manual_seed(seed)
-
-    torch.manual_seed = deterministic_torch_manual_seed
-
-
-def synchronize():
-    pass
-
-
-def summarize_graph_break(filename):
-    """
-    Sorts and de-dupes the graphs breaks on the reason string. Note that this
-    function is just a best effort to reduce the logging information. We could
-    miss some graph breaks because of de-duping. We can further refine this
-    function as need arises.
-    """
-    log_file = f"{filename.rstrip('.csv')}_graph_breaks.csv"
-    if os.path.exists(log_file):
-        df = pd.read_csv(log_file)
-        df = df.sort_values("reason").drop_duplicates(subset="reason")
-
-        # Specialize for multi tensor sgd as reason is not identical
-        multi_tensor_sgd_row = df.loc[df["reason"].str.contains("_multi_tensor_sgd")]
-        if len(multi_tensor_sgd_row):
-            df = df[
-                ~df["reason"].str.contains("_multi_tensor_sgd")
-            ]  # Drop all sgd rows
-            df = pd.concat(
-                [df, pd.DataFrame([multi_tensor_sgd_row.iloc[0]])], axis=0
-            )  # Add back a single row
-        df.to_csv(f"{log_file.rstrip('.csv')}_deduped.csv", index=False)
-
-
-def print_summary(filename, print_dataframe=False):
-    if not (filename and os.path.exists(filename)):
-        return
-    data = pd.read_csv(filename)
-    if "tag" in data.columns:
-        for tag in data.tag.unique():
-            if tag == "0.0000":
-                continue  # This happens for failed runs
-            print(f"\nSummary for tag={tag}:")
-            print_summary_table(data[data.tag == tag], print_dataframe=print_dataframe)
-    else:
-        print_summary_table(data, print_dataframe=print_dataframe)
-    summarize_graph_break(filename)
-
-
-def print_summary_table(data, print_dataframe=False):
-    if print_dataframe:
-        pd.options.display.max_rows = 1000
-        pd.options.display.max_columns = 1000
-        pd.options.display.width = 2000
-        print(data)
-    width = max(map(len, data.columns))
-    for col in data.columns:
-        try:
-            if col in ("dev", "name", "batch_size", "tag"):
-                continue
-            elif col in ("pct_ops", "pct_time"):
-                print(col.ljust(width), f"{data[col].mean():.3%}")
-            elif col in ("graphs", "graph_calls", "captured_ops", "total_ops"):
-                print(col.ljust(width), f"{data[col].mean():.3f}")
-            elif col in ("compilation_latency"):
-                print(col.ljust(width), f"mean={data[col].mean():.3f} seconds")
-            elif col in ("compression_ratio"):
-                print(col.ljust(width), f"mean={data[col].mean():.3f}x")
-            elif col in ("accuracy"):
-                pass_rate = (data[col] == "pass").mean()
-                print(col.ljust(width), f"pass_rate={100*pass_rate:.2f}%")
-            else:
-                cdata = data[col]
-                print(
-                    col.ljust(width),
-                    f"gmean={gmean(cdata):.2f}x mean={cdata.mean():.3f}x",
-                )
-        except Exception as e:
-            pass
-
-
-def tensor_is_on_xla(tensors):
-    def visit(x: torch.Tensor):
-        nonlocal result
-        if x.device.type == "xla":
-            result = True
-
-    result = False
-    tree_map_only(torch.Tensor, visit, tensors)
-    return result
-
-
-def timed(
-    model,
-    model_iter_fn,
-    example_inputs,
-    times=1,
-    return_result=False,
-    collect_outputs=False,
-):
-    use_xla = tensor_is_on_xla(example_inputs)
-    synchronize()
-
-    if use_xla:
-        xm.mark_step()
-        xm.wait_device_ops()
-
-    time_total = 0
-    # Dont collect outputs to correctly measure timing
-    for _ in range(times):
-        # Put this call inside the loop to reset the seed for each iteration.
-        # Don't include reset_rng_state() to correctly measure timing
-        reset_rng_state(use_xla)
-        t_iter_begin = time.perf_counter()
-        result = model_iter_fn(model, example_inputs, collect_outputs=collect_outputs)
-
-        # instead of calling sync on result_list, we should call mark_step.
-        # In training case, result_list may be empty, but we want to
-        # send all the pending graphs for compilation.
-        if use_xla:
-            # For the model running on regular torchxla (baseline), we need the
-            # mark step to send the accumulated graph for compilation.
-            #
-            # For the model running with dynamo/torchxla bridge, in training case,
-            # we need the mark step to send the optimizer graph out for
-            # compilation.
-            xm.mark_step()
-        t_iter_end = time.perf_counter()
-        time_total += t_iter_end - t_iter_begin
-
-    t_0 = time.perf_counter()
-    if use_xla:
-        xm.wait_device_ops()
-    synchronize()
-    t_1 = time.perf_counter()
-    time_total += t_1 - t_0
-    return (time_total, result) if return_result else time_total
-
-
-def _normalize_bench_inputs(example_inputs) -> Tuple[Tuple[Any], Mapping[str, Any]]:
-    # NOTE(bowbao): For huggingface benchmark, example_inputs are formatted as dictionary,
-    # and consumed like `model(**example_inputs)`.
-    # For other benchmarks, example_inputs are formatted as tuple and consumed
-    # like `model(*example_inputs)`.
-    if isinstance(example_inputs, dict):
-        return (), example_inputs
-    else:
-        return tuple(example_inputs), {}
-
-
-def _register_dataclass_output_as_pytree(example_outputs) -> None:
-    # NOTE(angelayi): For huggingface benchmark, some example outputs are
-    # formatted as a dataclass which pytree cannot consume. So we want
-    # to register the pytree implementation here
-    example_outputs_flat, _ = pytree.tree_flatten(example_outputs)
-    output_dataclass_types = [
-        type(out) for out in example_outputs_flat if dataclasses.is_dataclass(type(out))
-    ]
-    for output_type in output_dataclass_types:
-        from torch._export.utils import register_dataclass_as_pytree_node
-
-        register_dataclass_as_pytree_node(output_type)
-
-
-class Stats:
-    totals = collections.defaultdict(collections.Counter)
-
-    @classmethod
-    def reset_counters(cls):
-        for k, v in torch._dynamo.utils.counters.items():
-            cls.totals[k].update(v)
-        ok = torch._dynamo.utils.counters["frames"]["ok"]
-        total = torch._dynamo.utils.counters["frames"]["total"]
-        torch._dynamo.utils.counters.clear()
-        return ok, total
-
-    @classmethod
-    def print_summary(cls):
-        for k, v in sorted(cls.totals.items()):
-            lines = "\n  ".join(map(str, v.most_common(50)))
-            print(f"STATS {k}\n  {lines}")
-
-    @classmethod
-    def aot_summary(cls):
-        return [cls.totals["aot_autograd"]["total"], cls.totals["aot_autograd"]["ok"]]
-
-
-def coverage_experiment(args, model_iter_fn, model, example_inputs):
-    """
-    Test operator/model coverage of TorchDynamo and record statistics
-    taken from a profiler.  This target is mainly intended to check
-    correctness.
-
-    Writes to ./coverage.csv
-    """
-    profiler = Profiler()
-    frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
-    with profiler.prof:
-        frozen_model_iter_fn(model, example_inputs)
-    coverage_result = profiler.results()
-    output_csv(
-        output_filename,
-        (
-            "dev",
-            "name",
-            "batch_size",
-            "graphs",
-            "graph_calls",
-            "captured_ops",
-            "total_ops",
-            "pct_ops",
-            "pct_time",
-        ),
-        [
-            current_device,
-            current_name,
-            current_batch_size,
-        ]
-        + coverage_result.tocsv(),
-    )
-    return coverage_result
-
-
-def speedup_experiment_fx2trt(args, model_iter_fn, model, example_inputs):
-    """
-    Measure speedups over eager using the trt inference backend. TRT backend is based fx graph
-    generated by torch._dynamo.
-    Writes to ./speedups_fx2trt.csv
-    """
-    return speedup_experiment(args, model_iter_fn, model, example_inputs)
-
-
-def recompile_profiler_experiment(args, model_iter_fn, model, example_inputs):
-    with torch._dynamo.utils.CompileProfiler() as prof:
-        opt_model_iter_fn = torch._dynamo.optimize(prof, nopython=args.nopython)(
-            model_iter_fn
-        )
-        opt_model_iter_fn(model, example_inputs)
-        output_csv(
-            output_filename, ["model", "profiler report"], [current_name, prof.report()]
-        )
-        met = prof.get_metrics()
-        guard_failures = len(met["guard_failures"])
-        return [guard_failures]
-
-
-def randomize_input(inputs):
-    if isinstance(inputs, (list, tuple)):
-        return type(inputs)([randomize_input(x) for x in inputs])
-    elif isinstance(inputs, torch.Tensor):
-        if inputs.dtype in (torch.float32, torch.float64):
-            torch._dynamo.utils.counters["randomize_input"]["times"] += 1
-            return torch.randn_like(inputs)
-        elif inputs.dtype == torch.int64:
-            # Note: we can not simply tune integer tensors as follows
-            #   `return torch.randint_like(inputs, high=inputs.max().item())`
-            # This may break some invariants between tensors.
-            # E.g. in embedding lookup case, one tensor is the length
-            # and another is an indices tensor.
-            return inputs
-        else:
-            raise RuntimeError(
-                f"randomize_input need support tensor of type {inputs.dtype}"
-            )
-    else:
-        raise RuntimeError(
-            f"randomize_input can not handle input of type {type(inputs)}"
-        )
-
-
-def maybe_mark_step(args):
-    if args.trace_on_xla:
-        xm.mark_step()
-
-
-def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
-    """
-    Measure speedups over eager.
-
-    Writes to ./speedups.csv
-    """
-    # if args.dynamic_shapes:
-    #     return speedup_experiment_ds(args, model_iter_fn, model, example_inputs)
-
-    timings = np.zeros((args.repeat, 2), np.float64)
-    # if we randomize the input, we should also check the result is correct
-    should_check_result = should_randomize_input = args.randomize_input
-
-    import contextlib
-
-    from torch._inductor.utils import maybe_profile
-
-    @contextlib.contextmanager
-    def maybe_mark_profile(*args, **kwargs):
-        prof: torch.profiler.profile = kwargs.pop("p", None)
-        mark = kwargs.pop("mark", None)
-        if prof:
-            with torch.profiler.record_function(mark):
-                yield
-        else:
-            yield
-
-    times = args.iterations_per_run
-
-    # Use higher tolerance for XLA since XLA cause numerical unstability when
-    # graph size changes
-    tolerance = args.xla_tolerance if args.trace_on_xla else 1e-4
-    torch._dynamo.config.repro_tolerance = tolerance
-
-    with maybe_profile(args.export_profiler_trace) as p:
-        if args.export_aot_inductor:
-            frozen_model_iter_fn = export_aot_inductor(model_iter_fn)
-        else:
-            frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
-
-        for rep in trange(args.repeat, desc="running benchmark"):
-            inputs = (
-                randomize_input(copy.deepcopy(example_inputs))
-                if should_randomize_input
-                else example_inputs
-            )
-            # need call mark_step to perform the computation
-            # on randomize_input. Otherwise the first call using the
-            # inputs will incur high penalty then the next one.
-            maybe_mark_step(args)
-
-            # interleave the runs to handle frequency scaling and load changes
-            with maybe_mark_profile(p=p, mark="expected"):
-                timings[rep, 0], expected_output = timed(
-                    model,
-                    model_iter_fn,
-                    inputs,
-                    return_result=True,
-                    times=times,
-                    collect_outputs=args.collect_outputs,
-                )
-
-            # call mark_step between the 2 calls to make the comparison fair.
-            maybe_mark_step(args)
-
-            with maybe_mark_profile(p=p, mark="actual"):
-                timings[rep, 1], actual_output = timed(
-                    model,
-                    frozen_model_iter_fn,
-                    inputs,
-                    return_result=True,
-                    times=times,
-                    collect_outputs=args.collect_outputs,
-                )
-
-            if should_check_result:
-                is_correct = is_correct and same(
-                    expected_output, actual_output, tol=tolerance
-                )
-
-    if args.export_profiler_trace:
-        name = args.profiler_trace_name + "_" + model.name + ".json"
-        name = os.path.join(torch._dynamo.config.base_dir, name)
-        p.export_chrome_trace(name)
-    median = np.median(timings, axis=0)
-    speedup = median[0] / median[1]
-    if args.dump_raw_metrics:
-        np.save(
-            f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy",
-            timings,
-        )
-
-    first_headers = ["dev", "name", "batch_size"]
-    first_fields = [current_device, current_name, current_batch_size]
-    if "tag" in kwargs:
-        first_headers.append("tag")
-        first_fields.append(kwargs["tag"])
-    headers = first_headers + ["speedup", "abs_latency"]
-    row = first_fields + [float(speedup), median[1] * 1000]
-    msg = f"{speedup:.3f}x"
-    if args.baseline:
-        headers.extend(
-            [
-                "baseline",
-                "speedup_vs_baseline",
-            ]
-        )
-        df = pd.read_csv(args.baseline)
-        try:
-            baseline_speedup = df[df["name"] == current_name]["speedup"].item()
-            row.extend([baseline_speedup, speedup / baseline_speedup])
-            msg = f"{baseline_speedup:.3f}x -> {speedup:.3f}x [{speedup / baseline_speedup:.3f}x]"
-        except (KeyError, ZeroDivisionError):
-            row.extend(
-                [
-                    0.0,
-                    0.0,
-                ]
-            )
-    if "compilation_latency" in kwargs:
-        headers += [
-            "compilation_latency",
-            "compression_ratio",
-            "eager_peak_mem",
-            "dynamo_peak_mem",
-        ]
-        row.append(kwargs["compilation_latency"])
-        row.append(kwargs["compression_ratio"])
-        row.append(kwargs["eager_peak_mem"])
-        row.append(kwargs["dynamo_peak_mem"])
-    if "dynamo_stats" in kwargs:
-        for k, v in kwargs["dynamo_stats"].items():
-            headers.append(k)
-            row.append(v)
-    output_csv(
-        output_filename,
-        headers,
-        row,
-    )
-    headers, data = torch._dynamo.utils.compile_times(repr="csv", aggregate=True)
-    assert (
-        output_filename.find(".csv") > 0
-    ), f"expected output_filename to be a .csv, but got {output_filename}"
-    output_csv(
-        output_filename[:-4] + "_compilation_metrics.csv",
-        first_headers + headers,
-        first_fields + data,
-    )
-    return msg
-
-
-def speedup_experiment_ds(args, model_iter_fn, model, example_inputs):
-    """
-    Run dynamic shapes benchmarks.
-
-    Requires dynamic shape compatible models, which provide a list of example inputs.
-
-    Warms up using the first input example and then iterates the inputs,
-    measuring (and expecting minimal) variance between the runtime for different examples.
-
-    """
-    timings = np.zeros((args.repeat, len(example_inputs), 2), np.float64)
-
-    if args.repeat > 5:
-        print(
-            f"\ndynamic shapes experiments are slow, consider setting --repeat less than {args.repeat}\n"
-        )
-
-    nwarmup = 4
-    for rep in range(args.repeat):
-        # Start each rep fresh, e.g. only warmup on example 0
-        torch._dynamo.reset()
-        optimized_model_iter_fn = optimize_ctx(model_iter_fn)
-        for _ in range(nwarmup):
-            optimized_model_iter_fn(model, example_inputs[0])
-
-        for input_idx, inputs in enumerate(example_inputs):
-            # interleave the runs to handle frequency scaling and load changes
-            timings[rep, input_idx, 0] = timed(
-                model, model_iter_fn, inputs, return_result=False
-            )
-            # different from regular speedup_experiment, we _DO_ want to allow recompilation
-            timings[rep, input_idx, 1] = timed(
-                model, optimized_model_iter_fn, inputs, return_result=False
-            )
-    medians = np.median(timings, axis=0)
-    speedups = list(medians[:, 0] / medians[:, 1])
-    speedups_mean = np.mean(speedups)
-    speedups_median = np.median(speedups)
-    speedups_var = np.var(speedups)
-
-    # TODO this x[0] is not going to work in general but bert only has 1 input
-    shapes = [x[0].shape for x in example_inputs]
-    shape_keys = sorted(set(shapes))
-    shape_speedups = {
-        shape: [
-            it[1] for it in filter(lambda it: it[0] == shape, zip(shapes, speedups))
-        ]
-        for shape in shape_keys
-    }
-    output_str = (
-        f"mean: {speedups_mean:.3f}, median: {speedups_median:.3f}, var: {speedups_var:.3f}"
-        + "\nSpeedups by shape: "
-        + "\n".join(
-            [
-                f"{shape}: "
-                + ", ".join([f"{speedup: .3g}" for speedup in shape_speedups[shape]])
-                for shape in shape_keys
-            ]
-        )
-    )
-    output_csv(
-        output_filename,
-        ("dev", "name", "batch_size", "speedup mean", "speedup median", "speedup var"),
-        [
-            current_device,
-            current_name,
-            current_batch_size,
-            speedups_mean,
-            speedups_median,
-            speedups_var,
-        ],
-    )
-    return output_str
-
-
-def speedup_experiment_onnx(
-    onnx_model_cls: Type[OnnxModelFromTorchScript],
-    args,
-    model_iter_fn,
-    model,
-    example_inputs,
-    **kwargs,
-):
-    """
-    Measure speedups over eager.
-
-    This function is responsible for the following:
-        1. Creation of OnnxModel, which handles export, ort initialization.
-        2. Creating iobinding with OnnxModel if device is CUDA, which is essential for perf measurement.
-        3. Running ORT with OnnxModel.
-
-    Writes to ./{output_filename}, which should be
-        `pathlib.Path(self.output_dir) / f"{self.compiler}_{suite}_{self.dtype}_{self.mode}_{self.device}_{self.testing}.csv".
-
-    TODO(bowbao): Record export time and export peak memory usage.
-    """
-    timings = np.zeros((args.repeat, 2), np.float64)
-    is_correct = True
-    should_randomize_input = args.randomize_input
-    times = args.iterations_per_run
-
-    onnx_model = onnx_model_cls(
-        args.output_directory or ".", model, copy.deepcopy(example_inputs)
-    )
-
-    def create_onnx_input_binded_fn(
-        onnx_model: OnnxModelFromTorchScript, pt_inputs, example_outputs
-    ):
-        # Goal is to move the iobinding creation outside of the timer function.
-        iobinding, outputs = onnx_model.create_iobinding(pt_inputs, example_outputs)
-
-        def onnxrt_model_iter_fn(model, inputs, collect_outputs=True):
-            onnx_model.run_with_iobinding(iobinding, outputs)
-            if collect_outputs:
-                return outputs
-
-        return onnxrt_model_iter_fn
-
-    def create_onnx_fn(onnx_model: OnnxModelFromTorchScript, pt_inputs):
-        def onnxrt_model_iter_fn(model, inputs, collect_outputs=True):
-            return onnx_model.run(pt_inputs)
-
-        return onnxrt_model_iter_fn
-
-    for rep in range(args.repeat):
-        inputs = (
-            randomize_input(copy.deepcopy(example_inputs))
-            if should_randomize_input
-            else example_inputs
-        )
-        timings[rep, 0], expected_output = timed(
-            model,
-            model_iter_fn,
-            inputs,
-            return_result=True,
-            times=times,
-            collect_outputs=args.collect_outputs,
-        )
-
-        if current_device == "cpu":
-            onnxrt_model_iter_fn = create_onnx_fn(onnx_model, inputs)
-        else:
-            onnxrt_model_iter_fn = create_onnx_input_binded_fn(
-                onnx_model, inputs, expected_output
-            )
-
-        timings[rep, 1], actual_output = timed(
-            model,
-            onnxrt_model_iter_fn,
-            inputs,
-            return_result=True,
-            times=times,
-            collect_outputs=args.collect_outputs,
-        )
-
-    pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue
-    median = np.median(timings, axis=0)
-    speedup = median[0] / median[1]
-    if args.dump_raw_metrics:
-        np.save(
-            f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy",
-            timings,
-        )
-
-    headers = ["dev", "name", "batch_size", "speedup", "abs_latency"]
-    row = [
-        current_device,
-        current_name,
-        current_batch_size,
-        float(speedup),
-        median[1] * 1000,
-    ]
-    if "compilation_latency" in kwargs:
-        headers = headers + ["compilation_latency", "compression_ratio"]
-        row.append(kwargs["compilation_latency"])
-        row.append(kwargs["compression_ratio"])
-
-    output_csv(
-        output_filename,
-        headers,
-        row,
-    )
-    headers, data = torch._dynamo.utils.compile_times(repr="csv", aggregate=True)
-    assert (
-        output_filename.find(".csv") > 0
-    ), f"expected output_filename to be a .csv, but got {output_filename}"
-    output_csv(
-        output_filename[:-4] + "_compilation_metrics.csv",
-        ["dev", "name", "batch_size"] + headers,
-        [current_device, current_name, current_batch_size] + data,
-    )
-    return format_speedup(speedup, pvalue, is_correct=is_correct)
-
-
-def overhead_experiment(*args, model_iter_fn):
-    """
-    Measure overheads of TorchDynamo by running with no backend (only
-    eager+FX), and reporting speedup/slowdown over eager.
-
-    Writes to ./overheads.csv
-    """
-    return speedup_experiment(*args, model_iter_fn)
-
-
-def print_fx(gm, example_inputs):
-    print(gm.graph)
-    return gm
-
-
-def print_aten_ops(gm, example_inputs):
-    from functorch.compile import aot_module
-
-    def trace_printer(gm, _):
-        print(gm.graph)
-        return gm
-
-    return aot_module(gm, fw_compiler=trace_printer, bw_compiler=trace_printer)
-
-
-def baselines(models, model_iter_fn, example_inputs, args):
-    """
-    Common measurement code across all baseline experiments.
-    """
-    models = list(models)
-    for idx, (name, model) in enumerate(models):
-        if idx == 0:
-            result0 = model_iter_fn(model, example_inputs)
-        elif model is not None:
-            try:
-                result = model_iter_fn(model, example_inputs)
-                if same(result0, result):
-                    continue
-                print(name, "is INCORRECT")
-            except Exception:
-                log.exception("error checking %s", name)
-            models[idx] = (name, None)
-    timings = np.zeros((args.repeat, len(models)), np.float64)
-    timings.fill(1.0e10)
-    for rep in range(args.repeat):
-        for idx, (name, model) in enumerate(models):
-            if model is not None:
-                try:
-                    timings[rep, idx] = timed(model, model_iter_fn, example_inputs)
-                except Exception:
-                    pass
-    pvalue = [
-        ttest_ind(timings[:, 0], timings[:, i]).pvalue
-        for i in range(1, timings.shape[1])
-    ]
-    median = np.median(timings, axis=0)
-    speedup = median[0] / median[1:]
-    for idx, (name, model) in enumerate(models[1:]):
-        if model is None:
-            speedup[idx] = 0.0
-    result = " ".join(
-        [
-            format_speedup(s, p, m is not None)
-            for s, p, m in zip(speedup, pvalue, [m for n, m in models[1:]])
-        ]
-    )
-    output_csv(
-        output_filename,
-        ("dev", "name", "batch_size") + tuple(n for n, m in models[1:]),
-        [current_device, current_name, current_batch_size]
-        + [f"{x:.4f}" for x in speedup],
-    )
-    return result
-
-
-def xla(args, model_iter_fn, model, example_inputs):
-    xla_dev = xm.xla_device(devkind=current_device)
-    model_xla = copy.deepcopy(model).to("cpu").to(device=xla_dev)
-    example_inputs_xla = tree_map_only(
-        torch.Tensor, lambda x: x.to("cpu").to(device=xla_dev), example_inputs
-    )
-    for _ in range(3):  # warmup
-        timed(model, model_iter_fn, example_inputs)
-        timed(model_xla, model_iter_fn, example_inputs_xla)
-    timings = np.zeros((args.repeat, 2), np.float64)
-    timings.fill(1.0e10)
-    for rep in range(args.repeat):
-        timings[rep, 0] = timed(model, model_iter_fn, example_inputs)
-        timings[rep, 1] = timed(model_xla, model_iter_fn, example_inputs_xla)
-
-    pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue
-    time_baseline, time_xla = np.median(timings, axis=0)
-    speedup = time_baseline / time_xla
-    output_csv(
-        output_filename,
-        ("dev", "name", "batch_size", "speedup", "time_baseline", "time_xla"),
-        [
-            current_device,
-            current_name,
-            current_batch_size,
-            speedup,
-            time_baseline,
-            time_xla,
-        ],
-    )
-    return format_speedup(speedup, pvalue)
-
-
-def try_script(model, example_inputs):
-    try:
-        return torch.jit.script(model)
-    except Exception:
-        return None
-
-
-class AOTInductorModelCache:
-    cache = dict()
-
-    @classmethod
-    def load(cls, model, example_inputs, eager_forward):
-        key = id(model)
-        if key not in cls.cache:
-            # Register the output dataclass to pytree
-            example_outputs = eager_forward(
-                copy.deepcopy(model), clone_inputs(example_inputs)
-            )
-            _register_dataclass_output_as_pytree(example_outputs)
-
-            example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
-            example_inputs = torch._export.combine_args_kwargs(
-                example_args, example_kwargs
-            )
-
-            so_path, exported = torch._export.aot_compile(
-                model, example_args, example_kwargs
-            )
-
-            output_node = list(exported.graph.nodes)[-1]
-            output_tensors = [
-                torch.empty(
-                    node.meta["val"].size(),
-                    dtype=node.meta["val"].dtype,
-                    layout=node.meta["val"].layout,
-                    device=node.meta["val"].device,
-                )
-                for node in output_node.args[0]
-            ]
-
-            # Use a utility function for easier benchmarking
-            source = """
-            #include <torch/csrc/inductor/aoti_runtime/model.h>
-
-            torch::aot_inductor::AOTInductorModel model;
-
-            void run(
-                    const std::vector<at::Tensor>& input_tensors,
-                    std::vector<at::Tensor>& output_tensors) {
-                model.run(input_tensors, output_tensors, at::cuda::getCurrentCUDAStream());
-            }
-            """
-            module = torch.utils.cpp_extension.load_inline(
-                name="aot_inductor",
-                cpp_sources=[source],
-                functions=["run"],
-                extra_ldflags=[so_path],
-                with_cuda=True,
-            )
-
-            value = {
-                "module": module,
-                "exported": exported,
-                "output_tensors": output_tensors,
-                "output_spec": exported.call_spec.out_spec,
-            }
-            cls.cache[key] = value
-
-        return (
-            cls.cache[key]["module"],
-            cls.cache[key]["exported"],
-            cls.cache[key]["output_tensors"],
-            cls.cache[key]["output_spec"],
-        )
-
-
-def export_aot_inductor(forward: Callable):
-    eager_forward = forward
-
-    def opt_aot_inductor(model, example_inputs, collect_outputs=False):
-        module, exported, output_tensors, output_spec = AOTInductorModelCache.load(
-            model, example_inputs, eager_forward
-        )
-        param_buffer_values = list(exported.state_dict.values())
-        example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
-        example_inputs = torch._export.combine_args_kwargs(example_args, example_kwargs)
-        flat_example_inputs = fx_pytree.tree_flatten_spec(
-            example_inputs, exported.call_spec.in_spec
-        )
-        all_args = (*param_buffer_values, *flat_example_inputs)
-        module.run(all_args, output_tensors)
-        return pytree.tree_unflatten(output_tensors, output_spec)
-
-    return opt_aot_inductor
-
-
-def download_retry_decorator(download_fn):
-    """
-    Decorator function for applying retry logic to a download function.
-
-    The wrapped function will be called up to 5 times and raises an exception if the function fails each time.
-    After each unsuccessful attempt, there is a delay before the next attempt, which is increased linearly with the number of tries.
-
-    Usage:
-    @download_retry_decorator
-    def download_function(model_name: str):
-        # download logic goes here
-    """
-
-    @functools.wraps(download_fn)
-    def wrapper(self, *args, **kwargs) -> Any:
-        tries = 0
-        total_allowed_tries = MAX_DOWNLOAD_ATTEMPTS
-        while tries <= total_allowed_tries:
-            try:
-                model = download_fn(self, *args, **kwargs)
-                return model
-            except Exception as e:
-                tries += 1
-                if tries <= total_allowed_tries:
-                    wait = tries * 30
-                    print(
-                        f"Failed to load model: {e}. Trying again ({tries}/{total_allowed_tries}) after {wait}s"
-                    )
-                    time.sleep(wait)
-                else:
-                    raise RuntimeError(
-                        f"Failed to load model '{args}' with following error(s): {str(e)}."
-                    )
-
-    return wrapper
-
-
-class OnnxModelFromTorchScript:
-    """TorchScript based onnx export. `torch.onnx.export`
-
-    TODO(bowbao):
-    * large model export failed.
-          Onnx Model is larger than 2GB, but exporter makes decision based pt model size, which is
-          smaller than 2GB.
-    * OOM on slightly larger model.
-          Both pt model and ort inference session are on gpu. Attempt has been made to move ORT to
-          cuda:1, however ORT perf drop significantly.
-          For now running everything with batch_size 1 set in launch script.
-    """
-
-    TORCH_TO_NUMPY_DTYPE = {
-        torch.float16: np.float16,
-        torch.float32: np.float32,
-        torch.float64: np.float64,
-        torch.uint8: np.uint8,
-        torch.int8: np.int8,
-        torch.int16: np.int16,
-        torch.int32: np.int32,
-        torch.int64: np.longlong,
-        torch.bool: np.bool_,
-    }
-
-    def __init__(self, output_directory, model, example_inputs):
-        self.model_path = self._generate_onnx_model_path(output_directory)
-        self._export(
-            model,
-            example_inputs,
-            self.model_path,
-            opset_version=17,
-            do_constant_folding=False,
-            verbose=False,
-        )
-        self.onnx_session = self._init_ort_session(self.model_path)
-
-    def _generate_onnx_model_path(
-        self, output_directory: str, onnx_model_folder_name: str = "bench_onnx_models"
-    ) -> str:
-        # Hack to get model name.
-        from torch._functorch import aot_autograd
-
-        model_name = aot_autograd.model_name
-        model_path = pathlib.Path(output_directory, onnx_model_folder_name, model_name)
-        if model_path.exists() and model_path.is_dir():
-            shutil.rmtree(model_path)
-        model_path.mkdir(parents=True, exist_ok=True)
-        return str(model_path / "model.onnx")
-
-    def _export(self, model, example_inputs, output_path: str, /, **kwargs) -> None:
-        # Hack for huggingface models (kwargs only).
-        if isinstance(example_inputs, dict):
-
-            class WrapperModel(torch.nn.Module):
-                def __init__(self, model, keys):
-                    super().__init__()
-                    self.model = model
-                    self.keys = keys
-
-                def forward(self, *args):
-                    return self.model(**dict(zip(self.keys, args)))
-
-            model = WrapperModel(model, list(example_inputs.keys()))
-
-        torch.onnx.export(
-            model,
-            self.format_pt_inputs(example_inputs),
-            output_path,
-            **kwargs,
-        )
-
-    def _init_ort_session(self, model_path: str):
-        import onnxruntime
-
-        if current_device == "cpu":
-            ort_providers = ["CPUExecutionProvider"]
-        else:
-            # NOTE(bowbao): Reduce OOM by running ORT on another gpu.
-            # TODO(bowbao): This works to avoid OOM, but performance is surprisingly very bad.
-            # cuda_provider_options = {
-            #     "device_id": 1 if torch.cuda.device_count() > 1 else 0,
-            # }
-            # ort_providers = [("CUDAExecutionProvider", cuda_provider_options)]
-            ort_providers = ["CUDAExecutionProvider"]
-
-        ort_session = onnxruntime.InferenceSession(
-            self.model_path,
-            providers=ort_providers,
-        )
-        return ort_session
-
-    def format_pt_inputs(self, pt_inputs):
-        # NOTE(bowbao): For huggingface benchmark, pt_inputs are formatted as dictionary,
-        # and consumed like `model(**pt_inputs)`.
-        # For other benchmarks, pt_inputs are formatted as tuple and consumed
-        # like `model(*pt_inputs)`.
-        if isinstance(pt_inputs, dict):
-            pt_inputs = list(pt_inputs.values())
-        if isinstance(pt_inputs, torch.Tensor):
-            pt_inputs = (pt_inputs,)
-        return tuple(arg.contiguous() for arg in pt_inputs)
-
-    def format_pt_outputs(self, pt_outputs):
-        if isinstance(pt_outputs, torch.Tensor):
-            pt_outputs = (pt_outputs,)
-
-        pt_outputs, _ = pytree.tree_flatten(pt_outputs)
-
-        # Hack for huggingface model outputs
-        try:
-            from transformers import modeling_outputs
-        except ImportError:
-            pass
-        else:
-
-            def _to_tuple(x):
-                if isinstance(x, modeling_outputs.ModelOutput):
-                    return x.to_tuple()
-                return x
-
-            pt_outputs = pytree.tree_map(_to_tuple, pt_outputs)
-            pt_outputs, _ = pytree.tree_flatten(pt_outputs)
-
-        return pt_outputs
-
-    def create_outputs(self, *example_outputs):
-        return tuple(torch.empty_like(x) for x in example_outputs)
-
-    def create_iobinding(self, pt_inputs, example_outputs):
-        pt_inputs = self.format_pt_inputs(pt_inputs)
-        example_outputs = self.format_pt_outputs(example_outputs)
-
-        iobinding = self.onnx_session.io_binding()
-        args = [arg.contiguous() for arg in pt_inputs]
-        for ort_input, arg in zip(self.onnx_session.get_inputs(), args):
-            # NOTE: Small hack to reduce OOM issue by running ORT on another device.
-            # Disabled due to ORT perf regression.
-            # if torch.cuda.device_count() > 1:
-            #     arg = arg.detach().to("cuda:1")
-            device = arg.device
-            iobinding.bind_input(
-                ort_input.name,
-                device.type,
-                device.index or 0,
-                self.TORCH_TO_NUMPY_DTYPE[arg.dtype],
-                arg.size(),
-                arg.data_ptr(),
-            )
-
-        outputs = self.create_outputs(*example_outputs)
-        for ort_output, output in zip(self.onnx_session.get_outputs(), outputs):
-            # if torch.cuda.device_count() > 1:
-            #     output = output.detach().to("cuda:1")
-            device = output.device
-            iobinding.bind_output(
-                ort_output.name,
-                device.type,
-                device.index or 0,
-                self.TORCH_TO_NUMPY_DTYPE[output.dtype],
-                output.size(),
-                output.data_ptr(),
-            )
-        return iobinding, outputs
-
-    def run_with_iobinding(self, iobinding, outputs):
-        # 'outputs' are torch empty tensors binded to 'iobinding'.
-        self.onnx_session.run_with_iobinding(iobinding)
-        return outputs
-
-    def run(self, pt_inputs):
-        # NOTE: For CUDA performance testing, use `run_with_iobinding` to exclude memory
-        # copying overhead for inputs/outputs between cpu and gpu.
-        # Otherwise perf number is inaccurate.
-        pt_inputs = self.format_pt_inputs(pt_inputs)
-        onnx_inputs = {
-            ort_input.name: pt_input.cpu().numpy()
-            for ort_input, pt_input in zip(self.onnx_session.get_inputs(), pt_inputs)
-        }
-        ort_outputs = self.onnx_session.run(None, onnx_inputs)
-        pt_outputs = [
-            torch.from_numpy(ort_output).to(current_device)
-            for ort_output in ort_outputs
-        ]
-        if len(pt_outputs) == 1:
-            return pt_outputs[0]
-        return pt_outputs
-
-
-class OnnxModelFromDynamo(OnnxModelFromTorchScript):
-    """Dynamo and Fx based export. `torch.onnx.dynamo_export`."""
-
-    def __init__(self, output_directory, model, example_inputs):
-        self.model_path = self._generate_onnx_model_path(
-            output_directory, "bench_dynamo_onnx_model"
-        )
-        self._export_output = self._export(model, example_inputs, self.model_path)
-        self.onnx_session = self._init_ort_session(self.model_path)
-
-    def _export(
-        self, model, example_inputs, output_path: str
-    ) -> torch.onnx.ExportOutput:
-        example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
-        options = torch.onnx.ExportOptions()
-        export_output = torch.onnx.dynamo_export(
-            model, *example_args, **example_kwargs, export_options=options
-        )
-
-        export_output.save(output_path)
-        return export_output
-
-    def format_pt_inputs(self, pt_inputs):
-        pt_args, pt_kwargs = _normalize_bench_inputs(pt_inputs)
-        return self._export_output.adapt_torch_inputs_to_onnx(*pt_args, **pt_kwargs)
-
-    def format_pt_outputs(self, pt_outputs):
-        return self._export_output.adapt_torch_outputs_to_onnx(pt_outputs)
-
-
-def optimize_onnx_ctx(
-    output_directory: str,
-    onnx_model_cls: Type[OnnxModelFromTorchScript],
-    run_n_iterations: Callable,
-) -> Callable:
-    # NOTE(bowbao): This function creates and returns the onnx version of 'run_n_iterations',
-    # which does the following:
-    #   1. Export and cache model.
-    #   2. Create iobinding for ORT.
-    #   3. Run ORT for n iterations.
-    onnx_model: Optional[OnnxModelFromTorchScript] = None
-
-    def run_n_iterations_onnx(model, inputs, n=2):
-        from _onnx import reporter
-        from torch.onnx._internal import exporter
-        from torch.onnx._internal.fx import diagnostics
-
-        # NOTE(bowbao): Capture all export & ort errors and diagnostics.
-        # Serialize to csv, to be parsed and summarized later by '._onnx/reporter.py'.
-        # TODO: Accuracy mismatch is not reported here in csv.
-        assert (
-            output_filename.find(".csv") > 0
-        ), f"expected output_filename to be a .csv, but got {output_filename}"
-        output_error_filename = output_filename[:-4] + "_export_error.csv"
-        parser = reporter.ExportErrorParser(
-            current_device, current_name, current_batch_size
-        )
-        try:
-            nonlocal onnx_model
-            if onnx_model is None:
-                onnx_model = onnx_model_cls(
-                    output_directory, model, copy.deepcopy(inputs)
-                )
-
-            for _ in range(n - 1):
-                onnx_model.run(inputs)
-            return onnx_model.run(inputs)
-        except exporter.OnnxExporterError as e:
-            # `torch.onnx.dynamo_export` raises error that encloses diagnostics.
-            diagnostic_context = e.diagnostic_context
-            for parsed_error in parser.parse_diagnostic_context(diagnostic_context):
-                output_csv(
-                    output_error_filename, parsed_error.headers, parsed_error.row
-                )
-
-            # Check also the raw exception that caused export failure.
-            # Skip if it is already analyzed by diagnostics.
-            cause_of_exception = e.__cause__
-            if not isinstance(
-                cause_of_exception, diagnostics.RuntimeErrorWithDiagnostic
-            ):
-                parsed_error = parser.parse_exception(cause_of_exception)
-                output_csv(
-                    output_error_filename, parsed_error.headers, parsed_error.row
-                )
-            raise
-        except Exception as e:
-            # `torch.onnx.export` errors.
-            # ORT errors.
-            parsed_error = parser.parse_exception(e)
-            output_csv(output_error_filename, parsed_error.headers, parsed_error.row)
-            raise
-
-    return run_n_iterations_onnx
-
-
-def read_batch_size_from_file(args, filename, model_name):
-    batch_size = None
-    if os.path.exists("benchmarks"):
-        filename = os.path.join("benchmarks", filename)
-    assert os.path.exists(filename), filename
-    with open(filename) as f:
-        lines = f.readlines()
-        lines = [i.split(",") for i in lines if len(i.strip()) > 0]
-        for val in lines:
-            cur_name, b = val
-            if model_name == cur_name:
-                batch_size = int(b)
-    if batch_size is None:
-        log.warning("Could not find batch size for %s", model_name)
-    elif batch_size == -1:
-        raise RuntimeError(
-            f"Batch size is unset for {model_name} in {args.batch_size_file}"
-        )
-    print(f"batch size: {batch_size}")
-    return batch_size
-
-
-class TimeOutException(Exception):
-    pass
-
-
-def alarm_handler(signum, frame):
-    raise TimeOutException()
-
-
-def exit_after(s):
-    """
-    Decorator to raise TimeoutException if the fn is taking more than s seconds
-    to run.
-    """
-
-    def outer(fn):
-        def inner(*args, **kwargs):
-            signal.signal(signal.SIGALRM, alarm_handler)
-            signal.alarm(s)
-            try:
-                result = fn(*args, **kwargs)
-            finally:
-                signal.alarm(0)
-            return result
-
-        return inner
-
-    return outer
-
-
-def get_peak_memory():
-    return torch.cuda.max_memory_allocated() / 10**9
-
-
-def null_experiment(args, model_iter_fn, model, example_inputs):
-    """
-    A no-op experiment useful for making sure TorchBenchark alone works properly.
-    """
-
-    return []
-
-
-def cast_to(dtype, model, inputs):
-    # cast model and inputs to fp16
-    if dtype == torch.float16:
-        model = model.half()
-    else:
-        model = model.to(dtype)
-
-    inputs = tree_map(
-        lambda x: x.to(dtype)
-        if isinstance(x, torch.Tensor) and x.is_floating_point()
-        else x,
-        inputs,
-    )
-    return model, inputs
-
-
-def cast_to_bf16(model, inputs):
-    return cast_to(torch.bfloat16, model, inputs)
-
-
-def cast_to_fp16(model, inputs):
-    return cast_to(torch.float16, model, inputs)
-
-
-def cast_to_fp64(model, inputs):
-    return cast_to(torch.float64, model, inputs)
-
-
-def cast_to_fp32(model, inputs):
-    return cast_to(torch.float32, model, inputs)
-
-
-def reset_rng_state(use_xla=False):
-    torch.manual_seed(1337)
-    random.seed(1337)
-    np.random.seed(1337)
-    if use_xla:
-        xm.set_rng_state(1337, str(xm.xla_device()))
-
-
-class DummyGradScaler:
-    def scale(self, loss):
-        return loss
-
-
-def get_dynamo_stats():
-    # TODO: consider deepcopy'ing the entire counters struct and
-    # adding a helper to do subtraction on it
-    return collections.Counter(
-        {
-            "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"],
-            "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"],
-            "graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()),
-            # NB: The plus removes zero counts
-            "unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]),
-        }
-    )
-
-
-def maybe_fresh_cache(fn, is_cold_start):
-    def inner(*args, **kwargs):
-        cache_minder = contextlib.nullcontext()
-        if is_cold_start:
-            cache_entries = {}
-            cache_minder = fresh_inductor_cache(cache_entries)
-
-        try:
-            with cache_minder:
-                return fn(*args, **kwargs)
-        finally:
-            dump_cache = False
-            if dump_cache and is_cold_start:
-                output_csv(
-                    output_filename[:-4] + "_triton_cache.csv",
-                    ["dev", "name", "batch_size", "triton_cache"],
-                    [
-                        current_device,
-                        current_name,
-                        current_batch_size,
-                        cache_entries,
-                    ],
-                )
-
-    return inner
-
-
-@contextmanager
-def maybe_init_distributed(should_init_distributed, rank, world_size, port="6789"):
-    try:
-        if should_init_distributed:
-            torch.cuda.set_device(rank)
-            os.environ["MASTER_ADDR"] = "localhost"
-            os.environ["MASTER_PORT"] = port
-            torch.distributed.init_process_group(
-                "nccl", rank=rank, world_size=world_size
-            )
-        yield
-    finally:
-        if should_init_distributed:
-            torch.distributed.destroy_process_group()
-
-
-class BenchmarkRunner:
-    def __init__(self):
-        self.model_iter_fn = None
-        self.grad_scaler = DummyGradScaler()
-        self.autocast = contextlib.nullcontext
-        self.optimizer = None
-        self._args = None
-
-    def setup_amp(self):
-        if self.args.only in self.fp32_only_models:
-            return
-
-        if self.args.amp and self.args.devices == ["cuda"]:
-            # AMP training can lead to small loss values which can undeflow
-            # gradient values returning in zero gradients. To solve this
-            # problem, PyTorch introduces GradScaler. GradScaler is a stateful
-            # structure, that scales the loss values to prevent underflow. Loss
-            # values are big at the beginning of training (therefore not
-            # requiring scaling), while loss value tends to be small as network
-            # starts getting better (requiring scaling). GradScaler manages all
-            # of this fine tuning, checking the gradients are turning to inf,
-            # discarding such batches.
-
-            # Since we are not running a long iteration, default value of
-            # init_scale 65536 is going to turn all gradients to inf. Therefore,
-            # we just use a init_scale of 2.0 for benchmarking purpose.
-
-            # Disabling Gradscaler because
-            #  1) Benchmark setup runs 2 iterations of fwd-bwd. So, not useful.
-            #  2) Current setup shares grad_scaler for eager and dynamo model,
-            #  which is bad as Gradscaler has state and can adjust the scaling
-            #  factor between eager and dynamo run, making accuracy check
-            #  harder.
-            # self.grad_scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
-            self.autocast = torch.cuda.amp.autocast
-        elif (self.args.bfloat16 or self.args.amp) and self.args.devices == ["cpu"]:
-            self.autocast = torch.cpu.amp.autocast
-
-    def init_optimizer(self, name, device, params):
-        if device == "cuda" and self.args.training and name not in CI_SKIP_OPTIMIZER:
-            self.optimizer = torch.optim.SGD(params, lr=0.01, foreach=True)
-        else:
-            self.optimizer = None
-
-    @property
-    def args(self):
-        return self._args
-
-    @args.setter
-    def args(self, args):
-        self._args = args
-
-    @property
-    def skip_models(self):
-        return set()
-
-    @property
-    def skip_models_for_cuda(self):
-        return set()
-
-    @property
-    def skip_models_for_cpu(self):
-        return set()
-
-    @property
-    def slow_models(self):
-        return set()
-
-    @property
-    def very_slow_models(self):
-        return set()
-
-    @property
-    def non_deterministic_models(self):
-        return set()
-
-    @property
-    def fp32_only_models(self):
-        return set()
-
-    @property
-    def force_amp_for_fp16_bf16_models(self):
-        return set()
-
-    @property
-    def skip_not_suitable_for_training_models(self):
-        return set()
-
-    @property
-    def failing_torchinductor_models(self):
-        return set()
-
-    @property
-    def failing_fx2trt_models(self):
-        return set()
-
-    @property
-    def skip_accuracy_checks_large_models_dashboard(self):
-        return set()
-
-    @property
-    def skip_accuracy_check_as_eager_non_deterministic(self):
-        return set()
-
-    @property
-    def get_tolerance_and_cosine_flag(self, is_training, current_device, name):
-        raise NotImplementedError()
-
-    @property
-    def equal_nan(self):
-        equal_nan = True
-        if self.args.float32:
-            equal_nan = False
-        return equal_nan
-
-    def iter_models(self, args):
-        for model_name in self.iter_model_names(args):
-            for device in args.devices:
-                try:
-                    yield self.load_model(
-                        device,
-                        model_name,
-                        batch_size=args.batch_size,
-                    )
-                except NotImplementedError:
-                    continue  # bad benchmark implementation
-
-    def deepcopy_model(self, model):
-        return copy.deepcopy(model)
-
-    def cast_based_on_args(self, model, example_inputs):
-        if self.args.float32 or self.args.only in self.fp32_only_models:
-            if not self.args.float32:
-                log.warning("Model %s supports float32 only", self.args.only)
-            model, example_inputs = cast_to_fp32(model, example_inputs)
-        elif self.args.float16:
-            if self.args.only in self.force_amp_for_fp16_bf16_models:
-                log.warning(
-                    "Model %s does not support float16, running with amp instead",
-                    self.args.only,
-                )
-                self.args.amp = True
-                self.setup_amp()
-            else:
-                model, example_inputs = cast_to_fp16(model, example_inputs)
-        elif self.args.bfloat16:
-            if self.args.only in self.force_amp_for_fp16_bf16_models:
-                log.warning(
-                    "Model %s does not support bfloat16, running with amp instead",
-                    self.args.only,
-                )
-                self.args.amp = True
-                self.setup_amp()
-            else:
-                model, example_inputs = cast_to_bf16(model, example_inputs)
-
-        return model, example_inputs
-
-    def validate_model(self, model, example_inputs):
-        """
-        Runs the eager model with example inputs to ensure that eager passes.
-        """
-        model = self.deepcopy_model(model)
-        example_inputs = clone_inputs(example_inputs)
-        model, example_inputs = self.cast_based_on_args(model, example_inputs)
-        try:
-            self.model_iter_fn(model, example_inputs)
-        except Exception as e:
-            raise NotImplementedError("Eager model failed to run") from e
-
-    def maybe_cast(self, model, example_inputs):
-        model = self.deepcopy_model(model)
-        example_inputs = clone_inputs(example_inputs)
-        model, example_inputs = self.cast_based_on_args(model, example_inputs)
-        return model, example_inputs
-
-    def decay_batch_exp(self, batch_size, factor=0.5, divisor=2):
-        out_batch_size = batch_size * factor
-        if out_batch_size > divisor:
-            out_batch_size = (out_batch_size + 1) // divisor * divisor
-        else:
-            out_batch_size = batch_size - 1
-        return max(0, int(out_batch_size))
-
-    def batch_size_finder(self, device, model_name, initial_batch_size=1024):
-        batch_size = initial_batch_size
-        while batch_size >= 1:
-            torch.cuda.empty_cache()
-            try:
-                device, name, model, example_inputs, _ = self.load_model(
-                    device,
-                    model_name,
-                    batch_size,
-                )
-                self.model_iter_fn(model, example_inputs)
-                return batch_size
-            except RuntimeError as e:
-                error_str = str(e)
-                if "channels_last" in error_str:
-                    break
-            batch_size = self.decay_batch_exp(batch_size)
-        return 1
-
-    def run_n_iterations(self, mod, inputs):
-        n = self.args.iterations
-        for _ in range(n - 1):
-            self.model_iter_fn(mod, inputs, collect_outputs=False)
-        return self.model_iter_fn(mod, inputs, collect_outputs=True)
-
-    def optimizer_zero_grad(self, mod):
-        if self.optimizer is not None:
-            self.optimizer.zero_grad(True)
-        else:
-            mod.zero_grad(True)
-
-    def optimizer_step(self):
-        if self.optimizer is not None:
-            self.optimizer.step()
-
-    def get_benchmark_indices(self, length):
-        start = self._args.partition_id * (length // self._args.total_partitions)
-        end = (
-            (self._args.partition_id + 1) * (length // self._args.total_partitions)
-            if self._args.partition_id < self._args.total_partitions - 1
-            else length
-        )
-        return start, end
-
-    def deepcopy_and_maybe_ddp(self, model):
-        model = self.deepcopy_model(model)
-        if self.args.ddp:
-            assert (
-                torch.distributed.is_available()
-            ), "Can't use DDP without a distributed enabled build"
-            from torch.nn.parallel import DistributedDataParallel as DDP
-
-            model = DDP(model, find_unused_parameters=True)
-        elif self.args.fsdp:
-            assert (
-                torch.distributed.is_available()
-            ), "Can't use FSDP without a distributed enabled build"
-            from torch.distributed.fsdp import (
-                FullyShardedDataParallel as FSDP,
-                MixedPrecision,
-            )
-
-            from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy
-
-            if self.args.float16:
-                dtype = torch.float16
-            elif self.args.bfloat16:
-                dtype = torch.bfloat16
-            else:
-                dtype = torch.float32
-
-            mp_policy = MixedPrecision(
-                param_dtype=dtype,
-                # Gradient communication precision.
-                reduce_dtype=dtype,
-                # Buffer precision.
-                buffer_dtype=dtype,
-            )
-
-            my_auto_wrap_policy = functools.partial(
-                size_based_auto_wrap_policy, recurse=True, min_num_params=int(1e5)
-            )
-
-            model = FSDP(
-                model,
-                use_orig_params=True,
-                device_id=torch.cuda.current_device()
-                if self.args.devices[-1] == "cuda"
-                else None,
-                mixed_precision=mp_policy,
-                limit_all_gathers=True,
-                auto_wrap_policy=my_auto_wrap_policy,
-            )
-            if torch._inductor.config.triton.cudagraphs:
-                log.warning("Disabling cudagraphs for FSDP compatibility")
-                torch._inductor.config.triton.cudagraphs = False
-        return model
-
-    def check_accuracy(
-        self, name, model, example_inputs, optimize_ctx, experiment, tag
-    ):
-        """
-        Checks accuracy.
-        1) Collect the outputs with fp64 datatype. This is useful for error checking.
-        2) Checks if eager itself has variations.
-        """
-        start_stats = get_dynamo_stats()
-
-        def record_status(accuracy_status, dynamo_start_stats):
-            """
-            Records the status in the csv file
-            """
-            if current_name in self.non_deterministic_models:
-                if accuracy_status in (
-                    "pass",
-                    "eager_two_runs_differ",
-                    "fail_accuracy",
-                ):
-                    accuracy_status = "pass"
-
-            headers = ["dev", "name", "batch_size", "accuracy"]
-            fields = [current_device, current_name, current_batch_size, accuracy_status]
-
-            if tag is not None:
-                headers.insert(3, "tag")
-                fields.insert(3, tag)
-
-            dynamo_stats = get_dynamo_stats()
-            dynamo_stats.subtract(dynamo_start_stats)
-            for k, v in dynamo_stats.items():
-                headers.append(k)
-                fields.append(v)
-
-            output_csv(output_filename, headers, fields)
-            return accuracy_status
-
-        if name in self.skip_accuracy_checks_large_models_dashboard:
-            return record_status("pass_due_to_skip", dynamo_start_stats=start_stats)
-
-        # Collect the fp64 reference outputs to be used later for accuracy checking.
-        fp64_outputs = None
-        try:
-            model_fp64, inputs_fp64 = cast_to_fp64(
-                self.deepcopy_and_maybe_ddp(model),
-                clone_inputs(example_inputs),
-            )
-            self.init_optimizer(name, current_device, model_fp64.parameters())
-            fp64_outputs = self.run_n_iterations(model_fp64, inputs_fp64)
-        except Exception:
-            log.warning(
-                "fp64 golden ref were not generated for %s. Setting accuracy check to cosine",
-                name,
-            )
-            self.args.cosine = True
-            fp64_outputs = None
-
-        tolerance, cos_similarity = self.get_tolerance_and_cosine_flag(
-            self.args.training, current_device, name
-        )
-
-        # Cast the model to float16/float32 as necessary
-        model, example_inputs = self.maybe_cast(model, example_inputs)
-        accuracy_status = "pass"
-
-        with self.pick_grad(name, self.args.training):
-            # Get results of native pytorch
-            reset_rng_state()
-            try:
-                model_copy = self.deepcopy_and_maybe_ddp(model)
-                self.init_optimizer(name, current_device, model_copy.parameters())
-                correct_result = self.run_n_iterations(
-                    model_copy, clone_inputs(example_inputs)
-                )
-            except Exception as e:
-                accuracy_status = (
-                    "eager_1st_run_OOM"
-                    if isinstance(e, torch.cuda.OutOfMemoryError)
-                    else "eager_1st_run_fail"
-                )
-                log.exception(e)
-                return record_status(accuracy_status, dynamo_start_stats=start_stats)
-
-            # Rerun native pytorch
-            reset_rng_state()
-            try:
-                model_copy = self.deepcopy_and_maybe_ddp(model)
-                self.init_optimizer(name, current_device, model_copy.parameters())
-                correct_rerun_result = self.run_n_iterations(
-                    model_copy, clone_inputs(example_inputs)
-                )
-            except Exception as e:
-                accuracy_status = (
-                    "eager_2nd_run_OOM"
-                    if isinstance(e, torch.cuda.OutOfMemoryError)
-                    else "eager_2nd_run_fail"
-                )
-                return record_status(accuracy_status, dynamo_start_stats=start_stats)
-
-            # Two eager runs should have exactly same result
-            is_same = True
-            try:
-                if (
-                    name not in self.skip_accuracy_check_as_eager_non_deterministic
-                    and not same(
-                        correct_result,
-                        correct_rerun_result,
-                        fp64_ref=None,
-                        cos_similarity=False,
-                        tol=0,
-                        equal_nan=self.equal_nan,
-                    )
-                ):
-                    is_same = False
-            except Exception as e:
-                # Sometimes torch.allclose may throw RuntimeError
-                is_same = False
-
-            if not is_same:
-                accuracy_status = "eager_two_runs_differ"
-                return record_status(accuracy_status, dynamo_start_stats=start_stats)
-
-            correct_rerun_result = None
-
-            # Run with Dynamo
-            reset_rng_state()
-            torch._dynamo.reset()
-            try:
-                model_copy = self.deepcopy_and_maybe_ddp(model)
-                self.init_optimizer(name, current_device, model_copy.parameters())
-                if self.args.export:
-                    # TB and TIMM use list example_inputs
-                    # HF use dict example_inputs
-                    example_args, example_kwargs = _normalize_bench_inputs(
-                        example_inputs
-                    )
-
-                    # Register the output dataclass to pytree
-                    example_outputs = model_copy(*example_args, **example_kwargs)
-                    _register_dataclass_output_as_pytree(example_outputs)
-
-                    # apply export on module directly
-                    # no need for n iterations
-                    # the logic should be the same to self.model_iter_fn (forward_pass)
-                    with self.autocast():
-                        optimized_model_iter_fn = optimize_ctx(
-                            model_copy, example_args, example_kwargs
-                        )
-                        new_result = optimized_model_iter_fn(
-                            *example_args, **example_kwargs
-                        )
-                else:
-                    optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)
-                    new_result = optimized_model_iter_fn(model_copy, example_inputs)
-            except Exception as e:
-                log.exception(e)
-                print(
-                    "TorchDynamo optimized model failed to run because of following error"
-                )
-                accuracy_status = (
-                    "OOM"
-                    if isinstance(e, torch.cuda.OutOfMemoryError)
-                    else "fail_to_run"
-                )
-                return record_status(accuracy_status, dynamo_start_stats=start_stats)
-
-            if name in self.skip_accuracy_check_as_eager_non_deterministic:
-                return record_status("pass_due_to_skip", dynamo_start_stats=start_stats)
-
-            # Workaround for ONNX for non-tensor outputs
-            if (
-                current_onnx_compiler == "torchscript"
-                or current_onnx_compiler == "dynamo"
-            ):
-                from _onnx import patch
-
-                (
-                    correct_result,
-                    new_result,
-                    fp64_outputs,
-                ) = patch.patch_non_tensor_outputs(
-                    correct_result, new_result, fp64_outputs
-                )
-
-            try:
-                if not same(
-                    correct_result,
-                    new_result,
-                    fp64_outputs,
-                    equal_nan=self.equal_nan,
-                    cos_similarity=cos_similarity,
-                    tol=tolerance,
-                ):
-                    is_same = False
-            except Exception as e:
-                # Sometimes torch.allclose may throw RuntimeError
-                is_same = False
-
-            if not is_same:
-                if self.args.skip_accuracy_check:
-                    accuracy_status = "pass_due_to_skip"
-                else:
-                    accuracy_status = "fail_accuracy"
-                return record_status(accuracy_status, dynamo_start_stats=start_stats)
-
-        return record_status(accuracy_status, dynamo_start_stats=start_stats)
-
-    def check_tolerance(
-        self, name, model, example_inputs, optimize_ctx, base_device="cpu"
-    ):
-        """
-        Checks tolerance based on https://pytorch.org/docs/stable/generated/torch.allclose.html.
-        """
-        tolerance_status = "pass"
-        if name in self.skip_accuracy_checks_large_models_dashboard:
-            tolerance_status = "pass_due_to_skip"
-            return tolerance_status
-        # Cast the model to float16/float32 as necessary
-        model, example_inputs = self.maybe_cast(model, example_inputs)
-
-        with self.pick_grad(name, self.args.training):
-            # Get results of native pytorch
-            reset_rng_state()
-            model_copy = copy.deepcopy(model)
-            model_copy = model_copy.to(base_device)
-            example_inputs_copy = copy.deepcopy(example_inputs)
-            example_inputs_copy = tree_map(
-                lambda x: x.to(base_device), example_inputs_copy
-            )
-            self.init_optimizer(name, base_device, model_copy.parameters())
-            correct_result = self.run_n_iterations(model_copy, example_inputs_copy)
-
-            # Run with Dynamo
-            # Sometime CI fails with random triton compilation failure which will be skipped for now
-            # TODO: revisit this after switching to new Triton runtime
-            reset_rng_state()
-            torch._dynamo.reset()
-            try:
-                self.init_optimizer(name, current_device, model.parameters())
-                optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)
-                new_result = optimized_model_iter_fn(model, example_inputs)
-            except Exception as e:
-                log.exception(e)
-                if (
-                    self.args.ci
-                    and isinstance(e, BackendCompilerFailed)
-                    and (
-                        "Internal Triton PTX codegen error" in str(e)
-                        or "cubin" in str(e)
-                    )
-                ):
-                    return "pass_due_to_skip"
-                else:
-                    print(
-                        "TorchDynamo optimized model failed to run because of following error"
-                    )
-                    return "fail_to_run"
-
-            def dump_max_mean_values(tol, ref, res):
-                if isinstance(ref, (list, tuple, torch.nn.ParameterList, torch.Size)):
-                    for refi, resi in zip(ref, res):
-                        dump_max_mean_values(tol, refi, resi)
-                elif isinstance(ref, dict):
-                    for k in ref.keys():
-                        dump_max_mean_values(tol, ref[k], res[k])
-                elif isinstance(ref, torch.Tensor):
-                    res = res.to(base_device)
-                    t = torch.abs(ref - res) / (1 + torch.abs(ref))
-                    tol.append(t.flatten().to(torch.float32))
-                return tol
-
-            tol = []
-            dump_max_mean_values(tol, correct_result, new_result)
-            tol = torch.cat(tol)
-            tol = torch.tensor(tol)
-            max = torch.max(tol)
-            mean = torch.mean(tol)
-            div = torch.std(tol)
-            headers = ["dev", "name", "batch_size", "max", "mean", "std"]
-            fields = [
-                current_device,
-                current_name,
-                current_batch_size,
-                max.item(),
-                mean.item(),
-                div.item(),
-            ]
-            output_csv(output_filename, headers, fields)
-        return tolerance_status
-
-    def run_performance_test(
-        self, name, model, example_inputs, optimize_ctx, experiment, tag=None
-    ):
-        if self.args.xla:
-            with self.pick_grad(name, self.args.training):
-                return experiment(*self.maybe_cast(model, example_inputs))
-
-        def warmup(fn, model, example_inputs, mode, niters=5):
-            peak_mem = 0
-            start_stats = get_dynamo_stats()
-            try:
-                if current_device == "cuda":
-                    torch.cuda.reset_peak_memory_stats()
-                    torch.cuda.empty_cache()
-                t0 = time.perf_counter()
-                for _ in range(niters):
-                    fn(model, example_inputs)
-                t1 = time.perf_counter()
-                latency = t1 - t0
-                if current_device == "cuda":
-                    peak_mem = get_peak_memory()
-                elif current_device == "cpu":
-                    total = psutil.virtual_memory().total
-                    percentage = psutil.Process(os.getpid()).memory_percent()
-                    peak_mem = percentage * total / 10**9
-            except Exception:
-                log.exception("Backend %s failed in warmup()", mode)
-                return sys.exit(-1)
-            dynamo_stats = get_dynamo_stats()
-            dynamo_stats.subtract(start_stats)
-            return latency, peak_mem, dynamo_stats
-
-        # Cast the model to float16/float32 as necessary
-        model, example_inputs = self.maybe_cast(model, example_inputs)
-
-        # Use distributed wrapping as necessary
-        model = self.deepcopy_and_maybe_ddp(model)
-
-        self.init_optimizer(name, current_device, model.parameters())
-        with self.pick_grad(name, self.args.training):
-            ok, total = Stats.reset_counters()
-            experiment_kwargs = {}
-            if tag is not None:
-                experiment_kwargs["tag"] = tag
-            results = []
-            eager_latency, eager_peak_mem, _ = warmup(
-                self.model_iter_fn, model, example_inputs, "eager"
-            )
-            optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
-            dynamo_latency, dynamo_peak_mem, dynamo_stats = warmup(
-                optimized_model_iter_fn, model, example_inputs, "dynamo"
-            )
-
-            compilation_time = dynamo_latency - eager_latency
-            compression_ratio = (
-                eager_peak_mem / dynamo_peak_mem if dynamo_peak_mem else 0.0
-            )
-            if self.args.print_memory:
-                print(
-                    f"memory: eager: {eager_peak_mem:.2f} GB, "
-                    f"dynamo: {dynamo_peak_mem:.2f} GB, "
-                    f"ratio: {compression_ratio:.2f}"
-                )
-
-            if experiment.func is speedup_experiment:
-                experiment_kwargs["compilation_latency"] = compilation_time
-                experiment_kwargs["compression_ratio"] = compression_ratio
-                experiment_kwargs["eager_peak_mem"] = eager_peak_mem
-                experiment_kwargs["dynamo_peak_mem"] = dynamo_peak_mem
-                experiment_kwargs["dynamo_stats"] = dynamo_stats
-
-            if experiment.func is coverage_experiment:
-                ok, total = Stats.reset_counters()
-                results = []
-                # run with torch._dynamo few times to populate the cache
-                for _ in range(3):
-                    optimized_model_iter_fn(model, example_inputs)
-                _, frames_second_pass = Stats.reset_counters()  # should be 0
-                if frames_second_pass > 0:
-                    optimized_model_iter_fn(model, example_inputs)
-                    _, frames_third_pass = Stats.reset_counters()  # should be 0
-                else:
-                    frames_third_pass = 0
-
-                results.append(
-                    f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s"
-                )
-
-            if not hasattr(model, name):
-                model.name = name
-            results.append(experiment(model, example_inputs, **experiment_kwargs))
-            return " ".join(map(str, results))
-
-    def minify_model(
-        self,
-        name,
-        model,
-        example_inputs,
-        optimize_ctx,
-        experiment,
-        tag,
-    ):
-        logging.info("Minifying %s...", name)
-        os.environ["TORCH_COMPILE_DEBUG"] = "1"
-        os.environ["TORCHDYNAMO_REPRO_AFTER"] = "dynamo"
-        os.environ["TORCHDYNAMO_REPRO_LEVEL"] = "4"
-
-        self.check_accuracy(name, model, example_inputs, optimize_ctx, experiment, tag)
-
-        if self.args.output_directory:
-            repro_dir = self.args.output_directory
-        else:
-            repro_dir = torch._dynamo.config.base_dir
-
-        try:
-            shutil.move("repro.py", f"{repro_dir}/{name}_repro.py")
-        except OSError as e:
-            logging.error("Could not find repro script for model %s", name)
-        else:
-            logging.info(
-                "Repro script for model %s with minified graph saved to %s",
-                name,
-                repro_dir,
-            )
-
-    def run_one_model(
-        self,
-        name,
-        model,
-        example_inputs,
-        optimize_ctx,
-        experiment,
-        explain=False,
-        tag=None,
-    ):
-        mode = "train" if self.args.training else "eval"
-        msg = f"{current_device:4} {mode:5} {current_name:34} "
-        if tag:
-            msg += f" {tag:26}"
-        print(msg, flush=True)
-
-        start_stats = get_dynamo_stats()
-
-        if self.args.accuracy:
-            status = self.check_accuracy(
-                name, model, example_inputs, optimize_ctx, experiment, tag
-            )
-            print(status)
-            if status == "fail_accuracy" and self.args.minify:
-                self.minify_model(
-                    name, model, example_inputs, optimize_ctx, experiment, tag
-                )
-        elif self.args.tolerance:
-            status = self.check_tolerance(name, model, example_inputs, optimize_ctx)
-            print(status)
-        elif self.args.performance:
-            status = self.run_performance_test(
-                name, model, example_inputs, optimize_ctx, experiment, tag
-            )
-            print(status)
-        if self.args.timing:
-            from torch._dynamo.utils import op_count, print_time_report
-            from torch.utils._stats import simple_call_counter
-
-            print_time_report()
-            stats = "STATS: "
-            stats = stats + " | ".join(
-                itertools.chain(
-                    [f"call_* op count: {op_count}"],
-                    (f"{key}:{value}" for key, value in simple_call_counter.items()),
-                )
-            )
-            print(stats)
-        stats = get_dynamo_stats()
-        stats.subtract(start_stats)
-
-        if explain:
-            print(
-                f"Dynamo produced {stats['unique_graphs']} graphs "
-                f"covering {stats['calls_captured']} ops with "
-                f"{stats['graph_breaks']} graph breaks ({stats['unique_graph_breaks']} unique)"
-            )
-
-        if explain or self.args.log_graph_breaks or self.args.print_graph_breaks:
-            filename = f"{output_filename.rstrip('.csv')}_graph_breaks.csv"
-
-            def add_double_quotes(x):
-                # Delimiter because reason could have comma
-                return f'"{x}"'
-
-            for graph_break in graph_break_reasons:
-                reason = add_double_quotes(graph_break.reason)
-                user_stack = add_double_quotes(
-                    ", ".join([str(x) for x in graph_break.user_stack])
-                )
-                output_csv(
-                    filename,
-                    ["model", "reason", "user_stack"],
-                    [current_name, reason, user_stack],
-                )
-
-        if self.args.stats:
-            Stats.print_summary()
-
-
-def help(fn):
-    return fn.__doc__
-
-
-diff_branch_default = "DIFF-BRANCH-DEFAULT"
-
-
-def should_diff_branch(args):
-    return args.diff_branch != diff_branch_default
-
-
-def parse_args(args=None):
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--filter", "-k", action="append", help="filter benchmarks with regexp"
-    )
-    parser.add_argument(
-        "--exclude", "-x", action="append", help="filter benchmarks with regexp"
-    )
-    parser.add_argument(
-        "--exclude-exact", action="append", help="filter benchmarks with exact match"
-    )
-    parser.add_argument(
-        "--total-partitions",
-        type=int,
-        default=1,
-        choices=range(1, 10),
-        help="Total number of partitions we want to divide the benchmark suite into",
-    )
-    parser.add_argument(
-        "--partition-id",
-        type=int,
-        default=0,
-        help="ID of the benchmark suite partition to be run. Used to divide CI tasks",
-    )
-    parser.add_argument(
-        "--devices", "--device", "-d", action="append", help="cpu or cuda"
-    )
-    parser.add_argument("--device-index", help="CUDA device index")
-    parser.add_argument(
-        "--repeat", "-n", type=int, default=30, help="number of timing runs"
-    )
-    iterations_per_run_help = """
-        Run this may iterations for each time measurement. This is mainly used for
-        XLA training. We want to run multiple iterations per measurement so the
-        tracing and computation for different iteartions can overlap with each
-        other. This makes sure we have an accurate xla baseline.
-    """
-    parser.add_argument(
-        "--iterations-per-run", type=int, default=1, help=iterations_per_run_help
-    )
-    parser.add_argument(
-        "--randomize-input",
-        action="store_true",
-        help="Whether to randomize the input values. Dimensions will be kept the same.",
-    )
-    parser.add_argument(
-        "--threads",
-        "-t",
-        type=int,
-        help="number of threads to use for eager and inductor",
-    )
-    parser.add_argument(
-        "--nopython", action="store_true", help="Turn graph breaks into errors"
-    )
-    parser.add_argument(
-        "--no-skip",
-        action="store_true",
-        help="run models that are in the global SKIP list",
-    )
-    parser.add_argument(
-        "--prims-nvfuser", action="store_true", help="user prims + nvfuser backend"
-    )
-    parser.add_argument(
-        "--dump-raw-metrics",
-        action="store_true",
-        help="dump raw timing metrics from speedup experiment",
-    )
-    parser.add_argument(
-        "--log-operator-inputs",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--channels-last",
-        action="store_true",
-        default=False,
-        help="use channels last format",
-    )
-    parser.add_argument(
-        "--batch-size", "--batch_size", type=int, help="batch size for benchmarking"
-    )
-    parser.add_argument(
-        "--iterations", type=int, default=2, help="how many iterations to run"
-    )
-    parser.add_argument(
-        "--batch-size-file", type=str, help="String to load batch size from"
-    )
-    parser.add_argument("--cosine", action="store_true", help="use cosine similarity")
-    parser.add_argument(
-        "--cpp-wrapper", action="store_true", help="turn on cpp/cuda wrapper codegen"
-    )
-    parser.add_argument(
-        "--freezing", action="store_true", help="turn on freezing", default=False
-    )
-    parser.add_argument(
-        "--ci", action="store_true", help="Flag to tell that its a CI run"
-    )
-    parser.add_argument(
-        "--dynamic-ci-skips-only",
-        action="store_true",
-        help=(
-            "Run only the models that would have been skipped in CI "
-            "if dynamic-shapes, compared to running without dynamic-shapes.  "
-            "This is useful for checking if more models are now "
-            "successfully passing with dynamic shapes.  "
-            "Implies --dynamic-shapes and --ci"
-        ),
-    )
-    parser.add_argument(
-        "--dashboard", action="store_true", help="Flag to tell that its a Dashboard run"
-    )
-    parser.add_argument(
-        "--skip-fp64-check", action="store_true", help="skip accuracy check using fp64"
-    )
-    parser.add_argument(
-        "--fast", "-f", action="store_true", help="skip slow benchmarks"
-    )
-    parser.add_argument(
-        "--only",
-        help="""Run just one model from torchbench. Or
-        specify the path and class name of the model in format like:
-        --only=path:<MODEL_FILE_PATH>,class:<CLASS_NAME>
-
-        Due to the fact that dynamo changes current working directory,
-        the path should be an absolute path.
-
-        The class should have a method get_example_inputs to return the inputs
-        for the model. An example looks like
-        ```
-        class LinearModel(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = nn.Linear(10, 10)
-
-            def forward(self, x):
-                return self.linear(x)
-
-            def get_example_inputs(self):
-                return (torch.randn(2, 10),)
-        ```
-    """,
-    )
-    parser.add_argument(
-        "--multiprocess",
-        action="store_true",
-        help="Create n processes based on the number of devices (distributed use case).",
-    )
-    parser.add_argument(
-        "--ddp",
-        action="store_true",
-        help="Wraps model in DDP before running it, and uses dynamo DDPOptmizer (graph breaks) by default.",
-    )
-    parser.add_argument(
-        "--fsdp",
-        action="store_true",
-        help="""Wraps model in FSDP before running it. Disables cudagraphs by default.
-        Doesn't recursively wrap, mainly useful for checking dynamo UnspecNNModule compatibility
-    """,
-    )
-    parser.add_argument(
-        "--no-optimize-ddp",
-        action="store_true",
-        help="Disables dynamo DDPOptimizer (graph breaks). (Applies only when using --ddp benchmark mode).",
-    )
-    parser.add_argument(
-        "--distributed-master-port",
-        default="6789",
-        help="Port to bind for for torch.distributed.  Use the default unless it's conflicting with another user",
-    )
-    parser.add_argument(
-        "--dynamic-shapes",
-        action="store_true",
-        help="Runs a dynamic shapes version of the benchmark, if available.",
-    )
-    parser.add_argument(
-        "--dynamic-batch-only",
-        action="store_true",
-        help="Only assume batch dimension is dynamic.  Implies --dynamic-shapes",
-    )
-    parser.add_argument(
-        "--specialize-int", action="store_true", help="Run with specialize_int=True."
-    )
-    parser.add_argument(
-        "--use-eval-mode",
-        action="store_true",
-        help="sets model.eval() to reduce randomness",
-    )
-    parser.add_argument(
-        "--skip-accuracy-check",
-        action="store_true",
-        help="keeps running even when accuracy fails",
-    )
-    parser.add_argument(
-        "--generate-aot-autograd-stats",
-        action="store_true",
-        help="Generates AOT Autograd stats like how mnay graphs are sent to AOT",
-    )
-    parser.add_argument(
-        "--inductor-settings",
-        action="store_true",
-        help="Use same settings as --inductor for baseline comparisons",
-    )
-    parser.add_argument(
-        "--suppress-errors",
-        action="store_true",
-        help="Suppress errors instead of raising them",
-    )
-    parser.add_argument(
-        "--output",
-        help="Overrides the output filename",
-    )
-    parser.add_argument(
-        "--output-directory",
-        help="Overrides the directory to place output files.",
-    )
-    parser.add_argument(
-        "--baseline",
-        help="Compare with a prior --output",
-    )
-    parser.add_argument(
-        "--part",
-        default=None,
-        help="Specify the part of the model to run.",
-    )
-    parser.add_argument(
-        "--export-profiler-trace",
-        action="store_true",
-        help="exports trace of kineto profiler",
-    )
-    parser.add_argument(
-        "--profiler-trace-name",
-        "--profiler_trace_name",
-        help="Overwrites exported trace name",
-    )
-    parser.add_argument(
-        "--diff-branch",
-        default=diff_branch_default,
-        help="delta current branch against given branch.",
-    )
-    parser.add_argument(
-        "--tag", default=None, help="Specify a tag to be included in csv files."
-    )
-    parser.add_argument(
-        "--explain",
-        action="store_true",
-        help="print some graph/op statistics during the run, similar to .explain()",
-    )
-    parser.add_argument(
-        "--stats",
-        action="store_true",
-        help="print graph counter stats",
-    )
-    parser.add_argument(
-        "--print-memory",
-        action="store_true",
-        help="print extra memory statistics",
-    )
-    parser.add_argument(
-        "--print-dataframe-summary",
-        action="store_true",
-        help="print dataframe result used for calculating accuracy",
-    )
-    parser.add_argument(
-        "--cold-start-latency",
-        "--cold_start_latency",
-        action="store_true",
-        help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
-    )
-    parser.add_argument(
-        "--disable-cudagraphs",
-        action="store_true",
-        help="Disables cudagraphs for Inductor",
-    )
-    parser.add_argument(
-        "--disable-split-reductions",
-        action="store_true",
-        help="Disables split reductions for Inductor",
-    )
-    parser.add_argument(
-        "--disable-persistent-reductions",
-        action="store_true",
-        help="Disables split reductions for Inductor",
-    )
-    parser.add_argument(
-        "--disable-divisible-by-16",
-        action="store_true",
-        help="Disables divisible by 16 hint to Triton for Inductor",
-    )
-    parser.add_argument(
-        "--inductor-compile-mode",
-        default=None,
-        help="torch.compile mode argument for inductor runs.",
-    )
-    parser.add_argument(
-        "--print-graph-breaks",
-        action="store_true",
-        help="Show a warning whenever graph break",
-    )
-    parser.add_argument(
-        "--log-graph-breaks",
-        action="store_true",
-        help="log graph breaks in a file",
-    )
-    parser.add_argument(
-        "--trace-on-xla",
-        action="store_true",
-        help="Whether to trace the model on XLA or on eager device",
-    )
-    parser.add_argument(
-        "--xla-tolerance",
-        type=float,
-        default=1e-2,
-        help="XLA needs a loose tolerance to pass the correctness check",
-    )
-    parser.add_argument(
-        "--collect-outputs",
-        action="store_true",
-        help="""Whether to collect outputs for training. Set this to true if we
-        want to verify the numerical correctness of graidents. But that may
-        cause time measurement not accurate""",
-    )
-    parser.add_argument(
-        "--enable-activation-checkpointing",
-        action="store_true",
-        help="Enables activation checkpointing for HF models",
-    )
-    parser.add_argument("--timing", action="store_true", help="Emits phase timing")
-
-    parser.add_argument(
-        "--progress",
-        action="store_true",
-        help="Print n/k models message between each model run.",
-    )
-
-    parser.add_argument(
-        "--timeout",
-        type=int,
-        default=2000,
-        help="timeout (second) for benchmarking.",
-    )
-
-    parser.add_argument(
-        "--per_process_memory_fraction",
-        type=float,
-        default=1,
-        help="Set per-process GPU memory fraction (limit) for reducing usable size and reproducing OOMs",
-    )
-
-    parser.add_argument(
-        "--no-translation-validation",
-        action="store_true",
-        help="Disable translation validation for accuracy builds.",
-    )
-
-    parser.add_argument(
-        "--minify",
-        action="store_true",
-        help="Enable minification when failure is below tolerance. Save repro script for each model.",
-    )
-
-    group_fuser = parser.add_mutually_exclusive_group()
-    # --nvfuser is now the default, keep the option to not break scripts
-    group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS)
-    group_fuser.add_argument("--nnc", action="store_true", help="enable NNC for GPUs")
-
-    group_prec = parser.add_mutually_exclusive_group()
-    group_prec.add_argument("--float16", action="store_true", help="cast model to fp16")
-    group_prec.add_argument(
-        "--bfloat16", action="store_true", help="cast model to bf16"
-    )
-    group_prec.add_argument("--float32", action="store_true", help="cast model to fp32")
-    group_prec.add_argument(
-        "--amp", action="store_true", help="use automatic mixed precision"
-    )
-
-    group_printout = parser.add_mutually_exclusive_group()
-    group_printout.add_argument(
-        "--verbose", "-v", action="store_true", help="enable verbose debug printouts"
-    )
-    group_printout.add_argument(
-        "--quiet", "-q", action="store_true", help="suppress debug printouts"
-    )
-
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument(
-        "--coverage", action="store_true", help="(default) " + help(coverage_experiment)
-    )
-    group.add_argument(
-        "--overhead", action="store_true", help=help(overhead_experiment)
-    )
-    group.add_argument(
-        "--speedup-dynamo-ts",
-        action="store_true",
-        help="TorchDynamo frontend with torchscript backend",
-    )
-    group.add_argument(
-        "--speedup-fx2trt", action="store_true", help=help(speedup_experiment_fx2trt)
-    )
-    group.add_argument(
-        "--speedup-fx2trt-fp16",
-        action="store_true",
-        help=help(speedup_experiment_fx2trt),
-    )
-    group.add_argument(
-        "--print-fx",
-        action="store_true",
-        help="Print fx traces captured from model",
-    )
-    group.add_argument(
-        "--print-aten-ops",
-        action="store_true",
-        help="Print traces of aten ops captured by AOT autograd",
-    )
-    group.add_argument(
-        "--inductor",
-        action="store_true",
-        help="Measure speedup with TorchInductor",
-    )
-    group.add_argument(
-        "--export",
-        action="store_true",
-        help="Measure pass rate with export",
-    )
-    group.add_argument(
-        "--export-aot-inductor",
-        action="store_true",
-        help="Measure pass rate with Export+AOTInductor",
-    )
-    group.add_argument(
-        "--xla", action="store_true", help="Compare TorchXLA to eager PyTorch"
-    )
-    group.add_argument(
-        "--torchscript-onnx",
-        "--torchscript_onnx",
-        action="store_true",
-        help="Measure speedup with TorchScript ONNX, i.e. `torch.onnx.export`",
-    )
-    group.add_argument(
-        "--dynamo-onnx",
-        "--dynamo_onnx",
-        action="store_true",
-        help="Measure speedup with Dynamo ONNX, i.e. `torch.onnx.dynamo_export`",
-    )
-    group.add_argument(
-        "--backend",
-        choices=torch._dynamo.list_backends(exclude_tags=None),
-        help="measure speedup with a given backend",
-    )
-    group.add_argument("--nothing", action="store_true", help=help(null_experiment))
-    group.add_argument(
-        "--log-conv-args",
-        action="store_true",
-        help="Dump convolution input/weight/bias's shape/stride/dtype and other options to json",
-    )
-    group.add_argument(
-        "--recompile-profiler",
-        "--recompile_profiler",
-        action="store_true",
-        help="Run the dynamo recompilation profiler on each model.",
-    )
-    group.add_argument(
-        "--find-batch-sizes",
-        action="store_true",
-        help="finds the largest batch size that could fit on GPUs",
-    )
-
-    mode_group = parser.add_mutually_exclusive_group(required=True)
-    mode_group.add_argument(
-        "--accuracy",
-        action="store_true",
-        help="Checks accuracy with small batch size and eval mode",
-    )
-    mode_group.add_argument(
-        "--performance", action="store_true", help="Measures performance speedup"
-    )
-    mode_group.add_argument(
-        "--tolerance",
-        action="store_true",
-        help="extracts the tolerance for each model with small batch size and eval mode",
-    )
-    run_mode_group = parser.add_mutually_exclusive_group(required=True)
-    run_mode_group.add_argument(
-        "--training",
-        action="store_true",
-        help="Performs training",
-    )
-    run_mode_group.add_argument(
-        "--inference", action="store_true", help="Performs inference"
-    )
-    return parser.parse_args(args)
-
-
-def process_entry(rank, runner, original_dir, args):
-    args.rank = rank
-    with maybe_init_distributed(
-        args.use_distributed,
-        rank=rank,
-        world_size=args.world_size,
-        port=args.distributed_master_port,
-    ):
-        return maybe_fresh_cache(
-            run, (args.cold_start_latency and args.only) or args.ci
-        )(runner, args, original_dir)
-
-
-def main(runner, original_dir=None, args=None):
-    if original_dir:
-        os.chdir(original_dir)
-    args = parse_args(args)
-    if args.baseline:
-        args.baseline = os.path.abspath(args.baseline)
-
-    if should_diff_branch(args):
-        import git
-
-        # We do this here so we error out earlier if there's an issue
-        repo = git.Repo()
-        if repo.is_dirty():
-            raise RuntimeError(
-                "--diff-branch called on dirty branch. Commit, stash, or reset."
-            )
-        main_branch = repo.active_branch.name
-        if main_branch == args.diff_branch:
-            raise RuntimeError(
-                f"--diff-branch: current branch is same as {args.diff_branch} branch, what are you diffing?"
-            )
-
-    device_count = torch.cuda.device_count()
-    args.use_distributed = (args.ddp or args.fsdp) and args.only
-    if args.multiprocess:
-        if device_count <= 1:
-            log.warning(
-                "The use multiprocess flag is set but there are <= 1 devices available."
-            )
-        # multiprocess path
-        args.world_size = device_count
-        mp.spawn(process_entry, args=(runner, original_dir, args), nprocs=device_count)
-    else:
-        # single process path just uses the main process
-        args.world_size = 1
-        process_entry(0, runner, original_dir, args)
-
-
-def run(runner, args, original_dir=None):
-    # Pass the parsed args object to benchmark runner object
-    runner.args = args
-
-    args.filter = args.filter or [r"."]
-    args.exclude = args.exclude or [r"^$"]
-    args.exclude_exact = args.exclude_exact or []
-
-    if args.inductor:
-        assert args.backend is None
-        args.backend = "inductor"
-    if args.dynamic_ci_skips_only:
-        args.dynamic_shapes = True
-        args.ci = True
-    if args.dynamic_batch_only:
-        args.dynamic_shapes = True
-        torch._dynamo.config.assume_static_by_default = True
-    if args.dynamic_shapes:
-        if not args.dynamic_batch_only:
-            torch._dynamo.config.assume_static_by_default = False
-    if args.specialize_int:
-        torch._dynamo.config.specialize_int = True
-    if args.ci:
-        if args.accuracy:
-            # Run fewer iterations when checking accuracy
-            args.repeat = 2
-
-            # Set translation validation on by default on CI accuracy runs.
-            torch._dynamo.config.translation_validation = True
-
-        if args.dynamic_ci_skips_only:
-            # Test only the incremental set of jobs whose skipped was
-            # caused solely by turning on dynamic shapes
-            assert args.dynamic_shapes
-            ci = functools.partial(CI, args.backend, training=args.training)
-            args.filter = list(
-                set(CI_SKIP[ci(dynamic=True)]) - set(CI_SKIP[ci(dynamic=False)])
-            )
-        else:
-            ci = functools.partial(
-                CI, args.backend, training=args.training, dynamic=args.dynamic_shapes
-            )
-            for device in args.devices:
-                args.exclude_exact.extend(CI_SKIP[ci(device=device)])
-    if args.ddp:
-        # TODO: we could also hook DDP bench up to --speedup bench, _not_ for mgpu e2e perf,
-        # but just to measure impact on singlenode of performing graph-breaks.
-        # Left it as a follow up to keep this PR isolated.
-        assert (
-            args.accuracy
-        ), "DDP benchmark is currently only hooked up to --accuracy bench"
-        assert args.training, "DDP benchmark requires --training mode"
-        if args.no_optimize_ddp:
-            torch._dynamo.config.optimize_ddp = False
-        else:
-            # TODO(whc) after enabling DDPOptimizer by default this could be removed or assert
-            torch._dynamo.config.optimize_ddp = True
-        if args.only == "dlrm":
-            log.error(
-                "DLRM+DDP is unsupported as it requires sharding the embedding layer separately from DDP"
-            )
-            return sys.exit(-1)
-    if args.accuracy:
-        # Use small batch size. We use >1 batch size to ensure we test
-        # batch_norm type of operators that work on batch dims.
-        # TODO - Go through the failures for batch size = 2
-        if args.batch_size is None:
-            if runner.suite_name == "huggingface":
-                args.batch_size = 1
-            elif runner.suite_name == "torchbench":
-                args.batch_size = 4
-            else:
-                # Larger batch size of TIMM models to have stable batch_norm
-                assert runner.suite_name == "timm_models"
-                args.batch_size = 8
-
-        # Remove sources of randomness
-        if runner.suite_name not in ("timm_models", "huggingface"):
-            # TODO - Using train mode for timm_models and HF models. Move to train mode for Torchbench as well.
-            args.use_eval_mode = True
-        inductor_config.fallback_random = True
-        if args.only is not None and args.only not in {
-            "alexnet",
-            "Background_Matting",
-            "pytorch_CycleGAN_and_pix2pix",
-            "pytorch_unet",
-            "Super_SloMo",
-            "vgg16",
-            # https://github.com/pytorch/pytorch/issues/96724
-            "Wav2Vec2ForCTC",
-            "Wav2Vec2ForPreTraining",
-            "sam",
-        }:
-            # some of the models do not support use_deterministic_algorithms
-            torch.use_deterministic_algorithms(True)
-        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-        torch.backends.cudnn.deterministic = True
-        torch.backends.cudnn.allow_tf32 = False
-        torch.backends.cudnn.benchmark = False
-        torch.backends.cuda.matmul.allow_tf32 = False
-
-        # Remove randomeness when torch manual seed is called
-        patch_torch_manual_seed()
-
-        # Some models e.g. yolov3 assert batch size on n_gpus
-        if "CUDA_VISIBLE_DEVICES" not in os.environ:
-            args.device_index = "0"
-
-        # Stricter check to disable fallbacks
-        args.suppress_errors = False
-
-    if args.device_index is not None:
-        os.environ["CUDA_VISIBLE_DEVICES"] = args.device_index
-
-    elif args.performance:
-        # Ensure that we test on real scenarios
-        args.use_eval_mode = False
-
-    if args.partition_id > args.total_partitions or args.partition_id < 0:
-        print("Invalid partition id")
-        return sys.exit(-1)
-
-    if not args.devices:
-        if torch.cuda.is_available():
-            args.devices = ["cuda"]
-        else:
-            log.warning("torch.cuda.is_available() == False, using CPU")
-            args.devices = ["cpu"]
-
-    if args.devices != ["cpu"] and torch.cuda.is_available():
-        global synchronize
-        synchronize = torch.cuda.synchronize
-
-    if (
-        args.devices == ["cuda"]
-        and torch.cuda.get_device_properties(0).total_memory < 25 * 2**30
-    ):
-        # OOM errors on an RTX 3090 with 24gb RAM
-        runner.skip_models.update(
-            {
-                # torchbench
-                "hf_Longformer",
-                "timm_nfnet",
-                "timm_efficientdet",
-            }
-        )
-        if args.training:
-            runner.skip_models.add("hf_T5")
-
-    if args.nnc:
-        torch._C._jit_override_can_fuse_on_cpu(True)
-        torch._C._jit_override_can_fuse_on_gpu(True)
-        torch._C._jit_set_texpr_fuser_enabled(True)
-        torch._C._jit_set_nvfuser_enabled(False)
-
-    if args.threads:
-        torch.set_num_threads(args.threads)
-
-    if args.verbose:
-        torch._logging.set_logs(dynamo=logging.DEBUG)
-
-    if args.print_graph_breaks:
-        torch._dynamo.config.print_graph_breaks = True
-
-    if args.quiet:
-        torch._logging.set_logs(dynamo=logging.ERROR)
-
-    torch._dynamo.config.suppress_errors = args.suppress_errors
-
-    if args.training:
-        runner.model_iter_fn = runner.forward_and_backward_pass
-        runner.skip_models.update(runner.skip_not_suitable_for_training_models)
-    else:
-        runner.model_iter_fn = runner.forward_pass
-
-    if args.fast:
-        runner.skip_models.update(runner.slow_models)
-
-    if args.devices == ["cpu"]:
-        runner.skip_models.update(runner.very_slow_models)
-        runner.skip_models.update(runner.skip_models_for_cpu)
-    elif args.devices == ["cuda"]:
-        runner.skip_models.update(runner.skip_models_for_cuda)
-
-    if args.no_skip:
-        runner.skip_models.clear()
-
-    experiment = null_experiment
-    global current_name, current_device, current_batch_size, output_filename, optimize_ctx, current_onnx_compiler
-    optimize_ctx = contextlib.nullcontext()
-
-    if args.overhead:
-        optimize_ctx = torch._dynamo.optimize(dummy_fx_compile, nopython=args.nopython)
-        experiment = speedup_experiment
-        output_filename = "overheads.csv"
-    elif args.inductor:
-        inductor_config.debug = args.verbose
-        if args.threads:
-            inductor_config.cpp.threads = args.threads
-
-        optimize_ctx = functools.partial(
-            torch.compile,
-            backend="inductor",
-            fullgraph=args.nopython,
-            mode=args.inductor_compile_mode,
-        )
-        experiment = speedup_experiment
-        output_filename = "inductor.csv"
-    elif args.export:
-        optimize_ctx = torch._export.export
-        experiment = speedup_experiment
-        output_filename = "export.csv"
-    elif args.xla:
-        (dev,) = args.devices
-        os.environ["PJRT_DEVICE"] = {"cuda": "GPU", "cpu": "CPU"}[dev]
-        torch._dynamo.mark_dynamic = MagicMock()
-        experiment = xla
-        output_filename = "xla.csv"
-    elif args.torchscript_onnx:
-        optimize_ctx = functools.partial(
-            optimize_onnx_ctx, args.output_directory or ".", OnnxModelFromTorchScript
-        )
-        experiment = functools.partial(
-            speedup_experiment_onnx, OnnxModelFromTorchScript
-        )
-        output_filename = "torchscript_onnx.csv"
-        current_onnx_compiler = "torchscript"
-    elif args.dynamo_onnx:
-        optimize_ctx = functools.partial(
-            optimize_onnx_ctx, args.output_directory or ".", OnnxModelFromDynamo
-        )
-        experiment = functools.partial(speedup_experiment_onnx, OnnxModelFromDynamo)
-        output_filename = "dynamo_onnx.csv"
-        current_onnx_compiler = "dynamo"
-    elif args.speedup_dynamo_ts:
-        optimize_ctx = torch._dynamo.optimize("ts", nopython=args.nopython)
-        experiment = speedup_experiment
-        output_filename = "speedup_dynamo_ts.csv"
-    elif args.prims_nvfuser:
-        optimize_ctx = torch._dynamo.optimize("prims_nvfuser", nopython=args.nopython)
-        experiment = speedup_experiment
-        backend_str = "prims_nvfuser"
-        output_filename = f"accuracy_aot_{backend_str}.csv"
-    elif args.print_fx:
-        optimize_ctx = torch._dynamo.optimize(
-            print_fx,
-            nopython=args.nopython,
-        )
-    elif args.print_aten_ops:
-        optimize_ctx = torch._dynamo.optimize(
-            print_aten_ops,
-            nopython=args.nopython,
-        )
-    elif args.nothing:
-        optimize_ctx = nothing
-        experiment = speedup_experiment
-        output_filename = "nothing.csv"
-    elif args.backend or args.export_aot_inductor:
-        if args.export_aot_inductor:
-            assert not args.training, "AOTInductor only supports inference"
-            assert args.devices == ["cuda"], "AOTInductor only tested for CUDA"
-            optimize_ctx = export_aot_inductor
-        else:
-            optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
-        experiment = speedup_experiment
-        if args.accuracy:
-            output_filename = f"accuracy_{args.backend}.csv"
-        elif args.tolerance:
-            output_filename = f"tolerance_{args.backend}.csv"
-        else:
-            output_filename = f"speedup_{args.backend}.csv"
-    elif args.recompile_profiler:
-        output_filename = "recompile_profiler_log.csv"
-        experiment = recompile_profiler_experiment
-    else:
-        optimize_ctx = torch._dynamo.optimize(
-            fx_insert_profiling, nopython=args.nopython
-        )
-        experiment = coverage_experiment
-        output_filename = "coverage.csv"
-
-    if args.inductor or args.backend == "inductor" or args.export_aot_inductor:
-        inductor_config.triton.cudagraphs = not args.disable_cudagraphs
-        inductor_config.triton.persistent_reductions = (
-            not args.disable_persistent_reductions
-        )
-        inductor_config.split_reductions = not args.disable_split_reductions
-        inductor_config.triton.divisible_by_16 = not args.disable_divisible_by_16
-        inductor_config.cpp_wrapper = args.cpp_wrapper
-        if args.inference:
-            inductor_config.freezing = args.freezing
-
-    runner.setup_amp()
-
-    if args.output:
-        output_filename = args.output
-
-    if output_filename:
-        if args.output_directory:
-            output_filename = os.path.join(args.output_directory, output_filename)
-        else:
-            output_filename = os.path.join(
-                torch._dynamo.config.base_dir, output_filename
-            )
-
-    if args.find_batch_sizes and args.only:
-        for device in args.devices:
-            batch_size = runner.batch_size_finder(device, args.only)
-            print(args.only, batch_size)
-            output_csv(output_filename, [], [args.only, batch_size])
-        return
-
-    if args.export_profiler_trace:
-        if args.profiler_trace_name is None:
-            if args.backend:
-                args.profiler_trace_name = args.backend
-            elif args.inductor:
-                args.profiler_trace_name = "inductor"
-            else:
-                args.profiler_trace_name = "profile"
-        else:
-            args.profiler_trace_name = args.profiler_trace_name
-
-    if args.no_translation_validation:
-        # Overwrite 'translation_validation' config, if specified.
-        torch._dynamo.config.translation_validation = False
-
-    experiment = functools.partial(experiment, args, runner.model_iter_fn)
-
-    if args.only and should_diff_branch(args):
-        import git
-
-        repo = git.Repo()
-        main_branch = repo.active_branch.name
-        try:
-            # Adding diff-branch again to the args will override previous value
-            call_args = (
-                [sys.executable] + sys.argv + [f"--diff-branch={diff_branch_default}"]
-            )
-            # Run for main branch
-            subprocess.check_call(call_args + [f"--tag={main_branch}"])
-            # Run for comparison branch
-            repo.git.checkout(args.diff_branch)
-            subprocess.check_call(call_args + [f"--tag={args.diff_branch}"])
-        finally:
-            # Go back to main branch
-            repo.git.checkout(main_branch)
-    elif args.only:
-        model_name = args.only
-        for device in args.devices:
-            batch_size = args.batch_size
-            if args.batch_size_file:
-                batch_size = read_batch_size_from_file(
-                    args, args.batch_size_file, model_name
-                )
-            if model_specified_by_path(args.only):
-                model, example_inputs = load_model_from_path(args.only)
-                name = model.__class__.__name__
-                model = model.to(device=device)
-                example_inputs = tree_map_only(
-                    torch.Tensor, lambda x: x.to(device=device), example_inputs
-                )
-            else:
-                try:
-                    with tqdm(desc="loading model"):
-                        if args.part:
-                            (
-                                device,
-                                name,
-                                model,
-                                example_inputs,
-                                batch_size,
-                            ) = runner.load_model(
-                                device,
-                                model_name,
-                                batch_size=batch_size,
-                                part=args.part,
-                            )
-                        else:
-                            if args.fsdp:
-                                # Always load model on cpu for fsdp
-                                # When initializing FSDP, we will use the cuda device if args.cuda is set
-                                (
-                                    _,
-                                    name,
-                                    model,
-                                    example_inputs,
-                                    batch_size,
-                                ) = runner.load_model(
-                                    "cpu", model_name, batch_size=batch_size
-                                )
-                            else:
-                                (
-                                    device,
-                                    name,
-                                    model,
-                                    example_inputs,
-                                    batch_size,
-                                ) = runner.load_model(
-                                    device, model_name, batch_size=batch_size
-                                )
-                except NotImplementedError as e:
-                    print(e)
-                    import traceback
-
-                    print(traceback.format_exc())
-                    logging.warning("%s failed to load", args.only)
-                    continue  # bad benchmark implementation
-
-            if args.trace_on_xla:
-                xla_dev = xm.xla_device()
-                model = model.to(device=xla_dev)
-                example_inputs = tree_map_only(
-                    torch.Tensor, lambda x: x.to(device=xla_dev), example_inputs
-                )
-
-            current_name = name
-            current_device = device
-            current_batch_size = batch_size
-            set_model_name(name)
-
-            # Look for stuff that looks like batch size, and mark it dynamic.
-            # Better integration would integrate directly with benchmark suite
-            # but cannot conveniently do this
-            # NB: This must be done late enough so that we don't do more
-            # conversions on the inputs
-            # NB: Assumes only the first batch-y like dimension is the batch
-            marked = False
-
-            def detect_and_mark_batch(t):
-                nonlocal marked
-                for i, s in enumerate(t.size()):
-                    if s == batch_size:
-                        torch._dynamo.mark_dynamic(t, i)
-                        marked = True
-                        break
-
-            if (
-                args.dynamic_batch_only
-                and batch_size > 1
-                and model_name not in CI_SKIP_DYNAMIC_BATCH_ONLY
-            ):
-                tree_map_only(torch.Tensor, detect_and_mark_batch, example_inputs)
-                assert marked, f"nothing in example_inputs had a dim with {batch_size}"
-
-            if args.log_operator_inputs:
-                log_operator_inputs(
-                    model, example_inputs, runner.model_iter_fn, name, args
-                )
-                continue
-
-            if args.per_process_memory_fraction != 1:
-                torch.cuda.set_per_process_memory_fraction(
-                    args.per_process_memory_fraction
-                )
-
-            model, example_inputs = runner.cast_based_on_args(model, example_inputs)
-            runner.run_one_model(
-                name,
-                model,
-                example_inputs,
-                optimize_ctx,
-                experiment,
-                explain=args.explain,
-                tag=args.tag,
-            )
-        if args.generate_aot_autograd_stats:
-            stats_file = output_filename.split(".csv")[0] + "_stats.csv"
-            output_csv(
-                stats_file,
-                ("dev", "name", "batch_size", "total_aot_graphs", "ok_aot_graphs"),
-                [
-                    current_device,
-                    current_name,
-                    current_batch_size,
-                    *Stats.aot_summary(),
-                ],
-            )
-    else:
-        if output_filename and os.path.exists(output_filename):
-            os.unlink(output_filename)
-        if original_dir:
-            os.chdir(original_dir)
-        model_names = list(runner.iter_model_names(args))
-        nmodels = len(model_names)
-        for i, name in enumerate(model_names):
-            current_name = name
-            placeholder_batch_size = 0
-            if args.progress:
-                print(f"Running model {i+1}/{nmodels}", flush=True)
-
-            def write_csv(status):
-                if args.accuracy:
-                    headers = ["dev", "name", "batch_size", "accuracy"]
-                    rows = [
-                        [device, name, placeholder_batch_size, status]
-                        for device in args.devices
-                    ]
-                elif args.performance:
-                    headers = ["dev", "name", "batch_size", "speedup", "abs_latency"]
-                    rows = [
-                        [device, name, placeholder_batch_size, 0.0, 0.0]
-                        for device in args.devices
-                    ]
-                else:
-                    headers = []
-                    rows = [
-                        [device, name, placeholder_batch_size, 0.0]
-                        for device in args.devices
-                    ]
-
-                for row in rows:
-                    output_csv(output_filename, headers, row)
-
-            try:
-                timeout = args.timeout
-                if should_diff_branch(args):
-                    timeout *= 2
-                subprocess.check_call(
-                    [sys.executable] + sys.argv + [f"--only={name}"], timeout=timeout
-                )
-            except subprocess.TimeoutExpired:
-                print("TIMEOUT", file=sys.stderr)
-                write_csv("timeout")
-            except subprocess.SubprocessError:
-                print("ERROR", file=sys.stderr)
-                write_csv("infra_error")
-        print_summary(output_filename, print_dataframe=args.print_dataframe_summary)
-
-
-def log_operator_inputs(model, example_inputs, model_iter_fn, name, args):
-    mode = "training" if args.training else "eval"
-    output = os.path.join(os.path.dirname(args.output), f"{name}_{mode}.txt")
-
-    # TODO - add option for coalescing inputs over multiple runs
-    if os.path.exists(output):
-        print(f"Skipping {name}, {output} already exists")
-        return
-
-    print(f"Running {name}")
-
-    operator_mode = OperatorInputsMode()
-    fake_tensor_mode = FakeTensorMode()
-
-    with torch._subclasses.fake_tensor.FakeCopyMode(fake_tensor_mode):
-        model_fake = copy.deepcopy(model)
-        example_inputs_fake = copy.deepcopy(example_inputs)
-    try:
-        with fake_tensor_mode, operator_mode:
-            model_iter_fn(model_fake, example_inputs_fake, collect_outputs=False)
-    except Exception as e:
-        print(f"{name} failed to run with fake tensors, trying real. Exception: {e}")
-        operator_mode = OperatorInputsMode()
-        try:
-            with operator_mode:
-                model_iter_fn(model, example_inputs, collect_outputs=False)
-        except Exception as e2:
-            print(f"{name} failed to run with real. Exception: {e2}")
-            raise
-
-    print(f"Writing output to {output}")
-    operator_mode.log_to_file(output)
-
-
-if __name__ == "__main__":
-    raise RuntimeError(
-        f"You shouldn't run {sys.argv[0]} directly, instead try timm_model.py, torchbench.py or hugginface.py"
-    )
diff --git a/userbenchmark/dynamo/dynamobench/_dynamo/utils.py b/userbenchmark/dynamo/dynamobench/_dynamo/utils.py
index 81527ae647..34bc8ae390 100644
--- a/userbenchmark/dynamo/dynamobench/_dynamo/utils.py
+++ b/userbenchmark/dynamo/dynamobench/_dynamo/utils.py
@@ -33,27 +33,29 @@
 except ModuleNotFoundError:
     np = None
 
-import torch._logging
-import torch._numpy as tnp
-from torch._guards import detect_fake_mode  # noqa: F401
-from torch._logging import LazyString
-from . import config
-
-
-# NOTE: Make sure `NP_SUPPORTED_MODULES` and `NP_TO_TNP_MODULE` are in sync.
-if np:
-    NP_SUPPORTED_MODULES = (np, np.fft, np.linalg, np.random)
-
-    NP_TO_TNP_MODULE = {
-        np: tnp,
-        np.fft: tnp.fft,
-        np.linalg: tnp.linalg,
-        np.random: tnp.random,
-    }
-else:
-    NP_SUPPORTED_MODULES = {}
+try:
+    import torch._logging
+    import torch._numpy as tnp
+    from torch._guards import detect_fake_mode  # noqa: F401n
+    from torch._logging import LazyString
+    from . import config
+    # NOTE: Make sure `NP_SUPPORTED_MODULES` and `NP_TO_TNP_MODULE` are in sync.
+    if np:
+        NP_SUPPORTED_MODULES = (np, np.fft, np.linalg, np.random)
+
+        NP_TO_TNP_MODULE = {
+            np: tnp,
+            np.fft: tnp.fft,
+            np.linalg: tnp.linalg,
+            np.random: tnp.random,
+        }
+    else:
+        NP_SUPPORTED_MODULES = {}
 
-    NP_TO_TNP_MODULE = {}
+        NP_TO_TNP_MODULE = {}
+    from torch._subclasses.fake_tensor import FakeTensor, is_fake
+except:
+    pass
 
 import importlib
 
@@ -62,7 +64,7 @@
 import torch.fx.experimental.symbolic_shapes
 from torch import fx
 from torch._dispatch.python import enable_python_dispatcher
-from torch._subclasses.fake_tensor import FakeTensor, is_fake
+
 from torch.nn.modules.lazy import LazyModuleMixin
 from torch.utils._pytree import tree_map
 
diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py
index cb41ff4af2..5858c45bf8 100644
--- a/userbenchmark/dynamo/dynamobench/common.py
+++ b/userbenchmark/dynamo/dynamobench/common.py
@@ -54,10 +54,13 @@
 from scipy.stats import gmean, ttest_ind
 from torch._dynamo.profiler import fx_insert_profiling, Profiler
 from torch._dynamo.testing import dummy_fx_compile, format_speedup, same
-from torch._dynamo.utils import clone_inputs, graph_break_reasons
+try:
+    from torch._dynamo.utils import clone_inputs, graph_break_reasons
+    from torch._inductor.utils import aot_inductor_launcher, fresh_inductor_cache
+except ImportError:
+    from _dynamo.utils import clone_inputs, graph_break_reasons
 from torch._functorch.aot_autograd import set_model_name
 from torch._inductor import config as inductor_config
-from torch._inductor.utils import aot_inductor_launcher, fresh_inductor_cache
 from torch._subclasses.fake_tensor import FakeTensorMode
 
 from torch.utils import _pytree as pytree
@@ -65,11 +68,6 @@
 
 from tqdm.auto import tqdm, trange
 
-try:
-    from .microbenchmarks.operator_inp_utils import OperatorInputsMode
-except ImportError:
-    from microbenchmarks.operator_inp_utils import OperatorInputsMode
-
 try:
     import torch_xla
     import torch_xla.core.xla_model as xm
@@ -3177,10 +3175,10 @@ def process_entry(rank, runner, original_dir, args):
         )(runner, args, original_dir)
 
 
-def main(runner, original_dir=None):
+def main(runner, original_dir=None, args=None):
     if original_dir:
         os.chdir(original_dir)
-    args = parse_args()
+    args = parse_args() if not args else parse_args(args)
     if args.baseline:
         args.baseline = os.path.abspath(args.baseline)
 
@@ -3789,6 +3787,10 @@ def log_operator_inputs(model, example_inputs, model_iter_fn, name, args):
         return
 
     print(f"Running {name}")
+    try:
+        from .microbenchmarks.operator_inp_utils import OperatorInputsMode
+    except ImportError:
+        from microbenchmarks.operator_inp_utils import OperatorInputsMode
 
     operator_mode = OperatorInputsMode()
     fake_tensor_mode = FakeTensorMode()
diff --git a/userbenchmark/dynamo/dynamobench/requirements.txt b/userbenchmark/dynamo/dynamobench/requirements.txt
new file mode 100644
index 0000000000..a95678ade3
--- /dev/null
+++ b/userbenchmark/dynamo/dynamobench/requirements.txt
@@ -0,0 +1,2 @@
+pandas
+scipy
\ No newline at end of file
diff --git a/userbenchmark/dynamo/run.py b/userbenchmark/dynamo/run.py
index d410f7403c..42e98f2d15 100644
--- a/userbenchmark/dynamo/run.py
+++ b/userbenchmark/dynamo/run.py
@@ -1,11 +1,19 @@
 import logging
 import warnings
 
-from .torchbench import setup_torchbench_cwd, TorchBenchmarkRunner
+from torchbenchmark import add_path, REPO_PATH
+
+DYNAMOBENCH_PATH = REPO_PATH.joinpath("userbenchmark", "dynamo", "dynamobench")
+
 try:
-    from .common import main
+    # OSS Import
+    with add_path(str(DYNAMOBENCH_PATH)):
+        from torchbench import setup_torchbench_cwd, TorchBenchmarkRunner
+        from common import main
 except ImportError:
-    from common import main
+    # Meta Internal Import
+    from caffe2.benchmarks.dynamo.torchbench import setup_torchbench_cwd, TorchBenchmarkRunner
+    from caffe2.benchmarks.dynamo.common import main
 
 from typing import List
 
@@ -13,4 +21,4 @@ def run(args: List[str]):
     original_dir = setup_torchbench_cwd()
     logging.basicConfig(level=logging.WARNING)
     warnings.filterwarnings("ignore")
-    main(TorchBenchmarkRunner(), original_dir, args=args)
+    main(TorchBenchmarkRunner(), original_dir, args)
diff --git a/userbenchmark/dynamo/torchbench.py b/userbenchmark/dynamo/torchbench.py
deleted file mode 100644
index 1327040aed..0000000000
--- a/userbenchmark/dynamo/torchbench.py
+++ /dev/null
@@ -1,479 +0,0 @@
-#!/usr/bin/env python3
-import gc
-import importlib
-import logging
-import os
-import re
-import sys
-import warnings
-from os.path import abspath, exists
-
-import torch
-
-from .common import BenchmarkRunner, main
-from ._dynamo.testing import collect_results, reduce_to_scalar_loss
-from ._dynamo.utils import clone_inputs
-
-# We are primarily interested in tf32 datatype
-torch.backends.cuda.matmul.allow_tf32 = True
-
-
-def setup_torchbench_cwd():
-    original_dir = abspath(os.getcwd())
-
-    os.environ["KALDI_ROOT"] = "/tmp"  # avoids some spam
-    for torchbench_dir in (
-        "./torchbenchmark",
-        "../torchbenchmark",
-        "../torchbench",
-        "../benchmark",
-        "../../torchbenchmark",
-        "../../torchbench",
-        "../../benchmark",
-        "../../../torchbench",
-        "../../../benchmark",
-    ):
-        if exists(torchbench_dir):
-            break
-
-    if exists(torchbench_dir):
-        torchbench_dir = abspath(torchbench_dir)
-        os.chdir(torchbench_dir)
-        sys.path.append(torchbench_dir)
-
-    return original_dir
-
-
-# Some models have large dataset that doesn't fit in memory. Lower the batch
-# size to test the accuracy.
-USE_SMALL_BATCH_SIZE = {
-    "demucs": 4,
-    "dlrm": 1024,
-    "densenet121": 4,
-    "hf_Reformer": 4,
-    "hf_T5_base": 4,
-    "timm_efficientdet": 1,
-    "llama_v2_7b_16h": 1,
-}
-
-DETECTRON2_MODELS = {
-    "detectron2_fasterrcnn_r_101_c4",
-    "detectron2_fasterrcnn_r_101_dc5",
-    "detectron2_fasterrcnn_r_101_fpn",
-    "detectron2_fasterrcnn_r_50_c4",
-    "detectron2_fasterrcnn_r_50_dc5",
-    "detectron2_fasterrcnn_r_50_fpn",
-    "detectron2_maskrcnn_r_101_c4",
-    "detectron2_maskrcnn_r_101_fpn",
-    "detectron2_maskrcnn_r_50_fpn",
-}
-
-SKIP = {
-    # https://github.com/pytorch/torchdynamo/issues/101
-    "detectron2_maskrcnn",
-    # https://github.com/pytorch/torchdynamo/issues/145
-    "fambench_xlmr",
-    # TIMEOUT, https://github.com/pytorch/pytorch/issues/98467
-    "tacotron2",
-    "hf_Bert",  # Error: RelaxedUnspecConstraint(L['input_ids'].size()[0]) - inferred constant (4)
-    "hf_Bert_large",  # Error: RelaxedUnspecConstraint(L['input_ids'].size()[0]) - inferred constant (4)
-    # takes too long, extreme slowdown (< .001)
-    "maml",
-}
-
-SKIP_FOR_CPU = {
-    "hf_T5_generate",  # OOMs
-    "cm3leon_generate",  # model is CUDA only
-    "nanogpt",  # timeout
-    "sam",  # timeout
-    "llama_v2_7b_16h",  # model is CUDA only
-    "stable_diffusion",  # flaky
-    "torchrec_dlrm",  # requires FBGEMM, CUDA only
-}
-
-SKIP_FOR_CUDA = {
-    "gat",  # only works on CPU
-    "gcn",  # only works on CPU
-    "sage",  # only works on CPU
-}
-
-# Additional models that are skipped in training
-SKIP_TRAIN = {
-    # not designed for training
-    "pyhpc_equation_of_state",
-    "pyhpc_isoneutral_mixing",
-    "pyhpc_turbulent_kinetic_energy",
-    "maml",
-    "llama",
-    "llama_v2_7b_16h",
-}
-SKIP_TRAIN.update(DETECTRON2_MODELS)
-
-# These models support only train mode. So accuracy checking can't be done in
-# eval mode.
-ONLY_TRAINING_MODE = {
-    "tts_angular",
-    "tacotron2",
-    "demucs",
-    "hf_Reformer",
-    "pytorch_struct",
-    "yolov3",
-}
-ONLY_TRAINING_MODE.update(DETECTRON2_MODELS)
-
-# Need lower tolerance on GPU. GPU kernels have non deterministic kernels for these models.
-REQUIRE_HIGHER_TOLERANCE = {
-    "alexnet",
-    "attention_is_all_you_need_pytorch",
-    "densenet121",
-    "hf_Albert",
-    "vgg16",
-    "mobilenet_v3_large",
-    "nvidia_deeprecommender",
-    "timm_efficientdet",
-}
-
-# These models need >1e-3 tolerance
-REQUIRE_EVEN_HIGHER_TOLERANCE = {
-    "soft_actor_critic",
-    "tacotron2",
-}
-
-REQUIRE_HIGHER_FP16_TOLERANCE = {
-    "drq",
-}
-
-REQUIRE_COSINE_TOLERACE = {
-    # Just keeping it here even though its empty, if we need this in future.
-}
-
-# non-deterministic output / cant check correctness
-NONDETERMINISTIC = {
-    # https://github.com/pytorch/pytorch/issues/98355
-    "mobilenet_v3_large",
-}
-
-# These benchmarks took >600s on an i9-11900K CPU
-VERY_SLOW_BENCHMARKS = {
-    "hf_BigBird",  # 3339s
-    "hf_Longformer",  # 3062s
-    "hf_T5",  # 930s
-}
-
-# These benchmarks took >60s on an i9-11900K CPU
-SLOW_BENCHMARKS = {
-    *VERY_SLOW_BENCHMARKS,
-    "BERT_pytorch",  # 137s
-    "demucs",  # 116s
-    "fastNLP_Bert",  # 242s
-    "hf_Albert",  # 221s
-    "hf_Bart",  # 400s
-    "hf_Bert",  # 334s
-    "hf_DistilBert",  # 187s
-    "hf_GPT2",  # 470s
-    "hf_Reformer",  # 141s
-    "speech_transformer",  # 317s
-    "vision_maskrcnn",  # 99s
-}
-
-TRT_NOT_YET_WORKING = {
-    "alexnet",
-    "resnet18",
-    "resnet50",
-    "mobilenet_v2",
-    "mnasnet1_0",
-    "squeezenet1_1",
-    "shufflenetv2_x1_0",
-    "vgg16",
-    "resnext50_32x4d",
-}
-
-DONT_CHANGE_BATCH_SIZE = {
-    "demucs",
-    "pytorch_struct",
-    "pyhpc_turbulent_kinetic_energy",
-    "vision_maskrcnn",  # https://github.com/pytorch/benchmark/pull/1656
-}
-
-
-SKIP_ACCURACY_CHECK_MODELS = {
-    # Models too large to have eager, dynamo and fp64_numbers simultaneosuly
-    # even for 40 GB machine. We have tested accuracy for smaller version of
-    # these models
-    "hf_GPT2_large",
-    "hf_T5_large",
-    "timm_vision_transformer_large",
-    "maml",  # accuracy https://github.com/pytorch/pytorch/issues/93847
-    "llama_v2_7b_16h",
-    "Background_Matting",
-}
-
-SKIP_ACCURACY_CHECK_AS_EAGER_NON_DETERMINISTIC_MODELS = {
-    # Models that deterministic algorithms can not be turned on for eager mode.
-    "Background_Matting",
-}
-
-
-MAX_BATCH_SIZE_FOR_ACCURACY_CHECK = {
-    "hf_GPT2": 2,
-    "pytorch_unet": 2,
-}
-
-FORCE_AMP_FOR_FP16_BF16_MODELS = {
-    "DALLE2_pytorch",
-    "doctr_det_predictor",
-    "doctr_reco_predictor",
-    "Super_SloMo",
-    "tts_angular",
-}
-
-# models in canary_models that we should run anyway
-CANARY_MODELS = {
-    "torchrec_dlrm",
-}
-
-
-class TorchBenchmarkRunner(BenchmarkRunner):
-    def __init__(self):
-        super().__init__()
-        self.suite_name = "torchbench"
-        self.optimizer = None
-
-    @property
-    def skip_models(self):
-        return SKIP
-
-    @property
-    def skip_models_for_cpu(self):
-        return SKIP_FOR_CPU
-
-    @property
-    def skip_models_for_cuda(self):
-        return SKIP_FOR_CUDA
-
-    @property
-    def slow_models(self):
-        return SLOW_BENCHMARKS
-
-    @property
-    def very_slow_models(self):
-        return VERY_SLOW_BENCHMARKS
-
-    @property
-    def non_deterministic_models(self):
-        return NONDETERMINISTIC
-
-    @property
-    def skip_not_suitable_for_training_models(self):
-        return SKIP_TRAIN
-
-    @property
-    def failing_fx2trt_models(self):
-        return TRT_NOT_YET_WORKING
-
-    @property
-    def force_amp_for_fp16_bf16_models(self):
-        return FORCE_AMP_FOR_FP16_BF16_MODELS
-
-    @property
-    def skip_accuracy_checks_large_models_dashboard(self):
-        if self.args.dashboard or self.args.accuracy:
-            return SKIP_ACCURACY_CHECK_MODELS
-        return set()
-
-    @property
-    def skip_accuracy_check_as_eager_non_deterministic(self):
-        if self.args.accuracy and self.args.training:
-            return SKIP_ACCURACY_CHECK_AS_EAGER_NON_DETERMINISTIC_MODELS
-        return set()
-
-    def load_model(
-        self,
-        device,
-        model_name,
-        batch_size=None,
-        part=None,
-    ):
-        if self.args.enable_activation_checkpointing:
-            raise NotImplementedError(
-                "Activation checkpointing not implemented for Torchbench models"
-            )
-        is_training = self.args.training
-        use_eval_mode = self.args.use_eval_mode
-        dynamic_shapes = self.args.dynamic_shapes
-        candidates = [
-            f"torchbenchmark.models.{model_name}",
-            f"torchbenchmark.canary_models.{model_name}",
-            f"torchbenchmark.models.fb.{model_name}",
-        ]
-        for c in candidates:
-            try:
-                module = importlib.import_module(c)
-                break
-            except ModuleNotFoundError as e:
-                if e.name != c:
-                    raise
-        else:
-            raise ImportError(f"could not import any of {candidates}")
-        benchmark_cls = getattr(module, "Model", None)
-        if not hasattr(benchmark_cls, "name"):
-            benchmark_cls.name = model_name
-
-        cant_change_batch_size = (
-            not getattr(benchmark_cls, "ALLOW_CUSTOMIZE_BSIZE", True)
-            or model_name in DONT_CHANGE_BATCH_SIZE
-        )
-        if cant_change_batch_size:
-            batch_size = None
-        if batch_size is None and is_training and model_name in USE_SMALL_BATCH_SIZE:
-            batch_size = USE_SMALL_BATCH_SIZE[model_name]
-
-        # Control the memory footprint for few models
-        if self.args.accuracy and model_name in MAX_BATCH_SIZE_FOR_ACCURACY_CHECK:
-            batch_size = min(batch_size, MAX_BATCH_SIZE_FOR_ACCURACY_CHECK[model_name])
-
-        # workaround "RuntimeError: not allowed to set torch.backends.cudnn flags"
-        torch.backends.__allow_nonbracketed_mutation_flag = True
-        extra_args = []
-        if part:
-            extra_args = ["--part", part]
-
-        if model_name == "vision_maskrcnn" and is_training:
-            # Output of vision_maskrcnn model is a list of bounding boxes,
-            # sorted on the basis of their scores. This makes accuracy
-            # comparison hard with torch.compile. torch.compile can cause minor
-            # divergences in the output because of how fusion works for amp in
-            # TorchInductor compared to eager.  Therefore, instead of looking at
-            # all the bounding boxes, we compare only top 5.
-            model_kwargs = {"box_detections_per_img": 5}
-            benchmark = benchmark_cls(
-                test="train",
-                device=device,
-                batch_size=batch_size,
-                extra_args=extra_args,
-                model_kwargs=model_kwargs,
-            )
-        elif is_training:
-            benchmark = benchmark_cls(
-                test="train",
-                device=device,
-                batch_size=batch_size,
-                extra_args=extra_args,
-            )
-        else:
-            benchmark = benchmark_cls(
-                test="eval",
-                device=device,
-                batch_size=batch_size,
-                extra_args=extra_args,
-            )
-        model, example_inputs = benchmark.get_module()
-
-        # Models that must be in train mode while training
-        if is_training and (not use_eval_mode or model_name in ONLY_TRAINING_MODE):
-            model.train()
-        else:
-            model.eval()
-        gc.collect()
-        batch_size = benchmark.batch_size
-
-        # Torchbench has quite different setup for yolov3, so directly passing
-        # the right example_inputs
-        if model_name == "yolov3":
-            example_inputs = (torch.rand(batch_size, 3, 384, 512).to(device),)
-        # See https://github.com/pytorch/benchmark/issues/1561
-        if model_name == "maml_omniglot":
-            batch_size = 5
-            assert example_inputs[0].shape[0] == batch_size
-        if model_name == "vision_maskrcnn":
-            batch_size = 1
-        # global current_name, current_device
-        # current_device = device
-        # current_name = benchmark.name
-
-        if self.args.trace_on_xla:
-            # work around for: https://github.com/pytorch/xla/issues/4174
-            import torch_xla  # noqa: F401
-        self.validate_model(model, example_inputs)
-        return device, benchmark.name, model, example_inputs, batch_size
-
-    def iter_model_names(self, args):
-        from torchbenchmark import _list_canary_model_paths, _list_model_paths
-
-        models = _list_model_paths()
-        models += [
-            f
-            for f in _list_canary_model_paths()
-            if os.path.basename(f) in CANARY_MODELS
-        ]
-        models.sort()
-
-        start, end = self.get_benchmark_indices(len(models))
-        for index, model_path in enumerate(models):
-            if index < start or index >= end:
-                continue
-
-            model_name = os.path.basename(model_path)
-            if (
-                not re.search("|".join(args.filter), model_name, re.I)
-                or re.search("|".join(args.exclude), model_name, re.I)
-                or model_name in args.exclude_exact
-                or model_name in self.skip_models
-            ):
-                continue
-
-            yield model_name
-
-    def pick_grad(self, name, is_training):
-        if is_training or name in ("maml",):
-            return torch.enable_grad()
-        else:
-            return torch.no_grad()
-
-    def get_tolerance_and_cosine_flag(self, is_training, current_device, name):
-        tolerance = 1e-4
-        cosine = self.args.cosine
-        # Increase the tolerance for torch allclose
-        if self.args.float16 or self.args.amp:
-            if name in REQUIRE_HIGHER_FP16_TOLERANCE:
-                return 1e-2, cosine
-            return 1e-3, cosine
-        if is_training and current_device == "cuda":
-            tolerance = 1e-3
-            if name in REQUIRE_COSINE_TOLERACE:
-                cosine = True
-            elif name in REQUIRE_HIGHER_TOLERANCE:
-                tolerance = 1e-3
-            elif name in REQUIRE_EVEN_HIGHER_TOLERANCE:
-                tolerance = 8 * 1e-2
-        return tolerance, cosine
-
-    def compute_loss(self, pred):
-        return reduce_to_scalar_loss(pred)
-
-    def forward_pass(self, mod, inputs, collect_outputs=True):
-        with self.autocast():
-            return mod(*inputs)
-
-    def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
-        cloned_inputs = clone_inputs(inputs)
-        self.optimizer_zero_grad(mod)
-        with self.autocast():
-            pred = mod(*cloned_inputs)
-            loss = self.compute_loss(pred)
-        self.grad_scaler.scale(loss).backward()
-        self.optimizer_step()
-        if collect_outputs:
-            return collect_results(mod, pred, loss, cloned_inputs)
-        return None
-
-
-def torchbench_main():
-    original_dir = setup_torchbench_cwd()
-    logging.basicConfig(level=logging.WARNING)
-    warnings.filterwarnings("ignore")
-    main(TorchBenchmarkRunner(), original_dir)
-
-
-if __name__ == "__main__":
-    torchbench_main()
diff --git a/userbenchmark/dynamo/torchbench_models_list.txt b/userbenchmark/dynamo/torchbench_models_list.txt
deleted file mode 100644
index 04947c4a6a..0000000000
--- a/userbenchmark/dynamo/torchbench_models_list.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-BERT_pytorch,128
-Background_Matting, 16
-LearningToPaint,1024
-alexnet,1024
-dcgan,1024
-densenet121,64
-hf_Albert,32
-hf_Bart,16
-hf_Bert,16
-hf_GPT2,16
-hf_T5,4
-mnasnet1_0,256
-mobilenet_v2,128
-mobilenet_v3_large,256
-nvidia_deeprecommender,1024
-pytorch_unet,8
-resnet18,512
-resnet50,128
-resnext50_32x4d,128
-shufflenet_v2_x1_0,512
-squeezenet1_1,512
-timm_nfnet,256
-timm_efficientnet,128
-timm_regnet,128
-timm_resnest,256
-timm_vision_transformer,256
-timm_vovnet,128
-vgg16,128
diff --git a/userbenchmark/dynamo/torchbench_models_list_cpu.txt b/userbenchmark/dynamo/torchbench_models_list_cpu.txt
deleted file mode 100644
index ab485702b8..0000000000
--- a/userbenchmark/dynamo/torchbench_models_list_cpu.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-alexnet,128
-attention_is_all_you_need_pytorch,64
-BERT_pytorch,32
-dcgan,256
-densenet121,512
-dlrm,2048
-fastNLP_Bert,8
-functorch_dp_cifar10,1024
-hf_Albert,8
-hf_Bart,8
-hf_Bert,8
-hf_Bert_large,8
-hf_DistilBert,8
-hf_GPT2,8
-hf_GPT2_large,1
-hf_Longformer,4
-hf_Reformer,8
-hf_T5,4
-hf_T5_base,1
-hf_T5_large,1
-LearningToPaint,96
-lennard_jones,1024
-mnasnet1_0,32
-mobilenet_v2,16
-mobilenet_v3_large,32
-nvidia_deeprecommender,256
-phlippe_densenet,128
-phlippe_resnet,512
-pytorch_unet,4
-resnet152,32
-resnet18,256
-resnet50,256
-resnext50_32x4d,256
-shufflenet_v2_x1_0,64
-speech_transformer,1024
-squeezenet1_1,16
-Super_SloMo,1024
-timm_efficientnet,64
-timm_nfnet,128
-timm_regnet,32
-timm_resnest,32
-timm_vision_transformer,16
-timm_vision_transformer_large,8
-timm_vovnet,32
-tts_angular,1024
-vgg16,64
-vision_maskrcnn,1
-yolov3,32