From 9df72944d7e38ccd24e5efc962b15c8fc2daa68d Mon Sep 17 00:00:00 2001 From: Jason Ansel Date: Wed, 4 May 2022 14:42:31 -0700 Subject: [PATCH] Add `./torchbench.py --fast` option --- Makefile | 12 +++++------- README.md | 2 +- torchbench.py | 40 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 42 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 09a7a182fe..be6c48c19d 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ test: develop pytest tests torchbench: develop - python torchbench.py + python torchbench.py --fast overhead: develop python torchbench.py --overhead @@ -129,12 +129,10 @@ baseline-gpu: develop baseline-gpu-inductor: develop rm -f baseline_*.csv - python torchbench.py -dcuda --float32 --isolate -n50 --inductor - python torchbench.py -dcuda --float32 --isolate -n50 --backend=cudagraphs && mv speedup_cudagraphs.csv baseline_cudagraphs.csv - python torchbench.py -dcuda --float32 --isolate -n50 --backend=cudagraphs_ts --nvfuser && mv speedup_cudagraphs_ts.csv baseline_cg_nvfuser.csv - python torchbench.py -dcuda --float32 --isolate -n50 --backend=cudagraphs_ts && mv speedup_cudagraphs_ts.csv baseline_cg_nnc.csv - # python torchbench.py -dcuda --float32 --isolate -n50 --speedup-ts --nvfuser && mv baseline_ts.csv baseline_ts_nvfuser.csv - # python torchbench.py -dcuda --float32 --isolate -n50 --speedup-ts && mv baseline_ts.csv baseline_ts_nnc.csv + python torchbench.py --cosine -dcuda --float32 --isolate -n50 --inductor + python torchbench.py --cosine -dcuda --float32 --isolate -n50 --backend=cudagraphs && mv speedup_cudagraphs.csv baseline_cudagraphs.csv + python torchbench.py --cosine -dcuda --float32 --isolate -n50 --backend=cudagraphs_ts --nvfuser && mv speedup_cudagraphs_ts.csv baseline_cg_nvfuser.csv + python torchbench.py --cosine -dcuda --float32 --isolate -n50 --backend=cudagraphs_ts && mv speedup_cudagraphs_ts.csv baseline_cg_nnc.csv paste -d, inductor.csv baseline_cudagraphs.csv baseline_cg_nvfuser.csv baseline_cg_nnc.csv > baseline_all.csv diff --git a/README.md b/README.md index 3678ba2da9..58088a0653 100644 --- a/README.md +++ b/README.md @@ -376,7 +376,7 @@ cd ../torchdynamo make lint-deps # make sure it works -./torchbench.py +./torchbench.py --fast ``` ## Tests diff --git a/torchbench.py b/torchbench.py index c0722179e9..4a4b071d7a 100755 --- a/torchbench.py +++ b/torchbench.py @@ -105,6 +105,32 @@ "timm_efficientdet": 1, } +# These benchmarks took >600s on an i9-11900K CPU +VERY_SLOW_BENCHMARKS = { + "hf_BigBird", # 3339s + "hf_Longformer", # 3062s + "hf_T5", # 930s +} + +# These benchmarks took >60s on an i9-11900K CPU +SLOW_BENCHMARKS = { + *{ + "BERT_pytorch", # 137s + "demucs", # 116s + "fastNLP_Bert", # 242s + "hf_Albert", # 221s + "hf_Bart", # 400s + "hf_Bert", # 334s + "hf_DistilBert", # 187s + "hf_GPT2", # 470s + "hf_Reformer", # 141s + "speech_transformer", # 317s + "vision_maskrcnn", # 99s + }, + *VERY_SLOW_BENCHMARKS, +} + + current_name = "" current_device = "" output_filename = None @@ -646,6 +672,9 @@ def main(): parser.add_argument("--float16", action="store_true", help="cast model to fp16") parser.add_argument("--float32", action="store_true", help="cast model to fp32") parser.add_argument("--cosine", action="store_true", help="use cosine similarity") + parser.add_argument( + "--fast", "-f", action="store_true", help="skip slow benchmarks" + ) parser.add_argument("--only", help="used by --isolate to run just one model") parser.add_argument( "--minimum-call-count", type=int, help="filter out graphs with too few ops" @@ -808,9 +837,6 @@ def main(): } ) - if args.no_skip: - SKIP.clear() - if args.nvfuser: torch._C._jit_override_can_fuse_on_cpu(False) torch._C._jit_override_can_fuse_on_gpu(False) @@ -835,6 +861,12 @@ def main(): else: model_iter_fn = forward_pass + if args.fast: + SKIP.update(SLOW_BENCHMARKS) + + if args.devices == ["cpu"]: + SKIP.update(VERY_SLOW_BENCHMARKS) + if args.no_skip: SKIP.clear() @@ -1173,7 +1205,7 @@ def run_one_model( if output_filename and "coverage" in output_filename: results.append( - f"{ok:3}/{total:3} +{frames_third_pass} frames {time.perf_counter()-t0:.0f}s" + f"{ok:3}/{total:3} +{frames_third_pass} frames {time.perf_counter()-t0:3.0f}s" ) results.append(experiment(model, example_inputs))