diff --git a/requirements.txt b/requirements.txt index 7bc348361c..7689900959 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ pytest -pytest-benchmark \ No newline at end of file +pytest-benchmark +requests +tabulate diff --git a/score/README.md b/score/README.md new file mode 100644 index 0000000000..c8defe157c --- /dev/null +++ b/score/README.md @@ -0,0 +1,36 @@ +# Torchbench Score + +Torchbench provides a normalized benchmark score similar to 'spec' or other computing benchmarks. + +This is a prototype. Current status and limitations are described below. + +## Score versioning +The score is versioned, meaning only a certain set of benchmarks are captured in a particular +version of the score (even if additional benchmarks are added to the suite). The relative weight +of each benchmark in the overall score is frozen, along with a normalization factor measured on +a particular 'gold' machine with a particular PyTorch release. The intent is to measure the effect +of new pytorch versions on the same workloads using the same reference machine using a consistent +benchmark configuration. + +## Computing the score +To compute the current score, provide a score config and benchmark data produced by pytest with `--benchmark-json` or related arguments. +`python compute_score.py --configuration --benchmark_data_file ` +Or, use `--benchmark_data_dir` instead, pointing to a directory containing multiple json files to compute a table of scores. + +## New score versions +Periodically, as more workloads have been added to the torchbenchmark suite, or as changes to +relative weights or categories have been proposed, a new score configuration should be generated +rather than modifying an existing score definition. + +See `python generate_score_config.py -h` + +## Issues and Next Steps +For accurate score comparisons, measurements should be computed on the same machine +(or at least same machine spec) as the data used to produce normalization constants +in the score configuration. + +- compute_score.py should assert the machine type matches by default +- currently, a circleCI 'medium' gpu worker was used for the normalization data +- soon, a particular CPU/GPU config should be deliberately selected along with a + list of models/categories to be frozen for first long-living rev of the score + diff --git a/score/compute_score.py b/score/compute_score.py new file mode 100644 index 0000000000..ea0c33f15e --- /dev/null +++ b/score/compute_score.py @@ -0,0 +1,76 @@ + +""" +Compute the benchmark score given a frozen score configuration and current benchmark data. +""" +import argparse +import json +import math +import os +import yaml + +from tabulate import tabulate + +def compute_score(config, data, fake_data=None): + target = config['target'] + score = 0.0 + weight_sum = 0.0 + for name in config['benchmarks']: + cfg = config['benchmarks'][name] + weight, norm = cfg['weight'], cfg['norm'] + weight_sum += weight + measured_mean = [b['stats']['mean'] for b in data['benchmarks'] if b['name'] == name] + assert len(measured_mean) == 1, f"Missing data for {name}, unable to compute score" + measured_mean = measured_mean[0] + if fake_data is not None and name in fake_data: + # used for sanity checks on the sensitivity of the score metric + measured_mean = fake_data[name] + benchmark_score = weight * math.log(norm / measured_mean) + # print(f"{name}: {benchmark_score}") + score += benchmark_score + + score = target * math.exp(score) + assert abs(weight_sum - 1.0) < 1e-6, f"Bad configuration, weights don't sum to 1, but {weight_sum}" + return score + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--configuration", required=True, + help="frozen benchmark configuration generated by generate_score_config.py") + parser.add_argument("--benchmark_data_file", + help="pytest-benchmark json file with current benchmark data") + parser.add_argument("--benchmark_data_dir", + help="directory containing multiple .json files for each of which to compute a score") + parser.add_argument('--hack_data', nargs=2, action='append', help="keyword to match benchmark names, and multiplicative factor to adjust their measurement") + args = parser.parse_args() + with open(args.configuration) as cfg_file: + config = yaml.full_load(cfg_file) + + if args.benchmark_data_file is not None: + with open(args.benchmark_data_file) as data_file: + data = json.load(data_file) + + score = compute_score(config, data) + print(score) + if args.hack_data: + fake_data = {} + for keyword, factor in args.hack_data: + for b in data['benchmarks']: + if keyword.lower() in b['name'].lower(): + fake_data[b['name']] = b['stats']['mean'] * float(factor) + + hacked_score = compute_score(config, data, fake_data) + print(f"Using hacks {args.hack_data}, hacked_score {hacked_score}") + + elif args.benchmark_data_dir is not None: + scores = [('File', 'Score')] + for f in os.listdir(args.benchmark_data_dir): + path = os.path.join(args.benchmark_data_dir, f) + if os.path.isfile(path) and os.path.splitext(path)[1] == '.json': + with open(path) as data_file: + data = json.load(data_file) + score = compute_score(config, data) + scores.append((f, score)) + + print(tabulate(scores, headers='firstrow')) + + # print(f"Benchmark Score: {score} (rounded) {int(round(score))}") \ No newline at end of file diff --git a/score/generate_score_config.py b/score/generate_score_config.py new file mode 100644 index 0000000000..539e816940 --- /dev/null +++ b/score/generate_score_config.py @@ -0,0 +1,119 @@ +""" +Generate a fully specified benchmark configuration file, given a lightweight +specification and a complete source of benchmark data. + +Specification File +------------------ +Score hierarchy input intended to be as easy to construct as possible, +relying on automatic inference of unspecified weights, benchmark configs, +and normalization factors given a particular instance of benchmark data. + +Structure: + Root _ + - category | required: + - domain | 3 layers of organizational structure + - task _| + + - benchmark name - keyword match for root name in benchmark, + omit children unless used + _ + - train/eval | optional: + - device | provide specific weights or + - compiler/runtime _| exclude particular configs by omission + +Rules for describing the weight hierarchy +- everything is a dict, since at any level you could specify a weight +- if a weight is not specified, it is computed automatically with respect +its direct siblings. +- if specific benchmark configurations are omitted under a benchmark name, +all configurations present in the normalization data json are weighted equally + +Normalization Data +------------------ +Used to 'fill in the gaps' in the human written specification. + +- particular configurations (train/eval, device, compiler/runtime) present in +this data are used to compute benchmark weights +- measurements from this data are used as normalization factors in score computation + such that new data is scored relative to this data. + +#### +TODO +#### + - handle multiple normalization files, one for models, one for synthetic, etc + - make explicit configuration choice for throughput vs runtime metrics + - assert same machine used for all normalization files and freeze that in +""" +import argparse +import json +import yaml + +def generate_bench_cfg(spec, norm, target): + cfg = { + 'target': target, + 'benchmarks': {}, + } + benchmark_names = [b['name'] for b in norm['benchmarks']] + benchmark_norms = {b['name']: b['stats']['mean'] for b in norm['benchmarks']} + + assert len(spec['hierarchy']) > 0, "Must specify at least one category" + category_weight = 1.0 / len(spec['hierarchy']) + for category in spec['hierarchy']: + + category_spec = spec['hierarchy'][category] + assert isinstance(category_spec, dict), f"Category {category} in spec must be non-empty" + assert 'weight' not in category_spec, "TODO implement manual category weights" + domain_weight = 1.0 / len(category_spec) + + for domain in category_spec: + + tasks = category_spec[domain] + assert isinstance(tasks, dict), f"Domain {category}:{domain} in spec must be non-empty" + assert 'weight' not in tasks, "TODO implement manual domain weights" + task_weight = 1.0 / len(tasks) + + for task in tasks: + + benchmarks = tasks[task] + assert isinstance(benchmarks, dict), f"Task {category}:{domain}:{task} in spec must be non-empty" + assert 'weight' not in benchmarks, "TODO implement manual task weights" + benchmark_weight = 1.0 / len(benchmarks) + + for benchmark in benchmarks: + + assert benchmarks[benchmark] is None, "TODO handle benchmark as dict of config specs" + # assert 'weight' not in benchmarks[benchmark], "TODO implement manual benchmark weights" + found_benchmarks = [name for name in benchmark_names if benchmark in name] + assert len(found_benchmarks) > 0, f"No normalization data found for {benchmark}" + config_weight = 1.0 / len(found_benchmarks) + for b in found_benchmarks: + weight = domain_weight * task_weight * benchmark_weight * config_weight + cfg['benchmarks'][b] = { + 'weight': weight, + 'norm': benchmark_norms[b], + } + return cfg + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--specification", required=True, + help="yaml file describing weight hierarchy") + parser.add_argument("--normalization_data", required=True, + help="pytest-benchmark json file used for generating normalization " + "values and filling in unspecified benchmark configurations") + parser.add_argument("--output_file", required=True, + help="generated complete benchmark configuration") + parser.add_argument("--target_score", default=1000, + help="target score value given these normalizations and specifications") + args = parser.parse_args() + + with open(args.specification) as spec_file: + spec = yaml.full_load(spec_file) + + with open(args.normalization_data) as norm_file: + norm = json.load(norm_file) + + with open(args.output_file, 'w') as out_file: + bench_cfg = generate_bench_cfg(spec, norm, args.target_score) + yaml.dump(bench_cfg, out_file) + diff --git a/score/score.yml b/score/score.yml new file mode 100644 index 0000000000..b82d07c78c --- /dev/null +++ b/score/score.yml @@ -0,0 +1,38 @@ +--- +hierarchy: + model: + computer vision: + # segmentation: + # maskrcnn_benchmark: + classification: + pytorch_mobilenet_v3: + detection: + yolov3: + # generation: + # pytorch_CycleGAN_and_pix...: + # pytorch_stargan: + other computer vision: + Background_Matting: + Super_SloMo: + natural language processing: + translation: + attention_is_all_you_nee...: + language_modeling: + BERT_pytorch: + other nlp: + fastNLP: + speech: + synthesis: + tacotron2: + recommendation: + recommendation: + dlrm: + reinforcement learning: + other rl: + LearningToPaint: + other: + other tasks: + moco: + demucs: + pytorch_struct: +... diff --git a/score/torchbench_0.0.yaml b/score/torchbench_0.0.yaml new file mode 100644 index 0000000000..d972734e60 --- /dev/null +++ b/score/torchbench_0.0.yaml @@ -0,0 +1,227 @@ +benchmarks: + test_eval[BERT_pytorch-cpu-eager]: + norm: 0.056674222277769734 + weight: 0.006944444444444444 + test_eval[BERT_pytorch-cpu-jit]: + norm: 0.04845556728573034 + weight: 0.006944444444444444 + test_eval[BERT_pytorch-cuda-eager]: + norm: 0.026858860157876495 + weight: 0.006944444444444444 + test_eval[BERT_pytorch-cuda-jit]: + norm: 0.017104738694909322 + weight: 0.006944444444444444 + test_eval[LearningToPaint-cpu-eager]: + norm: 0.015391482121220757 + weight: 0.020833333333333332 + test_eval[LearningToPaint-cpu-jit]: + norm: 0.01744512900000531 + weight: 0.020833333333333332 + test_eval[LearningToPaint-cuda-eager]: + norm: 0.004052860649187367 + weight: 0.020833333333333332 + test_eval[LearningToPaint-cuda-jit]: + norm: 0.004101277514284299 + weight: 0.020833333333333332 + test_eval[Super_SloMo-cuda-eager]: + norm: 0.5727992373999768 + weight: 0.006944444444444444 + test_eval[Super_SloMo-cuda-jit]: + norm: 0.5758905388000131 + weight: 0.006944444444444444 + test_eval[attention_is_all_you_nee...-cpu-eager]: + norm: 0.8630710880000152 + weight: 0.006944444444444444 + test_eval[attention_is_all_you_nee...-cpu-jit]: + norm: 0.8489351521999652 + weight: 0.006944444444444444 + test_eval[attention_is_all_you_nee...-cuda-eager]: + norm: 0.18483205416672868 + weight: 0.006944444444444444 + test_eval[attention_is_all_you_nee...-cuda-jit]: + norm: 0.18512023083333892 + weight: 0.006944444444444444 + test_eval[demucs-cpu-eager]: + norm: 0.15394044100006404 + weight: 0.006944444444444444 + test_eval[demucs-cpu-jit]: + norm: 0.154964403714335 + weight: 0.006944444444444444 + test_eval[demucs-cuda-eager]: + norm: 0.08700494508337897 + weight: 0.006944444444444444 + test_eval[demucs-cuda-jit]: + norm: 0.08798973516667274 + weight: 0.006944444444444444 + test_eval[dlrm-cpu-eager]: + norm: 0.0007821738814477574 + weight: 0.041666666666666664 + test_eval[dlrm-cuda-eager]: + norm: 0.0011976210188676443 + weight: 0.041666666666666664 + test_eval[fastNLP-cpu-eager]: + norm: 0.0003339627349426991 + weight: 0.006944444444444444 + test_eval[fastNLP-cpu-jit]: + norm: 0.0002815807873683167 + weight: 0.006944444444444444 + test_eval[fastNLP-cuda-eager]: + norm: 0.00026201308868427635 + weight: 0.006944444444444444 + test_eval[fastNLP-cuda-jit]: + norm: 0.00021672122712418238 + weight: 0.006944444444444444 + test_eval[moco-cuda-eager]: + norm: 0.6445621068000037 + weight: 0.013888888888888888 + test_eval[moco-cuda-jit]: + norm: 0.6464073749999443 + weight: 0.013888888888888888 + test_eval[pytorch_mobilenet_v3-cpu-eager]: + norm: 0.02729150021618684 + weight: 0.006944444444444444 + test_eval[pytorch_mobilenet_v3-cpu-jit]: + norm: 0.027794305918903136 + weight: 0.006944444444444444 + test_eval[pytorch_mobilenet_v3-cuda-eager]: + norm: 0.02729599686483345 + weight: 0.006944444444444444 + test_eval[pytorch_mobilenet_v3-cuda-jit]: + norm: 0.02273801193330453 + weight: 0.006944444444444444 + test_eval[pytorch_struct-cpu-eager]: + norm: 0.002562631041459985 + weight: 0.006944444444444444 + test_eval[pytorch_struct-cpu-jit]: + norm: 0.002147531797012927 + weight: 0.006944444444444444 + test_eval[pytorch_struct-cuda-eager]: + norm: 0.0016883496258385442 + weight: 0.006944444444444444 + test_eval[pytorch_struct-cuda-jit]: + norm: 0.0012041589738027185 + weight: 0.006944444444444444 + test_eval[tacotron2-cuda-eager]: + norm: 1.0758157056000528 + weight: 0.08333333333333333 + test_eval[yolov3-cpu-eager]: + norm: 0.8865785995999431 + weight: 0.018518518518518517 + test_eval[yolov3-cuda-eager]: + norm: 0.5735116551998545 + weight: 0.018518518518518517 + test_train[BERT_pytorch-cpu-eager]: + norm: 0.27472745519994535 + weight: 0.006944444444444444 + test_train[BERT_pytorch-cpu-jit]: + norm: 0.3208459026000128 + weight: 0.006944444444444444 + test_train[BERT_pytorch-cuda-eager]: + norm: 0.13274032924994117 + weight: 0.006944444444444444 + test_train[BERT_pytorch-cuda-jit]: + norm: 0.12152077777778282 + weight: 0.006944444444444444 + test_train[Background_Matting-cuda-eager]: + norm: 4.454985670599944 + weight: 0.013888888888888888 + test_train[Background_Matting-cuda-jit]: + norm: 4.3299996296000245 + weight: 0.013888888888888888 + test_train[LearningToPaint-cpu-eager]: + norm: 0.06769545181248304 + weight: 0.020833333333333332 + test_train[LearningToPaint-cpu-jit]: + norm: 0.09986237930002062 + weight: 0.020833333333333332 + test_train[LearningToPaint-cuda-eager]: + norm: 0.021776436187498877 + weight: 0.020833333333333332 + test_train[LearningToPaint-cuda-jit]: + norm: 0.02460534921213807 + weight: 0.020833333333333332 + test_train[Super_SloMo-cuda-eager]: + norm: 1.4887921610000376 + weight: 0.006944444444444444 + test_train[Super_SloMo-cuda-jit]: + norm: 1.4842954989999726 + weight: 0.006944444444444444 + test_train[attention_is_all_you_nee...-cpu-eager]: + norm: 2.8790815572000157 + weight: 0.006944444444444444 + test_train[attention_is_all_you_nee...-cpu-jit]: + norm: 2.8627824179999153 + weight: 0.006944444444444444 + test_train[attention_is_all_you_nee...-cuda-eager]: + norm: 0.5916350972000146 + weight: 0.006944444444444444 + test_train[attention_is_all_you_nee...-cuda-jit]: + norm: 0.5918281878000471 + weight: 0.006944444444444444 + test_train[demucs-cpu-eager]: + norm: 0.7204151263999847 + weight: 0.006944444444444444 + test_train[demucs-cpu-jit]: + norm: 1.567230096000003 + weight: 0.006944444444444444 + test_train[demucs-cuda-eager]: + norm: 0.21315780479999374 + weight: 0.006944444444444444 + test_train[demucs-cuda-jit]: + norm: 0.21378219499997614 + weight: 0.006944444444444444 + test_train[dlrm-cpu-eager]: + norm: 0.0024918104727723656 + weight: 0.041666666666666664 + test_train[dlrm-cuda-eager]: + norm: 0.0038975701444924336 + weight: 0.041666666666666664 + test_train[fastNLP-cpu-eager]: + norm: 0.004783386184209382 + weight: 0.006944444444444444 + test_train[fastNLP-cpu-jit]: + norm: 0.004753424066670743 + weight: 0.006944444444444444 + test_train[fastNLP-cuda-eager]: + norm: 0.0025605078503221097 + weight: 0.006944444444444444 + test_train[fastNLP-cuda-jit]: + norm: 0.0025617683341551596 + weight: 0.006944444444444444 + test_train[moco-cuda-eager]: + norm: 1.3086710984000092 + weight: 0.013888888888888888 + test_train[moco-cuda-jit]: + norm: 1.3050591925999924 + weight: 0.013888888888888888 + test_train[pytorch_mobilenet_v3-cpu-eager]: + norm: 0.25344906619993707 + weight: 0.006944444444444444 + test_train[pytorch_mobilenet_v3-cpu-jit]: + norm: 0.23886577479997867 + weight: 0.006944444444444444 + test_train[pytorch_mobilenet_v3-cuda-eager]: + norm: 0.2574923820000549 + weight: 0.006944444444444444 + test_train[pytorch_mobilenet_v3-cuda-jit]: + norm: 0.24167844399989918 + weight: 0.006944444444444444 + test_train[pytorch_struct-cpu-eager]: + norm: 1.2756745926000803 + weight: 0.006944444444444444 + test_train[pytorch_struct-cpu-jit]: + norm: 1.3099148541999057 + weight: 0.006944444444444444 + test_train[pytorch_struct-cuda-eager]: + norm: 0.4209450229998765 + weight: 0.006944444444444444 + test_train[pytorch_struct-cuda-jit]: + norm: 0.4204848049998873 + weight: 0.006944444444444444 + test_train[tacotron2-cuda-eager]: + norm: 4.155933508199996 + weight: 0.08333333333333333 + test_train[yolov3-cuda-eager]: + norm: 6.38542746819985 + weight: 0.018518518518518517 +target: 1000 diff --git a/scripts/install_nightlies.sh b/scripts/install_nightlies.sh index a8f6d22942..cdeff59218 100755 --- a/scripts/install_nightlies.sh +++ b/scripts/install_nightlies.sh @@ -4,5 +4,3 @@ set -e . ~/miniconda3/etc/profile.d/conda.sh conda activate base conda install -y pytorch torchtext torchvision -c pytorch-nightly -pip install -q pytest pytest-benchmark requests - diff --git a/scripts/run_bench_and_upload.sh b/scripts/run_bench_and_upload.sh index 9611d72b8e..7f224554aa 100755 --- a/scripts/run_bench_and_upload.sh +++ b/scripts/run_bench_and_upload.sh @@ -24,12 +24,14 @@ BENCHMARK_ABS_FILENAME=${BENCHMARK_DATA}/${BENCHMARK_FILENAME} pytest test_bench.py --setup-show --benchmark-sort=Name --benchmark-json=${BENCHMARK_ABS_FILENAME} -k "$PYTEST_FILTER" +# Compute benchmark score +TORCHBENCH_SCORE=$(python score/compute_score.py --configuration score/torchbench_0.0.yaml --benchmark_data_file ${BENCHMARK_ABS_FILENAME}) # Token is only present for certain jobs, only upload if present if [ -z "$SCRIBE_GRAPHQL_ACCESS_TOKEN" ] then echo "Skipping benchmark upload, token is missing." else - python scripts/upload_scribe.py --pytest_bench_json ${BENCHMARK_ABS_FILENAME} + python scripts/upload_scribe.py --pytest_bench_json ${BENCHMARK_ABS_FILENAME} --torchbench_score ${TORCHBENCH_SCORE} fi diff --git a/scripts/upload_scribe.py b/scripts/upload_scribe.py index 8a2d8361fe..ea69c32b22 100644 --- a/scripts/upload_scribe.py +++ b/scripts/upload_scribe.py @@ -85,7 +85,7 @@ def __init__(self): 'circle_build_num', 'circle_project_reponame', ], 'float': [ - 'stddev', 'min', 'median', 'max', 'mean', 'runtime' + 'stddev', 'min', 'median', 'max', 'mean', 'runtime', 'torchbench_score', ] } @@ -135,13 +135,44 @@ def post_pytest_benchmarks(self, pytest_json, max_data_upload=100): self.upload(messages) + def post_torchbench_score(self, pytest_json, score): + machine_info = pytest_json['machine_info'] + commit_info = pytest_json['commit_info'] + upload_time = int(time.time()) + m = self.format_message({ + "time": upload_time, + "benchmark_time": pytest_json['datetime'], + "git_repo": commit_info['project'], + "git_commit_id": commit_info['id'], + "git_branch": commit_info['branch'], + "git_commit_time": commit_info['time'], + "git_dirty": commit_info['dirty'], + "pytorch_version": machine_info.get('pytorch_version', None), + "torchtext_version": machine_info.get('torchtext_version', None), + "torchvision_version": machine_info.get('torchvision_version', None), + "python_version": machine_info['python_implementation_version'], + "machine_kernel": machine_info['release'], + "machine_processor": machine_info['processor'], + "machine_hostname": machine_info['node'], + "circle_build_num": machine_info.get('circle_build_num', None), + "circle_project_reponame": machine_info.get('circle_project_name', None), + "torchbench_score": score, + }) + self.upload(m) + if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--pytest_bench_json", type=argparse.FileType('r'), + parser.add_argument("--pytest_bench_json", required=True, + type=argparse.FileType('r'), help='Upload json data formatted by pytest-benchmark module') + parser.add_argument("--torchbench_score", type=float, + help="optional torchbench score to include") args = parser.parse_args() - if args.pytest_bench_json: - benchmark_uploader = PytorchBenchmarkUploader() - json_data = json.load(args.pytest_bench_json) - benchmark_uploader.post_pytest_benchmarks(json_data) + + benchmark_uploader = PytorchBenchmarkUploader() + json_data = json.load(args.pytest_bench_json) + benchmark_uploader.post_pytest_benchmarks(json_data) + + if args.torchbench_score is not None: + benchmark_uploader.post_torchbench_score(json_data, args.torchbench_score)