pytorch · wconstab · Oct 1, 2020 · Sep 24, 2020 · Sep 25, 2020 · Sep 25, 2020
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,4 @@
 pytest
-pytest-benchmark
+pytest-benchmark
+requests
+tabulate
diff --git a/score/README.md b/score/README.md
@@ -0,0 +1,36 @@
+# Torchbench Score
+
+Torchbench provides a normalized benchmark score similar to 'spec' or other computing benchmarks.
+
+This is a prototype.  Current status and limitations are described below.
+
+## Score versioning
+The score is versioned, meaning only a certain set of benchmarks are captured in a particular 
+version of the score (even if additional benchmarks are added to the suite).  The relative weight
+of each benchmark in the overall score is frozen, along with a normalization factor measured on
+a particular 'gold' machine with a particular PyTorch release.  The intent is to measure the effect
+of new pytorch versions on the same workloads using the same reference machine using a consistent
+benchmark configuration.
+
+## Computing the score
+To compute the current score, provide a score config and benchmark data produced by pytest with `--benchmark-json` or related arguments.
+`python compute_score.py --configuration <cfg> --benchmark_data_file <data-file> `
+Or, use `--benchmark_data_dir` instead, pointing to a directory containing multiple json files to compute a table of scores.
+
+## New score versions
+Periodically, as more workloads have been added to the torchbenchmark suite, or as changes to
+relative weights or categories have been proposed, a new score configuration should be generated
+rather than modifying an existing score definition. 
+
+See `python generate_score_config.py -h` 
+
+## Issues and Next Steps
+For accurate score comparisons, measurements should be computed on the same machine 
+(or at least same machine spec) as the data used to produce normalization constants 
+in the score configuration.  
+
+- compute_score.py should assert the machine type matches by default
+- currently, a circleCI 'medium' gpu worker was used for the normalization data
+- soon, a particular CPU/GPU config should be deliberately selected along with a 
+  list of models/categories to be frozen for first long-living rev of the score
+
diff --git a/score/compute_score.py b/score/compute_score.py
@@ -0,0 +1,76 @@
+
+"""
+Compute the benchmark score given a frozen score configuration and current benchmark data.
+"""
+import argparse
+import json
+import math
+import os
+import yaml
+
+from tabulate import tabulate
+
+def compute_score(config, data, fake_data=None):
+    target = config['target']
+    score = 0.0
+    weight_sum = 0.0
+    for name in config['benchmarks']:
+        cfg = config['benchmarks'][name]
+        weight, norm = cfg['weight'], cfg['norm']
+        weight_sum += weight
+        measured_mean = [b['stats']['mean'] for b in data['benchmarks'] if b['name'] == name]
+        assert len(measured_mean) == 1, f"Missing data for {name}, unable to compute score"
+        measured_mean = measured_mean[0]
+        if fake_data is not None and name in fake_data:
+            # used for sanity checks on the sensitivity of the score metric
+            measured_mean = fake_data[name]
+        benchmark_score = weight * math.log(norm / measured_mean)
+        # print(f"{name}: {benchmark_score}")
+        score += benchmark_score
+
+    score = target * math.exp(score)
+    assert abs(weight_sum - 1.0) < 1e-6, f"Bad configuration, weights don't sum to 1, but {weight_sum}"
+    return score
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--configuration", required=True,
+        help="frozen benchmark configuration generated by generate_score_config.py")
+    parser.add_argument("--benchmark_data_file",
+        help="pytest-benchmark json file with current benchmark data")
+    parser.add_argument("--benchmark_data_dir",
+        help="directory containing multiple .json files for each of which to compute a score")
+    parser.add_argument('--hack_data', nargs=2, action='append', help="keyword to match benchmark names, and multiplicative factor to adjust their measurement")
+    args = parser.parse_args()
+    with open(args.configuration) as cfg_file:
+        config = yaml.full_load(cfg_file)
+
+    if args.benchmark_data_file is not None:
+        with open(args.benchmark_data_file) as data_file:
+            data = json.load(data_file)
+
+        score = compute_score(config, data)
+        print(score)
+        if args.hack_data:
+            fake_data = {}
+            for keyword, factor in args.hack_data:
+                for b in data['benchmarks']:
+                    if keyword.lower() in b['name'].lower():
+                        fake_data[b['name']] =  b['stats']['mean'] * float(factor)
+
+            hacked_score = compute_score(config, data, fake_data)
+            print(f"Using hacks {args.hack_data}, hacked_score {hacked_score}")
+
+    elif args.benchmark_data_dir is not None:
+        scores = [('File', 'Score')]
+        for f in os.listdir(args.benchmark_data_dir):
+            path = os.path.join(args.benchmark_data_dir, f)
+            if os.path.isfile(path) and os.path.splitext(path)[1] == '.json':
+                with open(path) as data_file:
+                    data = json.load(data_file)
+                    score = compute_score(config, data)
+                    scores.append((f, score))
+
+        print(tabulate(scores, headers='firstrow'))
+
+    # print(f"Benchmark Score: {score} (rounded) {int(round(score))}")
diff --git a/score/generate_score_config.py b/score/generate_score_config.py
@@ -0,0 +1,119 @@
+"""
+Generate a fully specified benchmark configuration file, given a lightweight
+specification and a complete source of benchmark data.
+
+Specification File
+------------------
+Score hierarchy input intended to be as easy to construct as possible,
+relying on automatic inference of unspecified weights, benchmark configs,
+and normalization factors given a particular instance of benchmark data.
+
+Structure:
+    Root                             _
+    - category                        |  required:
+      - domain                        |  3 layers of organizational structure 
+        - task                       _|
+
+          - benchmark name           -   keyword match for root name in benchmark,
+                                         omit children unless used
+                                     _
+            - train/eval              |  optional:
+              - device                |  provide specific weights or 
+                - compiler/runtime   _|  exclude particular configs by omission
+
+Rules for describing the weight hierarchy
+- everything is a dict, since at any level you could specify a weight
+- if a weight is not specified, it is computed automatically with respect
+its direct siblings.
+- if specific benchmark configurations are omitted under a benchmark name,
+all configurations present in the normalization data json are weighted equally
+
+Normalization Data
+------------------
+Used to 'fill in the gaps' in the human written specification. 
+
+- particular configurations (train/eval, device, compiler/runtime) present in 
+this data are used to compute benchmark weights
+- measurements from this data are used as normalization factors in score computation
+  such that new data is scored relative to this data.
+
+####
+TODO
+####
+ - handle multiple normalization files, one for models, one for synthetic, etc
+ - make explicit configuration choice for throughput vs runtime metrics
+ - assert same machine used for all normalization files and freeze that in
+"""
+import argparse
+import json
+import yaml
+
+def generate_bench_cfg(spec, norm, target):
+    cfg = {
+        'target': target,
+        'benchmarks': {},
+    }
+    benchmark_names = [b['name'] for b in norm['benchmarks']]
+    benchmark_norms = {b['name']: b['stats']['mean'] for b in norm['benchmarks']}
+
+    assert len(spec['hierarchy']) > 0, "Must specify at least one category"
+    category_weight = 1.0 / len(spec['hierarchy'])
+    for category in spec['hierarchy']:
+
+        category_spec = spec['hierarchy'][category]
+        assert isinstance(category_spec, dict), f"Category {category} in spec must be non-empty"
+        assert 'weight' not in category_spec, "TODO implement manual category weights"
+        domain_weight = 1.0 / len(category_spec)
+
+        for domain in category_spec:
+
+            tasks = category_spec[domain]        
+            assert isinstance(tasks, dict), f"Domain {category}:{domain} in spec must be non-empty"
+            assert 'weight' not in tasks, "TODO implement manual domain weights"
+            task_weight = 1.0 / len(tasks)
+
+            for task in tasks:
+
+                benchmarks = tasks[task]
+                assert isinstance(benchmarks, dict), f"Task {category}:{domain}:{task} in spec must be non-empty"
+                assert 'weight' not in benchmarks, "TODO implement manual task weights"
+                benchmark_weight = 1.0 / len(benchmarks)
+
+                for benchmark in benchmarks:
+
+                    assert benchmarks[benchmark] is None, "TODO handle benchmark as dict of config specs"
+                    # assert 'weight' not in benchmarks[benchmark], "TODO implement manual benchmark weights"
+                    found_benchmarks = [name for name in benchmark_names if benchmark in name]
+                    assert len(found_benchmarks) > 0, f"No normalization data found for {benchmark}"
+                    config_weight = 1.0 / len(found_benchmarks)
+                    for b in found_benchmarks:
+                        weight = domain_weight * task_weight * benchmark_weight * config_weight
+                        cfg['benchmarks'][b] = {
+                                'weight': weight,
+                                'norm': benchmark_norms[b],
+                        }
+    return cfg
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--specification", required=True,
+        help="yaml file describing weight hierarchy")
+    parser.add_argument("--normalization_data", required=True,
+        help="pytest-benchmark json file used for generating normalization "
+             "values and filling in unspecified benchmark configurations")
+    parser.add_argument("--output_file", required=True,
+        help="generated complete benchmark configuration")
+    parser.add_argument("--target_score", default=1000,
+        help="target score value given these normalizations and specifications")
+    args = parser.parse_args()
+
+    with open(args.specification) as spec_file:
+        spec = yaml.full_load(spec_file)
+
+    with open(args.normalization_data) as norm_file:
+        norm = json.load(norm_file)
+
+    with open(args.output_file, 'w') as out_file:
+        bench_cfg = generate_bench_cfg(spec, norm, args.target_score)
+        yaml.dump(bench_cfg, out_file)
+
diff --git a/score/score.yml b/score/score.yml
@@ -0,0 +1,38 @@
+---
+hierarchy:
+  model:
+    computer vision:
+      # segmentation:
+        # maskrcnn_benchmark:
+      classification:
+        pytorch_mobilenet_v3:
+      detection:
+        yolov3:
+      # generation:
+        # pytorch_CycleGAN_and_pix...:
+        # pytorch_stargan:
+      other computer vision:
+        Background_Matting:
+        Super_SloMo:
+    natural language processing:
+      translation:
+        attention_is_all_you_nee...:
+      language_modeling:
+        BERT_pytorch:
+      other nlp:
+        fastNLP:
+    speech:
+      synthesis:
+        tacotron2:
+    recommendation:
+      recommendation:
+        dlrm:
+    reinforcement learning:
+      other rl:
+        LearningToPaint:
+    other:
+      other tasks:
+        moco:
+        demucs:
+        pytorch_struct:
+...