From 6574ef81f1c64b4cfa26a6eea07231093f6a49b9 Mon Sep 17 00:00:00 2001
From: Will Constable <willconstable@gmail.com>
Date: Thu, 24 Sep 2020 14:48:00 -0400
Subject: [PATCH 01/13] Add generate_score_config.py and compute_score.py

- Add baseline score config (score.yml) and check in
  output torchbench_0.0.yaml
---
 score/README.md                |  24 +++
 score/compute_score.py         |  41 +++++
 score/generate_score_config.py | 121 +++++++++++++++
 score/score.yml                |  38 +++++
 score/torchbench_0.0.yaml      | 266 +++++++++++++++++++++++++++++++++
 5 files changed, 490 insertions(+)
 create mode 100644 score/README.md
 create mode 100644 score/compute_score.py
 create mode 100644 score/generate_score_config.py
 create mode 100644 score/score.yml
 create mode 100644 score/torchbench_0.0.yaml
diff --git a/score/README.md b/score/README.md
new file mode 100644
index 0000000000..cff74249b6
--- /dev/null
+++ b/score/README.md
@@ -0,0 +1,24 @@
+# Torchbench Score
+
+Torchbench provides a normalized benchmark score similar to 'spec' or other computing benchmarks.
+
+This is a prototype.  Current status and limitations are described below.
+
+## Score versioning
+The score is versioned, meaning only a certain set of benchmarks are captured in a particular 
+version of the score (even if additional benchmarks are added to the suite).  The relative weight
+of each benchmark in the overall score is frozen, along with a normalization factor measured on
+a particular 'gold' machine with a particular PyTorch release.  The intent is to measure the effect
+of new pytorch versions on the same workloads using the same reference machine using a consistent
+benchmark configuration.
+
+## Computing the score
+To compute the current score, provide a score config and benchmark data produced by pytest with `--benchmark-json` or related arguments.
+`python compute_score.py --configuration <cfg> --benchmark_data <data>`
+
+## New score versions
+Periodically, as more workloads have been added to the torchbenchmark suite, or as changes to
+relative weights or categories have been proposed, a new score configuration should be generated
+rather than modifying an existing score definition. 
+
+See `python generate_score_config.py -h` 
\ No newline at end of file
diff --git a/score/compute_score.py b/score/compute_score.py
new file mode 100644
index 0000000000..e0d8b48520
--- /dev/null
+++ b/score/compute_score.py
@@ -0,0 +1,41 @@
+
+"""
+Compute the benchmark score given a frozen score configuration and current benchmark data.
+"""
+import argparse
+import json
+import yaml
+
+def compute_score(config, data):
+    target = config['target']
+    score = 1.0
+    weight_sum = 0.0
+    for name in config['benchmarks']:
+        cfg = config['benchmarks'][name]
+        weight, norm = cfg['weight'], cfg['norm']
+        weight_sum += weight
+        measured_mean = [b['stats']['mean'] for b in data['benchmarks'] if b['name'] == name][0]
+        benchmark_score = (norm / measured_mean) ** weight
+        print(f"{name}: {benchmark_score}")
+        score *= benchmark_score
+
+    score = score ** (1.0 / len(config['benchmarks']))
+    assert abs(weight_sum - 1.0) < 1e-6, f"Bad configuration, weights don't sum to 1, but {weight_sum}"
+    return score * target    
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--configuration", required=True,
+        help="frozen benchmark configuration generated by generate_score_config.py")
+    parser.add_argument("--benchmark_data", required=True,
+        help="pytest-benchmark json file with current benchmark data")
+    args = parser.parse_args()
+
+    with open(args.configuration) as cfg_file:
+        config = yaml.full_load(cfg_file)
+
+    with open(args.benchmark_data) as data_file:
+        data = json.load(data_file)
+
+    score = compute_score(config, data)
+    print(f"Benchmark Score: {score} (rounded) {int(round(score))}")
\ No newline at end of file
diff --git a/score/generate_score_config.py b/score/generate_score_config.py
new file mode 100644
index 0000000000..567de9a7d9
--- /dev/null
+++ b/score/generate_score_config.py
@@ -0,0 +1,121 @@
+"""
+Generate a fully specified benchmark configuration file, given a lightweight
+specification and a complete source of benchmark data.
+
+Specification File
+------------------
+Score heirarchy input intended to be as easy to construct as possible,
+relying on automatic inference of unspecified weights, benchmark configs,
+and normalization factors given a particular instance of benchmark data.
+
+Structure:
+    Root                             _
+    - category                        |  required:
+      - domain                        |  3 layers of organizational structure 
+        - task                       _|
+
+          - benchmark name           -   keyword match for root name in benchmark,
+                                         omit children unless used
+                                     _
+            - train/eval              |  optional:
+              - device                |  provide specific weights or 
+                - compiler/runtime   _|  exclude particular configs by omission
+
+Rules for describing the weight heirarchy
+- everything is a dict, since at any level you could specify a weight
+- if a weight is not specified, it is computed automatically with respect
+its direct siblings.
+- if specific benchmark configurations are omitted under a benchmark name,
+all configurations present in the normalization data json are weighted equally
+
+Normalization Data
+------------------
+Used to 'fill in the gaps' in the human written specification. 
+
+Benchmark configurations (train/eval, device, compiler/runtime) present in 
+this data are frozen into the configuration and weighted equally if weights
+specific weights aren't provided in the specification.
+
+Normalization values are the benchmark measurements taken from this data, used
+to produce a value of 1.0 for each benchmark before applying weights and combining.
+
+####
+TODO
+####
+ - handle multiple normalization files, one for models, one for synthetic, etc
+ - make explicit configuration choice for throughput vs runtime metrics
+ - assert same machine used for all normalization files and freeze that in
+"""
+import argparse
+import json
+import yaml
+
+def generate_bench_cfg(spec, norm, target):
+    cfg = {
+        'target': target,
+        'benchmarks': {},
+    }
+    benchmark_names = [b['name'] for b in norm['benchmarks']]
+    benchmark_norms = {b['name']: b['stats']['mean'] for b in norm['benchmarks']}
+
+    assert len(spec['heirarchy']) > 0, "Must specify at least one category"
+    category_weight = 1.0 / len(spec['heirarchy'])
+    for category in spec['heirarchy']:
+        
+        category_spec = spec['heirarchy'][category]
+        assert isinstance(category_spec, dict), f"Category {category} in spec must be non-empty"
+        assert 'weight' not in category_spec, "TODO implement manual category weights"
+        domain_weight = 1.0 / len(category_spec)
+        
+        for domain in category_spec:
+            
+            tasks = category_spec[domain]        
+            assert isinstance(tasks, dict), f"Domain {category}:{domain} in spec must be non-empty"
+            assert 'weight' not in tasks, "TODO implement manual domain weights"
+            task_weight = 1.0 / len(tasks)
+            
+            for task in tasks:
+                
+                benchmarks = tasks[task]
+                assert isinstance(benchmarks, dict), f"Task {category}:{domain}:{task} in spec must be non-empty"
+                assert 'weight' not in benchmarks, "TODO implement manual task weights"
+                benchmark_weight = 1.0 / len(benchmarks)
+                
+                for benchmark in benchmarks:
+                    
+                    assert benchmarks[benchmark] is None, "TODO handle benchmark as dict of config specs"
+                    # assert 'weight' not in benchmarks[benchmark], "TODO implement manual benchmark weights"
+                    found_benchmarks = [name for name in benchmark_names if benchmark in name]
+                    assert len(found_benchmarks) > 0, f"No normalization data found for {benchmark}"
+                    config_weight = 1.0 / len(found_benchmarks)
+                    for b in found_benchmarks:
+                        weight = domain_weight * task_weight * benchmark_weight * config_weight
+                        cfg['benchmarks'][b] = {
+                                'weight': weight,
+                                'norm': benchmark_norms[b],
+                        }
+    return cfg
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--specification", required=True,
+        help="yaml file describing weight heirarchy")
+    parser.add_argument("--normalization_data", required=True,
+        help="pytest-benchmark json file used for generating normalization "
+             "values and filling in unspecified benchmark configurations")
+    parser.add_argument("--output_file", required=True,
+        help="generated complete benchmark configuration")
+    parser.add_argument("--target_score", default=1000,
+        help="target score value given these normalizations and specifications")
+    args = parser.parse_args()
+
+    with open(args.specification) as spec_file:
+        spec = yaml.full_load(spec_file)
+    
+    with open(args.normalization_data) as norm_file:
+        norm = json.load(norm_file)
+
+    with open(args.output_file, 'w') as out_file:
+        bench_cfg = generate_bench_cfg(spec, norm, args.target_score)
+        yaml.dump(bench_cfg, out_file)
+        
\ No newline at end of file
diff --git a/score/score.yml b/score/score.yml
new file mode 100644
index 0000000000..fb2994958b
--- /dev/null
+++ b/score/score.yml
@@ -0,0 +1,38 @@
+---
+heirarchy:
+  model:
+    computer vision:
+      segmentation:
+        maskrcnn_benchmark:
+      classification:
+        pytorch_mobilenet_v3:
+      detection:
+        yolov3:
+      generation:
+        pytorch_CycleGAN_and_pix...:
+        pytorch_stargan:
+      other computer vision:
+        Background_Matting:
+        Super_SloMo:
+    natural language processing:
+      translation:
+        attention_is_all_you_nee...:
+      language_modeling:
+        BERT_pytorch:
+      other nlp:
+        fastNLP:
+    speech:
+      synthesis:
+        tacotron2:
+    recommendation:
+      recommendation:
+        dlrm:
+    reinforcement learning:
+      other rl:
+        LearningToPaint:
+    other:
+      other tasks:
+        moco:
+        demucs:
+        pytorch_struct:
+...
\ No newline at end of file
diff --git a/score/torchbench_0.0.yaml b/score/torchbench_0.0.yaml
new file mode 100644
index 0000000000..4d8ab8a3df
--- /dev/null
+++ b/score/torchbench_0.0.yaml
@@ -0,0 +1,266 @@
+benchmarks:
+  test_eval[BERT_pytorch-cpu-eager]:
+    norm: 0.056674222277769734
+    weight: 0.006944444444444444
+  test_eval[BERT_pytorch-cpu-jit]:
+    norm: 0.04845556728573034
+    weight: 0.006944444444444444
+  test_eval[BERT_pytorch-cuda-eager]:
+    norm: 0.026858860157876495
+    weight: 0.006944444444444444
+  test_eval[BERT_pytorch-cuda-jit]:
+    norm: 0.017104738694909322
+    weight: 0.006944444444444444
+  test_eval[LearningToPaint-cpu-eager]:
+    norm: 0.015391482121220757
+    weight: 0.020833333333333332
+  test_eval[LearningToPaint-cpu-jit]:
+    norm: 0.01744512900000531
+    weight: 0.020833333333333332
+  test_eval[LearningToPaint-cuda-eager]:
+    norm: 0.004052860649187367
+    weight: 0.020833333333333332
+  test_eval[LearningToPaint-cuda-jit]:
+    norm: 0.004101277514284299
+    weight: 0.020833333333333332
+  test_eval[Super_SloMo-cuda-eager]:
+    norm: 0.5727992373999768
+    weight: 0.004166666666666667
+  test_eval[Super_SloMo-cuda-jit]:
+    norm: 0.5758905388000131
+    weight: 0.004166666666666667
+  test_eval[attention_is_all_you_nee...-cpu-eager]:
+    norm: 0.8630710880000152
+    weight: 0.006944444444444444
+  test_eval[attention_is_all_you_nee...-cpu-jit]:
+    norm: 0.8489351521999652
+    weight: 0.006944444444444444
+  test_eval[attention_is_all_you_nee...-cuda-eager]:
+    norm: 0.18483205416672868
+    weight: 0.006944444444444444
+  test_eval[attention_is_all_you_nee...-cuda-jit]:
+    norm: 0.18512023083333892
+    weight: 0.006944444444444444
+  test_eval[demucs-cpu-eager]:
+    norm: 0.15394044100006404
+    weight: 0.006944444444444444
+  test_eval[demucs-cpu-jit]:
+    norm: 0.154964403714335
+    weight: 0.006944444444444444
+  test_eval[demucs-cuda-eager]:
+    norm: 0.08700494508337897
+    weight: 0.006944444444444444
+  test_eval[demucs-cuda-jit]:
+    norm: 0.08798973516667274
+    weight: 0.006944444444444444
+  test_eval[dlrm-cpu-eager]:
+    norm: 0.0007821738814477574
+    weight: 0.041666666666666664
+  test_eval[dlrm-cuda-eager]:
+    norm: 0.0011976210188676443
+    weight: 0.041666666666666664
+  test_eval[fastNLP-cpu-eager]:
+    norm: 0.0003339627349426991
+    weight: 0.006944444444444444
+  test_eval[fastNLP-cpu-jit]:
+    norm: 0.0002815807873683167
+    weight: 0.006944444444444444
+  test_eval[fastNLP-cuda-eager]:
+    norm: 0.00026201308868427635
+    weight: 0.006944444444444444
+  test_eval[fastNLP-cuda-jit]:
+    norm: 0.00021672122712418238
+    weight: 0.006944444444444444
+  test_eval[maskrcnn_benchmark-cuda-eager]:
+    norm: 0.15884241357148962
+    weight: 0.016666666666666666
+  test_eval[moco-cuda-eager]:
+    norm: 0.6445621068000037
+    weight: 0.013888888888888888
+  test_eval[moco-cuda-jit]:
+    norm: 0.6464073749999443
+    weight: 0.013888888888888888
+  test_eval[pytorch_CycleGAN_and_pix...-cuda-eager]:
+    norm: 0.037252862678558686
+    weight: 0.005555555555555555
+  test_eval[pytorch_CycleGAN_and_pix...-cuda-jit]:
+    norm: 0.03734425745829147
+    weight: 0.005555555555555555
+  test_eval[pytorch_mobilenet_v3-cpu-eager]:
+    norm: 0.02729150021618684
+    weight: 0.004166666666666667
+  test_eval[pytorch_mobilenet_v3-cpu-jit]:
+    norm: 0.027794305918903136
+    weight: 0.004166666666666667
+  test_eval[pytorch_mobilenet_v3-cuda-eager]:
+    norm: 0.02729599686483345
+    weight: 0.004166666666666667
+  test_eval[pytorch_mobilenet_v3-cuda-jit]:
+    norm: 0.02273801193330453
+    weight: 0.004166666666666667
+  test_eval[pytorch_stargan-cpu-eager]:
+    norm: 1.030070522799997
+    weight: 0.0020833333333333333
+  test_eval[pytorch_stargan-cpu-jit]:
+    norm: 1.0323324262000824
+    weight: 0.0020833333333333333
+  test_eval[pytorch_stargan-cuda-eager]:
+    norm: 0.10837374479997379
+    weight: 0.0020833333333333333
+  test_eval[pytorch_stargan-cuda-jit]:
+    norm: 0.10870929039992916
+    weight: 0.0020833333333333333
+  test_eval[pytorch_struct-cpu-eager]:
+    norm: 0.002562631041459985
+    weight: 0.006944444444444444
+  test_eval[pytorch_struct-cpu-jit]:
+    norm: 0.002147531797012927
+    weight: 0.006944444444444444
+  test_eval[pytorch_struct-cuda-eager]:
+    norm: 0.0016883496258385442
+    weight: 0.006944444444444444
+  test_eval[pytorch_struct-cuda-jit]:
+    norm: 0.0012041589738027185
+    weight: 0.006944444444444444
+  test_eval[tacotron2-cuda-eager]:
+    norm: 1.0758157056000528
+    weight: 0.08333333333333333
+  test_eval[yolov3-cpu-eager]:
+    norm: 0.8865785995999431
+    weight: 0.01111111111111111
+  test_eval[yolov3-cuda-eager]:
+    norm: 0.5735116551998545
+    weight: 0.01111111111111111
+  test_train[BERT_pytorch-cpu-eager]:
+    norm: 0.27472745519994535
+    weight: 0.006944444444444444
+  test_train[BERT_pytorch-cpu-jit]:
+    norm: 0.3208459026000128
+    weight: 0.006944444444444444
+  test_train[BERT_pytorch-cuda-eager]:
+    norm: 0.13274032924994117
+    weight: 0.006944444444444444
+  test_train[BERT_pytorch-cuda-jit]:
+    norm: 0.12152077777778282
+    weight: 0.006944444444444444
+  test_train[Background_Matting-cuda-eager]:
+    norm: 4.454985670599944
+    weight: 0.008333333333333333
+  test_train[Background_Matting-cuda-jit]:
+    norm: 4.3299996296000245
+    weight: 0.008333333333333333
+  test_train[LearningToPaint-cpu-eager]:
+    norm: 0.06769545181248304
+    weight: 0.020833333333333332
+  test_train[LearningToPaint-cpu-jit]:
+    norm: 0.09986237930002062
+    weight: 0.020833333333333332
+  test_train[LearningToPaint-cuda-eager]:
+    norm: 0.021776436187498877
+    weight: 0.020833333333333332
+  test_train[LearningToPaint-cuda-jit]:
+    norm: 0.02460534921213807
+    weight: 0.020833333333333332
+  test_train[Super_SloMo-cuda-eager]:
+    norm: 1.4887921610000376
+    weight: 0.004166666666666667
+  test_train[Super_SloMo-cuda-jit]:
+    norm: 1.4842954989999726
+    weight: 0.004166666666666667
+  test_train[attention_is_all_you_nee...-cpu-eager]:
+    norm: 2.8790815572000157
+    weight: 0.006944444444444444
+  test_train[attention_is_all_you_nee...-cpu-jit]:
+    norm: 2.8627824179999153
+    weight: 0.006944444444444444
+  test_train[attention_is_all_you_nee...-cuda-eager]:
+    norm: 0.5916350972000146
+    weight: 0.006944444444444444
+  test_train[attention_is_all_you_nee...-cuda-jit]:
+    norm: 0.5918281878000471
+    weight: 0.006944444444444444
+  test_train[demucs-cpu-eager]:
+    norm: 0.7204151263999847
+    weight: 0.006944444444444444
+  test_train[demucs-cpu-jit]:
+    norm: 1.567230096000003
+    weight: 0.006944444444444444
+  test_train[demucs-cuda-eager]:
+    norm: 0.21315780479999374
+    weight: 0.006944444444444444
+  test_train[demucs-cuda-jit]:
+    norm: 0.21378219499997614
+    weight: 0.006944444444444444
+  test_train[dlrm-cpu-eager]:
+    norm: 0.0024918104727723656
+    weight: 0.041666666666666664
+  test_train[dlrm-cuda-eager]:
+    norm: 0.0038975701444924336
+    weight: 0.041666666666666664
+  test_train[fastNLP-cpu-eager]:
+    norm: 0.004783386184209382
+    weight: 0.006944444444444444
+  test_train[fastNLP-cpu-jit]:
+    norm: 0.004753424066670743
+    weight: 0.006944444444444444
+  test_train[fastNLP-cuda-eager]:
+    norm: 0.0025605078503221097
+    weight: 0.006944444444444444
+  test_train[fastNLP-cuda-jit]:
+    norm: 0.0025617683341551596
+    weight: 0.006944444444444444
+  test_train[maskrcnn_benchmark-cuda-eager]:
+    norm: 0.39820833739995576
+    weight: 0.016666666666666666
+  test_train[moco-cuda-eager]:
+    norm: 1.3086710984000092
+    weight: 0.013888888888888888
+  test_train[moco-cuda-jit]:
+    norm: 1.3050591925999924
+    weight: 0.013888888888888888
+  test_train[pytorch_CycleGAN_and_pix...-cuda-eager]:
+    norm: 53.70666004940008
+    weight: 0.005555555555555555
+  test_train[pytorch_mobilenet_v3-cpu-eager]:
+    norm: 0.25344906619993707
+    weight: 0.004166666666666667
+  test_train[pytorch_mobilenet_v3-cpu-jit]:
+    norm: 0.23886577479997867
+    weight: 0.004166666666666667
+  test_train[pytorch_mobilenet_v3-cuda-eager]:
+    norm: 0.2574923820000549
+    weight: 0.004166666666666667
+  test_train[pytorch_mobilenet_v3-cuda-jit]:
+    norm: 0.24167844399989918
+    weight: 0.004166666666666667
+  test_train[pytorch_stargan-cpu-eager]:
+    norm: 4.203840443199988
+    weight: 0.0020833333333333333
+  test_train[pytorch_stargan-cpu-jit]:
+    norm: 4.67151848600015
+    weight: 0.0020833333333333333
+  test_train[pytorch_stargan-cuda-eager]:
+    norm: 0.9323661928000547
+    weight: 0.0020833333333333333
+  test_train[pytorch_stargan-cuda-jit]:
+    norm: 0.9228581516000304
+    weight: 0.0020833333333333333
+  test_train[pytorch_struct-cpu-eager]:
+    norm: 1.2756745926000803
+    weight: 0.006944444444444444
+  test_train[pytorch_struct-cpu-jit]:
+    norm: 1.3099148541999057
+    weight: 0.006944444444444444
+  test_train[pytorch_struct-cuda-eager]:
+    norm: 0.4209450229998765
+    weight: 0.006944444444444444
+  test_train[pytorch_struct-cuda-jit]:
+    norm: 0.4204848049998873
+    weight: 0.006944444444444444
+  test_train[tacotron2-cuda-eager]:
+    norm: 4.155933508199996
+    weight: 0.08333333333333333
+  test_train[yolov3-cuda-eager]:
+    norm: 6.38542746819985
+    weight: 0.01111111111111111
+target: 1000

From 935f0141d92b9b7fdb0fb9a2016378522a63fb72 Mon Sep 17 00:00:00 2001
From: Will Constable <willconstable@gmail.com>
Date: Thu, 24 Sep 2020 20:09:38 -0400
Subject: [PATCH 02/13] Have CI upload benchmark score to scribe

---
 score/compute_score.py          |  5 ++--
 scripts/run_bench_and_upload.sh |  4 ++-
 scripts/upload_scribe.py        | 43 ++++++++++++++++++++++++++++-----
 3 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/score/compute_score.py b/score/compute_score.py
index e0d8b48520..9dfa847a28 100644
--- a/score/compute_score.py
+++ b/score/compute_score.py
@@ -16,7 +16,7 @@ def compute_score(config, data):
         weight_sum += weight
         measured_mean = [b['stats']['mean'] for b in data['benchmarks'] if b['name'] == name][0]
         benchmark_score = (norm / measured_mean) ** weight
-        print(f"{name}: {benchmark_score}")
+        # print(f"{name}: {benchmark_score}")
         score *= benchmark_score
 
     score = score ** (1.0 / len(config['benchmarks']))
@@ -38,4 +38,5 @@ def compute_score(config, data):
         data = json.load(data_file)
 
     score = compute_score(config, data)
-    print(f"Benchmark Score: {score} (rounded) {int(round(score))}")
\ No newline at end of file
+    print(score)
+    # print(f"Benchmark Score: {score} (rounded) {int(round(score))}")
\ No newline at end of file
diff --git a/scripts/run_bench_and_upload.sh b/scripts/run_bench_and_upload.sh
index 9611d72b8e..043be21b2f 100755
--- a/scripts/run_bench_and_upload.sh
+++ b/scripts/run_bench_and_upload.sh
@@ -24,12 +24,14 @@ BENCHMARK_ABS_FILENAME=${BENCHMARK_DATA}/${BENCHMARK_FILENAME}
 pytest test_bench.py --setup-show --benchmark-sort=Name --benchmark-json=${BENCHMARK_ABS_FILENAME} -k "$PYTEST_FILTER"
 
 
+# Compute benchmark score
+TORCHBENCH_SCORE=$(python score/compute_score.py --configuration score/torchbench_0.0.yaml --benchmark_data ${BENCHMARK_DATA}/hub.json)
 # Token is only present for certain jobs, only upload if present
 if [ -z "$SCRIBE_GRAPHQL_ACCESS_TOKEN" ]
 then
     echo "Skipping benchmark upload, token is missing."
 else
-    python scripts/upload_scribe.py --pytest_bench_json ${BENCHMARK_ABS_FILENAME}
+    python scripts/upload_scribe.py --pytest_bench_json ${BENCHMARK_ABS_FILENAME} --torchbench_score ${TORCHBENCH_SCORE}
 fi
 
 
diff --git a/scripts/upload_scribe.py b/scripts/upload_scribe.py
index 4192642271..c41d46dab5 100644
--- a/scripts/upload_scribe.py
+++ b/scripts/upload_scribe.py
@@ -85,7 +85,7 @@ def __init__(self):
                 'circle_build_num', 'circle_project_reponame',
             ],
             'float': [
-                'stddev', 'min', 'median', 'max', 'mean',
+                'stddev', 'min', 'median', 'max', 'mean', 'torchbench_score',
             ]
         }
 
@@ -125,13 +125,44 @@ def post_pytest_benchmarks(self, pytest_json):
             messages.append(m)
         self.upload(messages)
 
+    def post_torchbench_score(self, pytest_json, score):
+        machine_info = pytest_json['machine_info']
+        commit_info = pytest_json['commit_info']
+        upload_time = int(time.time())
+        m = self.format_message({
+            "time": upload_time,
+            "benchmark_time": pytest_json['datetime'],
+            "git_repo": commit_info['project'],
+            "git_commit_id": commit_info['id'],
+            "git_branch": commit_info['branch'],
+            "git_commit_time": commit_info['time'],
+            "git_dirty": commit_info['dirty'],
+            "pytorch_version": machine_info.get('pytorch_version', None),
+            "torchtext_version": machine_info.get('torchtext_version', None),
+            "torchvision_version": machine_info.get('torchvision_version', None),
+            "python_version": machine_info['python_implementation_version'],
+            "machine_kernel": machine_info['release'],
+            "machine_processor": machine_info['processor'],
+            "machine_hostname": machine_info['node'],
+            "circle_build_num": machine_info.get('circle_build_num', None),
+            "circle_project_reponame": machine_info.get('circle_project_name', None),
+            "torchbench_score": score,
+        })
+        self.upload(m)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--pytest_bench_json", type=argparse.FileType('r'),
+    parser.add_argument("--pytest_bench_json", required=True,
+                        type=argparse.FileType('r'),
                         help='Upload json data formatted by pytest-benchmark module')
+    parser.add_argument("--torchbench_score", type=float,
+                        help="optional torchbench score to include")
     args = parser.parse_args()
-    if args.pytest_bench_json:
-        benchmark_uploader = PytorchBenchmarkUploader()
-        json_data = json.load(args.pytest_bench_json)
-        benchmark_uploader.post_pytest_benchmarks(json_data)
+    
+    benchmark_uploader = PytorchBenchmarkUploader()
+    json_data = json.load(args.pytest_bench_json)
+    benchmark_uploader.post_pytest_benchmarks(json_data)
+    
+    if args.torchbench_score is not None:
+        benchmark_uploader.post_torchbench_score(json_data, args.torchbench_score)

From 29b34925c993cb9026fcbfafa14bed6a2aec86e2 Mon Sep 17 00:00:00 2001
From: Will Constable <willconstable@gmail.com>
Date: Thu, 24 Sep 2020 23:18:54 -0400
Subject: [PATCH 03/13] Remove maskrcnn from score temporarily

---
 score/score.yml           |  6 ++--
 score/torchbench_0.0.yaml | 62 ++++++++++++++++++---------------------
 2 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/score/score.yml b/score/score.yml
index fb2994958b..80f0bbb4f1 100644
--- a/score/score.yml
+++ b/score/score.yml
@@ -2,8 +2,8 @@
 heirarchy:
   model:
     computer vision:
-      segmentation:
-        maskrcnn_benchmark:
+      # segmentation:
+        # maskrcnn_benchmark:
       classification:
         pytorch_mobilenet_v3:
       detection:
@@ -35,4 +35,4 @@ heirarchy:
         moco:
         demucs:
         pytorch_struct:
-...
\ No newline at end of file
+...
diff --git a/score/torchbench_0.0.yaml b/score/torchbench_0.0.yaml
index 4d8ab8a3df..75ff829254 100644
--- a/score/torchbench_0.0.yaml
+++ b/score/torchbench_0.0.yaml
@@ -25,10 +25,10 @@ benchmarks:
     weight: 0.020833333333333332
   test_eval[Super_SloMo-cuda-eager]:
     norm: 0.5727992373999768
-    weight: 0.004166666666666667
+    weight: 0.005208333333333333
   test_eval[Super_SloMo-cuda-jit]:
     norm: 0.5758905388000131
-    weight: 0.004166666666666667
+    weight: 0.005208333333333333
   test_eval[attention_is_all_you_nee...-cpu-eager]:
     norm: 0.8630710880000152
     weight: 0.006944444444444444
@@ -71,9 +71,6 @@ benchmarks:
   test_eval[fastNLP-cuda-jit]:
     norm: 0.00021672122712418238
     weight: 0.006944444444444444
-  test_eval[maskrcnn_benchmark-cuda-eager]:
-    norm: 0.15884241357148962
-    weight: 0.016666666666666666
   test_eval[moco-cuda-eager]:
     norm: 0.6445621068000037
     weight: 0.013888888888888888
@@ -82,34 +79,34 @@ benchmarks:
     weight: 0.013888888888888888
   test_eval[pytorch_CycleGAN_and_pix...-cuda-eager]:
     norm: 0.037252862678558686
-    weight: 0.005555555555555555
+    weight: 0.006944444444444444
   test_eval[pytorch_CycleGAN_and_pix...-cuda-jit]:
     norm: 0.03734425745829147
-    weight: 0.005555555555555555
+    weight: 0.006944444444444444
   test_eval[pytorch_mobilenet_v3-cpu-eager]:
     norm: 0.02729150021618684
-    weight: 0.004166666666666667
+    weight: 0.005208333333333333
   test_eval[pytorch_mobilenet_v3-cpu-jit]:
     norm: 0.027794305918903136
-    weight: 0.004166666666666667
+    weight: 0.005208333333333333
   test_eval[pytorch_mobilenet_v3-cuda-eager]:
     norm: 0.02729599686483345
-    weight: 0.004166666666666667
+    weight: 0.005208333333333333
   test_eval[pytorch_mobilenet_v3-cuda-jit]:
     norm: 0.02273801193330453
-    weight: 0.004166666666666667
+    weight: 0.005208333333333333
   test_eval[pytorch_stargan-cpu-eager]:
     norm: 1.030070522799997
-    weight: 0.0020833333333333333
+    weight: 0.0026041666666666665
   test_eval[pytorch_stargan-cpu-jit]:
     norm: 1.0323324262000824
-    weight: 0.0020833333333333333
+    weight: 0.0026041666666666665
   test_eval[pytorch_stargan-cuda-eager]:
     norm: 0.10837374479997379
-    weight: 0.0020833333333333333
+    weight: 0.0026041666666666665
   test_eval[pytorch_stargan-cuda-jit]:
     norm: 0.10870929039992916
-    weight: 0.0020833333333333333
+    weight: 0.0026041666666666665
   test_eval[pytorch_struct-cpu-eager]:
     norm: 0.002562631041459985
     weight: 0.006944444444444444
@@ -127,10 +124,10 @@ benchmarks:
     weight: 0.08333333333333333
   test_eval[yolov3-cpu-eager]:
     norm: 0.8865785995999431
-    weight: 0.01111111111111111
+    weight: 0.013888888888888888
   test_eval[yolov3-cuda-eager]:
     norm: 0.5735116551998545
-    weight: 0.01111111111111111
+    weight: 0.013888888888888888
   test_train[BERT_pytorch-cpu-eager]:
     norm: 0.27472745519994535
     weight: 0.006944444444444444
@@ -145,10 +142,10 @@ benchmarks:
     weight: 0.006944444444444444
   test_train[Background_Matting-cuda-eager]:
     norm: 4.454985670599944
-    weight: 0.008333333333333333
+    weight: 0.010416666666666666
   test_train[Background_Matting-cuda-jit]:
     norm: 4.3299996296000245
-    weight: 0.008333333333333333
+    weight: 0.010416666666666666
   test_train[LearningToPaint-cpu-eager]:
     norm: 0.06769545181248304
     weight: 0.020833333333333332
@@ -163,10 +160,10 @@ benchmarks:
     weight: 0.020833333333333332
   test_train[Super_SloMo-cuda-eager]:
     norm: 1.4887921610000376
-    weight: 0.004166666666666667
+    weight: 0.005208333333333333
   test_train[Super_SloMo-cuda-jit]:
     norm: 1.4842954989999726
-    weight: 0.004166666666666667
+    weight: 0.005208333333333333
   test_train[attention_is_all_you_nee...-cpu-eager]:
     norm: 2.8790815572000157
     weight: 0.006944444444444444
@@ -209,9 +206,6 @@ benchmarks:
   test_train[fastNLP-cuda-jit]:
     norm: 0.0025617683341551596
     weight: 0.006944444444444444
-  test_train[maskrcnn_benchmark-cuda-eager]:
-    norm: 0.39820833739995576
-    weight: 0.016666666666666666
   test_train[moco-cuda-eager]:
     norm: 1.3086710984000092
     weight: 0.013888888888888888
@@ -220,31 +214,31 @@ benchmarks:
     weight: 0.013888888888888888
   test_train[pytorch_CycleGAN_and_pix...-cuda-eager]:
     norm: 53.70666004940008
-    weight: 0.005555555555555555
+    weight: 0.006944444444444444
   test_train[pytorch_mobilenet_v3-cpu-eager]:
     norm: 0.25344906619993707
-    weight: 0.004166666666666667
+    weight: 0.005208333333333333
   test_train[pytorch_mobilenet_v3-cpu-jit]:
     norm: 0.23886577479997867
-    weight: 0.004166666666666667
+    weight: 0.005208333333333333
   test_train[pytorch_mobilenet_v3-cuda-eager]:
     norm: 0.2574923820000549
-    weight: 0.004166666666666667
+    weight: 0.005208333333333333
   test_train[pytorch_mobilenet_v3-cuda-jit]:
     norm: 0.24167844399989918
-    weight: 0.004166666666666667
+    weight: 0.005208333333333333
   test_train[pytorch_stargan-cpu-eager]:
     norm: 4.203840443199988
-    weight: 0.0020833333333333333
+    weight: 0.0026041666666666665
   test_train[pytorch_stargan-cpu-jit]:
     norm: 4.67151848600015
-    weight: 0.0020833333333333333
+    weight: 0.0026041666666666665
   test_train[pytorch_stargan-cuda-eager]:
     norm: 0.9323661928000547
-    weight: 0.0020833333333333333
+    weight: 0.0026041666666666665
   test_train[pytorch_stargan-cuda-jit]:
     norm: 0.9228581516000304
-    weight: 0.0020833333333333333
+    weight: 0.0026041666666666665
   test_train[pytorch_struct-cpu-eager]:
     norm: 1.2756745926000803
     weight: 0.006944444444444444
@@ -262,5 +256,5 @@ benchmarks:
     weight: 0.08333333333333333
   test_train[yolov3-cuda-eager]:
     norm: 6.38542746819985
-    weight: 0.01111111111111111
+    weight: 0.013888888888888888
 target: 1000

From ecec19ea5ce0bd8a37c563b5bc0533f0e6548b22 Mon Sep 17 00:00:00 2001
From: Will Constable <willconstable@gmail.com>
Date: Fri, 25 Sep 2020 17:52:57 -0400
Subject: [PATCH 04/13] Add --benchmark_data_dir option to compute_score.py

---
 score/compute_score.py | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/score/compute_score.py b/score/compute_score.py
index 9dfa847a28..9e76bd30c0 100644
--- a/score/compute_score.py
+++ b/score/compute_score.py
@@ -4,8 +4,11 @@
 """
 import argparse
 import json
+import os
 import yaml
 
+from tabulate import tabulate
+
 def compute_score(config, data):
     target = config['target']
     score = 1.0
@@ -27,16 +30,31 @@ def compute_score(config, data):
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("--configuration", required=True,
         help="frozen benchmark configuration generated by generate_score_config.py")
-    parser.add_argument("--benchmark_data", required=True,
+    parser.add_argument("--benchmark_data_file",
         help="pytest-benchmark json file with current benchmark data")
+    parser.add_argument("--benchmark_data_dir",
+        help="directory containing multiple .json files for each of which to compute a score")
     args = parser.parse_args()
 
     with open(args.configuration) as cfg_file:
         config = yaml.full_load(cfg_file)
 
-    with open(args.benchmark_data) as data_file:
-        data = json.load(data_file)
+    if args.benchmark_data_file is not None:
+        with open(args.benchmark_data_file) as data_file:
+            data = json.load(data_file)
+
+        score = compute_score(config, data)
+        print(score)
+    elif args.benchmark_data_dir is not None:
+        scores = [('File', 'Score')]
+        for f in os.listdir(args.benchmark_data_dir):
+            path = os.path.join(args.benchmark_data_dir, f)
+            if os.path.isfile(path) and os.path.splitext(path)[1] == '.json':
+                with open(path) as data_file:
+                    data = json.load(data_file)
+                    score = compute_score(config, data)
+                    scores.append((f, score))
+            
+        print(tabulate(scores, headers='firstrow'))
 
-    score = compute_score(config, data)
-    print(score)
     # print(f"Benchmark Score: {score} (rounded) {int(round(score))}")
\ No newline at end of file

From be9394804bbbcc624c81c61dd285a0e5b5c1a7d5 Mon Sep 17 00:00:00 2001
From: Will Constable <willconstable@gmail.com>
Date: Fri, 25 Sep 2020 17:53:55 -0400
Subject: [PATCH 05/13] Update run_bench_upload to use new compute_score flag

---
 scripts/run_bench_and_upload.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/run_bench_and_upload.sh b/scripts/run_bench_and_upload.sh
index 043be21b2f..784590f40e 100755
--- a/scripts/run_bench_and_upload.sh
+++ b/scripts/run_bench_and_upload.sh
@@ -25,7 +25,7 @@ pytest test_bench.py --setup-show --benchmark-sort=Name --benchmark-json=${BENCH
 
 
 # Compute benchmark score
-TORCHBENCH_SCORE=$(python score/compute_score.py --configuration score/torchbench_0.0.yaml --benchmark_data ${BENCHMARK_DATA}/hub.json)
+TORCHBENCH_SCORE=$(python score/compute_score.py --configuration score/torchbench_0.0.yaml --benchmark_data_file ${BENCHMARK_DATA}/hub.json)
 # Token is only present for certain jobs, only upload if present
 if [ -z "$SCRIBE_GRAPHQL_ACCESS_TOKEN" ]
 then

From a1500977009318a36e44e1ea833b9f79d9ddb948 Mon Sep 17 00:00:00 2001
From: Will Constable <willconstable@gmail.com>
Date: Fri, 25 Sep 2020 17:54:40 -0400
Subject: [PATCH 06/13] Fix benchmark filename used in score computation

---
 scripts/run_bench_and_upload.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/run_bench_and_upload.sh b/scripts/run_bench_and_upload.sh
index 784590f40e..7f224554aa 100755
--- a/scripts/run_bench_and_upload.sh
+++ b/scripts/run_bench_and_upload.sh
@@ -25,7 +25,7 @@ pytest test_bench.py --setup-show --benchmark-sort=Name --benchmark-json=${BENCH
 
 
 # Compute benchmark score
-TORCHBENCH_SCORE=$(python score/compute_score.py --configuration score/torchbench_0.0.yaml --benchmark_data_file ${BENCHMARK_DATA}/hub.json)
+TORCHBENCH_SCORE=$(python score/compute_score.py --configuration score/torchbench_0.0.yaml --benchmark_data_file ${BENCHMARK_ABS_FILENAME})
 # Token is only present for certain jobs, only upload if present
 if [ -z "$SCRIBE_GRAPHQL_ACCESS_TOKEN" ]
 then

From 0709e171d8dce7ce18db5f33f22f77f7ede4a57c Mon Sep 17 00:00:00 2001
From: Will Constable <willconstable@gmail.com>
Date: Fri, 25 Sep 2020 18:51:50 -0400
Subject: [PATCH 07/13] Move dependencies from CI script to requirements.txt
 and add tabulate

---
 requirements.txt             | 4 +++-
 scripts/install_nightlies.sh | 2 --
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 7bc348361c..7689900959 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,4 @@
 pytest
-pytest-benchmark
\ No newline at end of file
+pytest-benchmark
+requests
+tabulate
diff --git a/scripts/install_nightlies.sh b/scripts/install_nightlies.sh
index a8f6d22942..cdeff59218 100755
--- a/scripts/install_nightlies.sh
+++ b/scripts/install_nightlies.sh
@@ -4,5 +4,3 @@ set -e
 . ~/miniconda3/etc/profile.d/conda.sh
 conda activate base
 conda install -y pytorch torchtext torchvision -c pytorch-nightly
-pip install -q pytest pytest-benchmark requests
-

From aa8eb99c31d994565c841582ea434ad174cb8340 Mon Sep 17 00:00:00 2001
From: Will Constable <willconstable@gmail.com>
Date: Mon, 28 Sep 2020 21:05:55 -0400
Subject: [PATCH 08/13] Change score computation to exp sum weighted log

This compuation has a more intuitive appeal, as
a 2x across the board improvement would yield a 2x score
improvement.

Also add 'hack_data' option to enable quick experiments
starting with real data and 'editing' only a keyword match set
of measurements by some factor.
---
 score/compute_score.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/score/compute_score.py b/score/compute_score.py
index 9e76bd30c0..e7c0c25818 100644
--- a/score/compute_score.py
+++ b/score/compute_score.py
@@ -4,27 +4,31 @@
 """
 import argparse
 import json
+import math
 import os
 import yaml
 
 from tabulate import tabulate
 
-def compute_score(config, data):
+def compute_score(config, data, fake_data=None):
     target = config['target']
-    score = 1.0
+    score = 0.0
     weight_sum = 0.0
     for name in config['benchmarks']:
         cfg = config['benchmarks'][name]
         weight, norm = cfg['weight'], cfg['norm']
         weight_sum += weight
         measured_mean = [b['stats']['mean'] for b in data['benchmarks'] if b['name'] == name][0]
-        benchmark_score = (norm / measured_mean) ** weight
+        if fake_data is not None and name in fake_data:
+            # used for sanity checks on the sensitivity of the score metric
+            measured_mean = fake_data[name]
+        benchmark_score = weight * math.log(norm / measured_mean)
         # print(f"{name}: {benchmark_score}")
-        score *= benchmark_score
+        score += benchmark_score
 
-    score = score ** (1.0 / len(config['benchmarks']))
+    score = target * math.exp(score)
     assert abs(weight_sum - 1.0) < 1e-6, f"Bad configuration, weights don't sum to 1, but {weight_sum}"
-    return score * target    
+    return score
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description=__doc__)
@@ -34,8 +38,8 @@ def compute_score(config, data):
         help="pytest-benchmark json file with current benchmark data")
     parser.add_argument("--benchmark_data_dir",
         help="directory containing multiple .json files for each of which to compute a score")
+    parser.add_argument('--hack_data', nargs=2, action='append', help="keyword to match benchmark names, and multiplicative factor to adjust their measurement")
     args = parser.parse_args()
-
     with open(args.configuration) as cfg_file:
         config = yaml.full_load(cfg_file)
 
@@ -45,6 +49,16 @@ def compute_score(config, data):
 
         score = compute_score(config, data)
         print(score)
+        if args.hack_data:
+            fake_data = {}
+            for keyword, factor in args.hack_data:
+                for b in data['benchmarks']:
+                    if keyword.lower() in b['name'].lower():
+                        fake_data[b['name']] =  b['stats']['mean'] * float(factor)
+
+            hacked_score = compute_score(config, data, fake_data)
+            print(f"Using hacks {args.hack_data}, hacked_score {hacked_score}")
+
     elif args.benchmark_data_dir is not None:
         scores = [('File', 'Score')]
         for f in os.listdir(args.benchmark_data_dir):

From 3acd61bc329580da4bbc4a69c7bed634bfe2f387 Mon Sep 17 00:00:00 2001
From: Will Constable <willconstable@gmail.com>
Date: Mon, 28 Sep 2020 21:09:46 -0400
Subject: [PATCH 09/13] Add assert for missing data during score computation

- doesn't change the outcome but prints helpful error
---
 score/compute_score.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/score/compute_score.py b/score/compute_score.py
index e7c0c25818..ea0c33f15e 100644
--- a/score/compute_score.py
+++ b/score/compute_score.py
@@ -18,7 +18,9 @@ def compute_score(config, data, fake_data=None):
         cfg = config['benchmarks'][name]
         weight, norm = cfg['weight'], cfg['norm']
         weight_sum += weight
-        measured_mean = [b['stats']['mean'] for b in data['benchmarks'] if b['name'] == name][0]
+        measured_mean = [b['stats']['mean'] for b in data['benchmarks'] if b['name'] == name]
+        assert len(measured_mean) == 1, f"Missing data for {name}, unable to compute score"
+        measured_mean = measured_mean[0]
         if fake_data is not None and name in fake_data:
             # used for sanity checks on the sensitivity of the score metric
             measured_mean = fake_data[name]

From 67c4730cf9b591b3d1f5e3c3fcf542a2ee89ee92 Mon Sep 17 00:00:00 2001
From: Will Constable <willconstable@gmail.com>
Date: Mon, 28 Sep 2020 23:43:38 -0400
Subject: [PATCH 10/13] Temporarily remove cyclegan/stargan from score as they
 have been disabled on PR jobs preventing score calculation on PRs

---
 score/score.yml           |  6 ++--
 score/torchbench_0.0.yaml | 67 ++++++++++-----------------------------
 2 files changed, 20 insertions(+), 53 deletions(-)

diff --git a/score/score.yml b/score/score.yml
index 80f0bbb4f1..982695f3ea 100644
--- a/score/score.yml
+++ b/score/score.yml
@@ -8,9 +8,9 @@ heirarchy:
         pytorch_mobilenet_v3:
       detection:
         yolov3:
-      generation:
-        pytorch_CycleGAN_and_pix...:
-        pytorch_stargan:
+      # generation:
+        # pytorch_CycleGAN_and_pix...:
+        # pytorch_stargan:
       other computer vision:
         Background_Matting:
         Super_SloMo:
diff --git a/score/torchbench_0.0.yaml b/score/torchbench_0.0.yaml
index 75ff829254..d972734e60 100644
--- a/score/torchbench_0.0.yaml
+++ b/score/torchbench_0.0.yaml
@@ -25,10 +25,10 @@ benchmarks:
     weight: 0.020833333333333332
   test_eval[Super_SloMo-cuda-eager]:
     norm: 0.5727992373999768
-    weight: 0.005208333333333333
+    weight: 0.006944444444444444
   test_eval[Super_SloMo-cuda-jit]:
     norm: 0.5758905388000131
-    weight: 0.005208333333333333
+    weight: 0.006944444444444444
   test_eval[attention_is_all_you_nee...-cpu-eager]:
     norm: 0.8630710880000152
     weight: 0.006944444444444444
@@ -77,36 +77,18 @@ benchmarks:
   test_eval[moco-cuda-jit]:
     norm: 0.6464073749999443
     weight: 0.013888888888888888
-  test_eval[pytorch_CycleGAN_and_pix...-cuda-eager]:
-    norm: 0.037252862678558686
-    weight: 0.006944444444444444
-  test_eval[pytorch_CycleGAN_and_pix...-cuda-jit]:
-    norm: 0.03734425745829147
-    weight: 0.006944444444444444
   test_eval[pytorch_mobilenet_v3-cpu-eager]:
     norm: 0.02729150021618684
-    weight: 0.005208333333333333
+    weight: 0.006944444444444444
   test_eval[pytorch_mobilenet_v3-cpu-jit]:
     norm: 0.027794305918903136
-    weight: 0.005208333333333333
+    weight: 0.006944444444444444
   test_eval[pytorch_mobilenet_v3-cuda-eager]:
     norm: 0.02729599686483345
-    weight: 0.005208333333333333
+    weight: 0.006944444444444444
   test_eval[pytorch_mobilenet_v3-cuda-jit]:
     norm: 0.02273801193330453
-    weight: 0.005208333333333333
-  test_eval[pytorch_stargan-cpu-eager]:
-    norm: 1.030070522799997
-    weight: 0.0026041666666666665
-  test_eval[pytorch_stargan-cpu-jit]:
-    norm: 1.0323324262000824
-    weight: 0.0026041666666666665
-  test_eval[pytorch_stargan-cuda-eager]:
-    norm: 0.10837374479997379
-    weight: 0.0026041666666666665
-  test_eval[pytorch_stargan-cuda-jit]:
-    norm: 0.10870929039992916
-    weight: 0.0026041666666666665
+    weight: 0.006944444444444444
   test_eval[pytorch_struct-cpu-eager]:
     norm: 0.002562631041459985
     weight: 0.006944444444444444
@@ -124,10 +106,10 @@ benchmarks:
     weight: 0.08333333333333333
   test_eval[yolov3-cpu-eager]:
     norm: 0.8865785995999431
-    weight: 0.013888888888888888
+    weight: 0.018518518518518517
   test_eval[yolov3-cuda-eager]:
     norm: 0.5735116551998545
-    weight: 0.013888888888888888
+    weight: 0.018518518518518517
   test_train[BERT_pytorch-cpu-eager]:
     norm: 0.27472745519994535
     weight: 0.006944444444444444
@@ -142,10 +124,10 @@ benchmarks:
     weight: 0.006944444444444444
   test_train[Background_Matting-cuda-eager]:
     norm: 4.454985670599944
-    weight: 0.010416666666666666
+    weight: 0.013888888888888888
   test_train[Background_Matting-cuda-jit]:
     norm: 4.3299996296000245
-    weight: 0.010416666666666666
+    weight: 0.013888888888888888
   test_train[LearningToPaint-cpu-eager]:
     norm: 0.06769545181248304
     weight: 0.020833333333333332
@@ -160,10 +142,10 @@ benchmarks:
     weight: 0.020833333333333332
   test_train[Super_SloMo-cuda-eager]:
     norm: 1.4887921610000376
-    weight: 0.005208333333333333
+    weight: 0.006944444444444444
   test_train[Super_SloMo-cuda-jit]:
     norm: 1.4842954989999726
-    weight: 0.005208333333333333
+    weight: 0.006944444444444444
   test_train[attention_is_all_you_nee...-cpu-eager]:
     norm: 2.8790815572000157
     weight: 0.006944444444444444
@@ -212,33 +194,18 @@ benchmarks:
   test_train[moco-cuda-jit]:
     norm: 1.3050591925999924
     weight: 0.013888888888888888
-  test_train[pytorch_CycleGAN_and_pix...-cuda-eager]:
-    norm: 53.70666004940008
-    weight: 0.006944444444444444
   test_train[pytorch_mobilenet_v3-cpu-eager]:
     norm: 0.25344906619993707
-    weight: 0.005208333333333333
+    weight: 0.006944444444444444
   test_train[pytorch_mobilenet_v3-cpu-jit]:
     norm: 0.23886577479997867
-    weight: 0.005208333333333333
+    weight: 0.006944444444444444
   test_train[pytorch_mobilenet_v3-cuda-eager]:
     norm: 0.2574923820000549
-    weight: 0.005208333333333333
+    weight: 0.006944444444444444
   test_train[pytorch_mobilenet_v3-cuda-jit]:
     norm: 0.24167844399989918
-    weight: 0.005208333333333333
-  test_train[pytorch_stargan-cpu-eager]:
-    norm: 4.203840443199988
-    weight: 0.0026041666666666665
-  test_train[pytorch_stargan-cpu-jit]:
-    norm: 4.67151848600015
-    weight: 0.0026041666666666665
-  test_train[pytorch_stargan-cuda-eager]:
-    norm: 0.9323661928000547
-    weight: 0.0026041666666666665
-  test_train[pytorch_stargan-cuda-jit]:
-    norm: 0.9228581516000304
-    weight: 0.0026041666666666665
+    weight: 0.006944444444444444
   test_train[pytorch_struct-cpu-eager]:
     norm: 1.2756745926000803
     weight: 0.006944444444444444
@@ -256,5 +223,5 @@ benchmarks:
     weight: 0.08333333333333333
   test_train[yolov3-cuda-eager]:
     norm: 6.38542746819985
-    weight: 0.013888888888888888
+    weight: 0.018518518518518517
 target: 1000

From fbf346865b8388ce1199cb76371efc772d68718a Mon Sep 17 00:00:00 2001
From: wconstab <whc@fb.com>
Date: Tue, 29 Sep 2020 12:51:59 -0700
Subject: [PATCH 11/13] Update README.md

---
 score/README.md | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/score/README.md b/score/README.md
index cff74249b6..c8defe157c 100644
--- a/score/README.md
+++ b/score/README.md
@@ -14,11 +14,23 @@ benchmark configuration.
 
 ## Computing the score
 To compute the current score, provide a score config and benchmark data produced by pytest with `--benchmark-json` or related arguments.
-`python compute_score.py --configuration <cfg> --benchmark_data <data>`
+`python compute_score.py --configuration <cfg> --benchmark_data_file <data-file> `
+Or, use `--benchmark_data_dir` instead, pointing to a directory containing multiple json files to compute a table of scores.
 
 ## New score versions
 Periodically, as more workloads have been added to the torchbenchmark suite, or as changes to
 relative weights or categories have been proposed, a new score configuration should be generated
 rather than modifying an existing score definition. 
 
-See `python generate_score_config.py -h` 
\ No newline at end of file
+See `python generate_score_config.py -h` 
+
+## Issues and Next Steps
+For accurate score comparisons, measurements should be computed on the same machine 
+(or at least same machine spec) as the data used to produce normalization constants 
+in the score configuration.  
+
+- compute_score.py should assert the machine type matches by default
+- currently, a circleCI 'medium' gpu worker was used for the normalization data
+- soon, a particular CPU/GPU config should be deliberately selected along with a 
+  list of models/categories to be frozen for first long-living rev of the score
+  

From 41da4ee0bf7618e857d45bb680be12a03bcf95be Mon Sep 17 00:00:00 2001
From: wconstab <whc@fb.com>
Date: Tue, 29 Sep 2020 14:19:58 -0700
Subject: [PATCH 12/13] Update generate_score_config.py

---
 score/generate_score_config.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/score/generate_score_config.py b/score/generate_score_config.py
index 567de9a7d9..bc65120fc2 100644
--- a/score/generate_score_config.py
+++ b/score/generate_score_config.py
@@ -32,12 +32,10 @@
 ------------------
 Used to 'fill in the gaps' in the human written specification. 
 
-Benchmark configurations (train/eval, device, compiler/runtime) present in 
-this data are frozen into the configuration and weighted equally if weights
-specific weights aren't provided in the specification.
-
-Normalization values are the benchmark measurements taken from this data, used
-to produce a value of 1.0 for each benchmark before applying weights and combining.
+- particular configurations (train/eval, device, compiler/runtime) present in 
+this data are used to compute benchmark weights
+- measurements from this data are used as normalization factors in score computation
+  such that new data is scored relative to this data.
 
 ####
 TODO
@@ -118,4 +116,4 @@ def generate_bench_cfg(spec, norm, target):
     with open(args.output_file, 'w') as out_file:
         bench_cfg = generate_bench_cfg(spec, norm, args.target_score)
         yaml.dump(bench_cfg, out_file)
-        
\ No newline at end of file
+        

From 78687a77286dd9a046864aa618f41ba2b9671765 Mon Sep 17 00:00:00 2001
From: Will Constable <willconstable@gmail.com>
Date: Thu, 1 Oct 2020 14:12:50 -0400
Subject: [PATCH 13/13] Fix spelling of hierarchy

---
 score/generate_score_config.py | 14 +++++++-------
 score/score.yml                |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/score/generate_score_config.py b/score/generate_score_config.py
index bc65120fc2..539e816940 100644
--- a/score/generate_score_config.py
+++ b/score/generate_score_config.py
@@ -4,7 +4,7 @@
 
 Specification File
 ------------------
-Score heirarchy input intended to be as easy to construct as possible,
+Score hierarchy input intended to be as easy to construct as possible,
 relying on automatic inference of unspecified weights, benchmark configs,
 and normalization factors given a particular instance of benchmark data.
 
@@ -21,7 +21,7 @@
               - device                |  provide specific weights or 
                 - compiler/runtime   _|  exclude particular configs by omission
 
-Rules for describing the weight heirarchy
+Rules for describing the weight hierarchy
 - everything is a dict, since at any level you could specify a weight
 - if a weight is not specified, it is computed automatically with respect
 its direct siblings.
@@ -56,11 +56,11 @@ def generate_bench_cfg(spec, norm, target):
     benchmark_names = [b['name'] for b in norm['benchmarks']]
     benchmark_norms = {b['name']: b['stats']['mean'] for b in norm['benchmarks']}
 
-    assert len(spec['heirarchy']) > 0, "Must specify at least one category"
-    category_weight = 1.0 / len(spec['heirarchy'])
-    for category in spec['heirarchy']:
+    assert len(spec['hierarchy']) > 0, "Must specify at least one category"
+    category_weight = 1.0 / len(spec['hierarchy'])
+    for category in spec['hierarchy']:
         
-        category_spec = spec['heirarchy'][category]
+        category_spec = spec['hierarchy'][category]
         assert isinstance(category_spec, dict), f"Category {category} in spec must be non-empty"
         assert 'weight' not in category_spec, "TODO implement manual category weights"
         domain_weight = 1.0 / len(category_spec)
@@ -97,7 +97,7 @@ def generate_bench_cfg(spec, norm, target):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("--specification", required=True,
-        help="yaml file describing weight heirarchy")
+        help="yaml file describing weight hierarchy")
     parser.add_argument("--normalization_data", required=True,
         help="pytest-benchmark json file used for generating normalization "
              "values and filling in unspecified benchmark configurations")
diff --git a/score/score.yml b/score/score.yml
index 982695f3ea..b82d07c78c 100644
--- a/score/score.yml
+++ b/score/score.yml
@@ -1,5 +1,5 @@
 ---
-heirarchy:
+hierarchy:
   model:
     computer vision:
       # segmentation: