Import revamped nix benchmark framework

The nix code for running benchmarks is now imported in-tree. This makes it easy to automatically run benchmarks and also generate the fancy R visualizations. See README.md for example usage. The visualizations are now generated as PNG files instead of an Rmarkdown document. This is to make it easier to introduce new visualizations, or variations of existing ones, and easier to load large images for zooming. Under the hood the nix code is also updated so that each test run is a separate derivation. This means that nix is able to parallelize the test runs (one execution of each benchmark with one raptorjit version) and distribute them between machines. This should make the tests run faster on the Hydra CI cluster and also avoid tying up whole servers with hours-long derivations that run hundreds of test runs at the same time. (It also allows you to parallelize test runs on the local machine to use multiple cores.)
snabbco · Mar 27, 2017 · 7a35839 · 7a35839
1 parent 742104f
commit 7a35839
Show file tree

Hide file tree

Showing 4 changed files with 210 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -116,6 +116,39 @@ $ make
 
 ... but make sure you have at least `make`, `clang`, and `luajit` in your `$PATH`.
 
+### Run the benchmarks
+
+Nix can also run the full benchmark suite and generate visualizations
+with R/ggplot2.
+
+The simplest incantation tests one branch:
+
+```shell
+$ nix-build testsuite/bench --arg Asrc ./.   # note: ./. means ./
+```
+
+You can also test several branches (A-E), give them names, specify
+command-line arguments, say how many tests to run, and allow parallel
+execution:
+
+```shell
+$ nix-build testsuite/bench                     \
+            --arg    Asrc ~/git/raptorjit       \
+            --argstr Aname master               \
+            --arg    Bsrc ~/git/raptorjit-hack  \
+            --argstr Bname hacked               \
+            --arg    Csrc ~/git/raptorjit-hack2 \
+            --argstr Cname hacked-O1            \
+            --argstr Cargs -O1                  \
+            --arg    runs 100                   \
+            -j 5           # Run up to 5 tests in parallel
+''
+
+If you are using a distributed nix environment such
+as [Hydra](https://nixos.org/hydra/) then the tests can be
+automatically parallelized and distributed across a suitable build
+farm.
+
 ### Quotes
 
 Here are some borrowed words to put this branch into context:

diff --git a/testsuite/bench/bench.R b/testsuite/bench/bench.R
@@ -0,0 +1,49 @@
+# R subroutines for reading and visualizing benchmark results.
+
+suppressPackageStartupMessages({
+  library(dplyr)
+  library(ggplot2)
+})
+
+## R library routines for analyzing benchmark results
+bench.read <- function(filename) {
+  data <- read.csv(filename)
+  ## baseline is the mean performance of the "A" version
+  baseline <- data %>%
+    filter(letter=="A") %>%
+    group_by(benchmark) %>%
+    summarize(baseline = mean(cycles))
+  ## Add 'relative' performance column: compared to mean from baseline branch
+  relative <- data %>%
+    left_join(baseline, by="benchmark") %>%
+    group_by(benchmark, version) %>%
+    mutate(relative = first(baseline) / cycles)
+  return(relative)
+}
+
+## Jitter plot faceted by benchmark
+bench.jitterplot <- function(data) {
+  ggplot(aes(y=relative, x=version, color=version), data=data) +
+    geom_jitter(shape=1, alpha=0.5) +
+    scale_y_continuous(breaks=seq(0, 3, 0.1), labels=scales::percent) +
+    theme(aspect.ratio = 1) +
+    theme(axis.text.x = element_text(angle=90)) +
+    ylab("Performance relative to baseline average") +
+    ggtitle("Comparative performance between RaptorJIT versions") +
+    facet_wrap(~ benchmark, scales="free_x")
+}
+
+## ECDF plot faceted by benchmark
+bench.ecdfplot <- function(data) {
+  ggplot(aes(x=relative, color=version), data=data) +
+  stat_ecdf() +
+  scale_x_continuous(labels=scales::percent) +
+  scale_y_log10(labels=scales::percent) +
+  theme(aspect.ratio = 1) +
+  theme(axis.text.x = element_text(angle=90)) +
+  ylab("Performance relative to baseline average") +
+  xlab("Percentage of results at or above this performance level") +
+  ggtitle("Comparative performance between RaptorJIT variants") +
+  facet_wrap(~ benchmark)
+}
+
diff --git a/testsuite/bench/default.nix b/testsuite/bench/default.nix
@@ -0,0 +1,103 @@
+# Run a large parallel benchmark campaign and generate R/ggplot2 reports.
+
+{ pkgs ? (import ../../pkgs.nix) {},
+  Asrc,        Aname ? "A", Aargs ? "",
+  Bsrc ? null, Bname ? "B", Bargs ? "",
+  Csrc ? null, Cname ? "C", Cargs ? "",
+  Dsrc ? null, Dname ? "D", Dargs ? "",
+  Esrc ? null, Ename ? "E", Eargs ? "",
+  hardware ? null,
+  runs ? 30 }:
+
+with pkgs;
+with stdenv;
+
+# Derivation to run benchmarks and produce a CSV result.
+let benchmark = letter: name: src: args: run:
+  let raptorjit = (import src {inherit pkgs; version = name;}).raptorjit; in
+  mkDerivation {
+    name = "benchmark-${name}-${toString run}";
+    src = pkgs.lib.cleanSource ./.;
+    # Force consistent hardware
+    requiredSystemFeatures = if hardware != null then [hardware] else [];
+    buildInputs = [ raptorjit linuxPackages.perf utillinux ];
+    buildPhase = ''
+      # Run multiple iterations of the benchmarks
+      echo "Run $run"
+      mkdir -p result/$run
+      # Run each individual benchmark
+      cat PARAM_x86_CI.txt |
+        (while read benchmark params; do
+           echo "running $benchmark"
+           # Execute with performance monitoring & time supervision
+           # Note: discard stdout due to overwhelming output
+           timeout -sKILL 60 \
+             perf stat -x, -o result/$run/$benchmark.perf \
+             raptorjit ${args} -e "math.randomseed(${toString run})" $benchmark.lua $params \
+                > /dev/null || \
+                rm result/$run/$benchmark.perf
+        done)
+    '';
+    installPhase = ''
+      # Copy the raw perf output for reference
+      cp -r result $out
+      # Log the exact CPU
+      lscpu > $out/cpu.txt
+      # Create a CSV file
+      # Create the rows based on the perf logs
+      for result in result/*.perf; do
+        version=${name}
+        benchmark=$(basename -s.perf -a $result)
+        instructions=$(awk -F, -e '$3 == "instructions" { print $1; }' $result)
+        cycles=$(      awk -F, -e '$3 == "cycles"       { print $1; }' $result)
+        echo ${letter},$version,$benchmark,${toString run},$instructions,$cycles >> $out/bench.csv
+      done
+    '';
+  };
+
+# Run a set of benchmarks and aggregate the results into a CSV file.
+# Each benchmark run is a separate derivation. This allows nix to
+# parallelize and distribute the benchmarking.
+  benchmarkSet = letter: name: src: args:
+    let benchmarks = map (benchmark letter name src args) (pkgs.lib.range 1 runs);
+    in
+      runCommand "benchmarks-${name}" { buildInputs = benchmarks; } ''
+        source $stdenv/setup
+        mkdir -p $out
+        for dir in ${pkgs.lib.fold (acc: x: "${acc} ${x}") "" benchmarks}; do
+          cat $dir/bench.csv >> $out/bench.csv
+        done
+      '';
+
+  benchA =                      (benchmarkSet "A" Aname Asrc Aargs);
+  benchB = if Bsrc != null then (benchmarkSet "B" Bname Bsrc Bargs) else "";
+  benchC = if Csrc != null then (benchmarkSet "C" Cname Csrc Cargs) else "";
+  benchD = if Dsrc != null then (benchmarkSet "D" Dname Dsrc Dargs) else "";
+  benchE = if Esrc != null then (benchmarkSet "E" Ename Esrc Eargs) else "";
+in
+
+rec {
+  benchmarkResults = mkDerivation {
+    name = "benchmark-results";
+    buildInputs = with pkgs.rPackages; [ pkgs.R ggplot2 dplyr ];
+    builder = pkgs.writeText "builder.csv" ''
+      source $stdenv/setup
+      # Get the CSV file
+      mkdir -p $out/nix-support
+      echo "letter,version,benchmark,run,instructions,cycles" > bench.csv
+                            cat ${benchA}/bench.csv >> bench.csv
+      [ -n "${benchB}" ] && cat ${benchB}/bench.csv >> bench.csv
+      [ -n "${benchC}" ] && cat ${benchC}/bench.csv >> bench.csv
+      [ -n "${benchD}" ] && cat ${benchD}/bench.csv >> bench.csv
+      [ -n "${benchE}" ] && cat ${benchE}/bench.csv >> bench.csv
+      cp bench.csv $out
+      echo "file CSV $out/bench.csv" >> $out/nix-support/hydra-build-products
+      # Generate the report
+      (cd ${./.}; Rscript ./generate.R $out/bench.csv $out)
+      for png in $out/*.png; do
+        echo "file PNG $png" >> $out/nix-support/hydra-build-products
+      done
+    '';
+  };
+}
+
diff --git a/testsuite/bench/generate.R b/testsuite/bench/generate.R
@@ -0,0 +1,25 @@
+#!/usr/bin/env nix-shell
+#!nix-shell -i Rscript -p R rpkgs.dplyr rpkgs.ggplot2
+
+# R command-line program for making visualizations from benchmark results.
+
+suppressWarnings(source("bench.R"))
+
+args <- commandArgs(trailingOnly=T)
+if (length(args) != 2) {
+    message("Usage: generate.R <csv> <outdir>"); quit(status=1)
+}
+
+filename <- args[[1]]
+outdir   <- args[[2]]
+
+data <- bench.read(filename)
+if (!dir.exists(outdir)) { dir.create(outdir, recursive=T) }
+
+ggsave(filename = file.path(outdir,"bench-jitter.png"),
+       plot = bench.jitterplot(data),
+       width=12, height=12)
+
+ggsave(filename = file.path(outdir,"bench-ecdf.png"),
+       plot = bench.ecdfplot(data),
+       width=12, height=12)