{{ message }}

# Optimize simple for loops#1338

Closed
opened this issue Sep 15, 2008 · 29 comments
Closed

# Optimize simple for loops#1338

opened this issue Sep 15, 2008 · 29 comments
Labels
Milestone

### scabug commented Sep 15, 2008

 I would like to suggest the compiler optimize the common case of for loops, that is, ```for (var <- Range [by step]) for (var <- int to int [by step]) for (var <- int until int [by step])``` to use while loops under the covers instead of. Currently, nested for loops using ranges/iterators are sometimes an order of magnitude slower than while loops. However, while loop constructs for iterating over arrays are very cumbersome, and the functional style (foreach) is also cumbersome and introduces a lot of function call overhead as well. The following two matrix multipication implementations ``` def matMulUsingIterators ( a : Array[Array[Double]], b : Array[Array[Double]], c : Array[Array[Double]]) : Unit = { val b_j = new Array[Double](b.length) for (j <- 0 until b(0).length) { for (k <- 0 until b.length) { b_j(k) = b(k)(j) } for (i <- 0 until a.length) { val c_i = c(i) val a_i = a(i) var s = 0.0d; for (k <- 0 until b.length) { s += a_i(k) * b_j(k) } c_i(j) = s } } } def matMulUsingRanges ( a : Array[Array[Double]], b : Array[Array[Double]], c : Array[Array[Double]]) : Unit = { val jRange = 0 until b(0).length; val kRange = 0 until b.length; val iRange = 0 until a.length; val b_j = new Array[Double](b.length) for (j <- jRange) { for (k <- kRange) { b_j(k) = b(k)(j) } for (i <- iRange) { val c_i = c(i); val a_i = a(i); var s = 0.0d; for (k <- kRange) { s += a_i(k) * b_j(k) } c_i(j) = s } } }``` are much slower than the same algorithm coded with while loops: ``` def matMulUsingWhileLoop ( a : Array[Array[Double]], b : Array[Array[Double]], c : Array[Array[Double]]) : Unit = { val m = a.length; val p = b(0).length; val n = b.length; val b_j = new Array[Double](b.length); var i = 0; var j = 0; var k = 0; while (j < p) { k = 0 while (k < n) { b_j(k) = b(k)(j); k += 1 } i = 0 while (i < m) { val c_i = c(i); val a_i = a(i); var s = 0.0d; k = 0; while (k < n) { s += a_i(k) * b_j(k); k += 1 } c_i(j) = s; i += 1 } j += 1; } }``` but the while loop code is more complex and error prone. (Sorry, Trac appears to remove some line breaks; I added some explicit semis but might have missed some; I'll try attaching actual working source code) Running this while measuring time in nanoseconds: ```Iterators 2,807,815,301ns Ranges 2,789,958,191ns While Loop 190,778,574ns``` MatMul by Iterators is 14 times as slow as with while loops. It does not appear that the Hotspot runtime profiling and optimization dramatically helps this performance problem This performance problem can hurt adoption of Scala for many types of uses/applications. The text was updated successfully, but these errors were encountered:

### scabug commented Sep 15, 2008

 Imported From: https://issues.scala-lang.org/browse/SI-1338?orig=1 Reporter: @DavidBiesack

### scabug commented Sep 15, 2008

 @DavidBiesack said: ```// Scala code to compare performance of nested int loops object MatMul { def matMulUsingIterators ( a : Array[Array[Double]], b : Array[Array[Double]], c : Array[Array[Double]]) : Unit = { val b_j = new Array[Double](b.length) for (j <- 0 until b(0).length) { for (k <- 0 until b.length) { b_j(k) = b(k)(j) } for (i <- 0 until a.length) { val c_i = c(i) val a_i = a(i) var s = 0.0d; for (k <- 0 until b.length) { s += a_i(k) * b_j(k) } c_i(j) = s } } } def matMulUsingRanges ( a : Array[Array[Double]], b : Array[Array[Double]], c : Array[Array[Double]]) : Unit = { val jRange = 0 until b(0).length // p val kRange = 0 until b.length // n val iRange = 0 until a.length // m val b_j = new Array[Double](b.length) for (j <- jRange) { for (k <- kRange) { b_j(k) = b(k)(j) } for (i <- iRange) { val c_i = c(i) val a_i = a(i) var s = 0.0d; for (k <- kRange) { s += a_i(k) * b_j(k) } c_i(j) = s } } } def matMulUsingLimits ( a : Array[Array[Double]], b : Array[Array[Double]], c : Array[Array[Double]]) : Unit = { val b_j = new Array[Double](b.length) val m = a.length val p = b(0).length val n = b.length for (j <- 0 until p) { for (k <- 0 until n) { b_j(k) = b(k)(j) } for (i <- 0 until m) { val c_i = c(i) val a_i = a(i) var s = 0.0d; for (k <- 0 until n) { s += a_i(k) * b_j(k) } c_i(j) = s } } } def matMulUsingWhileLoop ( a : Array[Array[Double]], b : Array[Array[Double]], c : Array[Array[Double]]) : Unit = { val m = a.length val p = b(0).length val n = b.length val b_j = new Array[Double](b.length) var i = 0; var j = 0; var k = 0 while (j < p) { k = 0 while (k < n) { b_j(k) = b(k)(j) k += 1 } i = 0 while (i < m) { val c_i = c(i) val a_i = a(i) var s = 0.0d; k = 0 while (k < n) { s += a_i(k) * b_j(k) k += 1 } c_i(j) = s i += 1 } j += 1 } } def time[R](block: => R) : (Long, R) = { val start = System.nanoTime() val result : R = block val time = System.nanoTime() - start (time, result) } val format = new java.text.DecimalFormat("0,000'ns'"); def report[R](label: String, result: (Long, R)) = { println(label + " " + format.format(result._1)) } private val FACTOR = 5 private val M = 80 private val N = 70 private val P = 60 def main(args : Array[String]) = { for (trial <- 3 until 0 by -1) { val factor = (if (System.getProperty("factor") != null) Integer.parseInt(System.getProperty("factor")) else FACTOR) val multiplier = if (trial == 1) factor else 1; val m = M * multiplier val n = N * multiplier val p = P * multiplier val a = new Array[Array[Double]](m,n) val b = new Array[Array[Double]](n,p) val c = new Array[Array[Double]](m,p) println("\nMultiply c[" + m + "," + p + "] = a[" + m + "," + n + "] times b[" + n + "," + p + "]\n"); val whileTime = time(matMulUsingWhileLoop(a,b,c)) val iterTime = time(matMulUsingIterators(a,b,c)) report("Iterators ", iterTime) report("Limits ", time(matMulUsingLimits(a,b,c))) report("Ranges ", time(matMulUsingRanges(a,b,c))) report("While Loop ", whileTime) println("MatMul by Iterators is " + iterTime._1 / whileTime._1 + " times as slow as with while loops.") } } } ```

### scabug commented Apr 10, 2009

 Jonathan Shore (jshore) said: This is a very important performance enhancement for numerical work. Seems that this could be solved with some pattern matching in the compiler, recognizing a range with no filtering (in the simplest case). Could then remap to a while form for the next stage.

### scabug commented Apr 10, 2009

 @Sciss said: +1

### scabug commented Apr 10, 2009

 Kipton Barros (kbarros) said: +1

### scabug commented May 15, 2009

 Erkki Lindpere (villane) said: +1 It would be nice if for-comprehensions with simple filters could be optimized as well, turning `for (i <- 1 to 10 if shouldProcess(i)) {}` into ```var i = 1 while (i < 10) { if (shouldProcess(i)) { } i += 1 } And extra nice if this would work with random access sequences.```

### scabug commented Oct 22, 2009

 Philippe (phdp) said: +1 This is actually the only thing keeping me from using Scala.

### scabug commented Nov 30, 2009

 @dragos said: Replying to [comment:12 PhDP]: +1 This is actually the only thing keeping me from using Scala. Have you tried '-optimize'? It can help a lot. It's very unlikely this will move from library to compiler-generated loops.

### scabug commented Dec 26, 2009

 Miguel Garcia (mgarcia) said: I haven't benchmarked (with and without -optimize) to see whether the current compilation scheme for "simple loops" is good enough. But in case it isn't, looks like the single place to change in the compiler is method TreeBuilder.makeFor(). According to its comment, http://lampsvn.epfl.ch/trac/scala/browser/scala/trunk/src/compiler/scala/tools/nsc/ast/parser/TreeBuilder.scala it performs five transformations. Prepending as special case a transformation for "simple loops" would not change semantics. (Well, assuming that a local definition does not shadow the usual ones: "to" in "1 to 10", "Range", and so on)

### scabug commented Jul 5, 2010

 Anton Mellit (mellit) said: I tried to create a script with the following: ```def timeit(f : () => Unit) { val t1 = System.currentTimeMillis() f() val t2 = System.currentTimeMillis() println(t2-t1) } def repeat(n : Int, f : Int => Unit) : Unit = { var i = 0 while (i { sum += i }) println(sum) } def test2() { var sum = 0 for(i <- 0 until 1000000000) { sum += i } println(sum) } timeit(test0) timeit(test1) timeit(test2)``` Result is: ```-1243309312 467 -1243309312 504 -1243309312 11899``` May be this 'repeat' is a workaround? Warning: works only with 'scala -optimise'. This is not very stable, sometimes some seemingly minor modifications, i.e. moving the code outside of the function, break it and I get 12000 for 'repeat'.

### scabug commented Jul 6, 2010

 @DavidBiesack said: Replying to [comment:28 mellit]: I tried to create a script with the following: {code} ... def repeat(n : Int, f : Int => Unit) : Unit = { var i = 0 while (i

### scabug commented Sep 18, 2010

 @paulp said: Please see update on this ticket sent to scala-user, also available here: http://scala-programming-language.1934581.n4.nabble.com/optimizing-simple-fors-td2545502.html#a2545502 I very much agree with mgarcia's comment of nine months ago that TreeBuilder.makeFor already does a whole pile of tree transformations and there is no convincing reason we shouldn't add one which has this kind of impact. Failing agreement on that point, I believe we have a pressing responsibility to clean up the parsing phase and plugin architecture sufficiently that it would be possible to do this transformation with a compiler plugin.

### scabug commented Oct 3, 2010

Olivier Chafik (olivier.chafik) said:
Hello,

I've just written such a compiler plugin, which you can install straight away on 2.8.0 with sbaz for testing :
http://article.gmane.org/gmane.comp.lang.scala.user/31814

It's probably full of bugs, but it rewrites int-range for loops with no filters and Array[T].foreach, Array[T].map into equivalent while loops.

You can have a look at the auto tests to see the supported cases :

## Looking forward to seeing something like this mainstream :-) Cheers

zOlive

### scabug commented Oct 4, 2010

 Olivier Chafik (olivier.chafik) said: I've adapted the fannkuch and nbody benchmarks that were in the scala-user thread mentioned previously and I had to adapt it a bit (inlining the ranges that were stored as val range = x until y). Here's the modified code : http://nativelibs4java.sourceforge.net/sbaz/scalacl/examples/ And to run it (with ScalaCL plugin installed via sbaz: sbaz install scalacl-compiler-plugin) : DISABLE_SCALACL_PLUGIN=1 scalac fannkuch.scala && scala fannkuch scalac fannkuch.scala && scala fannkuch DISABLE_SCALACL_PLUGIN=1 scalac nbody.scala && scala nbody scalac nbody.scala && scala nbody With the plugin turned on, the performance of the three variants (While, Limit, Range) is the same (the first while is actually slower, I haven't investigated why).

### scabug commented Oct 6, 2010

 Olivier Chafik (olivier.chafik) said: (Sorry for spamming you again, this should be the last time) I've just enhanced the plugin with more conversions to while loops : Array.foldLeft / foldRight Array.reduceLeft / reduceRight Array.scanLeft / scanRight (in addition to Array.foreach, Array.map and the int range foreach with no filter) Also, the conversions should now work on method references and inline lambdas the same way. Further progress and plans can be tracked at the bottom of this page : http://code.google.com/p/scalacl/wiki/ScalaCLPlugin

### scabug commented Nov 15, 2012

 Herrmann Khosse (herrkhosse) said: +1

### scabug commented Jul 21, 2013

 @SethTisue said: "Spire also provides a loop macro called cfor whose syntax bears a slight resemblance to a traditional for-loop from C or Java. This macro expands to a tail-recursive function, which will inline literal function arguments." https://github.com/non/spire

### scabug commented Jul 21, 2013

 Olivier Chafik (ochafik) said: Here's another loop macro with an arguably better syntax: Scalaxy/Loops (which reuses code from ScalaCL): https://github.com/ochafik/Scalaxy/tree/master/Loops

### scabug commented Mar 28, 2014

 @DarkDimius said: There's a different approach that I've tried in scalaBlitz: dont require users switch from using standard library while they code, but instead give a macro that changes implementation methods, replacing standard library implementation with macro-based one. Here's small description of it: http://scala-blitz.github.io/home/documentation//optimize.html The Range example in this ticket will be compiled to while loops and get same performance.

added this to the Backlog milestone Apr 6, 2017
mentioned this issue Feb 13, 2019
mentioned this issue May 17, 2019
mentioned this issue Jun 11, 2019

 @lrytz do you agree we should now close this, as per scala/scala#8069 ? and if we do close it, let's make some noise about it, too!

 It would seem the noise making probably could have started 6 years ago back when 2.10 was released. I copied the benchmark from scala/scala#8069 out of the Scala project to a fresh sbt 0.13 project and ran it against 2.10.7, 2.11.12, 2.12.8 and 2.13.0. My laptop (4 year old Macbook Air) is a little quieter than when I initially did this, so the variance is a little less with these numbers. 2.10.7 ``````Benchmark (size) Mode Cnt Score Error Units matMulUsingIterators 10 avgt 20 2.32 ± 0.06 us/op matMulUsingRanges 10 avgt 20 2.46 ± 0.06 us/op matMulUsingWhileLoop 10 avgt 20 1.17 ± 0.33 us/op matMulUsingIterators 100 avgt 20 1294.35 ± 26.09 us/op matMulUsingRanges 100 avgt 20 1547.63 ± 36.67 us/op matMulUsingWhileLoop 100 avgt 20 1101.28 ± 56.62 us/op matMulUsingIterators 1000 avgt 20 1450647.85 ± 50986.23 us/op matMulUsingRanges 1000 avgt 20 1609264.80 ± 37524.73 us/op matMulUsingWhileLoop 1000 avgt 20 1442805.40 ± 24099.38 us/op `````` 2.11.12 ``````Benchmark (size) Mode Cnt Score Error Units matMulUsingIterators 10 avgt 20 2.35 ± 0.08 us/op matMulUsingRanges 10 avgt 20 2.51 ± 0.06 us/op matMulUsingWhileLoop 10 avgt 20 1.04 ± 0.03 us/op matMulUsingIterators 100 avgt 20 1297.23 ± 30.70 us/op matMulUsingRanges 100 avgt 20 1632.09 ± 49.61 us/op matMulUsingWhileLoop 100 avgt 20 1064.69 ± 25.42 us/op matMulUsingRanges 1000 avgt 20 1635161.15 ± 39404.86 us/op matMulUsingIterators 1000 avgt 20 1482990.65 ± 42431.02 us/op matMulUsingWhileLoop 1000 avgt 20 1438909.85 ± 20257.01 us/op `````` 2.12.8 ``````Benchmark (size) Mode Cnt Score Error Units matMulUsingIterators 10 avgt 20 2.02 ± 0.06 us/op matMulUsingRanges 10 avgt 20 2.45 ± 0.06 us/op matMulUsingWhileLoop 10 avgt 20 0.90 ± 0.02 us/op matMulUsingIterators 100 avgt 20 1275.96 ± 29.09 us/op matMulUsingRanges 100 avgt 20 1581.64 ± 50.82 us/op matMulUsingWhileLoop 100 avgt 20 1081.20 ± 21.98 us/op matMulUsingIterators 1000 avgt 20 1465832.44 ± 29526.00 us/op matMulUsingRanges 1000 avgt 20 1631038.32 ± 45846.02 us/op matMulUsingWhileLoop 1000 avgt 20 1419277.53 ± 15006.24 us/op `````` 2.13.0 ``````Benchmark (size) Mode Cnt Score Error Units matMulUsingIterators 10 avgt 20 6.64 ± 0.23 us/op matMulUsingRanges 10 avgt 20 2.98 ± 0.08 us/op matMulUsingWhileLoop 10 avgt 20 0.92 ± 0.02 us/op matMulUsingIterators 100 avgt 20 2679.84 ± 50.97 us/op matMulUsingRanges 100 avgt 20 2045.60 ± 46.93 us/op matMulUsingWhileLoop 100 avgt 20 1088.56 ± 29.83 us/op matMulUsingIterators 1000 avgt 20 2391493.44 ± 44324.00 us/op matMulUsingRanges 1000 avgt 20 2114802.81 ± 48825.73 us/op matMulUsingWhileLoop 1000 avgt 20 1435843.18 ± 20688.23 us/op ``````

### Ichoran commented Jun 11, 2019

 @SethTisue - It's not as crazily bad as in the bug report, but since it seems to have actually gotten worse in 2.13 in the benchmark that @ashawley presented, and in any case it's still slow, I wouldn't make any noise. In fact, it seems likely that there's been a ~50% performance regression in 2.13 :(

### lrytz commented Jun 27, 2019

 I copied the benchmark from scala/scala#8069 out of the Scala project to a fresh sbt 0.13 project and ran it against 2.10.7, 2.11.12, 2.12.8 and 2.13.0. @ashawley do you have that repository at hand? I'd like to reproduce your test. I took your PR (scala/scala#8069), rebased it on current 2.13.x, and ran `bench/jmh:run scala.ArrayBenchmark` in sbt. What I got looks pretty good (on my MacBook Pro, and in summer heat, jdk 1.8.0_181): ``````[info] ArrayBenchmark.matMulUsingWhileLoopBench 10 avgt 20 0.603 ± 0.022 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 10 avgt 20 0.851 ± 0.041 us/op [info] ArrayBenchmark.matMulUsingRangesBench 10 avgt 20 0.946 ± 0.019 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 100 avgt 20 750.031 ± 7.678 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 100 avgt 20 861.454 ± 26.186 us/op [info] ArrayBenchmark.matMulUsingRangesBench 100 avgt 20 1172.940 ± 43.039 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 1000 avgt 20 1025865.092 ± 9769.750 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 1000 avgt 20 1141386.009 ± 44863.893 us/op [info] ArrayBenchmark.matMulUsingRangesBench 1000 avgt 20 1202351.806 ± 112234.083 us/op `````` Note that the `bench` project in the scala/scala repo enables the inliner/optimizer (https://github.com/scala/scala/blob/v2.13.0/build.sbt#L634).

### ashawley commented Jun 27, 2019

 Here's a repo that can run the benchmark across Scala versions: https://github.com/ashawley/bug-1338

### lrytz commented Jun 28, 2019

 Thanks, @ashawley! I ran the benchmark on 2.12.8 and 2.13.0, on AdoptOpenJDK 1.8.212-04, on my MacBook Pro. For me the performance is very similar, maybe with a slight advantage for 2.13 ``````2.12.8 [info] ArrayBenchmark.matMulUsingWhileLoopBench 10 avgt 20 0.608 ± 0.059 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 10 avgt 20 4.642 ± 0.090 us/op [info] ArrayBenchmark.matMulUsingRangesBench 10 avgt 20 1.815 ± 0.203 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 100 avgt 20 789.694 ± 35.496 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 100 avgt 20 1665.028 ± 100.906 us/op [info] ArrayBenchmark.matMulUsingRangesBench 100 avgt 20 1218.045 ± 104.188 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 1000 avgt 20 1178862.700 ± 153543.215 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 1000 avgt 20 1482162.761 ± 198875.330 us/op [info] ArrayBenchmark.matMulUsingRangesBench 1000 avgt 20 1464256.812 ± 236343.203 us/op `````` ``````2.13.0 [info] ArrayBenchmark.matMulUsingWhileLoopBench 10 avgt 20 0.555 ± 0.003 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 10 avgt 20 4.095 ± 0.460 us/op [info] ArrayBenchmark.matMulUsingRangesBench 10 avgt 20 1.656 ± 0.027 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 100 avgt 20 774.364 ± 41.958 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 100 avgt 20 1563.868 ± 36.727 us/op [info] ArrayBenchmark.matMulUsingRangesBench 100 avgt 20 1297.571 ± 253.275 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 1000 avgt 20 1090820.285 ± 27666.503 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 1000 avgt 20 1552164.276 ± 220824.486 us/op [info] ArrayBenchmark.matMulUsingRangesBench 1000 avgt 20 1168316.835 ± 21196.048 us/op `````` I also ran it on 2.13 with the optimizer enabled (`> set scalacOptions ++= Seq("-feature", "-opt:l:inline", "-opt-inline-from:**")`, which then the Iterator and RangeForeach versions perform the same as the while loop ``````2.13.0, optimized [info] ArrayBenchmark.matMulUsingWhileLoopBench 10 avgt 20 0.579 ± 0.047 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 10 avgt 20 0.770 ± 0.053 us/op [info] ArrayBenchmark.matMulUsingRangesBench 10 avgt 20 0.743 ± 0.016 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 100 avgt 20 777.171 ± 33.312 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 100 avgt 20 900.683 ± 122.898 us/op [info] ArrayBenchmark.matMulUsingRangesBench 100 avgt 20 1018.408 ± 39.549 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 1000 avgt 20 1108987.279 ± 53725.845 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 1000 avgt 20 1053506.326 ± 36964.233 us/op [info] ArrayBenchmark.matMulUsingRangesBench 1000 avgt 20 1075920.539 ± 25893.733 us/op `````` Maybe you can give it another try? Or someone else? Out of curiosity, I ran the 2.13 version, without the scala optimzier, on Graal EE ``````[info] ArrayBenchmark.matMulUsingWhileLoopBench 10 avgt 20 0.580 ± 0.019 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 10 avgt 20 0.864 ± 0.014 us/op [info] ArrayBenchmark.matMulUsingRangesBench 10 avgt 20 0.864 ± 0.022 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 100 avgt 20 882.833 ± 53.340 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 100 avgt 20 1004.923 ± 63.560 us/op [info] ArrayBenchmark.matMulUsingRangesBench 100 avgt 20 1061.473 ± 136.659 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 1000 avgt 20 1191383.633 ± 141781.733 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 1000 avgt 20 1135731.997 ± 101049.751 us/op [info] ArrayBenchmark.matMulUsingRangesBench 1000 avgt 20 1228080.904 ± 167866.786 us/op `````` So Graal achieves almost the same as the Scala optimizer. This brings the discussion to the VM. The original bug report is > 10 years old, I guess it measured Scala 2.7 on Java 1.5 or so. Things have improved in the meantime, the same 2.7 code might run a lot faster on a current JVM. It's also unclear if the measurements in the orignal bug report were done after proper warm-up. One problem with micro-benchmarks like these is that they don't test for what's called "profile pollution". In a real application, `Range.foreach` is potentially invoked a large number of times, which makes the lambda invocation in its body megamorphic. It looks like the benchmark here has enough `Range.foreach` calls. Adding a few more `Range.foreach` calls with different lambdas to the benchmark doesn't seem to affect the relative performance between the while and the foreach version, so that's a good sign. So I'd say we can close this bug.

### ashawley commented Jun 28, 2019

 Thanks for confirming the benchmark and sharing your insights, @lrytz. I think this can be closed. I also predict 2.13 is fine. The inconsistent results it gave can be chalked up to benchmark variance, IMO.

### lrytz commented Jul 1, 2019

 👍 Thanks for pulling on that thread, @ashawley!

closed this Jul 1, 2019

### som-snytt commented Jul 1, 2019

 I followed my bad instincts without knowing what I was doing and updated sbt and plugin: ``````sbt:for-benchmark> ++ 2.12.8 -v [info] Setting Scala version to 2.12.8 on 0 projects. [info] Switching Scala version on: [info] Excluding projects: [info] * for-benchmark (2.11.12) [info] Reapplying settings... [info] Set current project to for-benchmark (in build file:/home/amarki/projects/for-benchmark/) `````` ``````# 2.12.8 [info] # Run complete. Total time: 00:09:41 [info] [info] Benchmark (loopSize) Mode Cnt Score Error Units [info] ArrayBenchmark.matMulUsingIteratorsBench 10 avgt 20 9.421 ± 0.621 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 100 avgt 20 4144.742 ± 35.525 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 1000 avgt 20 3690096.967 ± 49486.245 us/op [info] ArrayBenchmark.matMulUsingRangesBench 10 avgt 20 3.682 ± 0.029 us/op [info] ArrayBenchmark.matMulUsingRangesBench 100 avgt 20 3617.982 ± 53.211 us/op [info] ArrayBenchmark.matMulUsingRangesBench 1000 avgt 20 3632637.830 ± 38451.647 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 10 avgt 20 1.110 ± 0.050 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 100 avgt 20 1204.156 ± 29.714 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 1000 avgt 20 1445848.245 ± 25610.794 us/op [success] Total time: 601 s, completed Jul 1, 2019, 11:02:00 AM # 2.13.0 [info] # Run complete. Total time: 00:10:10 [info] [info] Benchmark (loopSize) Mode Cnt Score Error Units [info] ArrayBenchmark.matMulUsingIteratorsBench 10 avgt 20 7.154 ± 0.204 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 100 avgt 20 4301.989 ± 358.245 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 1000 avgt 20 4097367.463 ± 270264.914 us/op [info] ArrayBenchmark.matMulUsingRangesBench 10 avgt 20 4.455 ± 0.035 us/op [info] ArrayBenchmark.matMulUsingRangesBench 100 avgt 20 4225.039 ± 248.789 us/op [info] ArrayBenchmark.matMulUsingRangesBench 1000 avgt 20 4081206.540 ± 306168.149 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 10 avgt 20 1.180 ± 0.105 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 100 avgt 20 1184.783 ± 22.531 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 1000 avgt 20 1562064.978 ± 114682.989 us/op [success] Total time: 623 s, completed Jul 1, 2019, 11:36:28 AM # 2.13.0 optimized [info] # Run complete. Total time: 00:07:08 [info] [info] Benchmark (loopSize) Mode Cnt Score Error Units [info] ArrayBenchmark.matMulUsingIteratorsBench 10 avgt 20 1.712 ± 0.113 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 100 avgt 20 1499.381 ± 82.163 us/op [info] ArrayBenchmark.matMulUsingIteratorsBench 1000 avgt 20 1461650.884 ± 96471.162 us/op [info] ArrayBenchmark.matMulUsingRangesBench 10 avgt 20 1.562 ± 0.401 us/op [info] ArrayBenchmark.matMulUsingRangesBench 100 avgt 20 1713.173 ± 15.729 us/op [info] ArrayBenchmark.matMulUsingRangesBench 1000 avgt 20 1382271.184 ± 112008.808 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 10 avgt 20 1.290 ± 0.082 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 100 avgt 20 1406.527 ± 42.015 us/op [info] ArrayBenchmark.matMulUsingWhileLoopBench 1000 avgt 20 1581735.904 ± 124449.687 us/op [success] Total time: 438 s, completed Jul 1, 2019, 11:47:10 AM ``````