From a0af563d111a457042ba51819f4d28104693f40c Mon Sep 17 00:00:00 2001
From: Kevin Moore <jauntbox@gmail.com>
Date: Tue, 12 Feb 2019 13:32:15 -0800
Subject: [PATCH] Make decision tree numeric bucketizer tests less flaky (#225)

---
 .../feature/DecisionTreeNumericBucketizerTest.scala | 13 +++++++++----
 .../DecisionTreeNumericMapBucketizerTest.scala      |  3 ++-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala
index 1f29198606..b267e3c8ae 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala
@@ -67,8 +67,9 @@ class DecisionTreeNumericBucketizerTest extends OpEstimatorSpec[OPVector,
   ).map(_.toOPVector)
 
   trait NormalData {
-    val numericData: Seq[Real] = RandomReal.normal[Real]().withProbabilityOfEmpty(0.2).limit(1000)
-    val labelData: Seq[RealNN] = RandomBinary(probabilityOfSuccess = 0.4).limit(1000).map(_.toDouble.toRealNN(0.0))
+    val total = 1000
+    val numericData: Seq[Real] = RandomReal.normal[Real]().withProbabilityOfEmpty(0.2).limit(total)
+    val labelData: Seq[RealNN] = RandomBinary(probabilityOfSuccess = 0.4).limit(total).map(_.toDouble.toRealNN(0.0))
     val (ds, numeric, label) = TestFeatureBuilder[Real, RealNN](numericData zip labelData)
     val expectedSplits = Array.empty[Double]
     lazy val modelLocation = tempDir + "/dt-buck-test-model-" + org.joda.time.DateTime.now().getMillis
@@ -83,10 +84,14 @@ class DecisionTreeNumericBucketizerTest extends OpEstimatorSpec[OPVector,
     val expectedSplits = Array.empty[Double]
   }
 
+  // Generate uniformly spaced data so that the splits found by the decision tree will be deterministic. We still
+  // won't get splits exactly at the midpoints between data points (eg. 14.95, 35.95, 90.95) due to the way Spark
+  // calculates splits by binning. The default bins are 32, which limits the resolution of the splits.
   trait UniformData {
+    val total = 1000
     val (min, max) = (0.0, 100.0)
-    val currencyData: Seq[Currency] =
-      RandomReal.uniform[Currency](minValue = min, maxValue = max).withProbabilityOfEmpty(0.1).limit(1000)
+    val currencyData: Seq[Currency] = (0 until total).map(x => (x * max/total).toCurrency)
+
     val labelData = currencyData.map(c => {
       c.value.map {
         case v if v < 15 => 0.0
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala
index 77722eae45..90708084c6 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala
@@ -90,7 +90,8 @@ class DecisionTreeNumericMapBucketizerTest extends OpEstimatorSpec[OPVector,
     val (min, max) = (0.0, 100.0)
     val currencies: RandomReal[Currency] =
       RandomReal.uniform[Currency](minValue = min, maxValue = max).withProbabilityOfEmpty(0.1)
-    val correlated = currencies.limit(total)
+    val correlated: Seq[Currency] = (0 until total).map(x => (x * max/total).toCurrency)
+
     val labelData = correlated.map(c => {
       c.value.map {
         case v if v < 15 => 0.0