From a0af563d111a457042ba51819f4d28104693f40c Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Tue, 12 Feb 2019 13:32:15 -0800 Subject: [PATCH] Make decision tree numeric bucketizer tests less flaky (#225) --- .../feature/DecisionTreeNumericBucketizerTest.scala | 13 +++++++++---- .../DecisionTreeNumericMapBucketizerTest.scala | 3 ++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala index 1f29198606..b267e3c8ae 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala @@ -67,8 +67,9 @@ class DecisionTreeNumericBucketizerTest extends OpEstimatorSpec[OPVector, ).map(_.toOPVector) trait NormalData { - val numericData: Seq[Real] = RandomReal.normal[Real]().withProbabilityOfEmpty(0.2).limit(1000) - val labelData: Seq[RealNN] = RandomBinary(probabilityOfSuccess = 0.4).limit(1000).map(_.toDouble.toRealNN(0.0)) + val total = 1000 + val numericData: Seq[Real] = RandomReal.normal[Real]().withProbabilityOfEmpty(0.2).limit(total) + val labelData: Seq[RealNN] = RandomBinary(probabilityOfSuccess = 0.4).limit(total).map(_.toDouble.toRealNN(0.0)) val (ds, numeric, label) = TestFeatureBuilder[Real, RealNN](numericData zip labelData) val expectedSplits = Array.empty[Double] lazy val modelLocation = tempDir + "/dt-buck-test-model-" + org.joda.time.DateTime.now().getMillis @@ -83,10 +84,14 @@ class DecisionTreeNumericBucketizerTest extends OpEstimatorSpec[OPVector, val expectedSplits = Array.empty[Double] } + // Generate uniformly spaced data so that the splits found by the decision tree will be deterministic. We still + // won't get splits exactly at the midpoints between data points (eg. 14.95, 35.95, 90.95) due to the way Spark + // calculates splits by binning. The default bins are 32, which limits the resolution of the splits. trait UniformData { + val total = 1000 val (min, max) = (0.0, 100.0) - val currencyData: Seq[Currency] = - RandomReal.uniform[Currency](minValue = min, maxValue = max).withProbabilityOfEmpty(0.1).limit(1000) + val currencyData: Seq[Currency] = (0 until total).map(x => (x * max/total).toCurrency) + val labelData = currencyData.map(c => { c.value.map { case v if v < 15 => 0.0 diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala index 77722eae45..90708084c6 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala @@ -90,7 +90,8 @@ class DecisionTreeNumericMapBucketizerTest extends OpEstimatorSpec[OPVector, val (min, max) = (0.0, 100.0) val currencies: RandomReal[Currency] = RandomReal.uniform[Currency](minValue = min, maxValue = max).withProbabilityOfEmpty(0.1) - val correlated = currencies.limit(total) + val correlated: Seq[Currency] = (0 until total).map(x => (x * max/total).toCurrency) + val labelData = correlated.map(c => { c.value.map { case v if v < 15 => 0.0