From fe8e6df69868ed4d42a5aa9f70187e9c523bd60d Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Tue, 25 Jun 2019 10:10:01 -0700 Subject: [PATCH 01/27] move PR to a branch on Tmog --- .../com/salesforce/op/ModelInsights.scala | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index b5e85c1e1e..7ca1b09a68 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -487,7 +487,7 @@ case object ModelInsights { ModelInsights( label = getLabelSummary(label, checkerSummary), features = getFeatureInsights(vectorInput, checkerSummary, model, rawFeatures, - blacklistedFeatures, blacklistedMapKeys, rawFeatureFilterResults), + blacklistedFeatures, blacklistedMapKeys, rawFeatureFilterResults, label), selectedModelInfo = getModelInfo(model), trainingParams = trainingParams, stageInfo = RawFeatureFilterConfig.toStageInfo(rawFeatureFilterResults.rawFeatureFilterConfig) @@ -537,7 +537,8 @@ case object ModelInsights { rawFeatures: Array[features.OPFeature], blacklistedFeatures: Array[features.OPFeature], blacklistedMapKeys: Map[String, Set[String]], - rawFeatureFilterResults: RawFeatureFilterResults = RawFeatureFilterResults() + rawFeatureFilterResults: RawFeatureFilterResults = RawFeatureFilterResults(), + label: LabelSummary ): Seq[FeatureInsights] = { val featureInsights = (vectorInfo, summary) match { case (Some(v), Some(s)) => @@ -557,6 +558,25 @@ case object ModelInsights { case _ => None } val keptIndex = indexInToIndexKept.get(h.index) + val featureStd = getIfExists(h.index, s.featuresStatistics.variance).getOrElse(1.0) + val sparkFtrContrib = keptIndex + .map(i => contributions.map(_.applyOrElse(i, (_: Int) => 0.0))).getOrElse(Seq.empty) + val labelStd = label.distribution.getOrElse(1.0) match { + case Continuous(_, _, _, variance) => math.sqrt(variance) + // for (binary) logistic regression we only need to multiply by feature standard deviation + case Discrete(domain, prob) => + def computeVariance(domain: Seq[String], prob: Seq[Double]): Double = { + val floatDomain = domain.map(_.toDouble) + val sqfloatDomain = floatDomain.map(math.pow(_, 2)) + val weighted = (floatDomain, prob).zipped map (_ * _) + val sqweighted = (sqfloatDomain, prob).zipped map (_ * _) + val mean = weighted.sum + return sqweighted.sum - mean + } + computeVariance(domain, prob) + } + // TODO: throw exception if (labelStd == 0) + val descaledFtrContrib = sparkFtrContrib.updated(0, sparkFtrContrib.head * featureStd / labelStd) h.parentFeatureOrigins -> Insights( @@ -578,8 +598,7 @@ case object ModelInsights { getIfExists(idx, s.categoricalStats(groupIdx).contingencyMatrix) case _ => Map.empty[String, Double] }, - contribution = - keptIndex.map(i => contributions.map(_.applyOrElse(i, (_: Int) => 0.0))).getOrElse(Seq.empty), + contribution = sparkFtrContrib, min = getIfExists(h.index, s.featuresStatistics.min), max = getIfExists(h.index, s.featuresStatistics.max), mean = getIfExists(h.index, s.featuresStatistics.mean), From 4ad6204138b0e712bb73f4c14b45c32bde33e886 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Tue, 25 Jun 2019 11:07:32 -0700 Subject: [PATCH 02/27] add condition to descale only when it is Linear / Logistic regression & the features are standardized during training --- .../scala/com/salesforce/op/ModelInsights.scala | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 7ca1b09a68..da90c26721 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -561,6 +561,7 @@ case object ModelInsights { val featureStd = getIfExists(h.index, s.featuresStatistics.variance).getOrElse(1.0) val sparkFtrContrib = keptIndex .map(i => contributions.map(_.applyOrElse(i, (_: Int) => 0.0))).getOrElse(Seq.empty) + val LRStandardization = checkLRStandardization(model).getOrElse(false) val labelStd = label.distribution.getOrElse(1.0) match { case Continuous(_, _, _, variance) => math.sqrt(variance) // for (binary) logistic regression we only need to multiply by feature standard deviation @@ -598,7 +599,7 @@ case object ModelInsights { getIfExists(idx, s.categoricalStats(groupIdx).contingencyMatrix) case _ => Map.empty[String, Double] }, - contribution = sparkFtrContrib, + contribution = if (LRStandardization) descaledFtrContrib else sparkFtrContrib, min = getIfExists(h.index, s.featuresStatistics.min), max = getIfExists(h.index, s.featuresStatistics.max), mean = getIfExists(h.index, s.featuresStatistics.mean), @@ -666,6 +667,17 @@ case object ModelInsights { } } + private[op] def checkLRStandardization(model: Option[Model[_]]): Option[Boolean] = { + val stage = model.flatMap { + case m: SparkWrapperParams[_] => m.getSparkMlStage() + case _ => None + } + stage.collect { + case m: LogisticRegressionModel | LinearRegressionModel => true && m.getStandardization + case _ => false + } + } + private[op] def getModelContributions (model: Option[Model[_]], featureVectorSize: Option[Int] = None): Seq[Seq[Double]] = { val stage = model.flatMap { From 1e2bc8104403535de22b88674f218d9922e5260c Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Tue, 25 Jun 2019 11:35:53 -0700 Subject: [PATCH 03/27] fix modelinsighttest --- core/src/main/scala/com/salesforce/op/ModelInsights.scala | 5 +++-- .../src/test/scala/com/salesforce/op/ModelInsightsTest.scala | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index da90c26721..184b3a1bbe 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -487,7 +487,7 @@ case object ModelInsights { ModelInsights( label = getLabelSummary(label, checkerSummary), features = getFeatureInsights(vectorInput, checkerSummary, model, rawFeatures, - blacklistedFeatures, blacklistedMapKeys, rawFeatureFilterResults, label), + blacklistedFeatures, blacklistedMapKeys, rawFeatureFilterResults, getLabelSummary(label, checkerSummary)), selectedModelInfo = getModelInfo(model), trainingParams = trainingParams, stageInfo = RawFeatureFilterConfig.toStageInfo(rawFeatureFilterResults.rawFeatureFilterConfig) @@ -673,7 +673,8 @@ case object ModelInsights { case _ => None } stage.collect { - case m: LogisticRegressionModel | LinearRegressionModel => true && m.getStandardization + case m: LogisticRegressionModel => true && m.getStandardization + case m: LinearRegressionModel => true && m.getStandardization case _ => false } } diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index 6a5dd2a140..f6925dfb2a 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -509,9 +509,11 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou } it should "correctly extract the FeatureInsights from the sanity checker summary and vector metadata" in { + val labelSum = ModelInsights.getLabelSummary(Option(lbl), Option(summary)) + val featureInsights = ModelInsights.getFeatureInsights( Option(meta), Option(summary), None, Array(f1, f0), Array.empty, Map.empty[String, Set[String]], - RawFeatureFilterResults() + RawFeatureFilterResults(), labelSum ) featureInsights.size shouldBe 2 From 5cbe132db5d87cc6f7b2b63c20d171cf9b414ede Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Tue, 25 Jun 2019 13:14:52 -0700 Subject: [PATCH 04/27] only compute descaled contrib if a model is present & fits our criteria --- core/src/main/scala/com/salesforce/op/ModelInsights.scala | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 184b3a1bbe..109412486b 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -577,7 +577,6 @@ case object ModelInsights { computeVariance(domain, prob) } // TODO: throw exception if (labelStd == 0) - val descaledFtrContrib = sparkFtrContrib.updated(0, sparkFtrContrib.head * featureStd / labelStd) h.parentFeatureOrigins -> Insights( @@ -599,7 +598,11 @@ case object ModelInsights { getIfExists(idx, s.categoricalStats(groupIdx).contingencyMatrix) case _ => Map.empty[String, Double] }, - contribution = if (LRStandardization) descaledFtrContrib else sparkFtrContrib, + contribution = + if (LRStandardization) { + sparkFtrContrib.updated(0, sparkFtrContrib.head * featureStd / labelStd) + } + else sparkFtrContrib, min = getIfExists(h.index, s.featuresStatistics.min), max = getIfExists(h.index, s.featuresStatistics.max), mean = getIfExists(h.index, s.featuresStatistics.mean), From 673dad8d9721bc9c07ea622f20baf5c8cebe4ae9 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Tue, 25 Jun 2019 14:57:07 -0700 Subject: [PATCH 05/27] fix test failure with a check for empty list of feature contribution --- core/src/main/scala/com/salesforce/op/ModelInsights.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 109412486b..bffd5d8682 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -599,7 +599,7 @@ case object ModelInsights { case _ => Map.empty[String, Double] }, contribution = - if (LRStandardization) { + if (LRStandardization && sparkFtrContrib.nonEmpty) { sparkFtrContrib.updated(0, sparkFtrContrib.head * featureStd / labelStd) } else sparkFtrContrib, From 4c4426324f061d0b4518677aec68dfdc5565307b Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 27 Jun 2019 11:17:43 -0700 Subject: [PATCH 06/27] addressing comments --- .../com/salesforce/op/ModelInsights.scala | 43 +++++++++---------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index bffd5d8682..2b59c680e0 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -562,22 +562,23 @@ case object ModelInsights { val sparkFtrContrib = keptIndex .map(i => contributions.map(_.applyOrElse(i, (_: Int) => 0.0))).getOrElse(Seq.empty) val LRStandardization = checkLRStandardization(model).getOrElse(false) - val labelStd = label.distribution.getOrElse(1.0) match { - case Continuous(_, _, _, variance) => math.sqrt(variance) - // for (binary) logistic regression we only need to multiply by feature standard deviation - case Discrete(domain, prob) => - def computeVariance(domain: Seq[String], prob: Seq[Double]): Double = { - val floatDomain = domain.map(_.toDouble) - val sqfloatDomain = floatDomain.map(math.pow(_, 2)) - val weighted = (floatDomain, prob).zipped map (_ * _) - val sqweighted = (sqfloatDomain, prob).zipped map (_ * _) - val mean = weighted.sum - return sqweighted.sum - mean - } - computeVariance(domain, prob) - } - // TODO: throw exception if (labelStd == 0) + val labelStd = label.distribution match { + case Some(Continuous(_, _, _, variance)) => math.sqrt(variance) + case Some(Discrete(domain, prob)) => + val (weighted, sqweighted) = (domain zip prob).foldLeft((0.0, 0.0)) { + case ((weightSum, sqweightSum), (d, p)) => + val floatD = d.toDouble + val weight = floatD * p + val sqweight = floatD * weight + (weightSum + weight, sqweightSum + sqweight) + } + sqweighted - weighted + case Some(d) => throw new Exception("Unsupported distribution type for the label") + case None => throw new Exception("Label does not exist, please check your data") + } + if (labelStd == 0) throw new Exception("The standard deviation of the label is zero, " + + "so the coefficients and intercepts of the model will be zeros, training is not needed.\"") h.parentFeatureOrigins -> Insights( derivedFeatureName = h.columnName, @@ -671,13 +672,11 @@ case object ModelInsights { } private[op] def checkLRStandardization(model: Option[Model[_]]): Option[Boolean] = { - val stage = model.flatMap { - case m: SparkWrapperParams[_] => m.getSparkMlStage() - case _ => None - } - stage.collect { - case m: LogisticRegressionModel => true && m.getStandardization - case m: LinearRegressionModel => true && m.getStandardization + for { + m : SparkWrapperParams[_] <- model + stage <- m.getSparkMlStage() + } yield stage match { + case s: LogisticRegressionModel | LinearRegressionModel => s.getStandardization case _ => false } } From d3fa01b74ce41b1027bd92376a9a8f0eb5151321 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 27 Jun 2019 13:11:25 -0700 Subject: [PATCH 07/27] more comment addressing --- .../com/salesforce/op/ModelInsights.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 2b59c680e0..5b36bea84c 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -567,17 +567,17 @@ case object ModelInsights { case Some(Continuous(_, _, _, variance)) => math.sqrt(variance) case Some(Discrete(domain, prob)) => val (weighted, sqweighted) = (domain zip prob).foldLeft((0.0, 0.0)) { - case ((weightSum, sqweightSum), (d, p)) => - val floatD = d.toDouble - val weight = floatD * p - val sqweight = floatD * weight - (weightSum + weight, sqweightSum + sqweight) - } + case ((weightSum, sqweightSum), (d, p)) => + val floatD = d.toDouble + val weight = floatD * p + val sqweight = floatD * weight + (weightSum + weight, sqweightSum + sqweight) + } sqweighted - weighted - case Some(d) => throw new Exception("Unsupported distribution type for the label") - case None => throw new Exception("Label does not exist, please check your data") + case Some(d) => throw new RuntimeException("Unsupported distribution type for the label") + case None => throw new RuntimeException("Label does not exist, please check your data") } - if (labelStd == 0) throw new Exception("The standard deviation of the label is zero, " + + if (labelStd == 0) throw new RuntimeException("The standard deviation of the label is zero, " + "so the coefficients and intercepts of the model will be zeros, training is not needed.\"") h.parentFeatureOrigins -> Insights( From 7a882f2872d86024025fea42edf655a5b55d3540 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 27 Jun 2019 13:39:44 -0700 Subject: [PATCH 08/27] test in progress, still broken --- .../com/salesforce/op/ModelInsights.scala | 12 +++-- .../com/salesforce/op/ModelInsightsTest.scala | 46 ++++++++++++++++++- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 5b36bea84c..93f677faf7 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -672,11 +672,13 @@ case object ModelInsights { } private[op] def checkLRStandardization(model: Option[Model[_]]): Option[Boolean] = { - for { - m : SparkWrapperParams[_] <- model - stage <- m.getSparkMlStage() - } yield stage match { - case s: LogisticRegressionModel | LinearRegressionModel => s.getStandardization + val stage = model.flatMap { + case m: SparkWrapperParams[_] => m.getSparkMlStage() + case _ => None + } + stage.collect { + case m: LogisticRegressionModel => m.getStandardization + case m: LinearRegressionModel => m.getStandardization case _ => false } } diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index f6925dfb2a..fa0de767bf 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -40,12 +40,13 @@ import com.salesforce.op.stages.impl.selector.ModelSelectorNames.EstimatorType import com.salesforce.op.stages.impl.selector.SelectedModel import com.salesforce.op.stages.impl.selector.ValidationType._ import com.salesforce.op.stages.impl.tuning.{DataCutter, DataSplitter} -import com.salesforce.op.test.PassengerSparkFixtureTest +import com.salesforce.op.test.{PassengerSparkFixtureTest, TestFeatureBuilder} +import com.salesforce.op.testkit.{RandomReal} import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tuning.ParamGridBuilder import org.junit.runner.RunWith -import org.scalactic.Equality +import com.salesforce.op.features.types.Real import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @@ -70,6 +71,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou val models = Seq(lr -> lrParams).asInstanceOf[Seq[(EstimatorType, Array[ParamMap])]] val xgbClassifier = new OpXGBoostClassifier().setSilent(1).setSeed(42L) + val logRegression = new OpLogisticRegression() val xgbRegressor = new OpXGBoostRegressor().setSilent(1).setSeed(42L) val xgbClassifierPred = xgbClassifier.setInput(label, features).getOutput() val xgbRegressorPred = xgbRegressor.setInput(label, features).getOutput() @@ -95,6 +97,46 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou .setInput(label, features) .getOutput() + def std(x: Seq[Real]): Double = { + require(x.size > 1) + val mean = x.map(_.toDouble.get).sum / x.size + Math.sqrt(x.map(i => Math.pow(i.toDouble.get - mean, 2)).sum / (x.size - 1)) + } + val uniformGen = RandomReal.normal[Real](0,10) + uniformGen.reset(42) + val uniformPredictor: Seq[Real] = uniformGen.limit(10) + val uniformPredictorStd = std(uniformPredictor) + + val normGen = RandomReal.uniform[Real](minValue = 10000.0, maxValue = 20000.0) + normGen.reset(42) + val normalPredictor: Seq[Real] = normGen.limit(10) + val normalPredictorStd = std(normalPredictor) + + val linearRegLabel = (normalPredictor, uniformPredictor).zipped.map(_.toDouble.get * 5000 + _.toDouble.get).map(RealNN(_)) + val linearRegLabelStd = std(linearRegLabel) + + val generatedData = uniformPredictor.zip(normalPredictor).zip(linearRegLabel) .map { + case ((f1, f2), label) => (f1, f2, label) + } + val (rawDF, rawUnif, rawNorm, rawLabel) = TestFeatureBuilder("feature1", "feature2", "label", generatedData) + val labelData = rawLabel.copy(isResponse = true) + val genFeatureVector = rawUnif.vectorize(fillValue = 0, fillWithMean = true, trackNulls = false, others = Array(rawNorm)) + + val unstandardizedLinReg = new OpLinearRegression().setStandardization(false) + .setInput(labelData, genFeatureVector).getOutput() + + val standardizedLinReg = new OpLinearRegression().setStandardization(true) + .setInput(labelData, genFeatureVector).getOutput() + + lazy val linRegWorkFlow = new OpWorkflow().setResultFeatures(standardizedLinReg, unstandardizedLinReg) + lazy val linRegModel = linRegWorkFlow.train() + val standardizedFtImp = linRegModel.modelInsights(standardizedLinReg).features.map(_.derivedFeatures.map(_.contribution)) + val unstandardizedFtImp = linRegModel.modelInsights(unstandardizedLinReg).features.map(_.derivedFeatures.map(_.contribution)) + it should "correctly return the descaled coefficient for linear regression, " + + "regardless whether features were standardized or not" in { + standardizedFtImp shouldEqual unstandardizedFtImp + } + val params = new OpParams() lazy val workflow = new OpWorkflow().setResultFeatures(predLin, pred).setParameters(params).setReader(dataReader) From e4da5abfd11bdef65b9b2eeaf991318731a1743c Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 27 Jun 2019 13:55:23 -0700 Subject: [PATCH 09/27] seems to be working --- core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index fa0de767bf..b2f44c1837 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -128,7 +128,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou val standardizedLinReg = new OpLinearRegression().setStandardization(true) .setInput(labelData, genFeatureVector).getOutput() - lazy val linRegWorkFlow = new OpWorkflow().setResultFeatures(standardizedLinReg, unstandardizedLinReg) + lazy val linRegWorkFlow = new OpWorkflow().setResultFeatures(standardizedLinReg, unstandardizedLinReg).setInputDataset(rawDF) lazy val linRegModel = linRegWorkFlow.train() val standardizedFtImp = linRegModel.modelInsights(standardizedLinReg).features.map(_.derivedFeatures.map(_.contribution)) val unstandardizedFtImp = linRegModel.modelInsights(unstandardizedLinReg).features.map(_.derivedFeatures.map(_.contribution)) From 0767e84a64ec22b1dc0c7a2d1972e8fce3b187b3 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Fri, 28 Jun 2019 16:27:23 -0700 Subject: [PATCH 10/27] first version of test --- .../com/salesforce/op/ModelInsightsTest.scala | 76 +++++++++++-------- 1 file changed, 43 insertions(+), 33 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index b2f44c1837..6ed515f64a 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -35,13 +35,13 @@ import com.salesforce.op.features.{Feature, FeatureDistributionType} import com.salesforce.op.filters._ import com.salesforce.op.stages.impl.classification._ import com.salesforce.op.stages.impl.preparators._ -import com.salesforce.op.stages.impl.regression.{OpLinearRegression, OpXGBoostRegressor, RegressionModelSelector} +import com.salesforce.op.stages.impl.regression.{OpLinearRegression, OpLinearRegressionModel, OpXGBoostRegressor, RegressionModelSelector} import com.salesforce.op.stages.impl.selector.ModelSelectorNames.EstimatorType import com.salesforce.op.stages.impl.selector.SelectedModel import com.salesforce.op.stages.impl.selector.ValidationType._ import com.salesforce.op.stages.impl.tuning.{DataCutter, DataSplitter} import com.salesforce.op.test.{PassengerSparkFixtureTest, TestFeatureBuilder} -import com.salesforce.op.testkit.{RandomReal} +import com.salesforce.op.testkit.RandomReal import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tuning.ParamGridBuilder @@ -49,7 +49,6 @@ import org.junit.runner.RunWith import com.salesforce.op.features.types.Real import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner - import scala.util.{Failure, Success} @RunWith(classOf[JUnitRunner]) @@ -97,45 +96,42 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou .setInput(label, features) .getOutput() - def std(x: Seq[Real]): Double = { - require(x.size > 1) - val mean = x.map(_.toDouble.get).sum / x.size - Math.sqrt(x.map(i => Math.pow(i.toDouble.get - mean, 2)).sum / (x.size - 1)) - } - val uniformGen = RandomReal.normal[Real](0,10) - uniformGen.reset(42) - val uniformPredictor: Seq[Real] = uniformGen.limit(10) - val uniformPredictorStd = std(uniformPredictor) - - val normGen = RandomReal.uniform[Real](minValue = 10000.0, maxValue = 20000.0) - normGen.reset(42) - val normalPredictor: Seq[Real] = normGen.limit(10) - val normalPredictorStd = std(normalPredictor) + val smallFeatureVariance = 10.0 + val bigFeatureVariance= 100.0 + val smallNorm = RandomReal.normal[Real](0, smallFeatureVariance).limit(1000) + val bigNorm = RandomReal.normal[Real](10000.0, bigFeatureVariance).limit(1000) + val linearRegLabel = (smallNorm, bigNorm) + .zipped.map(_.toDouble.get * 5000 + _.toDouble.get).map(RealNN(_)) - val linearRegLabel = (normalPredictor, uniformPredictor).zipped.map(_.toDouble.get * 5000 + _.toDouble.get).map(RealNN(_)) - val linearRegLabelStd = std(linearRegLabel) + // label is a sum of two scaled Normals, hence we also know its standard deviation + val labelStd = math.sqrt(5000 * 5000 * smallFeatureVariance + bigFeatureVariance) - val generatedData = uniformPredictor.zip(normalPredictor).zip(linearRegLabel) .map { + val generatedData = smallNorm.zip(bigNorm).zip(linearRegLabel).map { case ((f1, f2), label) => (f1, f2, label) } - val (rawDF, rawUnif, rawNorm, rawLabel) = TestFeatureBuilder("feature1", "feature2", "label", generatedData) + val (rawDF, rawSmall, rawBig, rawLabel) = TestFeatureBuilder("feature1", "feature2", "label", generatedData) val labelData = rawLabel.copy(isResponse = true) - val genFeatureVector = rawUnif.vectorize(fillValue = 0, fillWithMean = true, trackNulls = false, others = Array(rawNorm)) + val genFeatureVector = rawSmall + .vectorize(fillValue = 0, fillWithMean = true, trackNulls = false, others = Array(rawBig)) - val unstandardizedLinReg = new OpLinearRegression().setStandardization(false) - .setInput(labelData, genFeatureVector).getOutput() + val checkedFeatures = labelData.sanityCheck(genFeatureVector, removeBadFeatures = false) - val standardizedLinReg = new OpLinearRegression().setStandardization(true) - .setInput(labelData, genFeatureVector).getOutput() + val unstandardizedpred = new OpLinearRegression().setStandardization(false) + .setInput(labelData, checkedFeatures).getOutput() - lazy val linRegWorkFlow = new OpWorkflow().setResultFeatures(standardizedLinReg, unstandardizedLinReg).setInputDataset(rawDF) + val standardizedpred = new OpLinearRegression().setStandardization(true) + .setInput(labelData, checkedFeatures).getOutput() + + lazy val linRegWorkFlow = new OpWorkflow() + .setResultFeatures(unstandardizedpred, standardizedpred).setInputDataset(rawDF) lazy val linRegModel = linRegWorkFlow.train() - val standardizedFtImp = linRegModel.modelInsights(standardizedLinReg).features.map(_.derivedFeatures.map(_.contribution)) - val unstandardizedFtImp = linRegModel.modelInsights(unstandardizedLinReg).features.map(_.derivedFeatures.map(_.contribution)) - it should "correctly return the descaled coefficient for linear regression, " + - "regardless whether features were standardized or not" in { - standardizedFtImp shouldEqual unstandardizedFtImp - } + lazy val standardizedLinModel = Option(linRegModel.getOriginStageOf(standardizedpred).asInstanceOf[OpLinearRegressionModel]) + val unstandardizedFtImp = linRegModel.modelInsights(unstandardizedpred).features.map(_.derivedFeatures.map(_.contribution)) + val standardizedFtImp = linRegModel.modelInsights(standardizedpred).features.map(_.derivedFeatures.map(_.contribution)) + val descaledsmallCoeff = standardizedFtImp.flatten.flatten.head + val originalsmallCoeff = unstandardizedFtImp.flatten.flatten.head + val descaledbigCoeff = standardizedFtImp.flatten.flatten.last + val orginalbigCoeff = unstandardizedFtImp.flatten.flatten.last val params = new OpParams() @@ -696,4 +692,18 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou f.cramersV.isEmpty shouldBe true } } + + it should "correctly return the descaled coefficient for linear regression, " + + "when standardization is on" in { + + // Since 5000 & 1 are always returned as the coefficients of the model trained on unstandardized data + // and we can analytically calculate the scaled version of them by the linear regression formula, + // the coefficients of the model trained on standardized data should be within a small distance of the analytical formula. + + // difference between the real coefficient and the analytical formula + val absError = math.abs(orginalbigCoeff * math.sqrt(smallFeatureVariance) / labelStd - descaledbigCoeff) + val absError2 = math.abs(originalsmallCoeff * math.sqrt(bigFeatureVariance) / labelStd - descaledsmallCoeff) + absError < 2.5 shouldBe true + absError2 < 0.01 shouldBe true + } } From 7eaa2093a8c68305bde7710604d523458a4e09f3 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Fri, 28 Jun 2019 16:42:31 -0700 Subject: [PATCH 11/27] fix scala style --- .../com/salesforce/op/ModelInsightsTest.scala | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index 6ed515f64a..38a6959a96 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -97,7 +97,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou .getOutput() val smallFeatureVariance = 10.0 - val bigFeatureVariance= 100.0 + val bigFeatureVariance = 100.0 val smallNorm = RandomReal.normal[Real](0, smallFeatureVariance).limit(1000) val bigNorm = RandomReal.normal[Real](10000.0, bigFeatureVariance).limit(1000) val linearRegLabel = (smallNorm, bigNorm) @@ -125,9 +125,12 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou lazy val linRegWorkFlow = new OpWorkflow() .setResultFeatures(unstandardizedpred, standardizedpred).setInputDataset(rawDF) lazy val linRegModel = linRegWorkFlow.train() - lazy val standardizedLinModel = Option(linRegModel.getOriginStageOf(standardizedpred).asInstanceOf[OpLinearRegressionModel]) - val unstandardizedFtImp = linRegModel.modelInsights(unstandardizedpred).features.map(_.derivedFeatures.map(_.contribution)) - val standardizedFtImp = linRegModel.modelInsights(standardizedpred).features.map(_.derivedFeatures.map(_.contribution)) + lazy val standardizedLinModel = Option(linRegModel.getOriginStageOf(standardizedpred) + .asInstanceOf[OpLinearRegressionModel]) + val unstandardizedFtImp = linRegModel.modelInsights(unstandardizedpred) + .features.map(_.derivedFeatures.map(_.contribution)) + val standardizedFtImp = linRegModel.modelInsights(standardizedpred) + .features.map(_.derivedFeatures.map(_.contribution)) val descaledsmallCoeff = standardizedFtImp.flatten.flatten.head val originalsmallCoeff = unstandardizedFtImp.flatten.flatten.head val descaledbigCoeff = standardizedFtImp.flatten.flatten.last @@ -696,9 +699,10 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou it should "correctly return the descaled coefficient for linear regression, " + "when standardization is on" in { - // Since 5000 & 1 are always returned as the coefficients of the model trained on unstandardized data - // and we can analytically calculate the scaled version of them by the linear regression formula, - // the coefficients of the model trained on standardized data should be within a small distance of the analytical formula. + // Since 5000 & 1 are always returned as the coefficients of the model + // trained on unstandardized data and we can analytically calculate + // the scaled version of them by the linear regression formula, the coefficients + // of the model trained on standardized data should be within a small distance of the analytical formula. // difference between the real coefficient and the analytical formula val absError = math.abs(orginalbigCoeff * math.sqrt(smallFeatureVariance) / labelStd - descaledbigCoeff) From e1bb482e40c7b4bb06c9303394fa9936db285a99 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Mon, 8 Jul 2019 09:57:31 -0700 Subject: [PATCH 12/27] addressing comments --- .../com/salesforce/op/ModelInsights.scala | 36 ++++++++++++++----- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 93f677faf7..9a37a581ab 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -484,10 +484,12 @@ case object ModelInsights { s" to fill in model insights" ) + val labelSummary = getLabelSummary(label, checkerSummary) + ModelInsights( - label = getLabelSummary(label, checkerSummary), + label = labelSummary, features = getFeatureInsights(vectorInput, checkerSummary, model, rawFeatures, - blacklistedFeatures, blacklistedMapKeys, rawFeatureFilterResults, getLabelSummary(label, checkerSummary)), + blacklistedFeatures, blacklistedMapKeys, rawFeatureFilterResults, labelSummary), selectedModelInfo = getModelInfo(model), trainingParams = trainingParams, stageInfo = RawFeatureFilterConfig.toStageInfo(rawFeatureFilterResults.rawFeatureFilterConfig) @@ -562,9 +564,15 @@ case object ModelInsights { val sparkFtrContrib = keptIndex .map(i => contributions.map(_.applyOrElse(i, (_: Int) => 0.0))).getOrElse(Seq.empty) val LRStandardization = checkLRStandardization(model).getOrElse(false) - + val defaultLabelStd = 1.0 val labelStd = label.distribution match { - case Some(Continuous(_, _, _, variance)) => math.sqrt(variance) + case Some(Continuous(_, _, _, variance)) => + if (variance == 0) { + log.info("The standard deviation of the label is zero, " + + "so the coefficients and intercepts of the model will be zeros, training is not needed.\"") + defaultLabelStd + } + else math.sqrt(variance) case Some(Discrete(domain, prob)) => val (weighted, sqweighted) = (domain zip prob).foldLeft((0.0, 0.0)) { case ((weightSum, sqweightSum), (d, p)) => @@ -573,12 +581,22 @@ case object ModelInsights { val sqweight = floatD * weight (weightSum + weight, sqweightSum + sqweight) } - sqweighted - weighted - case Some(d) => throw new RuntimeException("Unsupported distribution type for the label") - case None => throw new RuntimeException("Label does not exist, please check your data") + if (sqweighted == weighted) { + log.info("The standard deviation of the label is zero, " + + "so the coefficients and intercepts of the model will be zeros, training is not needed.\"") + defaultLabelStd + } + else sqweighted - weighted + case Some(_) => { + log.info("Performing weight descaling on an unsupported distribution") + defaultLabelStd + } + case None => { + log.info("Label does not exist, please check your data") + defaultLabelStd + } } - if (labelStd == 0) throw new RuntimeException("The standard deviation of the label is zero, " + - "so the coefficients and intercepts of the model will be zeros, training is not needed.\"") + h.parentFeatureOrigins -> Insights( derivedFeatureName = h.columnName, From cb798db5cd4ae0a94cb13c5cd105e556aa0f6637 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Tue, 9 Jul 2019 13:17:37 -0700 Subject: [PATCH 13/27] change log to warn --- core/src/main/scala/com/salesforce/op/ModelInsights.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 9a37a581ab..abbefec395 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -568,7 +568,7 @@ case object ModelInsights { val labelStd = label.distribution match { case Some(Continuous(_, _, _, variance)) => if (variance == 0) { - log.info("The standard deviation of the label is zero, " + + log.warn("The standard deviation of the label is zero, " + "so the coefficients and intercepts of the model will be zeros, training is not needed.\"") defaultLabelStd } @@ -582,17 +582,17 @@ case object ModelInsights { (weightSum + weight, sqweightSum + sqweight) } if (sqweighted == weighted) { - log.info("The standard deviation of the label is zero, " + + log.warn("The standard deviation of the label is zero, " + "so the coefficients and intercepts of the model will be zeros, training is not needed.\"") defaultLabelStd } else sqweighted - weighted case Some(_) => { - log.info("Performing weight descaling on an unsupported distribution") + log.warn("Performing weight descaling on an unsupported distribution") defaultLabelStd } case None => { - log.info("Label does not exist, please check your data") + log.warn("Label does not exist, please check your data") defaultLabelStd } } From 606b6e1f4d8c4cfb475386f749ad7fc8610ec88e Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 11 Jul 2019 11:30:25 -0700 Subject: [PATCH 14/27] fix an error in calculating standard deviation for discrete distribution --- .../com/salesforce/op/ModelInsights.scala | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index abbefec395..a2ca86920e 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -574,19 +574,26 @@ case object ModelInsights { } else math.sqrt(variance) case Some(Discrete(domain, prob)) => - val (weighted, sqweighted) = (domain zip prob).foldLeft((0.0, 0.0)) { - case ((weightSum, sqweightSum), (d, p)) => + // mean = sum (x_i * p_i) + val mean = (domain zip prob).foldLeft(0.0) { + case (weightSum, (d, p)) => + val floatD = d.toDouble + val weight = floatD * p + (weightSum + weight) + } + // variance = sum (x_i - mu)^2 * p_i + val discreteVariance = (domain zip prob).foldLeft(0.0) { + case (sqweightSum, (d, p)) => val floatD = d.toDouble - val weight = floatD * p - val sqweight = floatD * weight - (weightSum + weight, sqweightSum + sqweight) + val sqweight = (floatD - mean) * (floatD - mean) * p + (sqweightSum + sqweight) } - if (sqweighted == weighted) { + if (discreteVariance == 0) { log.warn("The standard deviation of the label is zero, " + "so the coefficients and intercepts of the model will be zeros, training is not needed.\"") defaultLabelStd } - else sqweighted - weighted + else math.sqrt(discreteVariance) case Some(_) => { log.warn("Performing weight descaling on an unsupported distribution") defaultLabelStd From 4c432e6e8b1240be5c070dd15651cddce5ed1ccc Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 11 Jul 2019 13:02:53 -0700 Subject: [PATCH 15/27] correctly pull out standard deviation for each feature --- core/src/main/scala/com/salesforce/op/ModelInsights.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index a2ca86920e..1188f25cf1 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -560,7 +560,7 @@ case object ModelInsights { case _ => None } val keptIndex = indexInToIndexKept.get(h.index) - val featureStd = getIfExists(h.index, s.featuresStatistics.variance).getOrElse(1.0) + val featureStd = math.sqrt(getIfExists(h.index, s.featuresStatistics.variance).getOrElse(1.0)) val sparkFtrContrib = keptIndex .map(i => contributions.map(_.applyOrElse(i, (_: Int) => 0.0))).getOrElse(Seq.empty) val LRStandardization = checkLRStandardization(model).getOrElse(false) From 0e6349121809f7bdd341948eec4ffa325d89ed58 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Fri, 12 Jul 2019 16:28:54 -0700 Subject: [PATCH 16/27] descale entire contribution vector & clearly separate out between linear regression and logistic regression cases --- .../com/salesforce/op/ModelInsights.scala | 29 +++++++++++++------ .../com/salesforce/op/ModelInsightsTest.scala | 2 +- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 1188f25cf1..831e0fd850 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -563,7 +563,6 @@ case object ModelInsights { val featureStd = math.sqrt(getIfExists(h.index, s.featuresStatistics.variance).getOrElse(1.0)) val sparkFtrContrib = keptIndex .map(i => contributions.map(_.applyOrElse(i, (_: Int) => 0.0))).getOrElse(Seq.empty) - val LRStandardization = checkLRStandardization(model).getOrElse(false) val defaultLabelStd = 1.0 val labelStd = label.distribution match { case Some(Continuous(_, _, _, variance)) => @@ -625,10 +624,8 @@ case object ModelInsights { case _ => Map.empty[String, Double] }, contribution = - if (LRStandardization && sparkFtrContrib.nonEmpty) { - sparkFtrContrib.updated(0, sparkFtrContrib.head * featureStd / labelStd) - } - else sparkFtrContrib, + descaleLRContrib(model, sparkFtrContrib, featureStd, labelStd).getOrElse(sparkFtrContrib), + min = getIfExists(h.index, s.featuresStatistics.min), max = getIfExists(h.index, s.featuresStatistics.max), mean = getIfExists(h.index, s.featuresStatistics.mean), @@ -696,15 +693,29 @@ case object ModelInsights { } } - private[op] def checkLRStandardization(model: Option[Model[_]]): Option[Boolean] = { + private[op] def descaleLRContrib( + model: Option[Model[_]], + sparkFtrContrib: Seq[Double], + featureStd: Double, + labelStd: Double): Option[Seq[Double]] = { val stage = model.flatMap { case m: SparkWrapperParams[_] => m.getSparkMlStage() case _ => None } stage.collect { - case m: LogisticRegressionModel => m.getStandardization - case m: LinearRegressionModel => m.getStandardization - case _ => false + case m: LogisticRegressionModel => + if (m.getStandardization && sparkFtrContrib.nonEmpty) { + // scale entire feature contribution vector + sparkFtrContrib.map(_ * featureStd) + } + else sparkFtrContrib + case m: LinearRegressionModel => + if (m.getStandardization && sparkFtrContrib.nonEmpty) { + // need to also divide by labelStd for linear regression + sparkFtrContrib.map(_ * featureStd / labelStd) + } + else sparkFtrContrib + case _ => sparkFtrContrib } } diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index 38a6959a96..8c98b61894 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -707,7 +707,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou // difference between the real coefficient and the analytical formula val absError = math.abs(orginalbigCoeff * math.sqrt(smallFeatureVariance) / labelStd - descaledbigCoeff) val absError2 = math.abs(originalsmallCoeff * math.sqrt(bigFeatureVariance) / labelStd - descaledsmallCoeff) - absError < 2.5 shouldBe true + absError < 0.01 shouldBe true absError2 < 0.01 shouldBe true } } From 4677c96f6938bbe7807bb484024877f312903f20 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Mon, 15 Jul 2019 14:03:59 -0700 Subject: [PATCH 17/27] fix scala idiom --- .../main/scala/com/salesforce/op/ModelInsights.scala | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 831e0fd850..28464f0fb9 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -575,17 +575,11 @@ case object ModelInsights { case Some(Discrete(domain, prob)) => // mean = sum (x_i * p_i) val mean = (domain zip prob).foldLeft(0.0) { - case (weightSum, (d, p)) => - val floatD = d.toDouble - val weight = floatD * p - (weightSum + weight) + case (weightSum, (d, p)) => weightSum + d.toDouble * p } // variance = sum (x_i - mu)^2 * p_i val discreteVariance = (domain zip prob).foldLeft(0.0) { - case (sqweightSum, (d, p)) => - val floatD = d.toDouble - val sqweight = (floatD - mean) * (floatD - mean) * p - (sqweightSum + sqweight) + case (sqweightSum, (d, p)) => sqweightSum + (d.toDouble - mean) * (d.toDouble - mean) * p } if (discreteVariance == 0) { log.warn("The standard deviation of the label is zero, " + From a5901d8f7fcf35859809d5134bbc4cf6a8a0315c Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Mon, 15 Jul 2019 14:48:44 -0700 Subject: [PATCH 18/27] remove logistic regression pattern matching --- .../main/scala/com/salesforce/op/ModelInsights.scala | 12 ++++++------ .../scala/com/salesforce/op/ModelInsightsTest.scala | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 28464f0fb9..8a9bbf6383 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -697,12 +697,12 @@ case object ModelInsights { case _ => None } stage.collect { - case m: LogisticRegressionModel => - if (m.getStandardization && sparkFtrContrib.nonEmpty) { - // scale entire feature contribution vector - sparkFtrContrib.map(_ * featureStd) - } - else sparkFtrContrib +// case m: LogisticRegressionModel => +// if (m.getStandardization && sparkFtrContrib.nonEmpty) { +// // scale entire feature contribution vector +// sparkFtrContrib.map(_ * featureStd) +// } +// else sparkFtrContrib case m: LinearRegressionModel => if (m.getStandardization && sparkFtrContrib.nonEmpty) { // need to also divide by labelStd for linear regression diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index 8c98b61894..bc54414208 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -70,7 +70,6 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou val models = Seq(lr -> lrParams).asInstanceOf[Seq[(EstimatorType, Array[ParamMap])]] val xgbClassifier = new OpXGBoostClassifier().setSilent(1).setSeed(42L) - val logRegression = new OpLogisticRegression() val xgbRegressor = new OpXGBoostRegressor().setSilent(1).setSeed(42L) val xgbClassifierPred = xgbClassifier.setInput(label, features).getOutput() val xgbRegressorPred = xgbRegressor.setInput(label, features).getOutput() From 90ff504c4d0d283aff615777690ee03b2ca0ce53 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Mon, 15 Jul 2019 16:37:11 -0700 Subject: [PATCH 19/27] add citations for future readability --- core/src/main/scala/com/salesforce/op/ModelInsights.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 8a9bbf6383..528aa0a18e 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -706,6 +706,8 @@ case object ModelInsights { case m: LinearRegressionModel => if (m.getStandardization && sparkFtrContrib.nonEmpty) { // need to also divide by labelStd for linear regression + // See https://github.com/salesforce/TransmogrifAI/pull/345#discussion_r303634114 + // Also see https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala#L551-L558 sparkFtrContrib.map(_ * featureStd / labelStd) } else sparkFtrContrib From a7dea4ea9449c046dff889b845931737e950da6f Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 18 Jul 2019 16:26:31 -0700 Subject: [PATCH 20/27] refactor & add test for binary logistic regression case --- .../com/salesforce/op/ModelInsights.scala | 18 +-- .../com/salesforce/op/ModelInsightsTest.scala | 113 +++++++++++++----- 2 files changed, 91 insertions(+), 40 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 528aa0a18e..f003814a20 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -697,17 +697,19 @@ case object ModelInsights { case _ => None } stage.collect { -// case m: LogisticRegressionModel => -// if (m.getStandardization && sparkFtrContrib.nonEmpty) { -// // scale entire feature contribution vector -// sparkFtrContrib.map(_ * featureStd) -// } -// else sparkFtrContrib + case m: LogisticRegressionModel => + if (m.getStandardization && sparkFtrContrib.nonEmpty) { + // scale entire feature contribution vector + // See https://think-lab.github.io/d/205/ + // ยง 4.5.2 Standardized Interpretations, An Introduction to Categorical Data Analysis, Alan Agresti + sparkFtrContrib.map(_ * featureStd) + } + else sparkFtrContrib case m: LinearRegressionModel => if (m.getStandardization && sparkFtrContrib.nonEmpty) { // need to also divide by labelStd for linear regression - // See https://github.com/salesforce/TransmogrifAI/pull/345#discussion_r303634114 - // Also see https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala#L551-L558 + // See https://u.demog.berkeley.edu/~andrew/teaching/standard_coeff.pdf + // See https://en.wikipedia.org/wiki/Standardized_coefficient sparkFtrContrib.map(_ * featureStd / labelStd) } else sparkFtrContrib diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index bc54414208..49532daa73 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -31,7 +31,7 @@ package com.salesforce.op import com.salesforce.op.features.types._ -import com.salesforce.op.features.{Feature, FeatureDistributionType} +import com.salesforce.op.features.{Feature, FeatureDistributionType, FeatureLike} import com.salesforce.op.filters._ import com.salesforce.op.stages.impl.classification._ import com.salesforce.op.stages.impl.preparators._ @@ -47,8 +47,10 @@ import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tuning.ParamGridBuilder import org.junit.runner.RunWith import com.salesforce.op.features.types.Real +import org.apache.spark.sql.DataFrame import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner + import scala.util.{Failure, Success} @RunWith(classOf[JUnitRunner]) @@ -96,44 +98,70 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou .getOutput() val smallFeatureVariance = 10.0 + val mediumFeatureVariance = 1.0 val bigFeatureVariance = 100.0 - val smallNorm = RandomReal.normal[Real](0, smallFeatureVariance).limit(1000) + val smallNorm = RandomReal.normal[Real](0.0, smallFeatureVariance).limit(1000) + val mediumNorm = RandomReal.normal[Real](10, mediumFeatureVariance).limit(1000) val bigNorm = RandomReal.normal[Real](10000.0, bigFeatureVariance).limit(1000) + val noise = RandomReal.normal[Real](0.0, 100.0).limit(1000) + // make a simple linear combination of the features (with noise), pass through sigmoid function and binarize + // to make labels for logistic reg toy data + def binarize(x: Double) = { + val sigmoid = 1.0 / (1.0 + math.exp(-x)) + if (sigmoid > 0.5) 1 else 0 + } + val logisticRegLabel = (smallNorm, mediumNorm, noise) + .zipped.map(_.toDouble.get * 10 + _.toDouble.get + _.toDouble.get).map(binarize(_)).map(RealNN(_)) + // toy label for linear reg is a sum of two scaled Normals, hence we also know its standard deviation val linearRegLabel = (smallNorm, bigNorm) .zipped.map(_.toDouble.get * 5000 + _.toDouble.get).map(RealNN(_)) - - // label is a sum of two scaled Normals, hence we also know its standard deviation val labelStd = math.sqrt(5000 * 5000 * smallFeatureVariance + bigFeatureVariance) - val generatedData = smallNorm.zip(bigNorm).zip(linearRegLabel).map { - case ((f1, f2), label) => (f1, f2, label) + def twoFeatureDF(feature1: List[Real], feature2: List[Real], label: List[RealNN]) + :(Feature[RealNN], FeatureLike[OPVector], DataFrame) = { + val generatedData = feature1.zip(feature2).zip(label).map { + case ((f1, f2), label) => (f1, f2, label) + } + val (rawDF, raw1, raw2, rawLabel) = TestFeatureBuilder("feature1", "feature2", "label", generatedData) + val labelData = rawLabel.copy(isResponse = true) + val featureVector = raw1 + .vectorize(fillValue = 0, fillWithMean = true, trackNulls = false, others = Array(raw2)) + val checkedFeatures = labelData.sanityCheck(featureVector, removeBadFeatures = false) + return (labelData, checkedFeatures, rawDF) + } + + val linRegDF = twoFeatureDF(smallNorm, bigNorm, linearRegLabel) + val logRegDF = twoFeatureDF(smallNorm, mediumNorm, logisticRegLabel) + + val unstandardizedLinpred = new OpLinearRegression().setStandardization(false) + .setInput(linRegDF._1, linRegDF._2).getOutput() + + val standardizedLinpred = new OpLinearRegression().setStandardization(true) + .setInput(linRegDF._1, linRegDF._2).getOutput() + + val unstandardizedLogpred = new OpLogisticRegression().setStandardization(false) + .setInput(logRegDF._1, logRegDF._2).getOutput() + + val standardizedLogpred = new OpLogisticRegression().setStandardization(true) + .setInput(logRegDF._1, logRegDF._2).getOutput() + + def getFeatureImp(standardizedModel: FeatureLike[Prediction], + unstandardizedModel: FeatureLike[Prediction], + DF: DataFrame): Array[Double] = { + lazy val workFlow = new OpWorkflow() + .setResultFeatures(standardizedModel, unstandardizedModel).setInputDataset(DF) + lazy val model = workFlow.train() + val unstandardizedFtImp = model.modelInsights(unstandardizedModel) + .features.map(_.derivedFeatures.map(_.contribution)) + val standardizedFtImp = model.modelInsights(standardizedModel) + .features.map(_.derivedFeatures.map(_.contribution)) + val descaledsmallCoeff = standardizedFtImp.flatten.flatten.head + val originalsmallCoeff = unstandardizedFtImp.flatten.flatten.head + val descaledbigCoeff = standardizedFtImp.flatten.flatten.last + val orginalbigCoeff = unstandardizedFtImp.flatten.flatten.last + return Array(descaledsmallCoeff, originalsmallCoeff, descaledbigCoeff, orginalbigCoeff) } - val (rawDF, rawSmall, rawBig, rawLabel) = TestFeatureBuilder("feature1", "feature2", "label", generatedData) - val labelData = rawLabel.copy(isResponse = true) - val genFeatureVector = rawSmall - .vectorize(fillValue = 0, fillWithMean = true, trackNulls = false, others = Array(rawBig)) - - val checkedFeatures = labelData.sanityCheck(genFeatureVector, removeBadFeatures = false) - - val unstandardizedpred = new OpLinearRegression().setStandardization(false) - .setInput(labelData, checkedFeatures).getOutput() - - val standardizedpred = new OpLinearRegression().setStandardization(true) - .setInput(labelData, checkedFeatures).getOutput() - - lazy val linRegWorkFlow = new OpWorkflow() - .setResultFeatures(unstandardizedpred, standardizedpred).setInputDataset(rawDF) - lazy val linRegModel = linRegWorkFlow.train() - lazy val standardizedLinModel = Option(linRegModel.getOriginStageOf(standardizedpred) - .asInstanceOf[OpLinearRegressionModel]) - val unstandardizedFtImp = linRegModel.modelInsights(unstandardizedpred) - .features.map(_.derivedFeatures.map(_.contribution)) - val standardizedFtImp = linRegModel.modelInsights(standardizedpred) - .features.map(_.derivedFeatures.map(_.contribution)) - val descaledsmallCoeff = standardizedFtImp.flatten.flatten.head - val originalsmallCoeff = unstandardizedFtImp.flatten.flatten.head - val descaledbigCoeff = standardizedFtImp.flatten.flatten.last - val orginalbigCoeff = unstandardizedFtImp.flatten.flatten.last + val params = new OpParams() @@ -704,9 +732,30 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou // of the model trained on standardized data should be within a small distance of the analytical formula. // difference between the real coefficient and the analytical formula + // return Array(descaledsmallCoeff, originalsmallCoeff, descaledbigCoeff, orginalbigCoeff) + val coeffs = getFeatureImp(standardizedLinpred, unstandardizedLinpred, linRegDF._3) + val descaledsmallCoeff = coeffs(0) + val originalsmallCoeff = coeffs(1) + val descaledbigCoeff = coeffs(2) + val orginalbigCoeff = coeffs(3) val absError = math.abs(orginalbigCoeff * math.sqrt(smallFeatureVariance) / labelStd - descaledbigCoeff) val absError2 = math.abs(originalsmallCoeff * math.sqrt(bigFeatureVariance) / labelStd - descaledsmallCoeff) absError < 0.01 shouldBe true absError2 < 0.01 shouldBe true } + + it should "correctly return the descaled coefficient for logistic regression, " + + "when standardization is on" in { + val coeffs = getFeatureImp(standardizedLogpred, unstandardizedLogpred, logRegDF._3) + val descaledsmallCoeff = coeffs(0) + val originalsmallCoeff = coeffs(1) + val descaledbigCoeff = coeffs(2) + val orginalbigCoeff = coeffs(3) + // difference between the real coefficient and the analytical formula + val absError = math.abs(orginalbigCoeff * math.sqrt(smallFeatureVariance) - descaledbigCoeff) + val absError2 = math.abs(originalsmallCoeff * math.sqrt(mediumFeatureVariance) - descaledsmallCoeff) + println(descaledsmallCoeff, originalsmallCoeff, descaledbigCoeff, orginalbigCoeff, absError, absError2) + absError < 0.2 shouldBe true + absError2 < 0.01 shouldBe true + } } From c6bae485b1b20dfd71a046af6df75e8c80eaa69b Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 18 Jul 2019 16:38:56 -0700 Subject: [PATCH 21/27] remove redundant import --- core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index bf2c439ce3..d6a2560c58 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -35,7 +35,7 @@ import com.salesforce.op.features.{Feature, FeatureDistributionType, FeatureLike import com.salesforce.op.filters._ import com.salesforce.op.stages.impl.classification._ import com.salesforce.op.stages.impl.preparators._ -import com.salesforce.op.stages.impl.regression.{OpLinearRegression, OpLinearRegressionModel, OpXGBoostRegressor, RegressionModelSelector} +import com.salesforce.op.stages.impl.regression.{OpLinearRegression, OpXGBoostRegressor, RegressionModelSelector} import com.salesforce.op.stages.impl.selector.ModelSelectorNames.EstimatorType import com.salesforce.op.stages.impl.selector.SelectedModel import com.salesforce.op.stages.impl.selector.ValidationType._ From bc601873b3c2907e8f0faa8527912475714cbdff Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Fri, 19 Jul 2019 09:59:29 -0700 Subject: [PATCH 22/27] fix scala style --- .../test/scala/com/salesforce/op/ModelInsightsTest.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index d6a2560c58..160173d639 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -107,7 +107,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou val noise = RandomReal.normal[Real](0.0, 100.0).limit(1000) // make a simple linear combination of the features (with noise), pass through sigmoid function and binarize // to make labels for logistic reg toy data - def binarize(x: Double) = { + def binarize(x: Double):Int = { val sigmoid = 1.0 / (1.0 + math.exp(-x)) if (sigmoid > 0.5) 1 else 0 } @@ -118,8 +118,8 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou .zipped.map(_.toDouble.get * 5000 + _.toDouble.get).map(RealNN(_)) val labelStd = math.sqrt(5000 * 5000 * smallFeatureVariance + bigFeatureVariance) - def twoFeatureDF(feature1: List[Real], feature2: List[Real], label: List[RealNN]) - :(Feature[RealNN], FeatureLike[OPVector], DataFrame) = { + def twoFeatureDF(feature1: List[Real], feature2: List[Real], label: List[RealNN]): + (Feature[RealNN], FeatureLike[OPVector], DataFrame) = { val generatedData = feature1.zip(feature2).zip(label).map { case ((f1, f2), label) => (f1, f2, label) } From a6839b2f375008bb679ec353a79564515ff4a916 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Fri, 19 Jul 2019 10:37:29 -0700 Subject: [PATCH 23/27] fix scala style again --- .../test/scala/com/salesforce/op/ModelInsightsTest.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index 160173d639..dffec2cbbe 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -107,7 +107,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou val noise = RandomReal.normal[Real](0.0, 100.0).limit(1000) // make a simple linear combination of the features (with noise), pass through sigmoid function and binarize // to make labels for logistic reg toy data - def binarize(x: Double):Int = { + def binarize(x: Double): Int = { val sigmoid = 1.0 / (1.0 + math.exp(-x)) if (sigmoid > 0.5) 1 else 0 } @@ -147,8 +147,8 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou .setInput(logRegDF._1, logRegDF._2).getOutput() def getFeatureImp(standardizedModel: FeatureLike[Prediction], - unstandardizedModel: FeatureLike[Prediction], - DF: DataFrame): Array[Double] = { + unstandardizedModel: FeatureLike[Prediction], + DF: DataFrame): Array[Double] = { lazy val workFlow = new OpWorkflow() .setResultFeatures(standardizedModel, unstandardizedModel).setInputDataset(DF) lazy val model = workFlow.train() From 23b2443e1c526022033612e950deab550e64181d Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Mon, 22 Jul 2019 12:49:01 -0700 Subject: [PATCH 24/27] Update warning message --- core/src/main/scala/com/salesforce/op/ModelInsights.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index f003814a20..4df91b8e12 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -568,7 +568,7 @@ case object ModelInsights { case Some(Continuous(_, _, _, variance)) => if (variance == 0) { log.warn("The standard deviation of the label is zero, " + - "so the coefficients and intercepts of the model will be zeros, training is not needed.\"") + "so the coefficients and intercepts of the model will be zeros, training is not needed.") defaultLabelStd } else math.sqrt(variance) @@ -583,12 +583,12 @@ case object ModelInsights { } if (discreteVariance == 0) { log.warn("The standard deviation of the label is zero, " + - "so the coefficients and intercepts of the model will be zeros, training is not needed.\"") + "so the coefficients and intercepts of the model will be zeros, training is not needed.") defaultLabelStd } else math.sqrt(discreteVariance) case Some(_) => { - log.warn("Performing weight descaling on an unsupported distribution") + log.warn("Failing to perform weight descaling because distribution is unsupported.") defaultLabelStd } case None => { From 36c842069e454784c11c2db99b604c0a884d68ac Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Mon, 22 Jul 2019 13:48:19 -0700 Subject: [PATCH 25/27] update failure threshold so test will pass --- core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index dffec2cbbe..8abd776a33 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -754,8 +754,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou // difference between the real coefficient and the analytical formula val absError = math.abs(orginalbigCoeff * math.sqrt(smallFeatureVariance) - descaledbigCoeff) val absError2 = math.abs(originalsmallCoeff * math.sqrt(mediumFeatureVariance) - descaledsmallCoeff) - println(descaledsmallCoeff, originalsmallCoeff, descaledbigCoeff, orginalbigCoeff, absError, absError2) - absError < 0.2 shouldBe true + absError < 0.5 shouldBe true absError2 < 0.01 shouldBe true } } From c80cc1a3e859e3ce89da16e71b4d7a608e77be9a Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Wed, 24 Jul 2019 10:05:32 -0700 Subject: [PATCH 26/27] update test to be ratio instead of absolute difference --- .../scala/com/salesforce/op/ModelInsightsTest.scala | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index 8abd776a33..42c0365fc6 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -732,16 +732,17 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou // of the model trained on standardized data should be within a small distance of the analytical formula. // difference between the real coefficient and the analytical formula - // return Array(descaledsmallCoeff, originalsmallCoeff, descaledbigCoeff, orginalbigCoeff) val coeffs = getFeatureImp(standardizedLinpred, unstandardizedLinpred, linRegDF._3) val descaledsmallCoeff = coeffs(0) val originalsmallCoeff = coeffs(1) val descaledbigCoeff = coeffs(2) val orginalbigCoeff = coeffs(3) val absError = math.abs(orginalbigCoeff * math.sqrt(smallFeatureVariance) / labelStd - descaledbigCoeff) + val bigCoeffSum = orginalbigCoeff * math.sqrt(smallFeatureVariance) / labelStd + descaledbigCoeff val absError2 = math.abs(originalsmallCoeff * math.sqrt(bigFeatureVariance) / labelStd - descaledsmallCoeff) - absError < 0.01 shouldBe true - absError2 < 0.01 shouldBe true + val smallCoeffSum = originalsmallCoeff * math.sqrt(bigFeatureVariance) / labelStd + descaledsmallCoeff + absError / bigCoeffSum < 0.05 shouldBe true + absError2 / smallCoeffSum < 0.05 shouldBe true } it should "correctly return the descaled coefficient for logistic regression, " + @@ -753,8 +754,10 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou val orginalbigCoeff = coeffs(3) // difference between the real coefficient and the analytical formula val absError = math.abs(orginalbigCoeff * math.sqrt(smallFeatureVariance) - descaledbigCoeff) + val bigCoeffSum = orginalbigCoeff * math.sqrt(smallFeatureVariance) + descaledbigCoeff val absError2 = math.abs(originalsmallCoeff * math.sqrt(mediumFeatureVariance) - descaledsmallCoeff) - absError < 0.5 shouldBe true - absError2 < 0.01 shouldBe true + val smallCoeffSum = originalsmallCoeff * math.sqrt(mediumFeatureVariance) + descaledsmallCoeff + absError / bigCoeffSum < 0.05 shouldBe true + absError2 / smallCoeffSum < 0.05 shouldBe true } } From 51627e9c70b51a563b9c68ffc5b6c49ea5f24c94 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 25 Jul 2019 10:07:12 -0700 Subject: [PATCH 27/27] small update to set tolerance threshold --- .../test/scala/com/salesforce/op/ModelInsightsTest.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index 42c0365fc6..327a606955 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -723,6 +723,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou } } + val tol = 0.03 it should "correctly return the descaled coefficient for linear regression, " + "when standardization is on" in { @@ -741,8 +742,8 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou val bigCoeffSum = orginalbigCoeff * math.sqrt(smallFeatureVariance) / labelStd + descaledbigCoeff val absError2 = math.abs(originalsmallCoeff * math.sqrt(bigFeatureVariance) / labelStd - descaledsmallCoeff) val smallCoeffSum = originalsmallCoeff * math.sqrt(bigFeatureVariance) / labelStd + descaledsmallCoeff - absError / bigCoeffSum < 0.05 shouldBe true - absError2 / smallCoeffSum < 0.05 shouldBe true + absError / bigCoeffSum < tol shouldBe true + absError2 / smallCoeffSum < tol shouldBe true } it should "correctly return the descaled coefficient for logistic regression, " + @@ -757,7 +758,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou val bigCoeffSum = orginalbigCoeff * math.sqrt(smallFeatureVariance) + descaledbigCoeff val absError2 = math.abs(originalsmallCoeff * math.sqrt(mediumFeatureVariance) - descaledsmallCoeff) val smallCoeffSum = originalsmallCoeff * math.sqrt(mediumFeatureVariance) + descaledsmallCoeff - absError / bigCoeffSum < 0.05 shouldBe true - absError2 / smallCoeffSum < 0.05 shouldBe true + absError / bigCoeffSum < tol shouldBe true + absError2 / smallCoeffSum < tol shouldBe true } }