diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 9a2340ed74..59f75f9e03 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -638,7 +638,7 @@ case object ModelInsights { derivedFeatureGroup = h.grouping, derivedFeatureValue = h.indicatorValue, excluded = Option(s.dropped.contains(h.columnName)), - corr = getCorr(s.correlationsWLabel, h.columnName), + corr = getCorr(s.correlations, h.columnName), cramersV = catGroupIndex.map(i => s.categoricalStats(i).cramersV), mutualInformation = catGroupIndex.map(i => s.categoricalStats(i).mutualInfo), pointwiseMutualInformation = (catGroupIndex, catIndexWithinGroup) match { @@ -747,11 +747,7 @@ case object ModelInsights { if (index >= 0) values.mapValues(_ (index)) else Map.empty private def getCorr(corr: Correlations, name: String): Option[Double] = { - getIfExists(corr.featuresIn.indexOf(name), corr.values).orElse { - val j = corr.nanCorrs.indexOf(name) - if (j >= 0) Option(Double.NaN) - else None - } + getIfExists(corr.featuresIn.indexOf(name), corr.valuesWithLabel) } private[op] def descaleLRContrib( diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala index c4cfa6f224..3f5448a1df 100644 --- a/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala +++ b/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala @@ -474,6 +474,7 @@ trait RichNumericFeature { sampleUpperLimit: Int = SanityChecker.SampleUpperLimit, maxCorrelation: Double = SanityChecker.MaxCorrelation, minCorrelation: Double = SanityChecker.MinCorrelation, + maxFeatureCorr: Double = SanityChecker.MaxFeatureCorr, maxCramersV: Double = SanityChecker.MaxCramersV, correlationType: CorrelationType = SanityChecker.CorrelationTypeDefault, minVariance: Double = SanityChecker.MinVariance, @@ -494,6 +495,7 @@ trait RichNumericFeature { .setSampleUpperLimit(sampleUpperLimit) .setMaxCorrelation(maxCorrelation) .setMinCorrelation(minCorrelation) + .setMaxFeatureCorrelation(maxFeatureCorr) .setMaxCramersV(maxCramersV) .setCorrelationType(correlationType) .setMinVariance(minVariance) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/preparators/DerivedFeatureFilterUtils.scala b/core/src/main/scala/com/salesforce/op/stages/impl/preparators/DerivedFeatureFilterUtils.scala index 8474aa0385..cf6aabfc02 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/preparators/DerivedFeatureFilterUtils.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/preparators/DerivedFeatureFilterUtils.scala @@ -36,6 +36,7 @@ import com.salesforce.op.utils.spark.RichMetadata._ import org.apache.log4j.Level import org.apache.spark.ml.linalg.{Vectors => NewVectors} import org.apache.spark.ml.param.{BooleanParam, DoubleParam, Param, Params} +import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.stat.MultivariateStatisticalSummary import org.apache.spark.sql.types.Metadata @@ -98,7 +99,8 @@ object DerivedFeatureFilterUtils { labelNameAndIndex: Option[(String, Int)] = None, corrsWithLabel: Array[Double] = Array.empty, corrIndices: Array[Int] = Array.empty, - categoricalStats: Array[CategoricalGroupStats] = Array.empty + categoricalStats: Array[CategoricalGroupStats] = Array.empty, + corrMatrix: Option[Matrix] = None ): Array[ColumnStatistics] = { // precompute all statistics to avoid rebuilding the vectors every time val means = statsSummary.mean @@ -108,7 +110,7 @@ object DerivedFeatureFilterUtils { val variances = statsSummary.variance val cramersVMap = categoricalStats.flatMap(f => f.categoricalFeatures.map(c => c -> f.cramersV)) .toMap[String, Double] - val numCorrIndices = corrIndices.length + val featureCorrs = corrMatrix.map(_.rowIter.map(_.toArray).toArray) def maxByParent(seq: Seq[(String, Double)]) = seq.groupBy(_._1).map { case (k, v) => // Filter out the NaNs because max(3.4, NaN) = NaN, and we still want the keep the largest correlation @@ -174,6 +176,10 @@ object DerivedFeatureFilterUtils { case ind => Option(corrsWithLabel(ind)) }, cramersV = cramersVMap.get(name), + featureCorrs = corrIndices.indexOf(i) match { + case -1 => Seq.empty + case ind => featureCorrs.getOrElse(Array.empty).map(_.apply(ind)).dropRight(1) // drop label corr + }, parentCorr = getParentValue(col, corrParent, corrParentNoKeys), parentCramersV = getParentValue(col, cramersVParent, cramersVParentNoKeys), maxRuleConfidences = maxRuleConfMap.getOrElse(name, Seq.empty), @@ -195,6 +201,7 @@ object DerivedFeatureFilterUtils { variance = variances(labelColumnIndex), corrLabel = None, cramersV = None, + featureCorrs = Seq.empty, parentCorr = None, parentCramersV = None, maxRuleConfidences = Seq.empty, @@ -214,6 +221,7 @@ object DerivedFeatureFilterUtils { * @param minVariance Min variance for dropping features * @param minCorrelation Min correlation with label for dropping features * @param maxCorrelation Max correlation with label for dropping features + * @param maxFeatureCorr Max correlation between features for dropping the later features * @param maxCramersV Max Cramer's V for dropping categorical features * @param maxRuleConfidence Max allowed confidence of association rules for dropping features * @param minRequiredRuleSupport Threshold for association rule @@ -229,6 +237,7 @@ object DerivedFeatureFilterUtils { minVariance: Double, minCorrelation: Double = 0.0, maxCorrelation: Double = 1.0, + maxFeatureCorr: Double = 1.0, maxCramersV: Double = 1.0, maxRuleConfidence: Double = 1.0, minRequiredRuleSupport: Double = 1.0, @@ -256,6 +265,7 @@ object DerivedFeatureFilterUtils { minVariance = minVariance, minCorrelation = minCorrelation, maxCorrelation = maxCorrelation, + maxFeatureCorr = maxFeatureCorr, maxCramersV = maxCramersV, maxRuleConfidence = maxRuleConfidence, minRequiredRuleSupport = minRequiredRuleSupport, @@ -314,6 +324,7 @@ private[op] case class ColumnStatistics cramersV: Option[Double], parentCorr: Option[Double], parentCramersV: Option[Double], + featureCorrs: Seq[Double], // Need to be able to hold up to two maxRuleConfidences or supports for the case of nullIndicator columns coming // from non-categorical features (since they will correspond to a 2x2 contingency matrix) maxRuleConfidences: Seq[Double], @@ -341,6 +352,7 @@ private[op] case class ColumnStatistics minVariance: Double, maxCorrelation: Double, minCorrelation: Double, + maxFeatureCorr: Double, maxCramersV: Double, maxRuleConfidence: Double, minRequiredRuleSupport: Double, @@ -361,6 +373,10 @@ private[op] case class ColumnStatistics corrLabel.filter(Math.abs(_) > maxCorrelation).map(corr => s"correlation $corr higher than max correlation $maxCorrelation" ), + column.flatMap{ case cl => featureCorrs.take(cl.index).find(Math.abs(_) > maxFeatureCorr).map(corr => + s"this feature has correlations $corr with another feature higher than max feature-feature" + + s" correlation $maxFeatureCorr") + }, cramersV.filter(_ > maxCramersV).map(cv => s"Cramer's V $cv higher than max Cramer's V $maxCramersV" ), diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityChecker.scala b/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityChecker.scala index 9b30ff1cc8..ab436a05a0 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityChecker.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityChecker.scala @@ -94,15 +94,24 @@ trait SanityCheckerParams extends DerivedFeatureFilterParams { final val maxCorrelation = new DoubleParam( parent = this, name = "maxCorrelation", doc = "Maximum correlation (absolute value) allowed between a feature in the feature vector and the label", - isValid = ParamValidators.inRange(lowerBound = 0.0, upperBound = 1.0, lowerInclusive = true, upperInclusive = true) + isValid = ParamValidators.inRange(lowerBound = -0.1, upperBound = 1.1, lowerInclusive = true, upperInclusive = true) ) def setMaxCorrelation(value: Double): this.type = set(maxCorrelation, value) def getMaxCorrelation: Double = $(maxCorrelation) + final val maxFeatureCorrelation = new DoubleParam( + parent = this, name = "maxFeatureCorr", + doc = "Maximum correlation (absolute value) allowed between two features. When this value is exceeded the second" + + " feature in the correlated pair will be dropped.", + isValid = ParamValidators.inRange(lowerBound = -0.1, upperBound = 1.1, lowerInclusive = true, upperInclusive = true) + ) + def setMaxFeatureCorrelation(value: Double): this.type = set(maxFeatureCorrelation, value) + def getMaxFeatureCorrelation: Double = $(maxFeatureCorrelation) + final val minCorrelation = new DoubleParam( parent = this, name = "minCorrelation", doc = "Minimum correlation (absolute value) allowed between a feature in the feature vector and the label", - isValid = ParamValidators.inRange(lowerBound = 0.0, upperBound = 1.0, lowerInclusive = true, upperInclusive = true) + isValid = ParamValidators.inRange(lowerBound = -0.1, upperBound = 1.1, lowerInclusive = true, upperInclusive = true) ) def setMinCorrelation(value: Double): this.type = set(minCorrelation, value) def getMinCorrelation: Double = $(minCorrelation) @@ -190,6 +199,7 @@ trait SanityCheckerParams extends DerivedFeatureFilterParams { sampleUpperLimit -> SanityChecker.SampleUpperLimit, maxCorrelation -> SanityChecker.MaxCorrelation, minCorrelation -> SanityChecker.MinCorrelation, + maxFeatureCorrelation -> SanityChecker.MaxFeatureCorr, minVariance -> SanityChecker.MinVariance, maxCramersV -> SanityChecker.MaxCramersV, removeBadFeatures -> SanityChecker.RemoveBadFeatures, @@ -280,7 +290,7 @@ class SanityChecker(uid: String = UID[SanityChecker]) val colsCleaned = cols.map(_._2).filterNot(c => repeats.contains(c.index)) (group, colsCleaned.map(_.makeColName()), colsCleaned.map(_.index), colsCleaned.exists(_.hasParentOfSubType[MultiPickList])) - } + } colIndicesByGrouping.map { case (group, colNames, valueIndices, isMpl) => @@ -443,13 +453,13 @@ class SanityChecker(uid: String = UID[SanityChecker]) else ((0 until featureSize + 1).toArray, vectorRows) val numCorrIndices = corrIndices.length - // TODO: We are still calculating the full correlation matrix when featureLabelCorrOnly is false, but are not - // TODO: storing it anywhere. If we want access to the inter-feature correlations then need to refactor this a bit. - val corrsWithLabel = if ($(featureLabelCorrOnly)) { - OpStatistics.computeCorrelationsWithLabel(vectorRowsForCorr, colStats, count) + val (corrMatrix, corrsWithLabel) = if ($(featureLabelCorrOnly)) { + None -> OpStatistics.computeCorrelationsWithLabel(vectorRowsForCorr, colStats, count) + } + else { + val matrix = Statistics.corr(vectorRowsForCorr, getCorrelationType.sparkName) + Option(matrix) -> matrix.rowIter.map(_.apply(numCorrIndices - 1)).toArray } - else Statistics.corr(vectorRowsForCorr, getCorrelationType.sparkName).rowIter - .map(_.apply(numCorrIndices - 1)).toArray // Only calculate this if the label is categorical, so ignore if user has flagged label as not categorical val categoricalStats = @@ -467,7 +477,8 @@ class SanityChecker(uid: String = UID[SanityChecker]) Option((in1.name, featureSize)), // label column goes at end of vector corrsWithLabel, corrIndices, - categoricalStats + categoricalStats, + corrMatrix ) stats.foreach { stat => logInfo(stat.toString) } @@ -478,6 +489,7 @@ class SanityChecker(uid: String = UID[SanityChecker]) $(minVariance), $(minCorrelation), $(maxCorrelation), + $(maxFeatureCorrelation), $(maxCramersV), $(maxRuleConfidence), $(minRequiredRuleSupport), @@ -542,6 +554,7 @@ object SanityChecker { val SampleLowerLimit = 1E3.toInt val SampleUpperLimit = 1E6.toInt val MaxCorrelation = 0.95 + val MaxFeatureCorr = 0.99 val MinCorrelation = 0.0 val MinVariance = 1E-5 val MaxCramersV = 0.95 diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerMetadata.scala b/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerMetadata.scala index 1ec85660f4..c5713d0e8b 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerMetadata.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerMetadata.scala @@ -43,8 +43,7 @@ import scala.util.{Failure, Success, Try} * Contains all names for sanity checker metadata */ case object SanityCheckerNames extends DerivedFeatureFilterNames { - val CorrelationsWLabel: String = "correlationsWithLabel" - val CorrelationsWLabelIsNaN: String = "correlationsWithLabelIsNaN" + val Correlations: String = "correlations" val CorrelationType: String = "correlationType" val CategoricalStats: String = "categoricalStats" val CategoricalFeatures: String = "categoricalFeatures" @@ -58,7 +57,8 @@ case object SanityCheckerNames extends DerivedFeatureFilterNames { val Support: String = "support" val CountMatrix: String = "countMatrix" val FeaturesIn: String = "features" - val Values: String = "values" + val ValuesLabel: String = "valuesLabel" + val ValuesFeatures: String = "valuesFeatures" val NumNonZeros = "numNonZeros" val NumNull = "number of nulls" } @@ -66,7 +66,7 @@ case object SanityCheckerNames extends DerivedFeatureFilterNames { /** * Case class to convert to and from [[SanityChecker]] summary metadata * - * @param correlationsWLabel feature correlations with label + * @param correlations feature correlations with label * @param dropped features dropped for label leakage * @param featuresStatistics stats on features * @param names names of features passed in @@ -74,7 +74,7 @@ case object SanityCheckerNames extends DerivedFeatureFilterNames { */ case class SanityCheckerSummary ( - correlationsWLabel: Correlations, + correlations: Correlations, dropped: Seq[String], featuresStatistics: SummaryStatistics, names: Seq[String], @@ -92,9 +92,8 @@ case class SanityCheckerSummary sample: Double ) { this( - correlationsWLabel = new Correlations( - stats.filter(s => s.corrLabel.isDefined && !s.corrLabel.get.isNaN).map(s => s.name -> s.corrLabel.get), - stats.filter(s => s.corrLabel.isDefined && s.corrLabel.get.isNaN).map(_.name), + correlations = new Correlations( + stats.filter(s => s.corrLabel.isDefined).map(s => (s.name, s.corrLabel.get, s.featureCorrs)), correlationType ), dropped = dropped, @@ -113,7 +112,7 @@ case class SanityCheckerSummary */ def toMetadata(skipUnsupported: Boolean): Metadata = { val summaryMeta = new MetadataBuilder() - summaryMeta.putMetadata(SanityCheckerNames.CorrelationsWLabel, correlationsWLabel.toMetadata(skipUnsupported)) + summaryMeta.putMetadata(SanityCheckerNames.Correlations, correlations.toMetadata(skipUnsupported)) summaryMeta.putStringArray(SanityCheckerNames.Dropped, dropped.toArray) summaryMeta.putMetadata(SanityCheckerNames.FeaturesStatistics, featuresStatistics.toMetadata(skipUnsupported)) summaryMeta.putStringArray(SanityCheckerNames.Names, names.toArray) @@ -254,11 +253,9 @@ case class CategoricalStats * @throws RuntimeException in case of unsupported value type * @return [[Metadata]] metadata */ - // TODO: Build the metadata here instead of by treating Cramer's V and mutual info as correlations def toMetadata(skipUnsupported: Boolean): Metadata = { val meta = new MetadataBuilder() meta.putStringArray(SanityCheckerNames.CategoricalFeatures, categoricalFeatures) - // TODO: use custom serializer here instead of replacing NaNs with 0s meta.putDoubleArray(SanityCheckerNames.CramersV, cramersVs.map(f => if (f.isNaN) 0 else f)) meta.putDoubleArray(SanityCheckerNames.MutualInfo, mutualInfos) meta.putMetadata(SanityCheckerNames.PointwiseMutualInfoAgainstLabel, @@ -274,23 +271,24 @@ case class CategoricalStats * Correlations between features and the label from [[SanityChecker]] * * @param featuresIn names of features - * @param values correlation of feature with label - * @param nanCorrs nan correlation features + * @param valuesWithLabel correlation of feature with label + * @param valuesWithFeatures correlations between features * @param corrType type of correlation done on */ case class Correlations ( featuresIn: Seq[String], - values: Seq[Double], - nanCorrs: Seq[String], + valuesWithLabel: Seq[Double], + valuesWithFeatures: Seq[Seq[Double]], corrType: CorrelationType ) extends MetadataLike { - require(featuresIn.length == values.length, "Feature names and correlation values arrays must have the same length") + require(featuresIn.length == valuesWithLabel.length, + "Feature names and correlation values arrays must have the same length") - def this(corrs: Seq[(String, Double)], nans: Seq[String], corrType: CorrelationType) = this( + def this(corrs: Seq[(String, Double, Seq[Double])], corrType: CorrelationType) = this( featuresIn = corrs.map(_._1), - values = corrs.map(_._2), - nanCorrs = nans, + valuesWithLabel = corrs.map(_._2), + valuesWithFeatures = corrs.map(_._3), corrType = corrType ) @@ -303,8 +301,12 @@ case class Correlations def toMetadata(skipUnsupported: Boolean): Metadata = { val corrMeta = new MetadataBuilder() corrMeta.putStringArray(SanityCheckerNames.FeaturesIn, featuresIn.toArray) - corrMeta.putDoubleArray(SanityCheckerNames.Values, values.toArray) - corrMeta.putStringArray(SanityCheckerNames.CorrelationsWLabelIsNaN, nanCorrs.toArray) + corrMeta.putStringArray(SanityCheckerNames.ValuesLabel, valuesWithLabel.map(_.toString).toArray) + val fcMeta = new MetadataBuilder + if (valuesWithFeatures.nonEmpty) { + valuesWithFeatures.zip(featuresIn).map(c => fcMeta.putStringArray(c._2, c._1.map(_.toString).toArray)) + } + corrMeta.putMetadata(SanityCheckerNames.ValuesFeatures, fcMeta.build()) corrMeta.putString(SanityCheckerNames.CorrelationType, corrType.sparkName) corrMeta.build() } @@ -319,29 +321,45 @@ case class Correlations } else { corrType } - new Correlations(featuresIn ++ corr.featuresIn, values ++ corr.values, nanCorrs ++ corr.nanCorrs, corrName) + Correlations(featuresIn ++ corr.featuresIn, valuesWithLabel ++ corr.valuesWithLabel, + valuesWithFeatures ++ corr.valuesWithFeatures, corrName) } } case object SanityCheckerSummary { def flatten(checkers: Seq[SanityCheckerSummary]): SanityCheckerSummary = { - val correlationsWLabel: Correlations = checkers.map(_.correlationsWLabel).reduce(_ + _) + val correlations: Correlations = checkers.map(_.correlations).reduce(_ + _) val dropped: Seq[String] = checkers.flatMap(_.dropped) val featuresStatistics: SummaryStatistics = checkers.map(_.featuresStatistics).reduce(_ + _) val names: Seq[String] = checkers.flatMap(_.names) val categoricalStats: Array[CategoricalGroupStats] = checkers.flatMap(_.categoricalStats).toArray - new SanityCheckerSummary(correlationsWLabel, dropped, featuresStatistics, names, categoricalStats) + new SanityCheckerSummary(correlations, dropped, featuresStatistics, names, categoricalStats) } private def correlationsFromMetadata(meta: Metadata): Correlations = { val wrapped = meta.wrapped - Correlations( - featuresIn = wrapped.getArray[String](SanityCheckerNames.FeaturesIn).toSeq, - values = wrapped.getArray[Double](SanityCheckerNames.Values).toSeq, - nanCorrs = wrapped.getArray[String](SanityCheckerNames.CorrelationsWLabelIsNaN).toSeq, - corrType = CorrelationType.withNameInsensitive(wrapped.get[String](SanityCheckerNames.CorrelationType)) - ) + val features = wrapped.getArray[String](SanityCheckerNames.FeaturesIn).toSeq + if (wrapped.underlyingMap.keySet.contains("correlationsWithLabelIsNaN")) { // old sanity checker meta + val nans = wrapped.getArray[String]("correlationsWithLabelIsNaN") + val labelCorr = wrapped.getArray[Double]("values").toSeq + Correlations( + featuresIn = features ++ nans, + valuesWithLabel = labelCorr ++ Seq.fill(nans.length)(Double.NaN), + valuesWithFeatures = Seq.empty, + corrType = CorrelationType.withNameInsensitive(wrapped.get[String](SanityCheckerNames.CorrelationType)) + ) + } else { + val fc = wrapped.get[Metadata](SanityCheckerNames.ValuesFeatures).wrapped + Correlations( + featuresIn = features, + valuesWithLabel = wrapped.getArray[String](SanityCheckerNames.ValuesLabel).toSeq.map(_.toDouble), + valuesWithFeatures = + if (fc.underlyingMap.isEmpty) Seq.empty + else features.map(f => fc.getArray[String](f).toSeq.map(_.toDouble)), + corrType = CorrelationType.withNameInsensitive(wrapped.get[String](SanityCheckerNames.CorrelationType)) + ) + } } private def statisticsFromMetadata(meta: Metadata): SummaryStatistics = { @@ -382,18 +400,23 @@ case object SanityCheckerSummary { val wrapped = meta.wrapped // Try parsing as an older version of metadata (pre-3.3.0) if this doesn't work Try { + val corr = + if (wrapped.underlyingMap.contains("correlationsWithLabel")) { + wrapped.get[Metadata]("correlationsWithLabel") + } else wrapped.get[Metadata](SanityCheckerNames.Correlations) SanityCheckerSummary( - correlationsWLabel = correlationsFromMetadata(wrapped.get[Metadata](SanityCheckerNames.CorrelationsWLabel)), + correlations = correlationsFromMetadata(corr), dropped = wrapped.getArray[String](SanityCheckerNames.Dropped).toSeq, featuresStatistics = statisticsFromMetadata(wrapped.get[Metadata](SanityCheckerNames.FeaturesStatistics)), names = wrapped.getArray[String](SanityCheckerNames.Names).toSeq, categoricalStats = wrapped.getArray[Metadata](SanityCheckerNames.CategoricalStats) .map(categoricalGroupStatsFromMetadata) ) - } match { - case Success(summary) => summary - // Parse it under the old format - case Failure(_) => throw new IllegalArgumentException(s"failed to parse SanityCheckerSummary from $meta") } + } match { + case Success(summary) => summary + // Parse it under the old format + case Failure(_) => throw new IllegalArgumentException(s"failed to parse SanityCheckerSummary from $meta") } } + diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index edb1f673dc..b2a7d06ff6 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -528,7 +528,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou val labelName = "l" val summary = SanityCheckerSummary( - correlationsWLabel = Correlations(Seq("f0_f0_f2_1", "f0_f0_f3_2"), Seq(5.2, 5.3), Seq("f1_0"), + correlations = Correlations(Seq("f1_0", "f0_f0_f2_1", "f0_f0_f3_2"), Seq(Double.NaN, 5.2, 5.3), Seq.empty, CorrelationType.Pearson), dropped = Seq("f1_0"), featuresStatistics = SummaryStatistics(count = 3, sampleFraction = 0.01, max = Seq(0.1, 0.2, 0.3, 0.0), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/BadFeatureZooTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/BadFeatureZooTest.scala index 01256afc49..1cec33d518 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/BadFeatureZooTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/BadFeatureZooTest.scala @@ -88,6 +88,7 @@ class BadFeatureZooTest extends FlatSpec with TestSparkContext with Logging { val genFeatureVector = Seq(rawCity, rawCountry, rawPickList, rawCurrency).transmogrify() val checkedFeatures = new SanityChecker() .setCheckSample(1.0) + .setMaxFeatureCorrelation(1.1) .setInput(labelData, genFeatureVector) .setRemoveBadFeatures(true) .getOutput() @@ -245,6 +246,7 @@ class BadFeatureZooTest extends FlatSpec with TestSparkContext with Logging { val genFeatureVector = Seq(rawCity, rawCountry, rawPickList, rawCurrency).transmogrify() val checkedFeatures = new SanityChecker() .setCheckSample(1.0) + .setMaxFeatureCorrelation(1.1) .setInput(labelData, genFeatureVector) .setRemoveBadFeatures(true) .getOutput() @@ -289,6 +291,7 @@ class BadFeatureZooTest extends FlatSpec with TestSparkContext with Logging { val genFeatureVector = Seq(rawCity, rawCountry, rawPickList, rawCurrency).transmogrify() val checkedFeatures = new SanityChecker() .setCheckSample(1.0) + .setMaxFeatureCorrelation(1.1) .setInput(labelData, genFeatureVector) .setRemoveBadFeatures(true) .setCategoricalLabel(false) @@ -333,6 +336,7 @@ class BadFeatureZooTest extends FlatSpec with TestSparkContext with Logging { val genFeatureVector = Seq(rawCity, rawCountry, rawPickList, rawIntegralMap).transmogrify() val checkedFeatures = new SanityChecker() .setCheckSample(1.0) + .setMaxFeatureCorrelation(1.1) .setInput(labelData, genFeatureVector) .setRemoveBadFeatures(true) .getOutput() @@ -642,6 +646,7 @@ class BadFeatureZooTest extends FlatSpec with TestSparkContext with Logging { val genFeatureVector = Seq(rawCity, rawCountry, rawPickList).transmogrify() val checkedFeatures = new SanityChecker() .setCheckSample(1.0) + .setMaxFeatureCorrelation(1.1) .setInput(labels, genFeatureVector) .setRemoveBadFeatures(true) .getOutput() diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerMetadataTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerMetadataTest.scala index bf3d025626..7e2fc275d8 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerMetadataTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerMetadataTest.scala @@ -43,7 +43,9 @@ import org.scalatest.junit.JUnitRunner class SanityCheckerMetadataTest extends FlatSpec with TestSparkContext { val summary = SanityCheckerSummary( - correlationsWLabel = Correlations(Seq("f2", "f3"), Seq(0.2, 0.3), Seq(), CorrelationType.Pearson), + correlations = Correlations(Seq("f2", "f3", "f4"), Seq(0.2, 0.3, Double.NaN), Seq(Seq(0.2, 0.3, 0.1), + Seq(0.3, 0.2, Double.NaN), Seq(0.1, 0.1, 0.1)), + CorrelationType.Pearson), dropped = Seq("f1"), featuresStatistics = SummaryStatistics(3, 0.01, Seq(0.1, 0.2, 0.3), Seq(0.1, 0.2, 0.3), Seq(0.1, 0.2, 0.3), Seq(0.1, 0.2, 0.3)), @@ -79,10 +81,16 @@ class SanityCheckerMetadataTest extends FlatSpec with TestSparkContext { val retrieved = SanityCheckerSummary.fromMetadata(meta) retrieved.isInstanceOf[SanityCheckerSummary] - retrieved.correlationsWLabel.nanCorrs should contain theSameElementsAs summary.correlationsWLabel.nanCorrs - retrieved.correlationsWLabel.featuresIn should contain theSameElementsAs summary.correlationsWLabel.featuresIn - retrieved.correlationsWLabel.values should contain theSameElementsAs summary.correlationsWLabel.values + retrieved.correlations.featuresIn should contain theSameElementsAs summary.correlations.featuresIn + retrieved.correlations.valuesWithLabel.zip(summary.correlations.valuesWithLabel).foreach{ + case (rd, id) => if (rd.isNaN) id.isNaN shouldBe true else rd shouldEqual id + } + retrieved.correlations.valuesWithFeatures.zip(summary.correlations.valuesWithFeatures).foreach{ + case (rs, is) => rs.zip(is).foreach{ + case (rd, id) => if (rd.isNaN) id.isNaN shouldBe true else rd shouldEqual id + } + } retrieved.categoricalStats.map(_.cramersV) should contain theSameElementsAs summary.categoricalStats.map(_.cramersV) @@ -95,7 +103,7 @@ class SanityCheckerMetadataTest extends FlatSpec with TestSparkContext { retrieved.featuresStatistics.variance should contain theSameElementsAs summary.featuresStatistics.variance retrieved.names should contain theSameElementsAs summary.names - retrieved.correlationsWLabel.corrType shouldBe summary.correlationsWLabel.corrType + retrieved.correlations.corrType shouldBe summary.correlations.corrType retrieved.categoricalStats.flatMap(_.categoricalFeatures) should contain theSameElementsAs summary.categoricalStats.flatMap(_.categoricalFeatures) @@ -112,4 +120,64 @@ class SanityCheckerMetadataTest extends FlatSpec with TestSparkContext { recovered.hashCode() shouldEqual summary.toMetadata().hashCode() } + it should "be able to read metadata from the old format" in { + val json = """{ + | "statistics" : { + | "sampleFraction" : 0.01, + | "count" : 3.0, + | "variance" : [ 0.1, 0.2, 0.3 ], + | "mean" : [ 0.1, 0.2, 0.3 ], + | "min" : [ 0.1, 0.2, 0.3 ], + | "max" : [ 0.1, 0.2, 0.3 ] + | }, + | "names" : [ "f1", "f2", "f3" ], + | "correlationsWithLabel" : { + | "correlationType" : "pearson", + | "values" : [ 0.2, 0.3 ], + | "features" : [ "f2", "f3" ], + | "correlationsWithLabelIsNaN" : ["f1"] + | }, + | "categoricalStats" : [ { + | "support" : [ 1.0 ], + | "contingencyMatrix" : { + | "2" : [ 12.0 ], + | "1" : [ 12.0 ], + | "0" : [ 12.0 ] + | }, + | "maxRuleConfidence" : [ 1.0 ], + | "categoricalFeatures" : [ "f4" ], + | "pointwiseMutualInfoAgainstLabel" : { + | "2" : [ -0.32 ], + | "1" : [ 1.11 ], + | "0" : [ 1.23 ] + | }, + | "mutualInfo" : -1.22, + | "cramersV" : 0.45, + | "group" : "f4" + | }, { + | "support" : [ 1.0 ], + | "contingencyMatrix" : { + | "2" : [ 12.0 ], + | "1" : [ 12.0 ], + | "0" : [ 12.0 ] + | }, + | "maxRuleConfidence" : [ 1.0 ], + | "categoricalFeatures" : [ "f5" ], + | "pointwiseMutualInfoAgainstLabel" : { + | "2" : [ 0.99 ], + | "1" : [ 0.34 ], + | "0" : [ -2.11 ] + | }, + | "mutualInfo" : -0.51, + | "cramersV" : 0.11, + | "group" : "f5" + | } ], + | "featuresDropped" : [ "f1" ] + |}""".stripMargin + val recovered = Metadata.fromJson(json) + val summaryRecovered = SanityCheckerSummary.fromMetadata(recovered) + summaryRecovered.correlations.valuesWithLabel.size shouldBe 3 + summaryRecovered.correlations.valuesWithFeatures shouldBe Seq.empty + } + } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala index cfc0c9168c..f75278b9bf 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala @@ -31,6 +31,7 @@ package com.salesforce.op.stages.impl.preparators import com.salesforce.op._ + import com.salesforce.op.features.FeatureLike import com.salesforce.op.features.types._ import com.salesforce.op.stages.MetadataParam @@ -158,7 +159,7 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP val transformedData = model.transform(testData) val expectedFeatNames = expectedCorrFeatNames ++ expectedCorrFeatNamesIsNan - validateTransformerOutput(outputColName, transformedData, expectedFeatNames, expectedCorrFeatNames, + validateTransformerOutput(outputColName, transformedData, expectedFeatNames, featuresToDrop, expectedCorrFeatNamesIsNan) } @@ -183,6 +184,7 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP checker.getOrDefault(checker.maxCorrelation) shouldBe SanityChecker.MaxCorrelation checker.getOrDefault(checker.minVariance) shouldBe SanityChecker.MinVariance checker.getOrDefault(checker.minCorrelation) shouldBe SanityChecker.MinCorrelation + checker.getOrDefault(checker.maxFeatureCorrelation) shouldBe SanityChecker.MaxFeatureCorr checker.getOrDefault(checker.correlationType) shouldBe CorrelationType.Pearson.entryName checker.getOrDefault(checker.removeBadFeatures) shouldBe SanityChecker.RemoveBadFeatures } @@ -209,7 +211,7 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP outputColumns.columns.length + summary.dropped.length shouldEqual testMetadata.columns.length val expectedFeatNames = expectedCorrFeatNames ++ expectedCorrFeatNamesIsNan - validateTransformerOutput(outputColName, transformedData, expectedFeatNames, expectedCorrFeatNames, + validateTransformerOutput(outputColName, transformedData, expectedFeatNames, featuresToDrop, expectedCorrFeatNamesIsNan) } @@ -228,7 +230,7 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP val transformedData = model.transform(testData) val expectedFeatNames = expectedCorrFeatNames ++ expectedCorrFeatNamesIsNan - validateTransformerOutput(outputColName, transformedData, expectedFeatNames, expectedCorrFeatNames, + validateTransformerOutput(outputColName, transformedData, expectedFeatNames, Seq(), expectedCorrFeatNamesIsNan) } @@ -318,9 +320,9 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP val summaryMeta = model.getMetadata().getSummaryMetadata() val correlations = summaryMeta - .getMetadata(SanityCheckerNames.CorrelationsWLabel) - .getDoubleArray(SanityCheckerNames.Values) - correlations(0) + .getMetadata(SanityCheckerNames.Correlations) + .getStringArray(SanityCheckerNames.ValuesLabel) + correlations(0).toDouble } val sanityCheckerSpearman = new SanityChecker() @@ -384,7 +386,7 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP val featuresWithNaNCorr = Seq("textMap_7") val expectedFeatNames = featuresWithCorr ++ featuresWithNaNCorr - validateTransformerOutput(checkedFeatures.name, transformed, expectedFeatNames, featuresWithCorr, + validateTransformerOutput(checkedFeatures.name, transformed, expectedFeatNames, featuresToDrop, featuresWithNaNCorr) } @@ -426,10 +428,11 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP ) val expectedFeatNames = featuresWithCorr ++ featuresWithNaNCorr - validateTransformerOutput(checkedFeatures.name, transformed, expectedFeatNames, featuresWithCorr, + validateTransformerOutput(checkedFeatures.name, transformed, expectedFeatNames, featuresToDrop, featuresWithNaNCorr) } + it should "not calculate correlations on hashed text features if asked not to (using SmartTextVectorizer)" in { val smartMapVectorized = new SmartTextMapVectorizer[TextMap]() .setMaxCardinality(2).setNumFeatures(8).setMinSupport(1).setTopK(2).setPrependFeatureName(true) @@ -445,6 +448,7 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP .setCorrelationExclusion(CorrelationExclusion.HashedText) .setMinCorrelation(0.0) .setMaxCorrelation(0.8) + .setMaxFeatureCorrelation(1.0) .setMaxCramersV(0.8) .setInput(targetResponse, smartMapVectorized) .getOutput() @@ -458,13 +462,41 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP "textMap_6", "textMap_7", "textMap_color_NullIndicatorValue_8", "textMap_fruit_NullIndicatorValue_9", "textMap_beverage_NullIndicatorValue_10" ) - val featuresWithCorr = Seq("textMap_color_NullIndicatorValue_8", "textMap_fruit_NullIndicatorValue_9", - "textMap_beverage_NullIndicatorValue_10" - ) + val featuresIgnore = Seq("textMap_0", "textMap_1", "textMap_2", "textMap_3", "textMap_4", "textMap_5", + "textMap_6", "textMap_7") val featuresWithNaNCorr = Seq.empty[String] - validateTransformerOutput(checkedFeatures.name, transformed, expectedFeatNames, featuresWithCorr, - featuresToDrop, featuresWithNaNCorr) + validateTransformerOutput(checkedFeatures.name, transformed, expectedFeatNames, + featuresToDrop, featuresWithNaNCorr, featuresIgnore) + } + + it should "throw out duplicate features if their correlation is above the max feature corr" in { + + val checkedFeatures = new SanityChecker() + .setCheckSample(1.0) + .setRemoveBadFeatures(true) + .setRemoveFeatureGroup(false) + .setProtectTextSharedHash(false) + .setCorrelationExclusion(CorrelationExclusion.HashedText) + .setMinVariance(-0.1) + .setMinCorrelation(-0.1) + .setMaxCorrelation(1.1) + .setMaxFeatureCorrelation(0.9) + .setMaxCramersV(1.0) + .setInput(targetLabel, featureVector) + .getOutput() + + checkedFeatures.originStage shouldBe a[SanityChecker] + + val transformed = new OpWorkflow().setResultFeatures(featureVector, checkedFeatures).transform(testData) + + val featuresToDrop = Seq("testFeatNegCor_4") + val expectedFeatNames = Seq("age_0", "height_1", "height_null_2", "gender_3", "testFeatNegCor_4") + val featuresIngore = Seq.empty + val featuresWithNaNCorr = Seq("age_0") + + validateTransformerOutput(checkedFeatures.name, transformed, expectedFeatNames, + featuresToDrop, featuresWithNaNCorr, featuresIngore) } it should "not calculate correlations on hashed text features if asked not to (using vectorizer)" in { @@ -480,6 +512,7 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP .setMinVariance(-0.1) .setMinCorrelation(0.0) .setMaxCorrelation(0.8) + .setMaxFeatureCorrelation(1.0) .setMaxCramersV(0.8) .setInput(targetResponse, vectorized) .getOutput() @@ -492,13 +525,11 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP val expectedFeatNames = (0 until 512).map(i => "textMap_" + i.toString) ++ Seq("textMap_beverage_NullIndicatorValue_512", "textMap_color_NullIndicatorValue_513", "textMap_fruit_NullIndicatorValue_514") - val featuresWithCorr = Seq("textMap_beverage_NullIndicatorValue_512", "textMap_color_NullIndicatorValue_513", - "textMap_fruit_NullIndicatorValue_514" - ) + val featuresIngore = (0 until 512).map(i => "textMap_" + i.toString) val featuresWithNaNCorr = Seq.empty[String] - validateTransformerOutput(checkedFeatures.name, transformed, expectedFeatNames, featuresWithCorr, - featuresToDrop, featuresWithNaNCorr) + validateTransformerOutput(checkedFeatures.name, transformed, expectedFeatNames, + featuresToDrop, featuresWithNaNCorr, featuresIngore) } it should "only calculate correlations between feature and the label if requested" in { @@ -532,7 +563,7 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP val featuresWithNaNCorr = Seq("textMap_7") val expectedFeatNames = featuresWithCorr ++ featuresWithNaNCorr - validateTransformerOutput(checkedFeatures.name, transformed, expectedFeatNames, featuresWithCorr, + validateTransformerOutput(checkedFeatures.name, transformed, expectedFeatNames, featuresToDrop, featuresWithNaNCorr) } @@ -579,7 +610,7 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP val expectedFeatNames = featuresWithCorr ++ featuresWithNaNCorr - validateTransformerOutput(checkedFeatures.name, transformed, expectedFeatNames, featuresWithCorr, + validateTransformerOutput(checkedFeatures.name, transformed, expectedFeatNames, featuresToDrop, featuresWithNaNCorr) } @@ -633,9 +664,9 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP outputColName: String, transformedData: DataFrame, expectedFeatNames: Seq[String], - expectedCorrFeatNames: Seq[String], expectedFeaturesToDrop: Seq[String], - expectedCorrFeatNamesIsNan: Seq[String] + expectedCorrFeatNamesIsNan: Seq[String], + ignoredNames: Seq[String] = Seq.empty ): Unit = { transformedData.select(outputColName).collect().foreach { case Row(features: Vector) => features.toArray.length equals @@ -644,11 +675,12 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP val metadata: Metadata = getMetadata(outputColName, transformedData) val summary = SanityCheckerSummary.fromMetadata(metadata.getSummaryMetadata()) - summary.names.slice(0, summary.names.size - 1) should contain theSameElementsAs expectedFeatNames - summary.correlationsWLabel.nanCorrs should contain theSameElementsAs expectedCorrFeatNamesIsNan - summary.correlationsWLabel.featuresIn should contain theSameElementsAs expectedCorrFeatNames + summary.correlations.valuesWithLabel.zip(summary.names).collect{ + case (corr, name) if corr.isNaN => name + } should contain theSameElementsAs expectedCorrFeatNamesIsNan + summary.correlations.featuresIn should contain theSameElementsAs expectedFeatNames.diff(ignoredNames) summary.dropped should contain theSameElementsAs expectedFeaturesToDrop }