Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove duplicate features using sanity checker feature to feature correlations #476

Merged
merged 12 commits into from
May 19, 2020
8 changes: 2 additions & 6 deletions core/src/main/scala/com/salesforce/op/ModelInsights.scala
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ case object ModelInsights {
derivedFeatureGroup = h.grouping,
derivedFeatureValue = h.indicatorValue,
excluded = Option(s.dropped.contains(h.columnName)),
corr = getCorr(s.correlationsWLabel, h.columnName),
corr = getCorr(s.correlations, h.columnName),
cramersV = catGroupIndex.map(i => s.categoricalStats(i).cramersV),
mutualInformation = catGroupIndex.map(i => s.categoricalStats(i).mutualInfo),
pointwiseMutualInformation = (catGroupIndex, catIndexWithinGroup) match {
Expand Down Expand Up @@ -747,11 +747,7 @@ case object ModelInsights {
if (index >= 0) values.mapValues(_ (index)) else Map.empty

private def getCorr(corr: Correlations, name: String): Option[Double] = {
getIfExists(corr.featuresIn.indexOf(name), corr.values).orElse {
val j = corr.nanCorrs.indexOf(name)
if (j >= 0) Option(Double.NaN)
else None
}
getIfExists(corr.featuresIn.indexOf(name), corr.valuesWithLabel)
}

private[op] def descaleLRContrib(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,7 @@ trait RichNumericFeature {
sampleUpperLimit: Int = SanityChecker.SampleUpperLimit,
maxCorrelation: Double = SanityChecker.MaxCorrelation,
minCorrelation: Double = SanityChecker.MinCorrelation,
maxFeatureCorr: Double = SanityChecker.MaxFeatureCorr,
maxCramersV: Double = SanityChecker.MaxCramersV,
correlationType: CorrelationType = SanityChecker.CorrelationTypeDefault,
minVariance: Double = SanityChecker.MinVariance,
Expand All @@ -494,6 +495,7 @@ trait RichNumericFeature {
.setSampleUpperLimit(sampleUpperLimit)
.setMaxCorrelation(maxCorrelation)
.setMinCorrelation(minCorrelation)
.setMaxFeatureCorr(maxFeatureCorr)
.setMaxCramersV(maxCramersV)
.setCorrelationType(correlationType)
.setMinVariance(minVariance)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import com.salesforce.op.utils.spark.RichMetadata._
import org.apache.log4j.Level
import org.apache.spark.ml.linalg.{Vectors => NewVectors}
import org.apache.spark.ml.param.{BooleanParam, DoubleParam, Param, Params}
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.stat.MultivariateStatisticalSummary
import org.apache.spark.sql.types.Metadata

Expand Down Expand Up @@ -98,7 +99,8 @@ object DerivedFeatureFilterUtils {
labelNameAndIndex: Option[(String, Int)] = None,
corrsWithLabel: Array[Double] = Array.empty,
corrIndices: Array[Int] = Array.empty,
categoricalStats: Array[CategoricalGroupStats] = Array.empty
categoricalStats: Array[CategoricalGroupStats] = Array.empty,
corrMatrix: Option[Matrix] = None
): Array[ColumnStatistics] = {
// precompute all statistics to avoid rebuilding the vectors every time
val means = statsSummary.mean
Expand All @@ -108,7 +110,7 @@ object DerivedFeatureFilterUtils {
val variances = statsSummary.variance
val cramersVMap = categoricalStats.flatMap(f => f.categoricalFeatures.map(c => c -> f.cramersV))
.toMap[String, Double]
val numCorrIndices = corrIndices.length
val featureCorrs = corrMatrix.map(_.rowIter.map(_.toArray).toArray)

def maxByParent(seq: Seq[(String, Double)]) = seq.groupBy(_._1).map { case (k, v) =>
// Filter out the NaNs because max(3.4, NaN) = NaN, and we still want the keep the largest correlation
Expand Down Expand Up @@ -174,6 +176,10 @@ object DerivedFeatureFilterUtils {
case ind => Option(corrsWithLabel(ind))
},
cramersV = cramersVMap.get(name),
featureCorrs = corrIndices.indexOf(i) match {
nicodv marked this conversation as resolved.
Show resolved Hide resolved
case -1 => Seq.empty
case ind => featureCorrs.getOrElse(Array.empty).map(_.apply(ind)).dropRight(1) // drop label corr
},
parentCorr = getParentValue(col, corrParent, corrParentNoKeys),
parentCramersV = getParentValue(col, cramersVParent, cramersVParentNoKeys),
maxRuleConfidences = maxRuleConfMap.getOrElse(name, Seq.empty),
Expand All @@ -195,6 +201,7 @@ object DerivedFeatureFilterUtils {
variance = variances(labelColumnIndex),
corrLabel = None,
cramersV = None,
featureCorrs = Seq.empty,
parentCorr = None,
parentCramersV = None,
maxRuleConfidences = Seq.empty,
Expand All @@ -214,6 +221,7 @@ object DerivedFeatureFilterUtils {
* @param minVariance Min variance for dropping features
* @param minCorrelation Min correlation with label for dropping features
* @param maxCorrelation Max correlation with label for dropping features
* @param maxFeatureCorr Max correlation between features for dropping the later features
* @param maxCramersV Max Cramer's V for dropping categorical features
* @param maxRuleConfidence Max allowed confidence of association rules for dropping features
* @param minRequiredRuleSupport Threshold for association rule
Expand All @@ -229,6 +237,7 @@ object DerivedFeatureFilterUtils {
minVariance: Double,
minCorrelation: Double = 0.0,
maxCorrelation: Double = 1.0,
maxFeatureCorr: Double = 1.0,
maxCramersV: Double = 1.0,
maxRuleConfidence: Double = 1.0,
minRequiredRuleSupport: Double = 1.0,
Expand Down Expand Up @@ -256,6 +265,7 @@ object DerivedFeatureFilterUtils {
minVariance = minVariance,
minCorrelation = minCorrelation,
maxCorrelation = maxCorrelation,
maxFeatureCorr = maxFeatureCorr,
maxCramersV = maxCramersV,
maxRuleConfidence = maxRuleConfidence,
minRequiredRuleSupport = minRequiredRuleSupport,
Expand Down Expand Up @@ -314,6 +324,7 @@ private[op] case class ColumnStatistics
cramersV: Option[Double],
parentCorr: Option[Double],
parentCramersV: Option[Double],
featureCorrs: Seq[Double],
// Need to be able to hold up to two maxRuleConfidences or supports for the case of nullIndicator columns coming
// from non-categorical features (since they will correspond to a 2x2 contingency matrix)
maxRuleConfidences: Seq[Double],
Expand Down Expand Up @@ -341,6 +352,7 @@ private[op] case class ColumnStatistics
minVariance: Double,
maxCorrelation: Double,
minCorrelation: Double,
maxFeatureCorr: Double,
maxCramersV: Double,
maxRuleConfidence: Double,
minRequiredRuleSupport: Double,
Expand All @@ -361,6 +373,10 @@ private[op] case class ColumnStatistics
corrLabel.filter(Math.abs(_) > maxCorrelation).map(corr =>
s"correlation $corr higher than max correlation $maxCorrelation"
),
column.flatMap{ case cl => featureCorrs.take(cl.index).find(Math.abs(_) > maxFeatureCorr).map(corr =>
s"this feature has correlations $corr with another feature higher than max feature-feature" +
s" correlation $maxFeatureCorr")
},
cramersV.filter(_ > maxCramersV).map(cv =>
s"Cramer's V $cv higher than max Cramer's V $maxCramersV"
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,24 @@ trait SanityCheckerParams extends DerivedFeatureFilterParams {
final val maxCorrelation = new DoubleParam(
parent = this, name = "maxCorrelation",
doc = "Maximum correlation (absolute value) allowed between a feature in the feature vector and the label",
isValid = ParamValidators.inRange(lowerBound = 0.0, upperBound = 1.0, lowerInclusive = true, upperInclusive = true)
isValid = ParamValidators.inRange(lowerBound = -0.1, upperBound = 1.1, lowerInclusive = true, upperInclusive = true)
nicodv marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

@Jauntbox Jauntbox May 14, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did these ranges need to be changed? We shouldn't get correlations outside of [0, 1], right?

)
def setMaxCorrelation(value: Double): this.type = set(maxCorrelation, value)
def getMaxCorrelation: Double = $(maxCorrelation)

final val maxFeatureCorr = new DoubleParam(
Copy link
Contributor

@gerashegalov gerashegalov May 19, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better spell out Correlation in a public parameter

parent = this, name = "maxFeatureCorr",
doc = "Maximum correlation (absolute value) allowed between two features. When this value is exceeded the second" +
" feature in the correlated pair will be dropped.",
isValid = ParamValidators.inRange(lowerBound = -0.1, upperBound = 1.1, lowerInclusive = true, upperInclusive = true)
)
def setMaxFeatureCorr(value: Double): this.type = set(maxFeatureCorr, value)
def getMaxFeatureCorr: Double = $(maxFeatureCorr)

final val minCorrelation = new DoubleParam(
parent = this, name = "minCorrelation",
doc = "Minimum correlation (absolute value) allowed between a feature in the feature vector and the label",
isValid = ParamValidators.inRange(lowerBound = 0.0, upperBound = 1.0, lowerInclusive = true, upperInclusive = true)
isValid = ParamValidators.inRange(lowerBound = -0.1, upperBound = 1.1, lowerInclusive = true, upperInclusive = true)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here - why are these changed?

)
def setMinCorrelation(value: Double): this.type = set(minCorrelation, value)
def getMinCorrelation: Double = $(minCorrelation)
Expand Down Expand Up @@ -190,6 +199,7 @@ trait SanityCheckerParams extends DerivedFeatureFilterParams {
sampleUpperLimit -> SanityChecker.SampleUpperLimit,
maxCorrelation -> SanityChecker.MaxCorrelation,
minCorrelation -> SanityChecker.MinCorrelation,
maxFeatureCorr -> SanityChecker.MaxFeatureCorr,
minVariance -> SanityChecker.MinVariance,
maxCramersV -> SanityChecker.MaxCramersV,
removeBadFeatures -> SanityChecker.RemoveBadFeatures,
Expand Down Expand Up @@ -280,7 +290,7 @@ class SanityChecker(uid: String = UID[SanityChecker])
val colsCleaned = cols.map(_._2).filterNot(c => repeats.contains(c.index))
(group, colsCleaned.map(_.makeColName()), colsCleaned.map(_.index),
colsCleaned.exists(_.hasParentOfSubType[MultiPickList]))
}
}

colIndicesByGrouping.map {
case (group, colNames, valueIndices, isMpl) =>
Expand Down Expand Up @@ -443,13 +453,13 @@ class SanityChecker(uid: String = UID[SanityChecker])
else ((0 until featureSize + 1).toArray, vectorRows)
val numCorrIndices = corrIndices.length

// TODO: We are still calculating the full correlation matrix when featureLabelCorrOnly is false, but are not
// TODO: storing it anywhere. If we want access to the inter-feature correlations then need to refactor this a bit.
val corrsWithLabel = if ($(featureLabelCorrOnly)) {
OpStatistics.computeCorrelationsWithLabel(vectorRowsForCorr, colStats, count)
val (corrMatrix, corrsWithLabel) = if ($(featureLabelCorrOnly)) {
None -> OpStatistics.computeCorrelationsWithLabel(vectorRowsForCorr, colStats, count)
}
else {
val matrix = Statistics.corr(vectorRowsForCorr, getCorrelationType.sparkName)
Option(matrix) -> matrix.rowIter.map(_.apply(numCorrIndices - 1)).toArray
}
else Statistics.corr(vectorRowsForCorr, getCorrelationType.sparkName).rowIter
.map(_.apply(numCorrIndices - 1)).toArray

// Only calculate this if the label is categorical, so ignore if user has flagged label as not categorical
val categoricalStats =
Expand All @@ -467,7 +477,8 @@ class SanityChecker(uid: String = UID[SanityChecker])
Option((in1.name, featureSize)), // label column goes at end of vector
corrsWithLabel,
corrIndices,
categoricalStats
categoricalStats,
corrMatrix
)
stats.foreach { stat => logInfo(stat.toString) }

Expand All @@ -478,6 +489,7 @@ class SanityChecker(uid: String = UID[SanityChecker])
$(minVariance),
$(minCorrelation),
$(maxCorrelation),
$(maxFeatureCorr),
$(maxCramersV),
$(maxRuleConfidence),
$(minRequiredRuleSupport),
Expand Down Expand Up @@ -542,6 +554,7 @@ object SanityChecker {
val SampleLowerLimit = 1E3.toInt
val SampleUpperLimit = 1E6.toInt
val MaxCorrelation = 0.95
val MaxFeatureCorr = 0.99
val MinCorrelation = 0.0
val MinVariance = 1E-5
val MaxCramersV = 0.95
Expand Down
Loading