Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Standalone minimum variance estimator #463

Merged
merged 18 commits into from
Mar 6, 2020
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import com.salesforce.op.features.FeatureLike
import com.salesforce.op.features.types._
import com.salesforce.op.stages.impl.classification.{Impurity, OpRandomForestClassifier}
import com.salesforce.op.stages.impl.feature.{DropIndicesByTransformer, OpLDA}
import com.salesforce.op.stages.impl.preparators.MinVarianceFilter
import com.salesforce.op.stages.sparkwrappers.specific.OpEstimatorWrapper
import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
import org.apache.spark.ml.feature.{IDF, IDFModel}
Expand Down Expand Up @@ -74,7 +75,8 @@ trait RichVectorFeature {
* @param thresholds
* @return
*/
def randomForest(
def randomForest
(
label: FeatureLike[RealNN],
maxDepth: Int = 5,
maxBins: Int = 32,
Expand Down Expand Up @@ -132,13 +134,34 @@ trait RichVectorFeature {
* Allows columns to be dropped from a feature vector based on properties of the
* metadata about what is contained in each column (will work only on vectors)
* created with [[OpVectorMetadata]]
*
* @param matchFn function that goes from [[OpVectorColumnMetadata]] to boolean for dropping
* columns (cases that evaluate to true will be dropped)
* @return new Vector with columns removed by function
*/
def dropIndicesBy(matchFn: OpVectorColumnMetadata => Boolean): FeatureLike[OPVector] = {
new DropIndicesByTransformer(matchFn = matchFn).setInput(f).getOutput()
}

/**
* Apply filter that removes computed features that have variance <= `minVariance``
*
* @param minVariance
* @param removeBadFeatures
* @return
*/
def minVariance
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

perhaps name this filterMinVariance

(
minVariance: Double = MinVarianceFilter.MinVariance,
removeBadFeatures: Boolean = MinVarianceFilter.RemoveBadFeatures
): FeatureLike[OPVector] = {
val filter = new MinVarianceFilter()
filter.setInput(f)
.setMinVariance(minVariance)
.setRemoveBadFeatures(removeBadFeatures)
.getOutput()
}

}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
/*
* Copyright (c) 2017, Salesforce.com, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

package com.salesforce.op.stages.impl.preparators

import com.salesforce.op.UID
import com.salesforce.op.features.types.{OPVector, VectorConversions}
import com.salesforce.op.stages.base.unary.{UnaryEstimator, UnaryModel}
import com.salesforce.op.utils.spark.RichMetadata._
import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
import org.apache.log4j.{Level, LogManager}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vectors => NewVectors}
import org.apache.spark.ml.param.{BooleanParam, DoubleParam, Param, Params}
import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors}
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Dataset
import org.slf4j.impl.Log4jLoggerAdapter

import scala.util.Try


trait MinVarianceFilterParams extends Params {

final val logLevel = new Param[String](
parent = this, name = "logLevel",
doc = "sets log level (INFO, WARN, ERROR, DEBUG etc.)",
isValid = (s: String) => Try(Level.toLevel(s)).isSuccess
)

private[op] def setLogLevel(level: Level): this.type = set(logLevel, level.toString)

final val removeBadFeatures = new BooleanParam(
parent = this, name = "removeBadFeatures",
doc = "If set to true, this will automatically remove all the bad features from the feature vector"
)

def setRemoveBadFeatures(value: Boolean): this.type = set(removeBadFeatures, value)

def getRemoveBadFeatures: Boolean = $(removeBadFeatures)

final val minVariance = new DoubleParam(
parent = this, name = "minVariance",
doc = "Minimum amount of variance allowed for each feature"
)

def setMinVariance(value: Double): this.type = set(minVariance, value)

def getMinVariance: Double = $(minVariance)

setDefault(
removeBadFeatures -> MinVarianceFilter.RemoveBadFeatures,
minVariance -> MinVarianceFilter.MinVariance
)
}

/**
* The MinVarianceFilter checks that computed features have a minimum variance
*
* Like SanityChecker, the Estimator step outputs statistics on incoming data, as well as the
* names of features which should be dropped from the feature vector. And the transformer step
* applies the action of actually removing the low variance features from the feature vector
*
* Two distinctions from SanityChecker:
* (1) no label column as input; and
* (2) only filters features by variance
*/
class MinVarianceFilter
(
operationName: String = classOf[MinVarianceFilter].getSimpleName,
uid: String = UID[MinVarianceFilter]
) extends UnaryEstimator[OPVector, OPVector](operationName = operationName, uid = uid)
with MinVarianceFilterParams {

private def makeColumnStatistics
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rather than copy pasting these functions is it possible to move them to a shared companion object and just have default empty values for the parts you dont use?

(
metaCols: Seq[OpVectorColumnMetadata],
statsSummary: MultivariateStatisticalSummary
): Array[ColumnStatistics] = {
val means = statsSummary.mean
val maxs = statsSummary.max
val mins = statsSummary.min
val count = statsSummary.count
val variances = statsSummary.variance
val featuresStats = metaCols.map {
col =>
val i = col.index
val name = col.makeColName()
ColumnStatistics(
name = name,
column = Some(col),
isLabel = false,
count = count,
mean = means(i),
min = mins(i),
max = maxs(i),
variance = variances(i),
corrLabel = None,
cramersV = None,
parentCorr = None,
parentCramersV = None,
maxRuleConfidences = Seq.empty,
supports = Seq.empty
)
}
featuresStats.toArray
}

private def getFeaturesToDrop(stats: Array[ColumnStatistics]): Array[ColumnStatistics] = {
val minVar = $(minVariance)
// Including dummy params in reasonsToRemove to allow re-use of `ColumnStatistics`
for {
col <- stats
reasons = col.reasonsToRemove(
minVariance = minVar,
minCorrelation = 0.0,
maxCorrelation = 1.0,
maxCramersV = 1.0,
maxRuleConfidence = 1.0,
minRequiredRuleSupport = 1.0,
removeFeatureGroup = false,
protectTextSharedHash = true,
removedGroups = Seq.empty
)
if reasons.nonEmpty
} yield {
logWarning(s"Removing ${col.name} due to: ${reasons.mkString(",")}")
col
}
}

override def fitFn(data: Dataset[OPVector#Value]): UnaryModel[OPVector, OPVector] = {
// Set the desired log level
if (isSet(logLevel)) {
Option(log).collect { case l: Log4jLoggerAdapter =>
LogManager.getLogger(l.getName).setLevel(Level.toLevel($(logLevel)))
}
}
val removeBad = $(removeBadFeatures)

logInfo("Getting vector rows")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do these have to be log info statements? debug seem more appropriate.

val vectorRows: RDD[OldVector] = data.rdd.map {
case sparse: SparseVector => OldVectors.sparse(sparse.size, sparse.indices, sparse.values)
case dense: DenseVector => OldVectors.dense(dense.toArray)
}.persist()

logInfo("Calculating columns stats")
val colStats = Statistics.colStats(vectorRows)
val count = colStats.count
require(count > 0, "Sample size cannot be zero")

val featureSize = vectorRows.first().size
require(featureSize > 0, "Feature vector passed in is empty, check your vectorizers")

// handle any possible serialization errors if users give us wrong metadata
val vectorMeta = try {
OpVectorMetadata(getInputSchema()(in1.name))
} catch {
case e: NoSuchElementException =>
throw new IllegalArgumentException("Vector input metadata is malformed: ", e)
}

require(featureSize == vectorMeta.size,
s"Number of columns in vector metadata (${vectorMeta.size}) did not match number of columns in data" +
s"($featureSize), check your vectorizers \n metadata=$vectorMeta")
val vectorMetaColumns = vectorMeta.columns
val featureNames = vectorMetaColumns.map(_.makeColName())

logInfo("Logging all statistics")
val stats = makeColumnStatistics(vectorMetaColumns, colStats)
stats.foreach { stat => logInfo(stat.toString) }

logInfo("Calculating features to remove")
val toDropFeatures = if (removeBad) getFeaturesToDrop(stats) else Array.empty[ColumnStatistics]
val toDropSet = toDropFeatures.flatMap(_.column).toSet
val outputFeatures = vectorMetaColumns.filterNot { col => toDropSet.contains(col) }
val indicesToKeep = outputFeatures.map(_.index)

val outputMeta = OpVectorMetadata(getOutputFeatureName, outputFeatures, vectorMeta.history)

val summary = new MinVarianceSummary(
dropped = toDropFeatures.map(_.name),
featuresStatistics = new SummaryStatistics(colStats, sample = 1.0),
names = featureNames
)

setMetadata(outputMeta.toMetadata.withSummaryMetadata(summary.toMetadata()))

vectorRows.unpersist(blocking = false)

require(indicesToKeep.length > 0,
"The minimum variance filter has dropped all of your features, check your input data or your threshold")

new MinVarianceFilterModel(
indicesToKeep = indicesToKeep,
removeBadFeatures = removeBad,
operationName = operationName,
uid = uid
)
}
}

final class MinVarianceFilterModel private[op]
(
val indicesToKeep: Array[Int],
val removeBadFeatures: Boolean,
operationName: String,
uid: String
) extends UnaryModel[OPVector, OPVector](operationName = operationName, uid = uid) {

def transformFn: OPVector => OPVector = feature => {
if (!removeBadFeatures) feature
else {
val vals = new Array[Double](indicesToKeep.length)
feature.value.foreachActive((i, v) => {
val k = indicesToKeep.indexOf(i)
if (k >= 0) vals(k) = v
})
NewVectors.dense(vals).compressed.toOPVector
}
}
}

object MinVarianceFilter {
val RemoveBadFeatures = false
val MinVariance = 1E-5
}
Loading