-
Notifications
You must be signed in to change notification settings - Fork 393
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Possibility to Return top K positives and top K negatives for LOCO #264
Changes from 6 commits
84e2387
9fa76e5
9bce44f
128f62a
0a284bd
aea94de
97bb413
2ef84ee
0cec1c9
756cf0c
f3de950
19a9199
ea0069d
fba47ef
21a653f
dac78ba
a137d61
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,22 +33,29 @@ package com.salesforce.op.stages.impl.insights | |
import com.salesforce.op.UID | ||
import com.salesforce.op.features.types._ | ||
import com.salesforce.op.stages.base.unary.UnaryTransformer | ||
import com.salesforce.op.stages.impl.selector.SelectedModel | ||
import com.salesforce.op.stages.impl.selector.{ProblemType, SelectedModel} | ||
import com.salesforce.op.stages.sparkwrappers.specific.OpPredictorWrapperModel | ||
import com.salesforce.op.stages.sparkwrappers.specific.SparkModelConverter._ | ||
import com.salesforce.op.utils.spark.OpVectorMetadata | ||
import enumeratum.{Enum, EnumEntry} | ||
import org.apache.spark.annotation.Experimental | ||
import org.apache.spark.ml.Model | ||
import org.apache.spark.ml.linalg.Vectors | ||
import org.apache.spark.ml.param.IntParam | ||
import org.apache.spark.ml.param.{IntParam, Param} | ||
|
||
import scala.collection.mutable.PriorityQueue | ||
|
||
/** | ||
* Creates record level insights for model predictions. Takes the model to explain as a constructor argument. | ||
* The input feature is the feature vector fed into the model. | ||
* @param model model instance that you wish to explain | ||
* @param uid uid for instance | ||
* | ||
* The map's contents are different regarding the value of the topKStrategy param (only for Binary Classification | ||
* and Regression) : | ||
* - If PositiveNegative, returns at most 2 * topK elements : the topK most positive and the topK most negative | ||
* derived features based on the LOCO insight. | ||
* - If Abs, returns at most topK elements : the topK derived features having highest absolute value of LOCO score. | ||
* @param model model instance that you wish to explain | ||
* @param uid uid for instance | ||
*/ | ||
@Experimental | ||
class RecordInsightsLOCO[T <: Model[T]] | ||
|
@@ -61,10 +68,22 @@ class RecordInsightsLOCO[T <: Model[T]] | |
parent = this, name = "topK", | ||
doc = "Number of insights to keep for each record" | ||
) | ||
|
||
def setTopK(value: Int): this.type = set(topK, value) | ||
|
||
def getTopK: Int = $(topK) | ||
|
||
setDefault(topK -> 20) | ||
|
||
final val topKStrategy = new Param[String](parent = this, name = "topKStrategy", | ||
doc = "Whether returning topK based on absolute value or topK positives and negatives. Only for Binary " + | ||
"Classification and Regression." | ||
) | ||
|
||
def setTopKStrategy(strat: TopKStrategy): this.type = set(topKStrategy, strat.entryName) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please name fully - |
||
|
||
setDefault(topKStrategy, TopKStrategy.Abs.entryName) | ||
|
||
private val modelApply = model match { | ||
case m: SelectedModel => m.transformFn | ||
case m: OpPredictorWrapperModel[_] => m.transformFn | ||
|
@@ -74,9 +93,28 @@ class RecordInsightsLOCO[T <: Model[T]] | |
|
||
private lazy val featureInfo = OpVectorMetadata(getInputSchema()(in1.name)).getColumnHistory().map(_.toJson(false)) | ||
|
||
private lazy val vectorDummy = Array.fill(featureInfo.length)(0.0).toOPVector | ||
|
||
private[insights] lazy val problemType = modelApply(labelDummy, vectorDummy).score.length match { | ||
case 0 => ProblemType.Unknown | ||
case 1 => ProblemType.Regression | ||
case 2 => ProblemType.BinaryClassification | ||
case n if (n > 2) => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking about the multiclass issue - I think it makes sense to return the difference in score for the winning category so maybe instead of having this find the problem type it should return the index to examine (also move it down to the base score function line 117: val baseResult = modelApply(labelDummy, features) and then always use this index for pulling out the diff for the heaps |
||
log.info("MultiClassification Problem : Top K LOCOs by absolute value") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not a very user friendly message. Let's log some meaningful message for any problem type with some clear and comprehensible explanation. |
||
ProblemType.MultiClassification | ||
} | ||
} | ||
|
||
private def computeDiffs(i: Int, oldInd: Int, featureArray: Array[(Int, Double)], featureSize: Int, | ||
baseScore: Array[Double]): Array[Double] = { | ||
featureArray.update(i, (oldInd, 0)) | ||
val score = modelApply(labelDummy, OPVector(Vectors.sparse(featureSize, featureArray))).score | ||
val diffs = baseScore.zip(score).map { case (b, s) => b - s } | ||
diffs | ||
} | ||
|
||
override def transformFn: OPVector => TextMap = (features) => { | ||
val baseScore = modelApply(labelDummy, features).score | ||
val maxHeap = PriorityQueue.empty(MinScore) | ||
|
||
// TODO sparse implementation only works if changing values to zero - use dense vector to test effect of zeros | ||
val featuresSparse = features.value.toSparse | ||
|
@@ -86,26 +124,102 @@ class RecordInsightsLOCO[T <: Model[T]] | |
|
||
val k = $(topK) | ||
var i = 0 | ||
while (i < filledSize) { | ||
val (oldInd, oldVal) = featureArray(i) | ||
featureArray.update(i, (oldInd, 0)) | ||
val score = modelApply(labelDummy, OPVector(Vectors.sparse(featureSize, featureArray))).score | ||
val diffs = baseScore.zip(score).map{ case (b, s) => b - s } | ||
val max = diffs.maxBy(math.abs) | ||
maxHeap.enqueue((i, max, diffs)) | ||
if (i >= k) maxHeap.dequeue() | ||
featureArray.update(i, (oldInd, oldVal)) | ||
i += 1 | ||
val top = $(topKStrategy) match { | ||
case s if s == TopKStrategy.Abs.entryName || problemType == ProblemType.MultiClassification => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a getter method for strategy There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please also create a function for each of the cases to avoid this method growing large |
||
val absoluteMaxHeap = PriorityQueue.empty(AbsScore) | ||
var count = 0 | ||
while (i < filledSize) { | ||
val (oldInd, oldVal) = featureArray(i) | ||
featureArray.update(i, (oldInd, 0)) | ||
val score = modelApply(labelDummy, OPVector(Vectors.sparse(featureSize, featureArray))).score | ||
val diffs = baseScore.zip(score).map { case (b, s) => b - s } | ||
val max = diffs.maxBy(math.abs) | ||
if (max != 0) { // Not keeping LOCOs with value 0.0 | ||
absoluteMaxHeap.enqueue((i, max, diffs)) | ||
count += 1 | ||
if (count > k) absoluteMaxHeap.dequeue() | ||
} | ||
featureArray.update(i, (oldInd, oldVal)) | ||
i += 1 | ||
} | ||
|
||
val topAbs = absoluteMaxHeap.dequeueAll | ||
|
||
|
||
topAbs.sortBy { case (_, v, _) => -math.abs(v) } | ||
|
||
} | ||
case s if s == TopKStrategy.PositiveNegative.entryName => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if we always kept the positive and negative heap and then just check the strategy when we return the insights? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like that! This would keep the code simpler. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be slower when we want to return the top by abs value. Is there an easy way to go from 2 Priority Queues to 1? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be slightly slower but not by much if we assume that (or enforce that) k is small. To go to one q - just get all the answers and sort them. Again you are right that it is slower but if we limit k to ~100 it is a pretty small difference. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or you can get the first of each que and then basically keep popping off the last q that you took from for comparison - then it is not much slower you just have a sightly larger memory footprint There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another issue @leahmcguire. For Multiclassification, we return the highest absolute LOCO value, and for the other strategy we look at the winning category (baseScore). Getting the top Positive and negative for winning class doesn't guarantee the highest absolute value : what if this absolute value was from another class? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah - I think that the fact that we were just taking the highest absolute value was a mistake. It allows for things like baseScore = (0.1, 0.3, 0.6) new score = (0.3, 0.1, 0.6) to be the difference we report, which seems wrong. Always choosing the change in the winning value will mean that the change we report always has to do with the score that won, even if it is not the largest change in score. So yes we would change strategy for multiclass in absolute value as well - but I think it is slightly more principled...WDYT? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now the question is should we apply the same logic for Binary Classification. However, recent spikes witness a strong relationship between LOCO scores and prediction scores (cc @crupley) : Rows with the "highest scores" had on average its top absolute LOCOs being mostly positive. For "lowest scores", those locos are mostly negative. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, that is interesting. But wouldn't it be confusing to interpret for Binary if we choose the winning value because they are symetric? the scores will always some to 1 so a positive contribution to the prediction of 1 will be the same negative contribution to the prediction of 0. If we switch based on the prediction they will have to keep in mind the winning prediction to interpret the effect - this is something you need to do in multiclass but seems like an extra complication in binary. |
||
// Heap that will contain the top K positive LOCO values | ||
val positiveMaxHeap = PriorityQueue.empty(MinScore) | ||
// Heap that will contain the top K negative LOCO values | ||
val negativeMaxHeap = PriorityQueue.empty(MaxScore) | ||
// for each element of the feature vector != 0.0 | ||
// Size of positive heap | ||
var positiveCount = 0 | ||
// Size of negative heap | ||
var negativeCount = 0 | ||
for {i <- 0 until filledSize} { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use |
||
val (oldInd, oldVal) = featureArray(i) | ||
val diffs = computeDiffs(i, oldInd, featureArray, featureSize, baseScore) | ||
val max = if (problemType == ProblemType.Regression) diffs(0) else diffs(1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this seems very hacky and error prone - There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is where you would use the indexToExamine variable I suggested above |
||
|
||
if (max > 0.0) { // if positive LOCO then add it to positive heap | ||
positiveMaxHeap.enqueue((i, max, diffs)) | ||
positiveCount += 1 | ||
if (positiveCount > k) { // remove the lowest element if the heap size goes from 5 to 6 | ||
positiveMaxHeap.dequeue() | ||
} | ||
} else if (max < 0.0) { // if negative LOCO then add it to negative heap | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what about There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No. LOCOs of 0.0 are not interesting |
||
negativeMaxHeap.enqueue((i, max, diffs)) | ||
negativeCount += 1 | ||
if (negativeCount > k) { // remove the highest element if the heap size goes from 5 to 6 | ||
negativeMaxHeap.dequeue() | ||
} // Not keeping LOCOs with value 0 | ||
} | ||
featureArray.update(i, (oldInd, oldVal)) | ||
} | ||
val topPositive = positiveMaxHeap.dequeueAll | ||
val topNegative = negativeMaxHeap.dequeueAll | ||
(topPositive ++ topNegative).sortBy { case (_, v, _) => -v } | ||
} | ||
} | ||
|
||
val top = maxHeap.dequeueAll | ||
top.map{ case (k, _, v) => RecordInsightsParser.insightToText(featureInfo(featureArray(k)._1), v) } | ||
|
||
top.map { case (k, _, v) => RecordInsightsParser.insightToText(featureInfo(featureArray(k)._1), v) } | ||
.toMap.toTextMap | ||
} | ||
} | ||
|
||
|
||
private[insights] object MinScore extends Ordering[(Int, Double, Array[Double])] { | ||
private[insights] object AbsScore extends Ordering[(Int, Double, Array[Double])] { | ||
def compare(x: (Int, Double, Array[Double]), y: (Int, Double, Array[Double])): Int = | ||
math.abs(y._2) compare math.abs(x._2) | ||
} | ||
|
||
/** | ||
* Ordering of the heap that removes lowest score | ||
*/ | ||
private object MinScore extends Ordering[(Int, Double, Array[Double])] { | ||
def compare(x: (Int, Double, Array[Double]), y: (Int, Double, Array[Double])): Int = | ||
y._2 compare x._2 | ||
} | ||
|
||
/** | ||
* Ordering of the heap that removes highest score | ||
*/ | ||
private object MaxScore extends Ordering[(Int, Double, Array[Double])] { | ||
def compare(x: (Int, Double, Array[Double]), y: (Int, Double, Array[Double])): Int = | ||
x._2 compare y._2 | ||
} | ||
|
||
sealed abstract class TopKStrategy(val name: String) extends EnumEntry with Serializable | ||
|
||
object TopKStrategy extends Enum[TopKStrategy] { | ||
val values = findValues | ||
|
||
case object Abs extends TopKStrategy("abs") | ||
|
||
case object PositiveNegative extends TopKStrategy("positive and negative") | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,14 +33,15 @@ package com.salesforce.op.stages.impl.insights | |
import com.salesforce.op.{FeatureHistory, OpWorkflow} | ||
import com.salesforce.op.features.types._ | ||
import com.salesforce.op.stages.impl.classification.{OpLogisticRegression, OpRandomForestClassifier} | ||
import com.salesforce.op.stages.impl.insights.TopKStrategy.PositiveNegative | ||
import com.salesforce.op.stages.impl.preparators.SanityCheckDataTest | ||
import com.salesforce.op.stages.impl.regression.OpLinearRegression | ||
import com.salesforce.op.stages.impl.selector.ProblemType | ||
import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams | ||
import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} | ||
import com.salesforce.op.testkit.{RandomIntegral, RandomReal, RandomText, RandomVector} | ||
import com.salesforce.op.utils.spark.RichDataset._ | ||
import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} | ||
import org.apache.spark.ml.classification.LogisticRegressionModel | ||
import org.apache.spark.ml.regression.LinearRegressionModel | ||
import org.apache.spark.sql.DataFrame | ||
import org.apache.spark.sql.types.StructType | ||
|
@@ -52,6 +53,42 @@ import org.scalatest.junit.JUnitRunner | |
@RunWith(classOf[JUnitRunner]) | ||
class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { | ||
|
||
// scalastyle:off | ||
val data = Seq( // name, age, height, height_null, isBlueEyed, gender, testFeatNegCor | ||
SanityCheckDataTest("a", 32, 5.0, 0, 0.9, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 4.0, 1, 0.1, 0, 0.1), | ||
SanityCheckDataTest("a", 32, 6.0, 0, 0.8, 0.5, 0), | ||
SanityCheckDataTest("a", 32, 5.5, 0, 0.85, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.05, 0, 0.1), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.2, 0, 0.1), | ||
SanityCheckDataTest("a", 32, 5.0, 0, 0.99, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 4.0, 0, 0.0, 0, 0.1), | ||
SanityCheckDataTest("a", 32, 6.0, 1, 0.7, 0.5, 0), | ||
SanityCheckDataTest("a", 32, 5.5, 0, 0.8, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.1, 0, 0.1), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.05, 0, 0.1), | ||
SanityCheckDataTest("a", 32, 5.0, 0, 1, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 4.0, 1, 0.1, 0, 0.1), | ||
SanityCheckDataTest("a", 32, 6.0, 1, 0.9, 0.5, 0), | ||
SanityCheckDataTest("a", 32, 5.5, 0, 1, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.2, 0, 0.1), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.3, 0, 0.1), | ||
SanityCheckDataTest("a", 32, 5.0, 0, 0.6, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 4.0, 1, 0.1, 0, 0.1), | ||
SanityCheckDataTest("a", 32, 6.0, 0, 0.9, 0.5, 0), | ||
SanityCheckDataTest("a", 32, 5.5, 0, 1, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.05, 0, 0.1), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.3, 0, 0.1), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.05, 0, 0.1), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.4, 0, 0.1) | ||
).map(data => | ||
( | ||
data.name.toText, | ||
data.height_null.toRealNN, | ||
Seq(data.age, data.height, data.isBlueEyed, data.gender, data.testFeatNegCor).toOPVector | ||
) | ||
) | ||
// scalastyle:on | ||
Spec[RecordInsightsLOCO[_]] should "work with randomly generated features and binary logistic regression" in { | ||
val features = RandomVector.sparse(RandomReal.normal(), 40).limit(1000) | ||
val labels = RandomIntegral.integrals(0, 2).limit(1000).map(_.value.get.toRealNN) | ||
|
@@ -63,6 +100,8 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { | |
// val model = sparkModel.getSparkMlStage().get | ||
val insightsTransformer = new RecordInsightsLOCO(sparkModel).setInput(f1) | ||
val insights = insightsTransformer.transform(dfWithMeta).collect(insightsTransformer.getOutput()) | ||
insightsTransformer.problemType shouldBe ProblemType.BinaryClassification | ||
|
||
insights.foreach(_.value.size shouldBe 20) | ||
val parsed = insights.map(RecordInsightsParser.parseInsights) | ||
parsed.map( _.count{ case (_, v) => v.exists(_._1 == 1) } shouldBe 20 ) // number insights per pred column | ||
|
@@ -78,7 +117,10 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { | |
val sparkModel = new OpRandomForestClassifier().setInput(l1r, f1).fit(df) | ||
|
||
val insightsTransformer = new RecordInsightsLOCO(sparkModel).setInput(f1).setTopK(2) | ||
.setTopKStrategy(PositiveNegative) | ||
|
||
val insights = insightsTransformer.transform(dfWithMeta).collect(insightsTransformer.getOutput()) | ||
insightsTransformer.problemType shouldBe ProblemType.MultiClassification | ||
insights.foreach(_.value.size shouldBe 2) | ||
val parsed = insights.map(RecordInsightsParser.parseInsights) | ||
parsed.map( _.count{ case (_, v) => v.exists(_._1 == 5) } shouldBe 0 ) // no 6th column of insights | ||
|
@@ -102,6 +144,7 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { | |
|
||
val insightsTransformer = new RecordInsightsLOCO(model).setInput(f1) | ||
val insights = insightsTransformer.transform(dfWithMeta).collect(insightsTransformer.getOutput()) | ||
insightsTransformer.problemType shouldBe ProblemType.Regression | ||
insights.foreach(_.value.size shouldBe 20) | ||
val parsed = insights.map(RecordInsightsParser.parseInsights) | ||
parsed.foreach(_.values.foreach(i => i.foreach(v => v._1 shouldBe 0))) // has only one pred column | ||
|
@@ -121,59 +164,40 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { | |
|
||
it should "return the most predictive features" in { | ||
|
||
// scalastyle:off | ||
val data = Seq( // name, age, height, height_null, isBlueEyed, gender, testFeatNegCor | ||
SanityCheckDataTest("a", 32, 5.0, 0, 0.9, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 4.0, 1, 0.1, 0, 0.1), | ||
SanityCheckDataTest("a", 32, 6.0, 0, 0.8, 0.5, 0), | ||
SanityCheckDataTest("a", 32, 5.5, 0, 0.85, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.05, 0, 0.1), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.2, 0, 0.1), | ||
SanityCheckDataTest("a", 32, 5.0, 0, 0.99, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 4.0, 0, 0.0, 0, 0.1), | ||
SanityCheckDataTest("a", 32, 6.0, 1, 0.7, 0.5, 0), | ||
SanityCheckDataTest("a", 32, 5.5, 0, 0.8, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.1, 0, 0.1), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.05, 0, 0.1), | ||
SanityCheckDataTest("a", 32, 5.0, 0, 1, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 4.0, 1, 0.1, 0, 0.1), | ||
SanityCheckDataTest("a", 32, 6.0, 1, 0.9, 0.5, 0), | ||
SanityCheckDataTest("a", 32, 5.5, 0, 1, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.2, 0, 0.1), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.3, 0, 0.1), | ||
SanityCheckDataTest("a", 32, 5.0, 0, 0.6, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 4.0, 1, 0.1, 0, 0.1), | ||
SanityCheckDataTest("a", 32, 6.0, 0, 0.9, 0.5, 0), | ||
SanityCheckDataTest("a", 32, 5.5, 0, 1, 0.5, 0), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.05, 0, 0.1), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.3, 0, 0.1), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.05, 0, 0.1), | ||
SanityCheckDataTest("b", 32, 5.4, 1, 0.4, 0, 0.1) | ||
).map(data => | ||
( | ||
data.name.toText, | ||
data.height_null.toRealNN, | ||
Seq(data.age, data.height, data.isBlueEyed, data.gender, data.testFeatNegCor).toOPVector | ||
) | ||
) | ||
// scalastyle:on | ||
|
||
|
||
val (testData, name, labelNoRes, featureVector) = TestFeatureBuilder("name", "label", "features", data) | ||
val label = labelNoRes.copy(isResponse = true) | ||
val testDataMeta = addMetaData(testData, "features", 5) | ||
val sparkModel = new OpLogisticRegression().setInput(label, featureVector).fit(testData) | ||
val model = sparkModel.asInstanceOf[SparkWrapperParams[_]].getSparkMlStage().get | ||
.asInstanceOf[LogisticRegressionModel] | ||
|
||
val transformer = new RecordInsightsLOCO(model).setInput(featureVector) | ||
|
||
val insights = transformer.setTopK(1).transform(testDataMeta) | ||
val parsed = insights.collect(name, transformer.getOutput()) | ||
.map { case (n, i) => n -> RecordInsightsParser.parseInsights(i) } | ||
val transformer = new RecordInsightsLOCO(sparkModel).setInput(featureVector) | ||
|
||
val insights = transformer.setTopK(1).transform(testDataMeta).collect(transformer.getOutput()) | ||
val parsed = insights.map(RecordInsightsParser.parseInsights) | ||
// the highest corr that value that is not zero should be the top feature | ||
parsed.foreach { case (_, in) => Set("3_3_3_3", "1_1_1_1").contains(in.head._1.columnName) shouldBe true } | ||
parsed.foreach { case in => Set("3_3_3_3", "1_1_1_1").contains(in.head._1.columnName) shouldBe true } | ||
// the scores should be the same but opposite in sign | ||
parsed.foreach { case (_, in) => math.abs(in.head._2(0)._2 + in.head._2(1)._2) < 0.00001 shouldBe true } | ||
parsed.foreach { case in => math.abs(in.head._2(0)._2 + in.head._2(1)._2) < 0.00001 shouldBe true } | ||
} | ||
|
||
it should "return the most predictive features when using top K Positives + top K negatives strat" in { | ||
val (testData, name, labelNoRes, featureVector) = TestFeatureBuilder("name", "label", "features", data) | ||
val label = labelNoRes.copy(isResponse = true) | ||
val testDataMeta = addMetaData(testData, "features", 5) | ||
val sparkModel = new OpLogisticRegression().setInput(label, featureVector).fit(testData) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove redundant lines |
||
|
||
val transformer = new RecordInsightsLOCO(sparkModel).setTopKStrategy(TopKStrategy.PositiveNegative) | ||
.setInput(featureVector) | ||
|
||
val insights = transformer.transform(testDataMeta) | ||
val parsed = insights.collect(name, transformer.getOutput()) | ||
.map { case (n, i) => n -> RecordInsightsParser.parseInsights(i) } | ||
parsed.foreach { case (_, in) => | ||
in.head._1.columnName == "1_1_1_1" || in.last._1.columnName == "3_3_3_3" shouldBe true | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does this even mean |
||
} | ||
} | ||
|
||
it should "return the most predictive features for data generated with a strong relation to the label" in { | ||
|
@@ -218,9 +242,7 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { | |
// while currency can have either two (if it's null since the currency column will be filled with the mean) or just | ||
// one if it's not null. | ||
parsed.length shouldBe numRows | ||
parsed.foreach(m => if (m.keySet.count(_.columnName.contains("currency_NullIndicatorValue")) > 0) { | ||
m.size shouldBe 4 | ||
} else m.size shouldBe 3) | ||
parsed.foreach(m => m.size <= 4 shouldBe true) | ||
|
||
// Want to check the average contribution strengths for each picklist response and compare them to the | ||
// average contribution strengths of the other features. We should have a very high contribution when choices | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should we remove the
@Experimental
flag already?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no - we want to keep it so we can change things :-) For instance the way of creating this class is still terrible