Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improved PercentileCalibratorTest #318

Merged
merged 3 commits into from
May 13, 2019
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ import com.salesforce.op._
import com.salesforce.op.features.types._
import com.salesforce.op.features.Feature
import com.salesforce.op.stages.base.unary.UnaryModel
import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestSparkContext}
import com.salesforce.op.utils.spark.RichDataset._
import com.salesforce.op.utils.spark.RichMetadata._
import org.apache.spark.ml.{Estimator, Transformer}
Expand All @@ -49,10 +49,25 @@ import org.scalatest.FlatSpec
import scala.util.Random

@RunWith(classOf[JUnitRunner])
class PercentileCalibratorTest extends FlatSpec with TestSparkContext {
class PercentileCalibratorTest extends OpEstimatorSpec[RealNN, UnaryModel[RealNN, RealNN], PercentileCalibrator] {

import spark.implicits._

Spec[PercentileCalibrator] should "return a minimum calibrated score of 0 and max of 99 when buckets is 100" in {
val testData = Seq(10, 100, 1000).map(_.toRealNN)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not use a bit of a larger sample?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is already being tested by 1000 elements sample later in that test, the major purpose of those values and extending of OpEstimatorSpec trait is for checking de/serializing into json, etc.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Base spec is supposed to test everything including transformations, serialization etc. Further tests usually test for additional behavior which is not covered in the base test. It makes sense to test as much as you can in the base test and avoid additional ones.


val (inputData, testF) = TestFeatureBuilder(testData)

/**
* Estimator instance to be tested
*/
override val estimator: PercentileCalibrator = new PercentileCalibrator().setInput(testF)

/**
* Expected result of the transformer applied on the Input Dataset
*/
override val expectedResult: Seq[RealNN] = Seq(33.toRealNN, 66.toRealNN, 99.toRealNN)

it should "return a minimum calibrated score of 0 and max of 99 when buckets is 100" in {
val data = (0 until 1000).map(i => i.toLong.toIntegral -> Random.nextDouble.toRealNN)
val (scoresDF, f1, f2): (DataFrame, Feature[Integral], Feature[RealNN]) = TestFeatureBuilder(data)
val percentile = f2.toPercentile()
Expand All @@ -61,8 +76,8 @@ class PercentileCalibratorTest extends FlatSpec with TestSparkContext {
val scoresTransformed = model.asInstanceOf[Transformer].transform(scoresDF)

percentile.name shouldBe percentile.originStage.getOutputFeatureName
scoresTransformed.select(min(percentile.name)).first.getDouble(0) should equal (0.0)
scoresTransformed.select(max(percentile.name)).first.getDouble(0) should equal (99.0)
scoresTransformed.select(min(percentile.name)).first.getDouble(0) should equal(0.0)
scoresTransformed.select(max(percentile.name)).first.getDouble(0) should equal(99.0)
}

it should "produce the calibration map metadata" in {
Expand All @@ -89,7 +104,7 @@ class PercentileCalibratorTest extends FlatSpec with TestSparkContext {
val model = percentile.originStage.asInstanceOf[Estimator[_]].fit(scoresDF)
val scoresTransformed = model.asInstanceOf[Transformer].transform(scoresDF)

scoresTransformed.select(max(percentile.name)).first.getDouble(0) should equal (99.0)
scoresTransformed.select(max(percentile.name)).first.getDouble(0) should equal(99.0)
}

it should "return a maximum calibrated score of 99 when calibrating with less than 100" in {
Expand All @@ -100,7 +115,7 @@ class PercentileCalibratorTest extends FlatSpec with TestSparkContext {
val model = percentile.originStage.asInstanceOf[Estimator[_]].fit(scoresDF)
val scoresTransformed = model.asInstanceOf[Transformer].transform(scoresDF)

scoresTransformed.select(max(percentile.name)).first.getDouble(0) should equal (99.0)
scoresTransformed.select(max(percentile.name)).first.getDouble(0) should equal(99.0)
}

it should "return all scores from 0 to 99 in increments of 1" in {
Expand All @@ -117,7 +132,7 @@ class PercentileCalibratorTest extends FlatSpec with TestSparkContext {

val checkSet = (0 to 99).map(_.toReal).toSet

scoreCounts.collect(percentile).toSet should equal (checkSet)
scoreCounts.collect(percentile).toSet should equal(checkSet)
}

it should "return a uniform distribution of scores" in {
Expand Down Expand Up @@ -149,6 +164,7 @@ class PercentileCalibratorTest extends FlatSpec with TestSparkContext {

val indicesByProb = scoresTransformed.orderBy(f2.name).collect(f1).deep
val indicesByPerc = scoresTransformed.orderBy(percentile.name, f2.name).collect(f1).deep
indicesByProb should equal (indicesByPerc)
indicesByProb should equal(indicesByPerc)
}

}