From 9ba57d6536728fb9e1fb85537575e6051aa824c8 Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Fri, 15 Mar 2019 12:24:53 -0700 Subject: [PATCH 01/13] Adds min scoring set size to RFF --- .../com/salesforce/op/filters/RawFeatureFilter.scala | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala b/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala index 7cccfd01ae..8d4cb21ef6 100644 --- a/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala +++ b/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala @@ -300,9 +300,11 @@ class RawFeatureFilter[T] val scoreData = scoringReader.flatMap { s => val sd = s.generateDataFrame(rawFeatures, parameters.switchReaderParams()).persist() log.info("Loaded scoring data") - if (sd.count() > 0) Some(sd) + val scoringDataCount = sd.count() + if (scoringDataCount >= RawFeatureFilter.minRowsForScoringSet) Some(sd) else { - log.warn("Scoring dataset was empty. Only training data checks will be used.") + log.warn(s"Scoring dataset has $scoringDataCount rows, which is less than the minimum required of " + + s"${RawFeatureFilter.minRowsForScoringSet}. Only training data checks will be used.") None } } @@ -371,6 +373,10 @@ object RawFeatureFilter { bins } + // If there are not enough rows in the scoring set, we should not perform comparisons between the training and + // scoring sets since they will not be reliable. Currently, this is set to the same as the minimum training size. + val minRowsForScoringSet = 500 + } /** From 3e6bc34eee56080f5102eeab919543b5e16dca49 Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Thu, 21 Mar 2019 15:05:25 -0700 Subject: [PATCH 02/13] Updated tests with random feature generation, attempting to get the same thing working for maps --- .../op/filters/RawFeatureFilterTest.scala | 221 +++++++++++++++++- 1 file changed, 218 insertions(+), 3 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala index be2aff57cc..6ba1f76c1f 100644 --- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala @@ -31,15 +31,23 @@ package com.salesforce.op.filters import com.salesforce.op.OpParams -import com.salesforce.op.features.{FeatureDistributionType, OPFeature} -import com.salesforce.op.readers.DataFrameFieldNames -import com.salesforce.op.test.{Passenger, PassengerSparkFixtureTest} +import com.salesforce.op.features.{Feature, FeatureDistributionType, OPFeature} +import com.salesforce.op.features.types._ +import com.salesforce.op.readers.{CustomReader, DataFrameFieldNames, ReaderKey} +import com.salesforce.op.test._ +import com.salesforce.op.testkit._ +import com.salesforce.op.testkit.RandomData +import com.salesforce.op.stages.impl.feature.OPMapVectorizerTestHelper.makeTernaryOPMapTransformer import com.salesforce.op.utils.spark.RichDataset._ import com.twitter.algebird.Operators._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.junit.runner.RunWith import org.scalatest.{Assertion, FlatSpec} import org.scalatest.junit.JUnitRunner +import scala.reflect.runtime.universe.TypeTag + @RunWith(classOf[JUnitRunner]) class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with FiltersTestData { @@ -136,6 +144,213 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with excludedBothAllMK shouldBe empty } + /** + * Generates a random dataframe and OPFeatures from supplied data generators and their types. The names of the + * columns of the dataframe are fixed to be myF1, myF2, myF3, and myF4 so that the same OPFeatures can be used to + * refer to columns in either dataframe. + * + * @param f1 Random data generator for feature 1 (type F1) + * @param f2 Random data generator for feature 2 (type F2) + * @param f3 Random data generator for feature 3 (type F3) + * @param f4 Random data generator for feature 4 (type F4) + * @param numRows Number of rows to generate + * @tparam F1 Type of feature 1 + * @tparam F2 Type of feature 2 + * @tparam F3 Type of feature 3 + * @tparam F4 Type of feature 4 + * @return Tuple containing the generated dataframe and each individual OPFeature + */ + def generateRandomDfAndFeatures[ + F1 <: FeatureType : TypeTag, + F2 <: FeatureType : TypeTag, + F3 <: FeatureType : TypeTag, + F4 <: FeatureType : TypeTag + ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], f4: RandomData[F4], numRows: Int): + (Dataset[Row], Feature[F1], Feature[F2], Feature[F3], Feature[F4]) = { + + val f1Data = f1.limit(numRows) + val f2Data = f2.limit(numRows) + val f3Data = f3.limit(numRows) + val f4Data = f4.limit(numRows) + + // Combine the data into a single tuple for each row + val generatedTrainData: Seq[(F1, F2, F3, F4)] = f1Data.zip(f2Data).zip(f3Data).zip(f4Data).map { + case (((a, b), c), d) => (a, b, c, d) + } + + TestFeatureBuilder[F1, F2, F3, F4]("myF1", "myF2", "myF3", "myF4", generatedTrainData) + } + + it should "use a simple function to generate random data" in { + val (testDf, f1, f2, f3, f4) = generateRandomDfAndFeatures[City, Country, PickList, Currency]( + RandomText.cities.withProbabilityOfEmpty(0.2), + RandomText.countries.withProbabilityOfEmpty(0.2), + RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I")).withProbabilityOfEmpty(0.2), + RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.2), + 1000 + ) + + testDf.show(10) + } + + it should "clean a dataframe filled with randomly generated data" in { + // Define random generators that will be the same for training and scoring dataframes + val cityGenerator = RandomText.cities.withProbabilityOfEmpty(0.2) + val countryGenerator = RandomText.countries.withProbabilityOfEmpty(0.2) + val pickListGenerator = RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I")) + .withProbabilityOfEmpty(0.2) + val currencyGenerator = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0) + + // Define the training dataframe and the features (these should be the same between the training and scoring + // dataframes since they point to columns with the same names) + val (trainDf, trainCity, trainCountry, trainPickList, trainCurrency) = + generateRandomDfAndFeatures[City, Country, PickList, Currency]( + cityGenerator, + countryGenerator, + pickListGenerator, + currencyGenerator.withProbabilityOfEmpty(0.2), + 1000 + ) + + // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) + val (scoreDf, _, _, _, _) = generateRandomDfAndFeatures[City, Country, PickList, Currency]( + cityGenerator, + countryGenerator, + pickListGenerator, + currencyGenerator.withProbabilityOfEmpty(1.0), + 1000 + ) + + // Define the readers + val trainReader = new CustomReader[Row](ReaderKey.randomKey) { + def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf) + } + val scoreReader = new CustomReader[Row](ReaderKey.randomKey) { + def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(scoreDf) + } + + val params = new OpParams() + // We should be able to set the features to either be the train features or the score ones here + val features: Array[OPFeature] = Array(trainCity, trainCountry, trainPickList, trainCurrency) + val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0) + val filteredRawData = filter.generateFilteredRaw(features, params) + + + // Check that the only feature that was dropped was the currency feature + // TODO: Add a check for the reason dropped once that information is passed on to the workflow + filteredRawData.featuresToDrop.length shouldBe 1 + filteredRawData.featuresToDrop.head.name.startsWith("myF4") shouldBe true + } + + it should "not remove any features when the training and scoring sets are identical" in { + // Define random generators that will be the same for training and scoring dataframes + val cityGenerator = RandomText.cities.withProbabilityOfEmpty(0.2) + val countryGenerator = RandomText.countries.withProbabilityOfEmpty(0.2) + val pickListGenerator = RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I")) + .withProbabilityOfEmpty(0.2) + val currencyGenerator = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0) + + // Define the training dataframe and the features (these should be the same between the training and scoring + // dataframes since they point to columns with the same names) + val (trainDf, trainCity, trainCountry, trainPickList, trainCurrency) = + generateRandomDfAndFeatures[City, Country, PickList, Currency]( + cityGenerator, + countryGenerator, + pickListGenerator, + currencyGenerator.withProbabilityOfEmpty(0.2), + 1000 + ) + + // Define the readers + val trainReader = new CustomReader[Row](ReaderKey.randomKey) { + def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf) + } + val scoreReader = new CustomReader[Row](ReaderKey.randomKey) { + def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf) + } + + val params = new OpParams() + // We should be able to set the features to either be the train features or the score ones here + val features: Array[OPFeature] = Array(trainCity, trainCountry, trainPickList, trainCurrency) + val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0) + val filteredRawData = filter.generateFilteredRaw(features, params) + + filteredRawData.featuresToDrop shouldBe empty + filteredRawData.mapKeysToDrop shouldBe empty + filteredRawData.cleanedData.schema.fields should contain theSameElementsAs + trainReader.generateDataFrame(features).schema.fields + + assertFeatureDistributions(filteredRawData, total = features.length * 2) + + // Also check that the all the feature distributions are the same between the training and scoring sets + filteredRawData.trainingFeatureDistributions.zip(filteredRawData.trainingFeatureDistributions).foreach{ + case (train, score) => + train.name shouldBe score.name + train.key shouldBe score.key + train.count shouldBe score.count + train.nulls shouldBe score.nulls + train.distribution shouldBe score.distribution + train.summaryInfo shouldBe score.summaryInfo + } + } + + it should "correctly clean the dataframe due to min fill rates" in { + // Define random generators that will be the same for training and scoring dataframes + val stateGenerator = RandomText.states.withProbabilityOfEmpty(0.2) + val pickListGenerator = RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I")) + .withProbabilityOfEmpty(0.2) + val realMapGenerator = RandomMap.ofReals(RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0), 0, 4) + .withKeys ("customReal" +) + val binaryMapGenerator = RandomMap.ofBinaries(probabilityOfSuccess = 0.25, 0, 4) + .withKeys ("customBinary" +) + + val currencyGenerator = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25) + val c1 = currencyGenerator.limit(1000) + val c2 = currencyGenerator.limit(1000) + val c3 = currencyGenerator.limit(1000) + val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3).asRaw(isResponse = false) + + // Define the training dataframe and the features (these should be the same between the training and scoring + // dataframes since they point to columns with the same names) + val (trainDf, trainState, trainPickList, trainRealMap, trainBinaryMap) = + generateRandomDfAndFeatures[State, PickList, RealMap, BinaryMap]( + stateGenerator, + pickListGenerator, + realMapGenerator, + binaryMapGenerator, + 1000 + ) + + // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) + val (scoreDf, _, _, _, _) = generateRandomDfAndFeatures[City, Country, PickList, Currency]( + cityGenerator, + countryGenerator, + pickListGenerator, + currencyGenerator.withProbabilityOfEmpty(1.0), + 1000 + ) + + // Define the readers + val trainReader = new CustomReader[Row](ReaderKey.randomKey) { + def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf) + } + val scoreReader = new CustomReader[Row](ReaderKey.randomKey) { + def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(scoreDf) + } + + val params = new OpParams() + // We should be able to set the features to either be the train features or the score ones here + val features: Array[OPFeature] = Array(trainCity, trainCountry, trainPickList, trainCurrency) + val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0) + val filteredRawData = filter.generateFilteredRaw(features, params) + + + // Check that the only feature that was dropped was the currency feature + // TODO: Add a check for the reason dropped once that information is passed on to the workflow + filteredRawData.featuresToDrop.length shouldBe 1 + filteredRawData.featuresToDrop.head.name.startsWith("myF4") shouldBe true + } + it should "correctly clean the dataframe returned and give the features to blacklist" in { val params = new OpParams() val survPred = survived.copy(isResponse = false) From c05f61ee3423a7934c1452cb7ff49fba2aa6b147 Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Mon, 25 Mar 2019 01:28:54 -0700 Subject: [PATCH 03/13] Several new tests with randomly generated features --- .../op/filters/RawFeatureFilterTest.scala | 515 ++++++++++++++---- 1 file changed, 404 insertions(+), 111 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala index 6ba1f76c1f..31b02152e3 100644 --- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala @@ -30,8 +30,8 @@ package com.salesforce.op.filters -import com.salesforce.op.OpParams -import com.salesforce.op.features.{Feature, FeatureDistributionType, OPFeature} +import com.salesforce.op.{OpParams, OpWorkflow} +import com.salesforce.op.features.{Feature, FeatureDistributionType, FeatureLike, OPFeature} import com.salesforce.op.features.types._ import com.salesforce.op.readers.{CustomReader, DataFrameFieldNames, ReaderKey} import com.salesforce.op.test._ @@ -40,6 +40,7 @@ import com.salesforce.op.testkit.RandomData import com.salesforce.op.stages.impl.feature.OPMapVectorizerTestHelper.makeTernaryOPMapTransformer import com.salesforce.op.utils.spark.RichDataset._ import com.twitter.algebird.Operators._ +import org.apache.log4j.Level import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.junit.runner.RunWith @@ -51,6 +52,8 @@ import scala.reflect.runtime.universe.TypeTag @RunWith(classOf[JUnitRunner]) class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with FiltersTestData { + // loggingLevel(Level.INFO) + Spec[RawFeatureFilter[_]] should "compute feature stats correctly" in { val features: Array[OPFeature] = Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap) @@ -144,6 +147,39 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with excludedBothAllMK shouldBe empty } + /** + * Generates a random dataframe and OPFeatures from supplied data generators and their types. The names of the + * columns of the dataframe are fixed to be myF1, myF2, myF3, and myF4 so that the same OPFeatures can be used to + * refer to columns in either dataframe. + * + * @param f1 Random data generator for feature 1 (type F1) + * @param f2 Random data generator for feature 2 (type F2) + * @param f3 Random data generator for feature 3 (type F3) + * @param numRows Number of rows to generate + * @tparam F1 Type of feature 1 + * @tparam F2 Type of feature 2 + * @tparam F3 Type of feature 3 + * @return Tuple containing the generated dataframe and each individual OPFeature + */ + def generateRandomDfAndFeatures[ + F1 <: FeatureType : TypeTag, + F2 <: FeatureType : TypeTag, + F3 <: FeatureType : TypeTag + ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], numRows: Int): + (Dataset[Row], Feature[F1], Feature[F2], Feature[F3]) = { + + val f1Data = f1.limit(numRows) + val f2Data = f2.limit(numRows) + val f3Data = f3.limit(numRows) + + // Combine the data into a single tuple for each row + val generatedTrainData: Seq[(F1, F2, F3)] = f1Data.zip(f2Data).zip(f3Data).map { + case ((a, b), c) => (a, b, c) + } + + TestFeatureBuilder[F1, F2, F3]("myF1", "myF2", "myF3", generatedTrainData) + } + /** * Generates a random dataframe and OPFeatures from supplied data generators and their types. The names of the * columns of the dataframe are fixed to be myF1, myF2, myF3, and myF4 so that the same OPFeatures can be used to @@ -181,93 +217,23 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with TestFeatureBuilder[F1, F2, F3, F4]("myF1", "myF2", "myF3", "myF4", generatedTrainData) } - it should "use a simple function to generate random data" in { - val (testDf, f1, f2, f3, f4) = generateRandomDfAndFeatures[City, Country, PickList, Currency]( - RandomText.cities.withProbabilityOfEmpty(0.2), - RandomText.countries.withProbabilityOfEmpty(0.2), - RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I")).withProbabilityOfEmpty(0.2), - RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.2), - 1000 - ) - - testDf.show(10) - } - - it should "clean a dataframe filled with randomly generated data" in { - // Define random generators that will be the same for training and scoring dataframes - val cityGenerator = RandomText.cities.withProbabilityOfEmpty(0.2) - val countryGenerator = RandomText.countries.withProbabilityOfEmpty(0.2) - val pickListGenerator = RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I")) - .withProbabilityOfEmpty(0.2) - val currencyGenerator = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0) - - // Define the training dataframe and the features (these should be the same between the training and scoring - // dataframes since they point to columns with the same names) - val (trainDf, trainCity, trainCountry, trainPickList, trainCurrency) = - generateRandomDfAndFeatures[City, Country, PickList, Currency]( - cityGenerator, - countryGenerator, - pickListGenerator, - currencyGenerator.withProbabilityOfEmpty(0.2), - 1000 - ) - - // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) - val (scoreDf, _, _, _, _) = generateRandomDfAndFeatures[City, Country, PickList, Currency]( - cityGenerator, - countryGenerator, - pickListGenerator, - currencyGenerator.withProbabilityOfEmpty(1.0), - 1000 - ) - - // Define the readers - val trainReader = new CustomReader[Row](ReaderKey.randomKey) { - def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf) - } - val scoreReader = new CustomReader[Row](ReaderKey.randomKey) { - def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(scoreDf) - } - - val params = new OpParams() - // We should be able to set the features to either be the train features or the score ones here - val features: Array[OPFeature] = Array(trainCity, trainCountry, trainPickList, trainCurrency) - val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0) - val filteredRawData = filter.generateFilteredRaw(features, params) - - - // Check that the only feature that was dropped was the currency feature - // TODO: Add a check for the reason dropped once that information is passed on to the workflow - filteredRawData.featuresToDrop.length shouldBe 1 - filteredRawData.featuresToDrop.head.name.startsWith("myF4") shouldBe true - } - it should "not remove any features when the training and scoring sets are identical" in { // Define random generators that will be the same for training and scoring dataframes val cityGenerator = RandomText.cities.withProbabilityOfEmpty(0.2) val countryGenerator = RandomText.countries.withProbabilityOfEmpty(0.2) val pickListGenerator = RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I")) .withProbabilityOfEmpty(0.2) - val currencyGenerator = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0) + val currencyGenerator = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.2) // Define the training dataframe and the features (these should be the same between the training and scoring // dataframes since they point to columns with the same names) val (trainDf, trainCity, trainCountry, trainPickList, trainCurrency) = generateRandomDfAndFeatures[City, Country, PickList, Currency]( - cityGenerator, - countryGenerator, - pickListGenerator, - currencyGenerator.withProbabilityOfEmpty(0.2), - 1000 + cityGenerator, countryGenerator, pickListGenerator, currencyGenerator,1000 ) // Define the readers - val trainReader = new CustomReader[Row](ReaderKey.randomKey) { - def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf) - } - val scoreReader = new CustomReader[Row](ReaderKey.randomKey) { - def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf) - } + val (trainReader, scoreReader) = makeReaders(trainDf, trainDf) val params = new OpParams() // We should be able to set the features to either be the train features or the score ones here @@ -294,62 +260,358 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with } } - it should "correctly clean the dataframe due to min fill rates" in { + it should "correctly clean the dataframe containing map and non-map features due to min fill rate" in { // Define random generators that will be the same for training and scoring dataframes - val stateGenerator = RandomText.states.withProbabilityOfEmpty(0.2) - val pickListGenerator = RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I")) - .withProbabilityOfEmpty(0.2) - val realMapGenerator = RandomMap.ofReals(RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0), 0, 4) - .withKeys ("customReal" +) - val binaryMapGenerator = RandomMap.ofBinaries(probabilityOfSuccess = 0.25, 0, 4) - .withKeys ("customBinary" +) + val currencyGenerator25 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25) + val currencyGenerator95 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95) + val currencyGenerator50 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.5) + + // Define the training dataframe and the features (these should be the same between the training and scoring + // dataframes since they point to columns with the same names) + val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( + currencyGenerator25, currencyGenerator95, currencyGenerator50, 1000 + ) + val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) + // Need to make a raw version of this feature so that RawFeatureFilter will pick it up + val mapFeatureRaw = mapFeature.asRaw(isResponse = false) + val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) + + // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) + val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( + currencyGenerator95, currencyGenerator50, currencyGenerator25,1000 + ) + val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) + + // Define the readers + val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf) + + val params = new OpParams() + // We should be able to set the features to either be the train features or the score ones here + val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw) + + // Check that using the training reader only will result in the rarely filled features being removed + val filter = new RawFeatureFilter(trainReader, None, 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0) + val filteredRawData = filter.generateFilteredRaw(features, params) + + // TODO: Add a check for the reason dropped once that information is passed on to the workflow + // Check that we drop one feature, as well as its corresponding map key + filteredRawData.featuresToDrop.length shouldBe 1 + filteredRawData.mapKeysToDrop.keySet.size shouldBe 1 + filteredRawData.mapKeysToDrop.values.head.size shouldBe 1 + + // The fure that is 99% empty should be thrown out + filteredRawData.featuresToDrop.head.name.startsWith("myF2") shouldBe true + filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f2") + + // Check the actual filtered dataframe schemas + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe true + filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m => + if (m.nonEmpty) m.value.keySet should not contain "f2") - val currencyGenerator = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25) - val c1 = currencyGenerator.limit(1000) - val c2 = currencyGenerator.limit(1000) - val c3 = currencyGenerator.limit(1000) - val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3).asRaw(isResponse = false) + // There should be 6 FeatureDistributions - training for 3 raw features, one map with three keys + // The map and non-map features should also be the same + assertFeatureDistributionEquality(filteredRawData, total = 6) + + + // Check that using the scoring reader only will result in the rarely filled in both training and scoring sets + // being removed + val filterWithScoring = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0) + val filteredRawDataWithScoring = filterWithScoring.generateFilteredRaw(features, params) + + // TODO: Add a check for the reason dropped once that information is passed on to the workflow + // Check that we drop one feature, as well as its corresponding map key + filteredRawDataWithScoring.featuresToDrop.length shouldBe 2 + filteredRawDataWithScoring.mapKeysToDrop.keySet.size shouldBe 1 + filteredRawDataWithScoring.mapKeysToDrop.values.head.size shouldBe 2 + + // The feature that is 99% empty should be thrown out + filteredRawDataWithScoring.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF1", "myF2") + filteredRawDataWithScoring.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f1", "f2") + // filteredDataWithScoring.mapKeysToDrop + // .foldLeft(Set.empty[String])((acc, x) => acc ++ x._2) should contain theSameElementsAs Seq("f1", "f2") + + filteredRawDataWithScoring.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe false + filteredRawDataWithScoring.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false + filteredRawDataWithScoring.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe true + filteredRawDataWithScoring.cleanedData.collect(mapFeatureRaw).foreach(m => + if (m.nonEmpty) m.value.keySet shouldEqual Set("f3")) + + // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys + // The map and non-map features should also be the same + assertFeatureDistributionEquality(filteredRawDataWithScoring, total = 12) + } + + /** + * This test generates three numeric generators with the same underlying distribution, but different fill rates. + * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three + * raw features - each key contains the same data as the corresponding raw feature. + * + * Features f2 & f3 are switched between the training and scoring sets, so that they should have an absolute + * fill rate difference of 0.6. The RawFeatureFilter is set up with a maximum absolute fill rate of 0.4 so both + * f2 and f3 (as well as their corresponding map keys) should be removed. + */ + it should "correctly clean the dataframe containing map and non-map features due to max absolute fill rate " + + "difference" in { + // Define random generators that will be the same for training and scoring dataframes + val realGenerator1 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0) + val realGenerator2 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.2) + val realGenerator3 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.8) // Define the training dataframe and the features (these should be the same between the training and scoring // dataframes since they point to columns with the same names) - val (trainDf, trainState, trainPickList, trainRealMap, trainBinaryMap) = - generateRandomDfAndFeatures[State, PickList, RealMap, BinaryMap]( - stateGenerator, - pickListGenerator, - realMapGenerator, - binaryMapGenerator, - 1000 + val (trainDf, r1, r2, r3) = + generateRandomDfAndFeatures[Real, Real, Real]( + realGenerator1, realGenerator2, realGenerator3, 1000 ) + val mapFeature = makeTernaryOPMapTransformer[Real, RealMap, Double](r1, r2, r3) + // Need to make a raw version of this feature so that RawFeatureFilter will pick it up + val mapFeatureRaw = mapFeature.asRaw(isResponse = false) + val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) - val (scoreDf, _, _, _, _) = generateRandomDfAndFeatures[City, Country, PickList, Currency]( - cityGenerator, - countryGenerator, - pickListGenerator, - currencyGenerator.withProbabilityOfEmpty(1.0), - 1000 + val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Real, Real, Real]( + realGenerator1, realGenerator3, realGenerator2, 1000 ) + val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) // Define the readers - val trainReader = new CustomReader[Row](ReaderKey.randomKey) { - def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf) - } - val scoreReader = new CustomReader[Row](ReaderKey.randomKey) { - def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(scoreDf) - } + val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf) val params = new OpParams() // We should be able to set the features to either be the train features or the score ones here - val features: Array[OPFeature] = Array(trainCity, trainCountry, trainPickList, trainCurrency) - val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0) + val features: Array[OPFeature] = Array(r1, r2, r3, mapFeatureRaw) + val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.0, 0.4, Double.PositiveInfinity, 1.0, 1.0) + val filteredRawData = filter.generateFilteredRaw(features, params) + + /* + val exclusions = filter.getFeaturesToExclude( + trainingDistribs = filteredRawData.trainingFeatureDistributions, + scoringDistribs = filteredRawData.scoringFeatureDistributions, + correlationInfo = Map.empty + ) + println(exclusions) + */ + + // TODO: Add a check for the reason dropped once that information is passed on to the workflow + // Check that we drop one feature, as well as its corresponding map key + filteredRawData.featuresToDrop.length shouldBe 2 + filteredRawData.mapKeysToDrop.keySet.size shouldBe 1 + filteredRawData.mapKeysToDrop.values.head.size shouldBe 2 + + // Since we swtiched the distributions in features 2 & 3 between the training and scoring sets, then both of them + // should be removed (the two raw features and the two corresponding map keys) + filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF2", "myF3") + filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f2", "f3") + + // Check the actual filtered dataframe schemas + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe false + filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m => + if (m.nonEmpty) m.value.keySet shouldEqual Set("f1")) + + // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys + // The map and non-map features should also be the same + assertFeatureDistributionEquality(filteredRawData, total = 12) + } + + /** + * This test generates three numeric generators with the same underlying distribution, but different fill rates. + * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three + * raw features - each key contains the same data as the corresponding raw feature. + * + * Features f2 & f3 are switched between the training and scoring sets, so that they should have an absolute + * fill rate difference of 0.25, and a relative fill ratio difference of 6. The RawFeatureFilter is set up with a + * maximum fill ratio difference of 4 so both f2 and f3 (as well as their corresponding map keys) should be removed. + */ + it should "correctly clean the dataframe containing map and non-map features due to max fill ratio " + + "difference" in { + // Define random generators that will be the same for training and scoring dataframes + val realGenerator1 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0) + val realGenerator2 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95) + val realGenerator3 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.7) + + // Define the training dataframe and the features (these should be the same between the training and scoring + // dataframes since they point to columns with the same names) + val (trainDf, r1, r2, r3) = + generateRandomDfAndFeatures[Real, Real, Real]( + realGenerator1, realGenerator2, realGenerator3, 1000 + ) + val mapFeature = makeTernaryOPMapTransformer[Real, RealMap, Double](r1, r2, r3) + // Need to make a raw version of this feature so that RawFeatureFilter will pick it up + val mapFeatureRaw = mapFeature.asRaw(isResponse = false) + val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) + + // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) + val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Real, Real, Real]( + realGenerator1, realGenerator3, realGenerator2, 1000 + ) + val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) + + // Define the readers + val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf) + + val params = new OpParams() + // We should be able to set the features to either be the train features or the score ones here + val features: Array[OPFeature] = Array(r1, r2, r3, mapFeatureRaw) + val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.0, 1.0, 4.0, 1.0, 1.0) val filteredRawData = filter.generateFilteredRaw(features, params) + // TODO: Add a check for the reason dropped once that information is passed on to the workflow + // Check that we drop one feature, as well as its corresponding map key + filteredRawData.featuresToDrop.length shouldBe 2 + filteredRawData.mapKeysToDrop.keySet.size shouldBe 1 + filteredRawData.mapKeysToDrop.values.head.size shouldBe 2 + + // Since we swtiched the distributions in features 2 & 3 between the training and scoring sets, then both of them + // should be removed (the two raw features and the two corresponding map keys) + filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF2", "myF3") + filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f2", "f3") + + // Check the actual filtered dataframe schemas + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe false + filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m => + if (m.nonEmpty) m.value.keySet shouldEqual Set("f1")) + + // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys + // The map and non-map features should also be the same + assertFeatureDistributionEquality(filteredRawData, total = 12) + } + + /** + * This test generates three numeric generators with the same underlying distribution, but different fill rates. + * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three + * raw features - each key contains the same data as the corresponding raw feature. + * + * Features f1 & f3 are switched between the training and scoring sets, so they should have a very large JS + * divergence (practically, 1.0). The RawFeatureFilter is set up with a maximum JS divergence of 0.8, so both + * f1 and f3 (as well as their corresponding map keys) should be removed. + */ + it should "correctly clean the dataframe containing map and non-map features due to JS divergence" in { + // Define random generators that will be the same for training and scoring dataframes + val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 1.0, sigma = 5.0).withProbabilityOfEmpty(0.1) + val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 5.0) + val currencyGenerator3 = RandomReal.logNormal[Currency](mean = 1000.0, sigma = 5.0).withProbabilityOfEmpty(0.1) + + // Define the training dataframe and the features (these should be the same between the training and scoring + // dataframes since they point to columns with the same names) + val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( + currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000 + ) + val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) + // Need to make a raw version of this feature so that RawFeatureFilter will pick it up + val mapFeatureRaw = mapFeature.asRaw(isResponse = false) + val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) + + // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) + val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( + currencyGenerator3, currencyGenerator2, currencyGenerator1, 1000 + ) + val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) + + // Define the readers + val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf) + + val params = new OpParams() + // We should be able to set the features to either be the train features or the score ones here + val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw) + + val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 0.8, 1.0) + val filteredRawData = filter.generateFilteredRaw(features, params) - // Check that the only feature that was dropped was the currency feature // TODO: Add a check for the reason dropped once that information is passed on to the workflow + // Check that we drop one feature, as well as its corresponding map key + filteredRawData.featuresToDrop.length shouldBe 2 + filteredRawData.mapKeysToDrop.keySet.size shouldBe 1 + filteredRawData.mapKeysToDrop.values.head.size shouldBe 2 + + // The feature that is 99% empty should be thrown out + filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF1", "myF3") + filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f1", "f3") + + // Check the actual filtered dataframe schemas + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe false + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe true + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe false + filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m => + if (m.nonEmpty) m.value.keySet shouldEqual Set("f2")) + + // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys + // The map and non-map features should also be the same + assertFeatureDistributionEquality(filteredRawData, total = 12) + } + + it should "not drop protected raw features" in { + // Define random generators that will be the same for training and scoring dataframes + val currencyGenerator25 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25) + val currencyGenerator95 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95) + val currencyGenerator50 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.5) + + // Define the training dataframe and the features (these should be the same between the training and scoring + // dataframes since they point to columns with the same names) + val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( + currencyGenerator25, currencyGenerator95, currencyGenerator50, 1000 + ) + val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) + // Need to make a raw version of this feature so that RawFeatureFilter will pick it up + val mapFeatureRaw = mapFeature.asRaw(isResponse = false) + val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) + + // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) + val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( + currencyGenerator95, currencyGenerator50, currencyGenerator25, 1000 + ) + val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) + + // Define the readers + val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf) + + val params = new OpParams() + // We should be able to set the features to either be the train features or the score ones here + val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw) + + // Check that using the scoring reader only will result in the rarely filled in both training and scoring sets + // being removed, except for the protected feature that would normally be removed + val filter = new RawFeatureFilter(trainReader, Some(scoreReader), + bins = 10, + minFill = 0.1, + maxFillDifference = 1.0, + maxFillRatioDiff = Double.PositiveInfinity, + maxJSDivergence = 1.0, + maxCorrelation = 1.0, + protectedFeatures = Set("myF1") + ) + val filteredRawData = filter.generateFilteredRaw(features, params) + + // TODO: Add a check for the reason dropped once that information is passed on to the workflow + // Check that we drop one feature, as well as its corresponding map key filteredRawData.featuresToDrop.length shouldBe 1 - filteredRawData.featuresToDrop.head.name.startsWith("myF4") shouldBe true + filteredRawData.mapKeysToDrop.keySet.size shouldBe 1 + filteredRawData.mapKeysToDrop.values.head.size shouldBe 2 + + // The feature that is 99% empty should be thrown out + filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF2") + filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f1", "f2") + // filteredData.mapKeysToDrop + // .foldLeft(Set.empty[String])((acc, x) => acc ++ x._2) should contain theSameElementsAs Seq("f1", "f2") + + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false + filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe true + filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m => + if (m.nonEmpty) m.value.keySet shouldEqual Set("f3")) + + // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys + // The map and non-map features should also be the same + assertFeatureDistributionEquality(filteredRawData, total = 12) } + + // TODO: check null leakage removals (just do one threshold, or two in the same test) + it should "correctly clean the dataframe returned and give the features to blacklist" in { val params = new OpParams() @@ -469,6 +731,37 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with fd.trainingFeatureDistributions ++ fd.scoringFeatureDistributions shouldBe fd.featureDistributions } + private def assertFeatureDistributionEquality(fd: FilteredRawData, total: Int): Unit = { + fd.featureDistributions.length shouldBe total + fd.trainingFeatureDistributions.zip(fd.trainingFeatureDistributions).foreach { + case (train, score) => + train.name shouldBe score.name + train.key shouldBe score.key + train.count shouldBe score.count + train.nulls shouldBe score.nulls + train.distribution shouldBe score.distribution + train.summaryInfo shouldBe score.summaryInfo + } + } + + /** + * Defines readers in terms of datasets (in these tests, already created by feature generators) + * + * @param trainDf Training dataframe + * @param scoreDf Scoring dataframe + * @return Tuple of (trainingReader, scoringReader) + */ + private def makeReaders(trainDf: Dataset[Row], scoreDf: Dataset[Row]): (CustomReader[Row], CustomReader[Row]) = { + val trainReader = new CustomReader[Row](ReaderKey.randomKey) { + def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf) + } + val scoreReader = new CustomReader[Row](ReaderKey.randomKey) { + def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(scoreDf) + } + + (trainReader, scoreReader) + } + private def nullLabelCorrelationTest( maxCorrelation: Double, expectedDropped: Seq[OPFeature], From 8469d1f90532dce9a66df1e55a12f3b6b7919bdd Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Mon, 25 Mar 2019 12:02:16 -0700 Subject: [PATCH 04/13] Cleaned up new RFF tests, and added helper function for repetetive checks --- .../op/filters/RawFeatureFilterTest.scala | 665 ++++++++---------- 1 file changed, 310 insertions(+), 355 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala index 31b02152e3..6b69eb2f71 100644 --- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala @@ -33,13 +33,13 @@ package com.salesforce.op.filters import com.salesforce.op.{OpParams, OpWorkflow} import com.salesforce.op.features.{Feature, FeatureDistributionType, FeatureLike, OPFeature} import com.salesforce.op.features.types._ -import com.salesforce.op.readers.{CustomReader, DataFrameFieldNames, ReaderKey} +import com.salesforce.op.readers.{CustomReader, ReaderKey} +import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer import com.salesforce.op.test._ import com.salesforce.op.testkit._ import com.salesforce.op.testkit.RandomData import com.salesforce.op.stages.impl.feature.OPMapVectorizerTestHelper.makeTernaryOPMapTransformer import com.salesforce.op.utils.spark.RichDataset._ -import com.twitter.algebird.Operators._ import org.apache.log4j.Level import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Dataset, Row, SparkSession} @@ -54,6 +54,10 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // loggingLevel(Level.INFO) + // Our randomly generated data will generate feature names and corresponding map keys in this universe + val featureUniverse = Set("myF1", "myF2", "myF3") + val mapKeyUniverse = Set("f1", "f2", "f3") + Spec[RawFeatureFilter[_]] should "compute feature stats correctly" in { val features: Array[OPFeature] = Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap) @@ -147,76 +151,6 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with excludedBothAllMK shouldBe empty } - /** - * Generates a random dataframe and OPFeatures from supplied data generators and their types. The names of the - * columns of the dataframe are fixed to be myF1, myF2, myF3, and myF4 so that the same OPFeatures can be used to - * refer to columns in either dataframe. - * - * @param f1 Random data generator for feature 1 (type F1) - * @param f2 Random data generator for feature 2 (type F2) - * @param f3 Random data generator for feature 3 (type F3) - * @param numRows Number of rows to generate - * @tparam F1 Type of feature 1 - * @tparam F2 Type of feature 2 - * @tparam F3 Type of feature 3 - * @return Tuple containing the generated dataframe and each individual OPFeature - */ - def generateRandomDfAndFeatures[ - F1 <: FeatureType : TypeTag, - F2 <: FeatureType : TypeTag, - F3 <: FeatureType : TypeTag - ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], numRows: Int): - (Dataset[Row], Feature[F1], Feature[F2], Feature[F3]) = { - - val f1Data = f1.limit(numRows) - val f2Data = f2.limit(numRows) - val f3Data = f3.limit(numRows) - - // Combine the data into a single tuple for each row - val generatedTrainData: Seq[(F1, F2, F3)] = f1Data.zip(f2Data).zip(f3Data).map { - case ((a, b), c) => (a, b, c) - } - - TestFeatureBuilder[F1, F2, F3]("myF1", "myF2", "myF3", generatedTrainData) - } - - /** - * Generates a random dataframe and OPFeatures from supplied data generators and their types. The names of the - * columns of the dataframe are fixed to be myF1, myF2, myF3, and myF4 so that the same OPFeatures can be used to - * refer to columns in either dataframe. - * - * @param f1 Random data generator for feature 1 (type F1) - * @param f2 Random data generator for feature 2 (type F2) - * @param f3 Random data generator for feature 3 (type F3) - * @param f4 Random data generator for feature 4 (type F4) - * @param numRows Number of rows to generate - * @tparam F1 Type of feature 1 - * @tparam F2 Type of feature 2 - * @tparam F3 Type of feature 3 - * @tparam F4 Type of feature 4 - * @return Tuple containing the generated dataframe and each individual OPFeature - */ - def generateRandomDfAndFeatures[ - F1 <: FeatureType : TypeTag, - F2 <: FeatureType : TypeTag, - F3 <: FeatureType : TypeTag, - F4 <: FeatureType : TypeTag - ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], f4: RandomData[F4], numRows: Int): - (Dataset[Row], Feature[F1], Feature[F2], Feature[F3], Feature[F4]) = { - - val f1Data = f1.limit(numRows) - val f2Data = f2.limit(numRows) - val f3Data = f3.limit(numRows) - val f4Data = f4.limit(numRows) - - // Combine the data into a single tuple for each row - val generatedTrainData: Seq[(F1, F2, F3, F4)] = f1Data.zip(f2Data).zip(f3Data).zip(f4Data).map { - case (((a, b), c), d) => (a, b, c, d) - } - - TestFeatureBuilder[F1, F2, F3, F4]("myF1", "myF2", "myF3", "myF4", generatedTrainData) - } - it should "not remove any features when the training and scoring sets are identical" in { // Define random generators that will be the same for training and scoring dataframes val cityGenerator = RandomText.cities.withProbabilityOfEmpty(0.2) @@ -238,7 +172,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val params = new OpParams() // We should be able to set the features to either be the train features or the score ones here val features: Array[OPFeature] = Array(trainCity, trainCountry, trainPickList, trainCurrency) - val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0) + val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.4, 0.1, 1.0, 0.1, 1.0) val filteredRawData = filter.generateFilteredRaw(features, params) filteredRawData.featuresToDrop shouldBe empty @@ -246,18 +180,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with filteredRawData.cleanedData.schema.fields should contain theSameElementsAs trainReader.generateDataFrame(features).schema.fields - assertFeatureDistributions(filteredRawData, total = features.length * 2) - - // Also check that the all the feature distributions are the same between the training and scoring sets - filteredRawData.trainingFeatureDistributions.zip(filteredRawData.trainingFeatureDistributions).foreach{ - case (train, score) => - train.name shouldBe score.name - train.key shouldBe score.key - train.count shouldBe score.count - train.nulls shouldBe score.nulls - train.distribution shouldBe score.distribution - train.summaryInfo shouldBe score.summaryInfo - } + assertFeatureDistributionEquality(filteredRawData, total = features.length * 2) } it should "correctly clean the dataframe containing map and non-map features due to min fill rate" in { @@ -288,32 +211,19 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val params = new OpParams() // We should be able to set the features to either be the train features or the score ones here val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw) - // Check that using the training reader only will result in the rarely filled features being removed val filter = new RawFeatureFilter(trainReader, None, 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0) val filteredRawData = filter.generateFilteredRaw(features, params) // TODO: Add a check for the reason dropped once that information is passed on to the workflow - // Check that we drop one feature, as well as its corresponding map key - filteredRawData.featuresToDrop.length shouldBe 1 - filteredRawData.mapKeysToDrop.keySet.size shouldBe 1 - filteredRawData.mapKeysToDrop.values.head.size shouldBe 1 - - // The fure that is 99% empty should be thrown out - filteredRawData.featuresToDrop.head.name.startsWith("myF2") shouldBe true - filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f2") - - // Check the actual filtered dataframe schemas - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe true - filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m => - if (m.nonEmpty) m.value.keySet should not contain "f2") - - // There should be 6 FeatureDistributions - training for 3 raw features, one map with three keys - // The map and non-map features should also be the same - assertFeatureDistributionEquality(filteredRawData, total = 6) - + checkDroppedFeatures( + filteredRawData, + mapFeatureRaw, + featureUniverse = featureUniverse, + expectedDroppedFeatures = Set("myF2"), + mapKeyUniverse = mapKeyUniverse, + expectedDroppedKeys = Set("f2") + ) // Check that using the scoring reader only will result in the rarely filled in both training and scoring sets // being removed @@ -321,26 +231,17 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val filteredRawDataWithScoring = filterWithScoring.generateFilteredRaw(features, params) // TODO: Add a check for the reason dropped once that information is passed on to the workflow - // Check that we drop one feature, as well as its corresponding map key - filteredRawDataWithScoring.featuresToDrop.length shouldBe 2 - filteredRawDataWithScoring.mapKeysToDrop.keySet.size shouldBe 1 - filteredRawDataWithScoring.mapKeysToDrop.values.head.size shouldBe 2 - - // The feature that is 99% empty should be thrown out - filteredRawDataWithScoring.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF1", "myF2") - filteredRawDataWithScoring.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f1", "f2") - // filteredDataWithScoring.mapKeysToDrop - // .foldLeft(Set.empty[String])((acc, x) => acc ++ x._2) should contain theSameElementsAs Seq("f1", "f2") - - filteredRawDataWithScoring.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe false - filteredRawDataWithScoring.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false - filteredRawDataWithScoring.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe true - filteredRawDataWithScoring.cleanedData.collect(mapFeatureRaw).foreach(m => - if (m.nonEmpty) m.value.keySet shouldEqual Set("f3")) + checkDroppedFeatures( + filteredRawDataWithScoring, + mapFeatureRaw, + featureUniverse = featureUniverse, + expectedDroppedFeatures = Set("myF1", "myF2"), + mapKeyUniverse = mapKeyUniverse, + expectedDroppedKeys = Set("f1", "f2") + ) // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys - // The map and non-map features should also be the same - assertFeatureDistributionEquality(filteredRawDataWithScoring, total = 12) + assertFeatureDistributions(filteredRawDataWithScoring, total = 12) } /** @@ -355,24 +256,24 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with it should "correctly clean the dataframe containing map and non-map features due to max absolute fill rate " + "difference" in { // Define random generators that will be the same for training and scoring dataframes - val realGenerator1 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0) - val realGenerator2 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.2) - val realGenerator3 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.8) + val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0) + val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.2) + val currencyGenerator3 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.8) // Define the training dataframe and the features (these should be the same between the training and scoring // dataframes since they point to columns with the same names) - val (trainDf, r1, r2, r3) = - generateRandomDfAndFeatures[Real, Real, Real]( - realGenerator1, realGenerator2, realGenerator3, 1000 + val (trainDf, c1, c2, c3) = + generateRandomDfAndFeatures[Currency, Currency, Currency]( + currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000 ) - val mapFeature = makeTernaryOPMapTransformer[Real, RealMap, Double](r1, r2, r3) + val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) - val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Real, Real, Real]( - realGenerator1, realGenerator3, realGenerator2, 1000 + val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( + currencyGenerator1, currencyGenerator3, currencyGenerator2, 1000 ) val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) @@ -381,40 +282,22 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val params = new OpParams() // We should be able to set the features to either be the train features or the score ones here - val features: Array[OPFeature] = Array(r1, r2, r3, mapFeatureRaw) + val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw) val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.0, 0.4, Double.PositiveInfinity, 1.0, 1.0) val filteredRawData = filter.generateFilteredRaw(features, params) - /* - val exclusions = filter.getFeaturesToExclude( - trainingDistribs = filteredRawData.trainingFeatureDistributions, - scoringDistribs = filteredRawData.scoringFeatureDistributions, - correlationInfo = Map.empty - ) - println(exclusions) - */ - // TODO: Add a check for the reason dropped once that information is passed on to the workflow - // Check that we drop one feature, as well as its corresponding map key - filteredRawData.featuresToDrop.length shouldBe 2 - filteredRawData.mapKeysToDrop.keySet.size shouldBe 1 - filteredRawData.mapKeysToDrop.values.head.size shouldBe 2 - - // Since we swtiched the distributions in features 2 & 3 between the training and scoring sets, then both of them - // should be removed (the two raw features and the two corresponding map keys) - filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF2", "myF3") - filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f2", "f3") - - // Check the actual filtered dataframe schemas - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe false - filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m => - if (m.nonEmpty) m.value.keySet shouldEqual Set("f1")) + checkDroppedFeatures( + filteredRawData, + mapFeatureRaw, + featureUniverse = featureUniverse, + expectedDroppedFeatures = Set("myF2", "myF3"), + mapKeyUniverse = mapKeyUniverse, + expectedDroppedKeys = Set("f2", "f3") + ) // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys - // The map and non-map features should also be the same - assertFeatureDistributionEquality(filteredRawData, total = 12) + assertFeatureDistributions(filteredRawData, total = 12) } /** @@ -429,24 +312,24 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with it should "correctly clean the dataframe containing map and non-map features due to max fill ratio " + "difference" in { // Define random generators that will be the same for training and scoring dataframes - val realGenerator1 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0) - val realGenerator2 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95) - val realGenerator3 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.7) + val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0) + val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95) + val currencyGenerator3 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.7) // Define the training dataframe and the features (these should be the same between the training and scoring // dataframes since they point to columns with the same names) - val (trainDf, r1, r2, r3) = - generateRandomDfAndFeatures[Real, Real, Real]( - realGenerator1, realGenerator2, realGenerator3, 1000 + val (trainDf, c1, c2, c3) = + generateRandomDfAndFeatures[Currency, Currency, Currency]( + currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000 ) - val mapFeature = makeTernaryOPMapTransformer[Real, RealMap, Double](r1, r2, r3) + val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) - val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Real, Real, Real]( - realGenerator1, realGenerator3, realGenerator2, 1000 + val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( + currencyGenerator1, currencyGenerator3, currencyGenerator2, 1000 ) val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) @@ -455,31 +338,22 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val params = new OpParams() // We should be able to set the features to either be the train features or the score ones here - val features: Array[OPFeature] = Array(r1, r2, r3, mapFeatureRaw) + val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw) val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.0, 1.0, 4.0, 1.0, 1.0) val filteredRawData = filter.generateFilteredRaw(features, params) // TODO: Add a check for the reason dropped once that information is passed on to the workflow - // Check that we drop one feature, as well as its corresponding map key - filteredRawData.featuresToDrop.length shouldBe 2 - filteredRawData.mapKeysToDrop.keySet.size shouldBe 1 - filteredRawData.mapKeysToDrop.values.head.size shouldBe 2 - - // Since we swtiched the distributions in features 2 & 3 between the training and scoring sets, then both of them - // should be removed (the two raw features and the two corresponding map keys) - filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF2", "myF3") - filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f2", "f3") - - // Check the actual filtered dataframe schemas - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe false - filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m => - if (m.nonEmpty) m.value.keySet shouldEqual Set("f1")) + checkDroppedFeatures( + filteredRawData, + mapFeatureRaw, + featureUniverse = featureUniverse, + expectedDroppedFeatures = Set("myF2", "myF3"), + mapKeyUniverse = mapKeyUniverse, + expectedDroppedKeys = Set("f2", "f3") + ) // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys - // The map and non-map features should also be the same - assertFeatureDistributionEquality(filteredRawData, total = 12) + assertFeatureDistributions(filteredRawData, total = 12) } /** @@ -519,33 +393,24 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val params = new OpParams() // We should be able to set the features to either be the train features or the score ones here val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw) - val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 0.8, 1.0) val filteredRawData = filter.generateFilteredRaw(features, params) // TODO: Add a check for the reason dropped once that information is passed on to the workflow - // Check that we drop one feature, as well as its corresponding map key - filteredRawData.featuresToDrop.length shouldBe 2 - filteredRawData.mapKeysToDrop.keySet.size shouldBe 1 - filteredRawData.mapKeysToDrop.values.head.size shouldBe 2 - - // The feature that is 99% empty should be thrown out - filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF1", "myF3") - filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f1", "f3") - - // Check the actual filtered dataframe schemas - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe false - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe true - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe false - filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m => - if (m.nonEmpty) m.value.keySet shouldEqual Set("f2")) + checkDroppedFeatures( + filteredRawData, + mapFeatureRaw, + featureUniverse = featureUniverse, + expectedDroppedFeatures = Set("myF1", "myF3"), + mapKeyUniverse = mapKeyUniverse, + expectedDroppedKeys = Set("f1", "f3") + ) // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys - // The map and non-map features should also be the same - assertFeatureDistributionEquality(filteredRawData, total = 12) + assertFeatureDistributions(filteredRawData, total = 12) } - it should "not drop protected raw features" in { + it should "not drop protected raw features or response features" in { // Define random generators that will be the same for training and scoring dataframes val currencyGenerator25 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25) val currencyGenerator95 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95) @@ -573,10 +438,9 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val params = new OpParams() // We should be able to set the features to either be the train features or the score ones here val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw) - // Check that using the scoring reader only will result in the rarely filled in both training and scoring sets // being removed, except for the protected feature that would normally be removed - val filter = new RawFeatureFilter(trainReader, Some(scoreReader), + val filterWithProtected = new RawFeatureFilter(trainReader, Some(scoreReader), bins = 10, minFill = 0.1, maxFillDifference = 1.0, @@ -585,141 +449,234 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with maxCorrelation = 1.0, protectedFeatures = Set("myF1") ) - val filteredRawData = filter.generateFilteredRaw(features, params) + val filteredRawData = filterWithProtected.generateFilteredRaw(features, params) // TODO: Add a check for the reason dropped once that information is passed on to the workflow - // Check that we drop one feature, as well as its corresponding map key - filteredRawData.featuresToDrop.length shouldBe 1 - filteredRawData.mapKeysToDrop.keySet.size shouldBe 1 - filteredRawData.mapKeysToDrop.values.head.size shouldBe 2 + checkDroppedFeatures( + filteredRawData, + mapFeatureRaw, + featureUniverse = featureUniverse, + expectedDroppedFeatures = Set("myF2"), + mapKeyUniverse = mapKeyUniverse, + expectedDroppedKeys = Set("f1", "f2") + ) - // The feature that is 99% empty should be thrown out - filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF2") - filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f1", "f2") - // filteredData.mapKeysToDrop - // .foldLeft(Set.empty[String])((acc, x) => acc ++ x._2) should contain theSameElementsAs Seq("f1", "f2") + // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys + assertFeatureDistributions(filteredRawData, total = 12) - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false - filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe true - filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m => - if (m.nonEmpty) m.value.keySet shouldEqual Set("f3")) + val filter = new RawFeatureFilter(trainReader, Some(scoreReader), + bins = 10, + minFill = 0.1, + maxFillDifference = 1.0, + maxFillRatioDiff = Double.PositiveInfinity, + maxJSDivergence = 1.0, + maxCorrelation = 1.0 + ) + val featuresWithResponse: Array[OPFeature] = Array(c1.copy(isResponse = true), c2, c3, mapFeatureRaw) + val filteredRawDataWithResponse = filter.generateFilteredRaw(featuresWithResponse, params) + + checkDroppedFeatures( + filteredRawDataWithResponse, + mapFeatureRaw, + featureUniverse = featureUniverse, + expectedDroppedFeatures = Set("myF2"), + mapKeyUniverse = mapKeyUniverse, + expectedDroppedKeys = Set("f1", "f2") + ) // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys - // The map and non-map features should also be the same - assertFeatureDistributionEquality(filteredRawData, total = 12) + assertFeatureDistributions(filteredRawDataWithResponse, total = 12) } - - // TODO: check null leakage removals (just do one threshold, or two in the same test) - - it should "correctly clean the dataframe returned and give the features to blacklist" in { - val params = new OpParams() - val survPred = survived.copy(isResponse = false) - val features: Array[OPFeature] = - Array(survPred, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap) - val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.0, 1.0, Double.PositiveInfinity, 1.0, 1.0) - val filteredRawData = filter.generateFilteredRaw(features, params) - filteredRawData.featuresToDrop shouldBe empty - filteredRawData.mapKeysToDrop shouldBe empty - filteredRawData.cleanedData.schema.fields should contain theSameElementsAs passengersDataSet.schema.fields - - assertFeatureDistributions(filteredRawData, total = 26) - - val filter1 = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.5, 0.5, Double.PositiveInfinity, 1.0, 1.0) - val filteredRawData1 = filter1.generateFilteredRaw(features, params) - filteredRawData1.featuresToDrop should contain theSameElementsAs Array(survPred) - filteredRawData1.mapKeysToDrop should contain theSameElementsAs Map( - "numericMap" -> Set("Male"), "booleanMap" -> Set("Male"), "stringMap" -> Set("Male")) - filteredRawData1.cleanedData.schema.fields.exists(_.name == survPred.name) shouldBe false - filteredRawData1.cleanedData.collect(stringMap).foreach(m => - if (m.nonEmpty) m.value.keySet shouldEqual Set("Female")) - assertFeatureDistributions(filteredRawData, total = 26) - } + /** + * This test generates three numeric generators with the same underlying distribution, but different fill rates. + * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three + * raw features - each key contains the same data as the corresponding raw feature. + * + * Features f1 & f3 are switched between the training and scoring sets, so they should have a very large JS + * divergence (practically, 1.0). The RawFeatureFilter is set up with a maximum JS divergence of 0.8, so both + * f1 and f3 (as well as their corresponding map keys) should be removed, but they are added to a list of features + * protected from JS divergence removal. + */ + it should "not drop JS divergence-protected features based on JS divergence check" in { + // Define random generators that will be the same for training and scoring dataframes + val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 1.0, sigma = 5.0).withProbabilityOfEmpty(0.1) + val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 5.0) + val currencyGenerator3 = RandomReal.logNormal[Currency](mean = 1000.0, sigma = 5.0).withProbabilityOfEmpty(0.1) - it should "not drop response features" in { - val params = new OpParams() - val features: Array[OPFeature] = - Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap) - val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.5, 0.5, Double.PositiveInfinity, 1.0, 1.0) - val filteredRawData = filter.generateFilteredRaw(features, params) - filteredRawData.featuresToDrop shouldBe empty - filteredRawData.cleanedData.schema.fields should contain theSameElementsAs passengersDataSet.schema.fields - filteredRawData.cleanedData.collect(stringMap) - .foreach(m => if (m.nonEmpty) m.value.keySet shouldEqual Set("Female")) - assertFeatureDistributions(filteredRawData, total = 26) - } + // Define the training dataframe and the features (these should be the same between the training and scoring + // dataframes since they point to columns with the same names) + val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( + currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000 + ) + val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) + // Need to make a raw version of this feature so that RawFeatureFilter will pick it up + val mapFeatureRaw = mapFeature.asRaw(isResponse = false) + val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) - it should "not drop protected features" in { - val params = new OpParams() - val features: Array[OPFeature] = - Array(survived, age, gender, height, weight, description, boarded) - val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1, 2, 0.2, 0.9) - val filteredRawData = filter.generateFilteredRaw(features, params) - filteredRawData.featuresToDrop.toSet shouldEqual Set(age, gender, height, weight, description, boarded) - filteredRawData.cleanedData.schema.fields.map(_.name) should contain theSameElementsAs - Array(DataFrameFieldNames.KeyFieldName, survived.name) - assertFeatureDistributions(filteredRawData, total = 14) + // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) + val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( + currencyGenerator3, currencyGenerator2, currencyGenerator1, 1000 + ) + val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) - val filter2 = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1, 2, 0.2, 0.9, - protectedFeatures = Set(age.name, gender.name)) - val filteredRawData2 = filter2.generateFilteredRaw(features, params) - filteredRawData2.featuresToDrop.toSet shouldEqual Set(height, weight, description, boarded) - filteredRawData2.cleanedData.schema.fields.map(_.name) should contain theSameElementsAs - Array(DataFrameFieldNames.KeyFieldName, survived.name, age.name, gender.name) - assertFeatureDistributions(filteredRawData, total = 14) - } + // Define the readers + val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf) - it should "not drop JS divergence-protected features based on JS divergence check" in { val params = new OpParams() - val features: Array[OPFeature] = - Array(survived, age, gender, height, weight, description, boarded, boardedTime, boardedTimeAsDateTime) - val filter = new RawFeatureFilter( - trainingReader = dataReader, - scoringReader = Some(simpleReader), + // We should be able to set the features to either be the train features or the score ones here + val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw) + val filter = new RawFeatureFilter(trainReader, Some(scoreReader), bins = 10, - minFill = 0.0, + minFill = 0.1, maxFillDifference = 1.0, maxFillRatioDiff = Double.PositiveInfinity, - maxJSDivergence = 0.0, + maxJSDivergence = 0.8, maxCorrelation = 1.0, - jsDivergenceProtectedFeatures = Set(boardedTime.name, boardedTimeAsDateTime.name) + jsDivergenceProtectedFeatures = Set("myF3") ) - val filteredRawData = filter.generateFilteredRaw(features, params) - filteredRawData.featuresToDrop.toSet shouldEqual Set(age, gender, height, weight, description, boarded) - filteredRawData.cleanedData.schema.fields.map(_.name) should contain theSameElementsAs - Seq(DataFrameFieldNames.KeyFieldName, survived.name, boardedTime.name, boardedTimeAsDateTime.name) - assertFeatureDistributions(filteredRawData, total = 18) - } - it should "correctly drop features based on null-label leakage correlation greater than 0.9" in { - val expectedDropped = Seq(boarded, weight, gender) - val expectedMapKeys = Seq("Female", "Male") - val expectedDroppedMapKeys = Map[String, Set[String]]() - nullLabelCorrelationTest(0.9, expectedDropped, expectedMapKeys, expectedDroppedMapKeys) + // TODO: Add a check for the reason dropped once that information is passed on to the workflow + checkDroppedFeatures( + filteredRawData, + mapFeatureRaw, + featureUniverse = featureUniverse, + expectedDroppedFeatures = Set("myF1"), + mapKeyUniverse = mapKeyUniverse, + expectedDroppedKeys = Set("f1", "f3") + ) + + // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys + assertFeatureDistributions(filteredRawData, total = 12) } - it should "correctly drop features based on null-label leakage correlation greater than 0.6" in { - val expectedDropped = Seq(boarded, weight, gender, age) - val expectedMapKeys = Seq("Female", "Male") - val expectedDroppedMapKeys = Map[String, Set[String]]() - nullLabelCorrelationTest(0.6, expectedDropped, expectedMapKeys, expectedDroppedMapKeys) + + it should "correctly drop features based on null-label correlations" in { + // Define random generators that will be the same for training and scoring dataframes + val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.3) + val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.3) + val currencyGenerator3 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.3) + + // Define the training dataframe and the features (these should be the same between the training and scoring + // dataframes since they point to columns with the same names) + val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( + currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000 + ) + val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) + // Need to make a raw version of this feature so that RawFeatureFilter will pick it up + val mapFeatureRaw = mapFeature.asRaw(isResponse = false) + + // Construct a label that we know is highly biased from the pickList data to check if SanityChecker detects it + val labelTransformer = new UnaryLambdaTransformer[Currency, RealNN](operationName = "labelFunc", + transformFn = r => r.value match { + case Some(v) => RealNN(1.0) + case _ => RealNN(0.0) + } + ) + val labelData = labelTransformer.setInput(c2).getOutput().asInstanceOf[Feature[RealNN]] + .copy(isResponse = true) + val labelDataRaw = labelData.asRaw(isResponse = true) + val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(trainDf) + + // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) + val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( + currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000 + ) + val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(scoreDf) + + // Define the readers + val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf) + + val params = new OpParams() + // We should be able to set the features to either be the train features or the score ones here + val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw, labelDataRaw) + val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 0.8) + val filteredRawData = filter.generateFilteredRaw(features, params) + + // TODO: check that filter.getFeaturesToExclude contains the correlation exclusions too + // TODO: Add a check for the reason dropped once that information is passed on to the workflow + checkDroppedFeatures( + filteredRawData, + mapFeatureRaw, + featureUniverse = featureUniverse ++ Set(labelData.name), + expectedDroppedFeatures = Set("myF2"), + mapKeyUniverse = mapKeyUniverse, + expectedDroppedKeys = Set("f2") + ) + + // There should be 14 FeatureDistributions - training and scoring for 4 raw features, one map with three keys + assertFeatureDistributions(filteredRawData, total = 14) } - it should "correctly drop features based on null-label leakage correlation greater than 0.4" in { - val expectedDropped = Seq(boarded, weight, gender, age, description) - val expectedMapKeys = Seq("Male") - val expectedDroppedMapKeys = Map("booleanMap" -> Set("Female"), "stringMap" -> Set("Female"), - "numericMap" -> Set("Female")) - nullLabelCorrelationTest(0.4, expectedDropped, expectedMapKeys, expectedDroppedMapKeys) + /** + * Generates a random dataframe and OPFeatures from supplied data generators and their types. The names of the + * columns of the dataframe are fixed to be myF1, myF2, myF3, and myF4 so that the same OPFeatures can be used to + * refer to columns in either dataframe. + * + * @param f1 Random data generator for feature 1 (type F1) + * @param f2 Random data generator for feature 2 (type F2) + * @param f3 Random data generator for feature 3 (type F3) + * @param numRows Number of rows to generate + * @tparam F1 Type of feature 1 + * @tparam F2 Type of feature 2 + * @tparam F3 Type of feature 3 + * @return Tuple containing the generated dataframe and each individual OPFeature + */ + def generateRandomDfAndFeatures[ + F1 <: FeatureType : TypeTag, + F2 <: FeatureType : TypeTag, + F3 <: FeatureType : TypeTag + ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], numRows: Int): + (Dataset[Row], Feature[F1], Feature[F2], Feature[F3]) = { + + val f1Data = f1.limit(numRows) + val f2Data = f2.limit(numRows) + val f3Data = f3.limit(numRows) + + // Combine the data into a single tuple for each row + val generatedTrainData: Seq[(F1, F2, F3)] = f1Data.zip(f2Data).zip(f3Data).map { + case ((a, b), c) => (a, b, c) + } + + TestFeatureBuilder[F1, F2, F3]("myF1", "myF2", "myF3", generatedTrainData) } - it should "correctly drop features based on null-label leakage correlation greater than 0.3" in { - val expectedDropped = Seq(boarded, weight, gender, age, description, booleanMap, numericMap, stringMap) - // all the maps dropped - val expectedDroppedMapKeys = Map[String, Set[String]]() - nullLabelCorrelationTest(0.3, expectedDropped, Seq(), expectedDroppedMapKeys) + /** + * Generates a random dataframe and OPFeatures from supplied data generators and their types. The names of the + * columns of the dataframe are fixed to be myF1, myF2, myF3, and myF4 so that the same OPFeatures can be used to + * refer to columns in either dataframe. + * + * @param f1 Random data generator for feature 1 (type F1) + * @param f2 Random data generator for feature 2 (type F2) + * @param f3 Random data generator for feature 3 (type F3) + * @param f4 Random data generator for feature 4 (type F4) + * @param numRows Number of rows to generate + * @tparam F1 Type of feature 1 + * @tparam F2 Type of feature 2 + * @tparam F3 Type of feature 3 + * @tparam F4 Type of feature 4 + * @return Tuple containing the generated dataframe and each individual OPFeature + */ + def generateRandomDfAndFeatures[ + F1 <: FeatureType : TypeTag, + F2 <: FeatureType : TypeTag, + F3 <: FeatureType : TypeTag, + F4 <: FeatureType : TypeTag + ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], f4: RandomData[F4], numRows: Int): + (Dataset[Row], Feature[F1], Feature[F2], Feature[F3], Feature[F4]) = { + + val f1Data = f1.limit(numRows) + val f2Data = f2.limit(numRows) + val f3Data = f3.limit(numRows) + val f4Data = f4.limit(numRows) + + // Combine the data into a single tuple for each row + val generatedTrainData: Seq[(F1, F2, F3, F4)] = f1Data.zip(f2Data).zip(f3Data).zip(f4Data).map { + case (((a, b), c), d) => (a, b, c, d) + } + + TestFeatureBuilder[F1, F2, F3, F4]("myF1", "myF2", "myF3", "myF4", generatedTrainData) } private def assertFeatureDistributions(fd: FilteredRawData, total: Int): Assertion = { @@ -733,7 +690,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with private def assertFeatureDistributionEquality(fd: FilteredRawData, total: Int): Unit = { fd.featureDistributions.length shouldBe total - fd.trainingFeatureDistributions.zip(fd.trainingFeatureDistributions).foreach { + fd.trainingFeatureDistributions.zip(fd.scoringFeatureDistributions).foreach { case (train, score) => train.name shouldBe score.name train.key shouldBe score.key @@ -762,42 +719,40 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with (trainReader, scoreReader) } - private def nullLabelCorrelationTest( - maxCorrelation: Double, - expectedDropped: Seq[OPFeature], - expectedMapKeys: Seq[String], - expectedDroppedMapKeys: Map[String, Set[String]] + // TODO: Expand scope to take multiple map types, and/or type parameters for maps of different types + /** + * Automates various checks on whether features are removed from the cleaned dataframe produced by RawFeatureFilter. + * Right now, it is specialized to accept just one map type feature, which is hardcoded to be a CurrencyMap based + * on current tests. + * + * @param filteredRawData FilteredRawData object prdduced by RawFeatureFilter + * @param mapFeatureRaw Name of raw map feature to check keys on + * @param featureUniverse Set of raw feature names you start with + * @param expectedDroppedFeatures Expected set of raw feature names to be dropped + * @param mapKeyUniverse Set of map keys in mapFeatureRaw you start with + * @param expectedDroppedKeys Expected set of map keys to be dropped + */ + private def checkDroppedFeatures( + filteredRawData: FilteredRawData, + mapFeatureRaw: FeatureLike[CurrencyMap], + featureUniverse: Set[String], + expectedDroppedFeatures: Set[String], + mapKeyUniverse: Set[String], + expectedDroppedKeys: Set[String] ): Unit = { - def getFilter(maxCorrelation: Double): RawFeatureFilter[Passenger] = new RawFeatureFilter( - trainingReader = dataReader, - scoringReader = Some(simpleReader), - bins = 10, - minFill = 0.0, - maxFillDifference = 1.0, - maxFillRatioDiff = Double.PositiveInfinity, - maxJSDivergence = 1.0, - maxCorrelation = maxCorrelation) + // Check that we drop one feature, as well as its corresponding map key + filteredRawData.featuresToDrop.length shouldBe expectedDroppedFeatures.size + filteredRawData.mapKeysToDrop.keySet.size shouldBe 1 + filteredRawData.mapKeysToDrop.values.head.size shouldBe expectedDroppedKeys.size - val params = new OpParams() - val features: Array[OPFeature] = - Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap) - val filteredRawData@FilteredRawData(df, dropped, droppedKeyValue, _) = - getFilter(maxCorrelation).generateFilteredRaw(features, params) - - assertFeatureDistributions(filteredRawData, total = 26) - dropped should contain theSameElementsAs expectedDropped - droppedKeyValue should contain theSameElementsAs expectedDroppedMapKeys - - df.schema.fields.map(_.name) should contain theSameElementsAs - DataFrameFieldNames.KeyFieldName +: features.diff(dropped).map(_.name) - if (expectedMapKeys.nonEmpty) { - df.collect(booleanMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys - df.collect(numericMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys - df.collect(stringMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys - } else { - intercept[IllegalArgumentException] { df.collect(booleanMap) } - intercept[IllegalArgumentException] { df.collect(numericMap) } - intercept[IllegalArgumentException] { df.collect(stringMap) } - } + filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs expectedDroppedFeatures + filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs expectedDroppedKeys + + // Check the actual filtered dataframe schemas + featureUniverse.foreach(f => { + filteredRawData.cleanedData.schema.fields.exists(_.name == f) shouldBe !expectedDroppedFeatures.contains(f) + filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m => + if (m.nonEmpty) m.value.keySet.intersect(expectedDroppedKeys) shouldBe Set.empty) + }) } } From 0999ac9657c67eeae80de563d49708a56eb17c07 Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Mon, 25 Mar 2019 15:20:06 -0700 Subject: [PATCH 05/13] More documentation and readability changes --- .../op/filters/RawFeatureFilterTest.scala | 111 +++++++++++------- 1 file changed, 69 insertions(+), 42 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala index 6b69eb2f71..eec086b7c3 100644 --- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala @@ -57,6 +57,8 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // Our randomly generated data will generate feature names and corresponding map keys in this universe val featureUniverse = Set("myF1", "myF2", "myF3") val mapKeyUniverse = Set("f1", "f2", "f3") + // Number of rows to use in randomly generated data sets + val numRows = 1000 Spec[RawFeatureFilter[_]] should "compute feature stats correctly" in { val features: Array[OPFeature] = @@ -151,6 +153,12 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with excludedBothAllMK shouldBe empty } + /** + * This test uses several data generators to generate data according to different distributions, makes a reader + * corresponding to the generated dataframe, and then uses that as both the training and scoring reader in + * RawFeatureFilter. Not only should no features be removed, but the training and scoring distributions should be + * identical. + */ it should "not remove any features when the training and scoring sets are identical" in { // Define random generators that will be the same for training and scoring dataframes val cityGenerator = RandomText.cities.withProbabilityOfEmpty(0.2) @@ -163,7 +171,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // dataframes since they point to columns with the same names) val (trainDf, trainCity, trainCountry, trainPickList, trainCurrency) = generateRandomDfAndFeatures[City, Country, PickList, Currency]( - cityGenerator, countryGenerator, pickListGenerator, currencyGenerator,1000 + cityGenerator, countryGenerator, pickListGenerator, currencyGenerator, numRows ) // Define the readers @@ -183,6 +191,17 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with assertFeatureDistributionEquality(filteredRawData, total = features.length * 2) } + /** + * This test generates three numeric generators with the same underlying distribution, but different fill rates. + * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three + * raw features - each key contains the same data as the corresponding raw feature. + * + * Features c1, c2, and c3 are permuted between the training and scoring sets. In the training set, feature c2 + * has a 5% fill rate and should be removed. In the scoring set, map key f1 has a 5% fill rate so should be removed. + * This test checks removal when only the training reader is used, and when both the training and scoring readers + * are used. + * + */ it should "correctly clean the dataframe containing map and non-map features due to min fill rate" in { // Define random generators that will be the same for training and scoring dataframes val currencyGenerator25 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25) @@ -192,7 +211,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // Define the training dataframe and the features (these should be the same between the training and scoring // dataframes since they point to columns with the same names) val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( - currencyGenerator25, currencyGenerator95, currencyGenerator50, 1000 + currencyGenerator25, currencyGenerator95, currencyGenerator50, numRows ) val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up @@ -201,7 +220,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( - currencyGenerator95, currencyGenerator50, currencyGenerator25,1000 + currencyGenerator95, currencyGenerator50, currencyGenerator25, numRows ) val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) @@ -220,7 +239,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with filteredRawData, mapFeatureRaw, featureUniverse = featureUniverse, - expectedDroppedFeatures = Set("myF2"), + expectedDroppedFeatures = Set(c2.name), mapKeyUniverse = mapKeyUniverse, expectedDroppedKeys = Set("f2") ) @@ -235,7 +254,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with filteredRawDataWithScoring, mapFeatureRaw, featureUniverse = featureUniverse, - expectedDroppedFeatures = Set("myF1", "myF2"), + expectedDroppedFeatures = Set(c1.name, c2.name), mapKeyUniverse = mapKeyUniverse, expectedDroppedKeys = Set("f1", "f2") ) @@ -249,9 +268,9 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three * raw features - each key contains the same data as the corresponding raw feature. * - * Features f2 & f3 are switched between the training and scoring sets, so that they should have an absolute + * Features c2 & c3 are switched between the training and scoring sets, so that they should have an absolute * fill rate difference of 0.6. The RawFeatureFilter is set up with a maximum absolute fill rate of 0.4 so both - * f2 and f3 (as well as their corresponding map keys) should be removed. + * c2 and c3 (as well as their corresponding map keys f2 & f3) should be removed. */ it should "correctly clean the dataframe containing map and non-map features due to max absolute fill rate " + "difference" in { @@ -264,7 +283,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // dataframes since they point to columns with the same names) val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( - currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000 + currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows ) val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up @@ -273,7 +292,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( - currencyGenerator1, currencyGenerator3, currencyGenerator2, 1000 + currencyGenerator1, currencyGenerator3, currencyGenerator2, numRows ) val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) @@ -291,7 +310,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with filteredRawData, mapFeatureRaw, featureUniverse = featureUniverse, - expectedDroppedFeatures = Set("myF2", "myF3"), + expectedDroppedFeatures = Set(c2.name, c3.name), mapKeyUniverse = mapKeyUniverse, expectedDroppedKeys = Set("f2", "f3") ) @@ -305,9 +324,9 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three * raw features - each key contains the same data as the corresponding raw feature. * - * Features f2 & f3 are switched between the training and scoring sets, so that they should have an absolute + * Features c2 & c3 are switched between the training and scoring sets, so that they should have an absolute * fill rate difference of 0.25, and a relative fill ratio difference of 6. The RawFeatureFilter is set up with a - * maximum fill ratio difference of 4 so both f2 and f3 (as well as their corresponding map keys) should be removed. + * maximum fill ratio difference of 4 so both c2 and c3 (as well as their corresponding map keys) should be removed. */ it should "correctly clean the dataframe containing map and non-map features due to max fill ratio " + "difference" in { @@ -320,16 +339,16 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // dataframes since they point to columns with the same names) val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( - currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000 + currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows ) val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) - val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) + val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( - currencyGenerator1, currencyGenerator3, currencyGenerator2, 1000 + currencyGenerator1, currencyGenerator3, currencyGenerator2, numRows ) val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) @@ -347,7 +366,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with filteredRawData, mapFeatureRaw, featureUniverse = featureUniverse, - expectedDroppedFeatures = Set("myF2", "myF3"), + expectedDroppedFeatures = Set(c2.name, c3.name), mapKeyUniverse = mapKeyUniverse, expectedDroppedKeys = Set("f2", "f3") ) @@ -361,7 +380,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three * raw features - each key contains the same data as the corresponding raw feature. * - * Features f1 & f3 are switched between the training and scoring sets, so they should have a very large JS + * Features c1 & c3 are switched between the training and scoring sets, so they should have a very large JS * divergence (practically, 1.0). The RawFeatureFilter is set up with a maximum JS divergence of 0.8, so both * f1 and f3 (as well as their corresponding map keys) should be removed. */ @@ -374,7 +393,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // Define the training dataframe and the features (these should be the same between the training and scoring // dataframes since they point to columns with the same names) val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( - currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000 + currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows ) val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up @@ -383,7 +402,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( - currencyGenerator3, currencyGenerator2, currencyGenerator1, 1000 + currencyGenerator3, currencyGenerator2, currencyGenerator1, numRows ) val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) @@ -401,7 +420,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with filteredRawData, mapFeatureRaw, featureUniverse = featureUniverse, - expectedDroppedFeatures = Set("myF1", "myF3"), + expectedDroppedFeatures = Set(c1.name, c3.name), mapKeyUniverse = mapKeyUniverse, expectedDroppedKeys = Set("f1", "f3") ) @@ -419,7 +438,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // Define the training dataframe and the features (these should be the same between the training and scoring // dataframes since they point to columns with the same names) val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( - currencyGenerator25, currencyGenerator95, currencyGenerator50, 1000 + currencyGenerator25, currencyGenerator95, currencyGenerator50, numRows ) val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up @@ -428,7 +447,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( - currencyGenerator95, currencyGenerator50, currencyGenerator25, 1000 + currencyGenerator95, currencyGenerator50, currencyGenerator25, numRows ) val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) @@ -447,7 +466,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with maxFillRatioDiff = Double.PositiveInfinity, maxJSDivergence = 1.0, maxCorrelation = 1.0, - protectedFeatures = Set("myF1") + protectedFeatures = Set(c1.name) ) val filteredRawData = filterWithProtected.generateFilteredRaw(features, params) @@ -456,7 +475,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with filteredRawData, mapFeatureRaw, featureUniverse = featureUniverse, - expectedDroppedFeatures = Set("myF2"), + expectedDroppedFeatures = Set(c2.name), mapKeyUniverse = mapKeyUniverse, expectedDroppedKeys = Set("f1", "f2") ) @@ -479,7 +498,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with filteredRawDataWithResponse, mapFeatureRaw, featureUniverse = featureUniverse, - expectedDroppedFeatures = Set("myF2"), + expectedDroppedFeatures = Set(c2.name), mapKeyUniverse = mapKeyUniverse, expectedDroppedKeys = Set("f1", "f2") ) @@ -488,26 +507,27 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with assertFeatureDistributions(filteredRawDataWithResponse, total = 12) } + // TODO: Add a way to protect map keys from removal? /** - * This test generates three numeric generators with the same underlying distribution, but different fill rates. - * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three - * raw features - each key contains the same data as the corresponding raw feature. + * This test generates three numeric generators with very different underlying distributions. Each generator + * corresponds to a different raw feature. Additionally, a single map feature is made from the three raw features - + * each key contains the same data as the corresponding raw feature. * - * Features f1 & f3 are switched between the training and scoring sets, so they should have a very large JS + * Features c1 & c3 are switched between the training and scoring sets, so they should have a very large JS * divergence (practically, 1.0). The RawFeatureFilter is set up with a maximum JS divergence of 0.8, so both - * f1 and f3 (as well as their corresponding map keys) should be removed, but they are added to a list of features + * c1 and c3 (as well as their corresponding map keys) should be removed, but c3 is added to a list of features * protected from JS divergence removal. */ it should "not drop JS divergence-protected features based on JS divergence check" in { // Define random generators that will be the same for training and scoring dataframes val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 1.0, sigma = 5.0).withProbabilityOfEmpty(0.1) - val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 5.0) + val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 5.0).withProbabilityOfEmpty(0.1) val currencyGenerator3 = RandomReal.logNormal[Currency](mean = 1000.0, sigma = 5.0).withProbabilityOfEmpty(0.1) // Define the training dataframe and the features (these should be the same between the training and scoring // dataframes since they point to columns with the same names) val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( - currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000 + currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows ) val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up @@ -516,7 +536,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( - currencyGenerator3, currencyGenerator2, currencyGenerator1, 1000 + currencyGenerator3, currencyGenerator2, currencyGenerator1, numRows ) val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) @@ -533,7 +553,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with maxFillRatioDiff = Double.PositiveInfinity, maxJSDivergence = 0.8, maxCorrelation = 1.0, - jsDivergenceProtectedFeatures = Set("myF3") + jsDivergenceProtectedFeatures = Set(c3.name) ) val filteredRawData = filter.generateFilteredRaw(features, params) @@ -542,7 +562,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with filteredRawData, mapFeatureRaw, featureUniverse = featureUniverse, - expectedDroppedFeatures = Set("myF1"), + expectedDroppedFeatures = Set(c1.name), mapKeyUniverse = mapKeyUniverse, expectedDroppedKeys = Set("f1", "f3") ) @@ -551,7 +571,15 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with assertFeatureDistributions(filteredRawData, total = 12) } - + /** + * This test generates three numeric generators where ach generator corresponds to a different raw feature. + * Additionally, a single map feature is made from the three raw features - each key contains the same data + * as the corresponding raw feature. + * + * A binary label is generated with a perfect relationship to feature c2 - if it is empty then the label is 0, + * otherwise it is 1. Therefore feature c2 (and its corresponding map key) should be removed by the correlation + * check between a raw feature's null indicator and the label. + */ it should "correctly drop features based on null-label correlations" in { // Define random generators that will be the same for training and scoring dataframes val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.3) @@ -561,27 +589,26 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // Define the training dataframe and the features (these should be the same between the training and scoring // dataframes since they point to columns with the same names) val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( - currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000 + currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows ) val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) - // Construct a label that we know is highly biased from the pickList data to check if SanityChecker detects it + // Construct a label that we know is directly correlated to the currency data val labelTransformer = new UnaryLambdaTransformer[Currency, RealNN](operationName = "labelFunc", transformFn = r => r.value match { case Some(v) => RealNN(1.0) case _ => RealNN(0.0) } ) - val labelData = labelTransformer.setInput(c2).getOutput().asInstanceOf[Feature[RealNN]] - .copy(isResponse = true) + val labelData = labelTransformer.setInput(c2).getOutput().asInstanceOf[Feature[RealNN]].copy(isResponse = true) val labelDataRaw = labelData.asRaw(isResponse = true) val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(trainDf) // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( - currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000 + currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows ) val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(scoreDf) @@ -600,7 +627,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with filteredRawData, mapFeatureRaw, featureUniverse = featureUniverse ++ Set(labelData.name), - expectedDroppedFeatures = Set("myF2"), + expectedDroppedFeatures = Set(c2.name), mapKeyUniverse = mapKeyUniverse, expectedDroppedKeys = Set("f2") ) From 8b483bfa722f0fae21316f65278c7b20472cc7fe Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Mon, 25 Mar 2019 15:25:13 -0700 Subject: [PATCH 06/13] Small cleanup --- .../com/salesforce/op/filters/RawFeatureFilterTest.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala index eec086b7c3..ffb38277ba 100644 --- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala @@ -778,8 +778,8 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // Check the actual filtered dataframe schemas featureUniverse.foreach(f => { filteredRawData.cleanedData.schema.fields.exists(_.name == f) shouldBe !expectedDroppedFeatures.contains(f) - filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m => - if (m.nonEmpty) m.value.keySet.intersect(expectedDroppedKeys) shouldBe Set.empty) }) + filteredRawData.cleanedData.collect(mapFeatureRaw) + .foreach(_.value.keySet.intersect(expectedDroppedKeys) shouldBe Set.empty) } } From aa1ebb6586808caf53646665ef8b6eea69070a59 Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Mon, 25 Mar 2019 16:50:34 -0700 Subject: [PATCH 07/13] Fix scalastyle errors --- .../op/filters/RawFeatureFilterTest.scala | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala index ffb38277ba..843a1e7789 100644 --- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala @@ -216,13 +216,13 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) - val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) + val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( currencyGenerator95, currencyGenerator50, currencyGenerator25, numRows ) - val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) + val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) // Define the readers val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf) @@ -246,7 +246,8 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with // Check that using the scoring reader only will result in the rarely filled in both training and scoring sets // being removed - val filterWithScoring = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0) + val filterWithScoring = new RawFeatureFilter(trainReader, Some(scoreReader), + 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0) val filteredRawDataWithScoring = filterWithScoring.generateFilteredRaw(features, params) // TODO: Add a check for the reason dropped once that information is passed on to the workflow @@ -288,13 +289,13 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) - val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) + val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( currencyGenerator1, currencyGenerator3, currencyGenerator2, numRows ) - val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) + val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) // Define the readers val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf) @@ -302,7 +303,8 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val params = new OpParams() // We should be able to set the features to either be the train features or the score ones here val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw) - val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.0, 0.4, Double.PositiveInfinity, 1.0, 1.0) + val filter = new RawFeatureFilter(trainReader, Some(scoreReader), + 10, 0.0, 0.4, Double.PositiveInfinity, 1.0, 1.0) val filteredRawData = filter.generateFilteredRaw(features, params) // TODO: Add a check for the reason dropped once that information is passed on to the workflow @@ -350,7 +352,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( currencyGenerator1, currencyGenerator3, currencyGenerator2, numRows ) - val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) + val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) // Define the readers val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf) @@ -398,13 +400,13 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) - val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) + val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( currencyGenerator3, currencyGenerator2, currencyGenerator1, numRows ) - val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) + val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) // Define the readers val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf) @@ -443,13 +445,13 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) - val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) + val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( currencyGenerator95, currencyGenerator50, currencyGenerator25, numRows ) - val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) + val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) // Define the readers val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf) @@ -532,13 +534,13 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) - val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) + val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( currencyGenerator3, currencyGenerator2, currencyGenerator1, numRows ) - val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) + val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf) // Define the readers val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf) @@ -604,13 +606,13 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with ) val labelData = labelTransformer.setInput(c2).getOutput().asInstanceOf[Feature[RealNN]].copy(isResponse = true) val labelDataRaw = labelData.asRaw(isResponse = true) - val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(trainDf) + val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(trainDf) // Define the scoring dataframe (we can reuse the existing features so don't need to keep them) val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency]( currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows ) - val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(scoreDf) + val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(scoreDf) // Define the readers val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf) @@ -650,12 +652,11 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with * @tparam F3 Type of feature 3 * @return Tuple containing the generated dataframe and each individual OPFeature */ - def generateRandomDfAndFeatures[ - F1 <: FeatureType : TypeTag, - F2 <: FeatureType : TypeTag, - F3 <: FeatureType : TypeTag + def generateRandomDfAndFeatures[F1 <: FeatureType : TypeTag, + F2 <: FeatureType : TypeTag, + F3 <: FeatureType : TypeTag ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], numRows: Int): - (Dataset[Row], Feature[F1], Feature[F2], Feature[F3]) = { + (Dataset[Row], Feature[F1], Feature[F2], Feature[F3]) = { val f1Data = f1.limit(numRows) val f2Data = f2.limit(numRows) @@ -685,13 +686,12 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with * @tparam F4 Type of feature 4 * @return Tuple containing the generated dataframe and each individual OPFeature */ - def generateRandomDfAndFeatures[ - F1 <: FeatureType : TypeTag, - F2 <: FeatureType : TypeTag, - F3 <: FeatureType : TypeTag, - F4 <: FeatureType : TypeTag + def generateRandomDfAndFeatures[F1 <: FeatureType : TypeTag, + F2 <: FeatureType : TypeTag, + F3 <: FeatureType : TypeTag, + F4 <: FeatureType : TypeTag ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], f4: RandomData[F4], numRows: Int): - (Dataset[Row], Feature[F1], Feature[F2], Feature[F3], Feature[F4]) = { + (Dataset[Row], Feature[F1], Feature[F2], Feature[F3], Feature[F4]) = { val f1Data = f1.limit(numRows) val f2Data = f2.limit(numRows) From 8e911b93dcbac9b327fa3a10ab470cd17fba2f15 Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Mon, 25 Mar 2019 20:45:52 -0700 Subject: [PATCH 08/13] Debugging --- .../scala/com/salesforce/op/OpWorkflowTest.scala | 8 ++++++++ .../impl/insights/RecordInsightsLOCOTest.scala | 14 ++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala index 42c671831c..466ea9a33b 100644 --- a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala +++ b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala @@ -44,6 +44,7 @@ import com.salesforce.op.stages.impl.tuning._ import com.salesforce.op.test.{Passenger, PassengerSparkFixtureTest, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} +import org.apache.log4j.Level import org.apache.spark.ml.param.{BooleanParam, ParamMap} import org.apache.spark.ml.tuning.ParamGridBuilder import org.apache.spark.rdd.RDD @@ -238,14 +239,21 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest { } it should "use the raw feature filter to generate data instead of the reader when the filter is specified" in { + loggingLevel(Level.INFO) + val fv = Seq(age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap).transmogrify() val survivedNum = survived.occurs() val pred = BinaryClassificationModelSelector().setInput(survivedNum, fv).getOutput() + + val tempdata = new OpWorkflow().setReader(dataReader).setResultFeatures(pred).computeDataUpTo(weight) + tempdata.show(numRows = 10, truncate = false) + val wf = new OpWorkflow() .setResultFeatures(pred) .withRawFeatureFilter(Option(dataReader), Option(simpleReader), maxFillRatioDiff = 1.0) // only height and the female key of maps should meet this criteria val data = wf.computeDataUpTo(weight) + data.show(10, truncate = false) data.schema.fields.map(_.name).toSet shouldEqual Set("key", "height", "survived", "stringMap", "numericMap", "booleanMap") diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala index aad90b1a27..31fdcb4d88 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala @@ -176,7 +176,7 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { parsed.foreach { case (_, in) => math.abs(in.head._2(0)._2 + in.head._2(1)._2) < 0.00001 shouldBe true } } - it should "return the most predictive features for dat generated with a strong relation to the label" in { + it should "return the most predictive features for data generated with a strong relation to the label" in { val numRows = 1000 val countryData: Seq[Country] = RandomText.countries.withProbabilityOfEmpty(0.3).take(numRows).toList val pickListData: Seq[PickList] = RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G")) @@ -252,11 +252,12 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { val otherVar = math.abs(otherIndices.map(varImportances.apply).sum) / otherIndices.size // Strengths of features "A", "B", and "C" should be much larger the other feature strengths - assert(abcAvg > 5 * otherAvg, "Average feature strengths for features involved in label formula should be" + + assert(abcAvg > 5 * otherAvg, + "Average feature strengths for features involved in label formula should be " + "much larger than the average feature strengths of other features") // There should be a really large t-value when comparing the two avg feature strengths - assert(math.abs(abcAvg - otherAvg) / math.sqrt((abcVar + otherVar)/numRows) > 10, "The t-value comparing the" + - "average feature strengths between important and other features should be large") + assert(math.abs(abcAvg - otherAvg) / math.sqrt((abcVar + otherVar)/numRows) > 10, + "The t-value comparing the average feature strengths between important and other features should be large") // Record insights averaged across all records should be similar to the feature importances from Spark's RF val rfImportances = sparkModel.getSparkMlStage().get.featureImportances @@ -267,8 +268,9 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { // Compare the ratio of importances between "important" and "other" features in both paradigms assert(math.abs(avgRecordInsightRatio - featureImportanceRatio)*2 / - (avgRecordInsightRatio + featureImportanceRatio) < 0.8, "The ratio of feature strengths between important and" + - "other features should be similar to the ratio of feature importances from Spark's RandomForest") + (avgRecordInsightRatio + featureImportanceRatio) < 0.8, + "The ratio of feature strengths between important and other features should be similar to the ratio of " + + "feature importances from Spark's RandomForest") } } From c08ac2dae20d89f9632b4628adea1c613c4ddc34 Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Mon, 25 Mar 2019 21:47:14 -0700 Subject: [PATCH 09/13] Fixed OpWorkflowTest and reduced flakiness of LOCO test --- .../scala/com/salesforce/op/OpWorkflowTest.scala | 14 ++++---------- .../impl/insights/RecordInsightsLOCOTest.scala | 2 +- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala index 466ea9a33b..9c2065abc5 100644 --- a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala +++ b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala @@ -44,7 +44,6 @@ import com.salesforce.op.stages.impl.tuning._ import com.salesforce.op.test.{Passenger, PassengerSparkFixtureTest, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} -import org.apache.log4j.Level import org.apache.spark.ml.param.{BooleanParam, ParamMap} import org.apache.spark.ml.tuning.ParamGridBuilder import org.apache.spark.rdd.RDD @@ -239,24 +238,19 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest { } it should "use the raw feature filter to generate data instead of the reader when the filter is specified" in { - loggingLevel(Level.INFO) - val fv = Seq(age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap).transmogrify() val survivedNum = survived.occurs() val pred = BinaryClassificationModelSelector().setInput(survivedNum, fv).getOutput() - val tempdata = new OpWorkflow().setReader(dataReader).setResultFeatures(pred).computeDataUpTo(weight) - tempdata.show(numRows = 10, truncate = false) - val wf = new OpWorkflow() .setResultFeatures(pred) - .withRawFeatureFilter(Option(dataReader), Option(simpleReader), - maxFillRatioDiff = 1.0) // only height and the female key of maps should meet this criteria + .withRawFeatureFilter(Option(dataReader), None, maxFillRatioDiff = 1.0) val data = wf.computeDataUpTo(weight) - data.show(10, truncate = false) + // Since there are < 500 rows in the scoring set, only the training set checks are applied here, and the only + // removal reasons should be null indicator - label correlations data.schema.fields.map(_.name).toSet shouldEqual - Set("key", "height", "survived", "stringMap", "numericMap", "booleanMap") + Set("booleanMap", "description", "height", "stringMap", "age", "key", "survived", "numericMap") } it should "return a model that transforms the data correctly" in { diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala index 31fdcb4d88..d73dae904e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala @@ -252,7 +252,7 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { val otherVar = math.abs(otherIndices.map(varImportances.apply).sum) / otherIndices.size // Strengths of features "A", "B", and "C" should be much larger the other feature strengths - assert(abcAvg > 5 * otherAvg, + assert(abcAvg > 4 * otherAvg, "Average feature strengths for features involved in label formula should be " + "much larger than the average feature strengths of other features") // There should be a really large t-value when comparing the two avg feature strengths From bef03dd0229197ab22a90d1d7a68ec103214125e Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Tue, 26 Mar 2019 09:46:56 -0700 Subject: [PATCH 10/13] Made minScoringRows settable, with a default --- .../main/scala/com/salesforce/op/OpWorkflow.scala | 8 ++++++-- .../salesforce/op/filters/RawFeatureFilter.scala | 13 +++++++++---- .../scala/com/salesforce/op/OpWorkflowTest.scala | 6 ++---- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/OpWorkflow.scala b/core/src/main/scala/com/salesforce/op/OpWorkflow.scala index 4f1c458bad..a92e8c3f2c 100644 --- a/core/src/main/scala/com/salesforce/op/OpWorkflow.scala +++ b/core/src/main/scala/com/salesforce/op/OpWorkflow.scala @@ -512,6 +512,8 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore { * Output is the bins for the text features. * @param timePeriod Time period used to apply circulate date transformation for date features, if not * specified will use numeric feature transformation + * @param minScoringRows Minimum row threshold for scoring set comparisons to be used in checks. If the scoring + * set size is below this threshold, then only training data checks will be used * @tparam T Type of the data read in */ @Experimental @@ -529,7 +531,8 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore { protectedFeatures: Array[OPFeature] = Array.empty, protectedJSFeatures: Array[OPFeature] = Array.empty, textBinsFormula: (Summary, Int) => Int = RawFeatureFilter.textBinsFormula, - timePeriod: Option[TimePeriod] = None + timePeriod: Option[TimePeriod] = None, + minScoringRows: Int = RawFeatureFilter.minScoringRowsDefault ): this.type = { val training = trainingReader.orElse(reader).map(_.asInstanceOf[Reader[T]]) require(training.nonEmpty, "Reader for training data must be provided either in withRawFeatureFilter or directly" + @@ -550,7 +553,8 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore { protectedFeatures = protectedRawFeatures, jsDivergenceProtectedFeatures = protectedRawJSFeatures, textBinsFormula = textBinsFormula, - timePeriod = timePeriod) + timePeriod = timePeriod, + minScoringRows = minScoringRows) } this } diff --git a/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala b/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala index 8d4cb21ef6..3acd770c00 100644 --- a/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala +++ b/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala @@ -82,6 +82,9 @@ import scala.util.Failure * Output is the bins for the text features. * @param timePeriod Time period used to apply circulate date transformation for date features, if * not specified will use regular numeric feature transformation + * @param minScoringRows Minimum row threshold for scoring set comparisons to be used in checks. If + * the scoring set size is below this threshold, then only training data checks + * will be used * @tparam T datatype of the reader */ class RawFeatureFilter[T] @@ -98,7 +101,8 @@ class RawFeatureFilter[T] val jsDivergenceProtectedFeatures: Set[String] = Set.empty, val protectedFeatures: Set[String] = Set.empty, val textBinsFormula: (Summary, Int) => Int = RawFeatureFilter.textBinsFormula, - val timePeriod: Option[TimePeriod] = None + val timePeriod: Option[TimePeriod] = None, + val minScoringRows: Int = RawFeatureFilter.minScoringRowsDefault ) extends Serializable { require(bins > 1 && bins <= FeatureDistribution.MaxBins, s"Invalid bin size $bins," + @@ -110,6 +114,7 @@ class RawFeatureFilter[T] s" maxFillRatioDiff must be greater than 0.0") require(maxJSDivergence >= 0.0 && maxJSDivergence <= 1.0, s"Invalid maxJSDivergence size $maxJSDivergence," + s" maxJSDivergence must be between 0 and 1") + require(minScoringRows >= 0, s"minRowsForScoringSet must be >= 0, but was set to $minScoringRows") ClosureUtils.checkSerializable(textBinsFormula) match { case Failure(e) => throw new IllegalArgumentException("The argument textBinsFormula must be serializable", e) @@ -301,10 +306,10 @@ class RawFeatureFilter[T] val sd = s.generateDataFrame(rawFeatures, parameters.switchReaderParams()).persist() log.info("Loaded scoring data") val scoringDataCount = sd.count() - if (scoringDataCount >= RawFeatureFilter.minRowsForScoringSet) Some(sd) + if (scoringDataCount >= minScoringRows) Some(sd) else { log.warn(s"Scoring dataset has $scoringDataCount rows, which is less than the minimum required of " + - s"${RawFeatureFilter.minRowsForScoringSet}. Only training data checks will be used.") + s"$minScoringRows. Only training data checks will be used.") None } } @@ -375,7 +380,7 @@ object RawFeatureFilter { // If there are not enough rows in the scoring set, we should not perform comparisons between the training and // scoring sets since they will not be reliable. Currently, this is set to the same as the minimum training size. - val minRowsForScoringSet = 500 + val minScoringRowsDefault = 500 } diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala index 9c2065abc5..994776e8a8 100644 --- a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala +++ b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala @@ -244,13 +244,11 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest { val wf = new OpWorkflow() .setResultFeatures(pred) - .withRawFeatureFilter(Option(dataReader), None, maxFillRatioDiff = 1.0) + .withRawFeatureFilter(Option(dataReader), Option(simpleReader), maxFillRatioDiff = 1.0, minScoringRows = 0) val data = wf.computeDataUpTo(weight) - // Since there are < 500 rows in the scoring set, only the training set checks are applied here, and the only - // removal reasons should be null indicator - label correlations data.schema.fields.map(_.name).toSet shouldEqual - Set("booleanMap", "description", "height", "stringMap", "age", "key", "survived", "numericMap") + Set("key", "height", "survived", "stringMap", "numericMap", "booleanMap") } it should "return a model that transforms the data correctly" in { From 73d9d61c0cf9d340eda09a32603b6297d20862f1 Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Tue, 26 Mar 2019 10:38:14 -0700 Subject: [PATCH 11/13] Put all the old RFF tests back in and overrode the minScoringRows when necessary --- .../op/filters/RawFeatureFilterTest.scala | 174 +++++++++++++++++- 1 file changed, 165 insertions(+), 9 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala index 843a1e7789..4f111b5775 100644 --- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala @@ -33,7 +33,7 @@ package com.salesforce.op.filters import com.salesforce.op.{OpParams, OpWorkflow} import com.salesforce.op.features.{Feature, FeatureDistributionType, FeatureLike, OPFeature} import com.salesforce.op.features.types._ -import com.salesforce.op.readers.{CustomReader, ReaderKey} +import com.salesforce.op.readers.{CustomReader, DataFrameFieldNames, ReaderKey} import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer import com.salesforce.op.test._ import com.salesforce.op.testkit._ @@ -43,6 +43,7 @@ import com.salesforce.op.utils.spark.RichDataset._ import org.apache.log4j.Level import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Dataset, Row, SparkSession} +import com.twitter.algebird.Operators._ import org.junit.runner.RunWith import org.scalatest.{Assertion, FlatSpec} import org.scalatest.junit.JUnitRunner @@ -153,13 +154,128 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with excludedBothAllMK shouldBe empty } + it should "correctly clean the dataframe returned and give the features to blacklist" in { + val params = new OpParams() + val survPred = survived.copy(isResponse = false) + val features: Array[OPFeature] = + Array(survPred, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap) + val filter = new RawFeatureFilter(dataReader, Some(simpleReader), + 10, 0.0, 1.0, Double.PositiveInfinity, 1.0, 1.0, minScoringRows = 0) + val filteredRawData = filter.generateFilteredRaw(features, params) + filteredRawData.featuresToDrop shouldBe empty + filteredRawData.mapKeysToDrop shouldBe empty + filteredRawData.cleanedData.schema.fields should contain theSameElementsAs passengersDataSet.schema.fields + + assertFeatureDistributions(filteredRawData, total = 26) + + val filter1 = new RawFeatureFilter(dataReader, Some(simpleReader), + 10, 0.5, 0.5, Double.PositiveInfinity, 1.0, 1.0, minScoringRows = 0) + val filteredRawData1 = filter1.generateFilteredRaw(features, params) + filteredRawData1.featuresToDrop should contain theSameElementsAs Array(survPred) + filteredRawData1.mapKeysToDrop should contain theSameElementsAs Map( + "numericMap" -> Set("Male"), "booleanMap" -> Set("Male"), "stringMap" -> Set("Male")) + filteredRawData1.cleanedData.schema.fields.exists(_.name == survPred.name) shouldBe false + filteredRawData1.cleanedData.collect(stringMap).foreach(m => + if (m.nonEmpty) m.value.keySet shouldEqual Set("Female")) + assertFeatureDistributions(filteredRawData, total = 26) + } + + it should "not drop response features" in { + val params = new OpParams() + val features: Array[OPFeature] = + Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap) + val filter = new RawFeatureFilter(dataReader, Some(simpleReader), + 10, 0.5, 0.5, Double.PositiveInfinity, 1.0, 1.0, minScoringRows = 0) + val filteredRawData = filter.generateFilteredRaw(features, params) + filteredRawData.featuresToDrop shouldBe empty + filteredRawData.cleanedData.schema.fields should contain theSameElementsAs passengersDataSet.schema.fields + filteredRawData.cleanedData.collect(stringMap) + .foreach(m => if (m.nonEmpty) m.value.keySet shouldEqual Set("Female")) + assertFeatureDistributions(filteredRawData, total = 26) + } + + it should "not drop protected features" in { + val params = new OpParams() + val features: Array[OPFeature] = + Array(survived, age, gender, height, weight, description, boarded) + val filter = new RawFeatureFilter(dataReader, Some(simpleReader), + 10, 0.1, 0.1, 2, 0.2, 0.9, minScoringRows = 0) + val filteredRawData = filter.generateFilteredRaw(features, params) + filteredRawData.featuresToDrop.toSet shouldEqual Set(age, gender, height, weight, description, boarded) + filteredRawData.cleanedData.schema.fields.map(_.name) should contain theSameElementsAs + Array(DataFrameFieldNames.KeyFieldName, survived.name) + assertFeatureDistributions(filteredRawData, total = 14) + + val filter2 = new RawFeatureFilter(dataReader, Some(simpleReader), + 10, 0.1, 0.1, 2, 0.2, 0.9, minScoringRows = 0, + protectedFeatures = Set(age.name, gender.name)) + val filteredRawData2 = filter2.generateFilteredRaw(features, params) + filteredRawData2.featuresToDrop.toSet shouldEqual Set(height, weight, description, boarded) + filteredRawData2.cleanedData.schema.fields.map(_.name) should contain theSameElementsAs + Array(DataFrameFieldNames.KeyFieldName, survived.name, age.name, gender.name) + assertFeatureDistributions(filteredRawData, total = 14) + } + + it should "not drop JS divergence-protected features based on JS divergence check" in { + val params = new OpParams() + val features: Array[OPFeature] = + Array(survived, age, gender, height, weight, description, boarded, boardedTime, boardedTimeAsDateTime) + val filter = new RawFeatureFilter( + trainingReader = dataReader, + scoringReader = Some(simpleReader), + bins = 10, + minFill = 0.0, + maxFillDifference = 1.0, + maxFillRatioDiff = Double.PositiveInfinity, + maxJSDivergence = 0.0, + maxCorrelation = 1.0, + jsDivergenceProtectedFeatures = Set(boardedTime.name, boardedTimeAsDateTime.name), + minScoringRows = 0 + ) + + val filteredRawData = filter.generateFilteredRaw(features, params) + filteredRawData.featuresToDrop.toSet shouldEqual Set(age, gender, height, weight, description, boarded) + filteredRawData.cleanedData.schema.fields.map(_.name) should contain theSameElementsAs + Seq(DataFrameFieldNames.KeyFieldName, survived.name, boardedTime.name, boardedTimeAsDateTime.name) + assertFeatureDistributions(filteredRawData, total = 18) + } + + it should "correctly drop features based on null-label leakage correlation greater than 0.9" in { + val expectedDropped = Seq(boarded, weight, gender) + val expectedMapKeys = Seq("Female", "Male") + val expectedDroppedMapKeys = Map[String, Set[String]]() + nullLabelCorrelationTest(0.9, expectedDropped, expectedMapKeys, expectedDroppedMapKeys) + } + + it should "correctly drop features based on null-label leakage correlation greater than 0.6" in { + val expectedDropped = Seq(boarded, weight, gender, age) + val expectedMapKeys = Seq("Female", "Male") + val expectedDroppedMapKeys = Map[String, Set[String]]() + nullLabelCorrelationTest(0.6, expectedDropped, expectedMapKeys, expectedDroppedMapKeys) + } + + it should "correctly drop features based on null-label leakage correlation greater than 0.4" in { + val expectedDropped = Seq(boarded, weight, gender, age, description) + val expectedMapKeys = Seq("Male") + val expectedDroppedMapKeys = Map("booleanMap" -> Set("Female"), "stringMap" -> Set("Female"), + "numericMap" -> Set("Female")) + nullLabelCorrelationTest(0.4, expectedDropped, expectedMapKeys, expectedDroppedMapKeys) + } + + it should "correctly drop features based on null-label leakage correlation greater than 0.3" in { + val expectedDropped = Seq(boarded, weight, gender, age, description, booleanMap, numericMap, stringMap) + // all the maps dropped + val expectedDroppedMapKeys = Map[String, Set[String]]() + nullLabelCorrelationTest(0.3, expectedDropped, Seq(), expectedDroppedMapKeys) + } + /** * This test uses several data generators to generate data according to different distributions, makes a reader * corresponding to the generated dataframe, and then uses that as both the training and scoring reader in * RawFeatureFilter. Not only should no features be removed, but the training and scoring distributions should be * identical. */ - it should "not remove any features when the training and scoring sets are identical" in { + it should "not remove any features when the training and scoring sets are identical generated data" in { // Define random generators that will be the same for training and scoring dataframes val cityGenerator = RandomText.cities.withProbabilityOfEmpty(0.2) val countryGenerator = RandomText.countries.withProbabilityOfEmpty(0.2) @@ -202,7 +318,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with * are used. * */ - it should "correctly clean the dataframe containing map and non-map features due to min fill rate" in { + it should "correctly clean randomly generated map and non-map features due to min fill rate" in { // Define random generators that will be the same for training and scoring dataframes val currencyGenerator25 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25) val currencyGenerator95 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95) @@ -273,7 +389,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with * fill rate difference of 0.6. The RawFeatureFilter is set up with a maximum absolute fill rate of 0.4 so both * c2 and c3 (as well as their corresponding map keys f2 & f3) should be removed. */ - it should "correctly clean the dataframe containing map and non-map features due to max absolute fill rate " + + it should "correctly clean the randomly generated map and non-map features due to max absolute fill rate " + "difference" in { // Define random generators that will be the same for training and scoring dataframes val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0) @@ -330,7 +446,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with * fill rate difference of 0.25, and a relative fill ratio difference of 6. The RawFeatureFilter is set up with a * maximum fill ratio difference of 4 so both c2 and c3 (as well as their corresponding map keys) should be removed. */ - it should "correctly clean the dataframe containing map and non-map features due to max fill ratio " + + it should "correctly clean the randomly generated map and non-map features due to max fill ratio " + "difference" in { // Define random generators that will be the same for training and scoring dataframes val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0) @@ -386,7 +502,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with * divergence (practically, 1.0). The RawFeatureFilter is set up with a maximum JS divergence of 0.8, so both * f1 and f3 (as well as their corresponding map keys) should be removed. */ - it should "correctly clean the dataframe containing map and non-map features due to JS divergence" in { + it should "correctly clean the randomly generated map and non-map features due to JS divergence" in { // Define random generators that will be the same for training and scoring dataframes val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 1.0, sigma = 5.0).withProbabilityOfEmpty(0.1) val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 5.0) @@ -431,7 +547,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with assertFeatureDistributions(filteredRawData, total = 12) } - it should "not drop protected raw features or response features" in { + it should "not drop protected raw features or response features from generated data" in { // Define random generators that will be the same for training and scoring dataframes val currencyGenerator25 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25) val currencyGenerator95 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95) @@ -520,7 +636,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with * c1 and c3 (as well as their corresponding map keys) should be removed, but c3 is added to a list of features * protected from JS divergence removal. */ - it should "not drop JS divergence-protected features based on JS divergence check" in { + it should "not drop JS divergence-protected features based on JS divergence check with generated data" in { // Define random generators that will be the same for training and scoring dataframes val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 1.0, sigma = 5.0).withProbabilityOfEmpty(0.1) val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 5.0).withProbabilityOfEmpty(0.1) @@ -582,7 +698,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with * otherwise it is 1. Therefore feature c2 (and its corresponding map key) should be removed by the correlation * check between a raw feature's null indicator and the label. */ - it should "correctly drop features based on null-label correlations" in { + it should "correctly drop features based on null-label correlations with generated data" in { // Define random generators that will be the same for training and scoring dataframes val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.3) val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.3) @@ -728,6 +844,46 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with } } + private def nullLabelCorrelationTest( + maxCorrelation: Double, + expectedDropped: Seq[OPFeature], + expectedMapKeys: Seq[String], + expectedDroppedMapKeys: Map[String, Set[String]] + ): Unit = { + def getFilter(maxCorrelation: Double): RawFeatureFilter[Passenger] = new RawFeatureFilter( + trainingReader = dataReader, + scoringReader = Some(simpleReader), + bins = 10, + minFill = 0.0, + maxFillDifference = 1.0, + maxFillRatioDiff = Double.PositiveInfinity, + maxJSDivergence = 1.0, + maxCorrelation = maxCorrelation, + minScoringRows = 0) + + val params = new OpParams() + val features: Array[OPFeature] = + Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap) + val filteredRawData@FilteredRawData(df, dropped, droppedKeyValue, _) = + getFilter(maxCorrelation).generateFilteredRaw(features, params) + + assertFeatureDistributions(filteredRawData, total = 26) + dropped should contain theSameElementsAs expectedDropped + droppedKeyValue should contain theSameElementsAs expectedDroppedMapKeys + + df.schema.fields.map(_.name) should contain theSameElementsAs + DataFrameFieldNames.KeyFieldName +: features.diff(dropped).map(_.name) + if (expectedMapKeys.nonEmpty) { + df.collect(booleanMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys + df.collect(numericMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys + df.collect(stringMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys + } else { + intercept[IllegalArgumentException] { df.collect(booleanMap) } + intercept[IllegalArgumentException] { df.collect(numericMap) } + intercept[IllegalArgumentException] { df.collect(stringMap) } + } + } + /** * Defines readers in terms of datasets (in these tests, already created by feature generators) * From 94f856a1ced10d8ef4f09160eecdc2818f9d5d5f Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Thu, 28 Mar 2019 14:21:24 -0700 Subject: [PATCH 12/13] Reduced number of rows generated down to 500 (from 1000) to speed things up --- .../com/salesforce/op/filters/RawFeatureFilterTest.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala index 51b230a5c1..40f1af2995 100644 --- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala @@ -52,14 +52,14 @@ import scala.reflect.runtime.universe.TypeTag @RunWith(classOf[JUnitRunner]) class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with FiltersTestData { - + // loggingLevel(Level.INFO) // Our randomly generated data will generate feature names and corresponding map keys in this universe val featureUniverse = Set("myF1", "myF2", "myF3") val mapKeyUniverse = Set("f1", "f2", "f3") // Number of rows to use in randomly generated data sets - val numRows = 1000 + val numRows = 500 Spec[RawFeatureFilter[_]] should "correctly compute feature stats" in { val features: Array[OPFeature] = From bee777be46cc7bc214336673ffce88e862848f94 Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Thu, 28 Mar 2019 14:58:04 -0700 Subject: [PATCH 13/13] Fixed scalastyle issues and made a test less likely to be flaky --- .../salesforce/op/filters/RawFeatureFilterTest.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala index 40f1af2995..57d26d5332 100644 --- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala @@ -242,7 +242,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val features: Array[OPFeature] = Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap) val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.5, 0.5, - Double.PositiveInfinity, 1.0, 1.0, minScoringRows = 0) + Double.PositiveInfinity, 1.0, 1.0, minScoringRows = 0) val filteredRawData = filter.generateFilteredRaw(features, params) filteredRawData.featuresToDrop shouldBe empty filteredRawData.cleanedData.schema.fields should contain theSameElementsAs passengersDataSet.schema.fields @@ -255,7 +255,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val params = new OpParams() val features: Array[OPFeature] = Array(survived, age, gender, height, weight, description, boarded) - val filter = new RawFeatureFilter(dataReader, Some(simpleReader),10, 0.1, 0.1, + val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1, 2, 0.2, 0.9, minScoringRows = 0) val filteredRawData = filter.generateFilteredRaw(features, params) filteredRawData.featuresToDrop.toSet shouldEqual Set(age, gender, height, weight, description, boarded) @@ -263,7 +263,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with Array(DataFrameFieldNames.KeyFieldName, survived.name) assertFeatureDistributions(filteredRawData, total = 14) - val filter2 = new RawFeatureFilter(dataReader, Some(simpleReader),10, 0.1, 0.1, + val filter2 = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1, 2, 0.2, 0.9, minScoringRows = 0, protectedFeatures = Set(age.name, gender.name)) val filteredRawData2 = filter2.generateFilteredRaw(features, params) @@ -501,7 +501,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with * * Features c2 & c3 are switched between the training and scoring sets, so that they should have an absolute * fill rate difference of 0.25, and a relative fill ratio difference of 6. The RawFeatureFilter is set up with a - * maximum fill ratio difference of 4 so both c2 and c3 (as well as their corresponding map keys) should be removed. + * maximum fill ratio difference of 3 so both c2 and c3 (as well as their corresponding map keys) should be removed. */ it should "correctly clean the randomly generated map and non-map features due to max fill ratio " + "difference" in { @@ -533,7 +533,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val params = new OpParams() // We should be able to set the features to either be the train features or the score ones here val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw) - val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.0, 1.0, 4.0, 1.0, 1.0) + val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.0, 1.0, 3.0, 1.0, 1.0) val filteredRawData = filter.generateFilteredRaw(features, params) // TODO: Add a check for the reason dropped once that information is passed on to the workflow