From 9ba57d6536728fb9e1fb85537575e6051aa824c8 Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Fri, 15 Mar 2019 12:24:53 -0700
Subject: [PATCH 01/13] Adds min scoring set size to RFF

---
 .../com/salesforce/op/filters/RawFeatureFilter.scala   | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala b/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala
index 7cccfd01ae..8d4cb21ef6 100644
--- a/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala
+++ b/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala
@@ -300,9 +300,11 @@ class RawFeatureFilter[T]
     val scoreData = scoringReader.flatMap { s =>
       val sd = s.generateDataFrame(rawFeatures, parameters.switchReaderParams()).persist()
       log.info("Loaded scoring data")
-      if (sd.count() > 0) Some(sd)
+      val scoringDataCount = sd.count()
+      if (scoringDataCount >= RawFeatureFilter.minRowsForScoringSet) Some(sd)
       else {
-        log.warn("Scoring dataset was empty. Only training data checks will be used.")
+        log.warn(s"Scoring dataset has $scoringDataCount rows, which is less than the minimum required of " +
+          s"${RawFeatureFilter.minRowsForScoringSet}. Only training data checks will be used.")
         None
       }
     }
@@ -371,6 +373,10 @@ object RawFeatureFilter {
     bins
   }
 
+  // If there are not enough rows in the scoring set, we should not perform comparisons between the training and
+  // scoring sets since they will not be reliable. Currently, this is set to the same as the minimum training size.
+  val minRowsForScoringSet = 500
+
 }
 
 /**

From 3e6bc34eee56080f5102eeab919543b5e16dca49 Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Thu, 21 Mar 2019 15:05:25 -0700
Subject: [PATCH 02/13] Updated tests with random feature generation,
 attempting to get the same thing working for maps

---
 .../op/filters/RawFeatureFilterTest.scala     | 221 +++++++++++++++++-
 1 file changed, 218 insertions(+), 3 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
index be2aff57cc..6ba1f76c1f 100644
--- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
@@ -31,15 +31,23 @@
 package com.salesforce.op.filters
 
 import com.salesforce.op.OpParams
-import com.salesforce.op.features.{FeatureDistributionType, OPFeature}
-import com.salesforce.op.readers.DataFrameFieldNames
-import com.salesforce.op.test.{Passenger, PassengerSparkFixtureTest}
+import com.salesforce.op.features.{Feature, FeatureDistributionType, OPFeature}
+import com.salesforce.op.features.types._
+import com.salesforce.op.readers.{CustomReader, DataFrameFieldNames, ReaderKey}
+import com.salesforce.op.test._
+import com.salesforce.op.testkit._
+import com.salesforce.op.testkit.RandomData
+import com.salesforce.op.stages.impl.feature.OPMapVectorizerTestHelper.makeTernaryOPMapTransformer
 import com.salesforce.op.utils.spark.RichDataset._
 import com.twitter.algebird.Operators._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{Dataset, Row, SparkSession}
 import org.junit.runner.RunWith
 import org.scalatest.{Assertion, FlatSpec}
 import org.scalatest.junit.JUnitRunner
 
+import scala.reflect.runtime.universe.TypeTag
+
 @RunWith(classOf[JUnitRunner])
 class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with FiltersTestData {
 
@@ -136,6 +144,213 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     excludedBothAllMK shouldBe empty
   }
 
+  /**
+   * Generates a random dataframe and OPFeatures from supplied data generators and their types. The names of the
+   * columns of the dataframe are fixed to be myF1, myF2, myF3, and myF4 so that the same OPFeatures can be used to
+   * refer to columns in either dataframe.
+   *
+   * @param f1        Random data generator for feature 1 (type F1)
+   * @param f2        Random data generator for feature 2 (type F2)
+   * @param f3        Random data generator for feature 3 (type F3)
+   * @param f4        Random data generator for feature 4 (type F4)
+   * @param numRows   Number of rows to generate
+   * @tparam F1       Type of feature 1
+   * @tparam F2       Type of feature 2
+   * @tparam F3       Type of feature 3
+   * @tparam F4       Type of feature 4
+   * @return          Tuple containing the generated dataframe and each individual OPFeature
+   */
+  def generateRandomDfAndFeatures[
+    F1 <: FeatureType : TypeTag,
+    F2 <: FeatureType : TypeTag,
+    F3 <: FeatureType : TypeTag,
+    F4 <: FeatureType : TypeTag
+  ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], f4: RandomData[F4], numRows: Int):
+  (Dataset[Row], Feature[F1], Feature[F2], Feature[F3], Feature[F4])  = {
+
+    val f1Data = f1.limit(numRows)
+    val f2Data = f2.limit(numRows)
+    val f3Data = f3.limit(numRows)
+    val f4Data = f4.limit(numRows)
+
+    // Combine the data into a single tuple for each row
+    val generatedTrainData: Seq[(F1, F2, F3, F4)] = f1Data.zip(f2Data).zip(f3Data).zip(f4Data).map {
+      case (((a, b), c), d) => (a, b, c, d)
+    }
+
+    TestFeatureBuilder[F1, F2, F3, F4]("myF1", "myF2", "myF3", "myF4", generatedTrainData)
+  }
+
+  it should "use a simple function to generate random data" in {
+    val (testDf, f1, f2, f3, f4) = generateRandomDfAndFeatures[City, Country, PickList, Currency](
+      RandomText.cities.withProbabilityOfEmpty(0.2),
+      RandomText.countries.withProbabilityOfEmpty(0.2),
+      RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I")).withProbabilityOfEmpty(0.2),
+      RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.2),
+      1000
+    )
+
+    testDf.show(10)
+  }
+
+  it should "clean a dataframe filled with randomly generated data" in {
+    // Define random generators that will be the same for training and scoring dataframes
+    val cityGenerator = RandomText.cities.withProbabilityOfEmpty(0.2)
+    val countryGenerator = RandomText.countries.withProbabilityOfEmpty(0.2)
+    val pickListGenerator = RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I"))
+      .withProbabilityOfEmpty(0.2)
+    val currencyGenerator = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0)
+
+    // Define the training dataframe and the features (these should be the same between the training and scoring
+    // dataframes since they point to columns with the same names)
+    val (trainDf, trainCity, trainCountry, trainPickList, trainCurrency) =
+      generateRandomDfAndFeatures[City, Country, PickList, Currency](
+        cityGenerator,
+        countryGenerator,
+        pickListGenerator,
+        currencyGenerator.withProbabilityOfEmpty(0.2),
+        1000
+      )
+
+    // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
+    val (scoreDf, _, _, _, _) = generateRandomDfAndFeatures[City, Country, PickList, Currency](
+        cityGenerator,
+        countryGenerator,
+        pickListGenerator,
+        currencyGenerator.withProbabilityOfEmpty(1.0),
+        1000
+      )
+
+    // Define the readers
+    val trainReader = new CustomReader[Row](ReaderKey.randomKey) {
+      def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf)
+    }
+    val scoreReader = new CustomReader[Row](ReaderKey.randomKey) {
+      def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(scoreDf)
+    }
+
+    val params = new OpParams()
+    // We should be able to set the features to either be the train features or the score ones here
+    val features: Array[OPFeature] = Array(trainCity, trainCountry, trainPickList, trainCurrency)
+    val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0)
+    val filteredRawData = filter.generateFilteredRaw(features, params)
+
+
+    // Check that the only feature that was dropped was the currency feature
+    // TODO: Add a check for the reason dropped once that information is passed on to the workflow
+    filteredRawData.featuresToDrop.length shouldBe 1
+    filteredRawData.featuresToDrop.head.name.startsWith("myF4") shouldBe true
+  }
+
+  it should "not remove any features when the training and scoring sets are identical" in {
+    // Define random generators that will be the same for training and scoring dataframes
+    val cityGenerator = RandomText.cities.withProbabilityOfEmpty(0.2)
+    val countryGenerator = RandomText.countries.withProbabilityOfEmpty(0.2)
+    val pickListGenerator = RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I"))
+      .withProbabilityOfEmpty(0.2)
+    val currencyGenerator = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0)
+
+    // Define the training dataframe and the features (these should be the same between the training and scoring
+    // dataframes since they point to columns with the same names)
+    val (trainDf, trainCity, trainCountry, trainPickList, trainCurrency) =
+    generateRandomDfAndFeatures[City, Country, PickList, Currency](
+      cityGenerator,
+      countryGenerator,
+      pickListGenerator,
+      currencyGenerator.withProbabilityOfEmpty(0.2),
+      1000
+    )
+
+    // Define the readers
+    val trainReader = new CustomReader[Row](ReaderKey.randomKey) {
+      def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf)
+    }
+    val scoreReader = new CustomReader[Row](ReaderKey.randomKey) {
+      def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf)
+    }
+
+    val params = new OpParams()
+    // We should be able to set the features to either be the train features or the score ones here
+    val features: Array[OPFeature] = Array(trainCity, trainCountry, trainPickList, trainCurrency)
+    val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0)
+    val filteredRawData = filter.generateFilteredRaw(features, params)
+
+    filteredRawData.featuresToDrop shouldBe empty
+    filteredRawData.mapKeysToDrop shouldBe empty
+    filteredRawData.cleanedData.schema.fields should contain theSameElementsAs
+      trainReader.generateDataFrame(features).schema.fields
+
+    assertFeatureDistributions(filteredRawData, total = features.length * 2)
+
+    // Also check that the all the feature distributions are the same between the training and scoring sets
+    filteredRawData.trainingFeatureDistributions.zip(filteredRawData.trainingFeatureDistributions).foreach{
+      case (train, score) =>
+        train.name shouldBe score.name
+        train.key shouldBe score.key
+        train.count shouldBe score.count
+        train.nulls shouldBe score.nulls
+        train.distribution shouldBe score.distribution
+        train.summaryInfo shouldBe score.summaryInfo
+    }
+  }
+
+  it should "correctly clean the dataframe due to min fill rates" in {
+    // Define random generators that will be the same for training and scoring dataframes
+    val stateGenerator = RandomText.states.withProbabilityOfEmpty(0.2)
+    val pickListGenerator = RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I"))
+      .withProbabilityOfEmpty(0.2)
+    val realMapGenerator = RandomMap.ofReals(RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0), 0, 4)
+      .withKeys ("customReal" +)
+    val binaryMapGenerator = RandomMap.ofBinaries(probabilityOfSuccess = 0.25, 0, 4)
+      .withKeys ("customBinary" +)
+
+    val currencyGenerator = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25)
+    val c1 = currencyGenerator.limit(1000)
+    val c2 = currencyGenerator.limit(1000)
+    val c3 = currencyGenerator.limit(1000)
+    val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3).asRaw(isResponse = false)
+
+    // Define the training dataframe and the features (these should be the same between the training and scoring
+    // dataframes since they point to columns with the same names)
+    val (trainDf, trainState, trainPickList, trainRealMap, trainBinaryMap) =
+    generateRandomDfAndFeatures[State, PickList, RealMap, BinaryMap](
+      stateGenerator,
+      pickListGenerator,
+      realMapGenerator,
+      binaryMapGenerator,
+      1000
+    )
+
+    // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
+    val (scoreDf, _, _, _, _) = generateRandomDfAndFeatures[City, Country, PickList, Currency](
+      cityGenerator,
+      countryGenerator,
+      pickListGenerator,
+      currencyGenerator.withProbabilityOfEmpty(1.0),
+      1000
+    )
+
+    // Define the readers
+    val trainReader = new CustomReader[Row](ReaderKey.randomKey) {
+      def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf)
+    }
+    val scoreReader = new CustomReader[Row](ReaderKey.randomKey) {
+      def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(scoreDf)
+    }
+
+    val params = new OpParams()
+    // We should be able to set the features to either be the train features or the score ones here
+    val features: Array[OPFeature] = Array(trainCity, trainCountry, trainPickList, trainCurrency)
+    val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0)
+    val filteredRawData = filter.generateFilteredRaw(features, params)
+
+
+    // Check that the only feature that was dropped was the currency feature
+    // TODO: Add a check for the reason dropped once that information is passed on to the workflow
+    filteredRawData.featuresToDrop.length shouldBe 1
+    filteredRawData.featuresToDrop.head.name.startsWith("myF4") shouldBe true
+  }
+
   it should "correctly clean the dataframe returned and give the features to blacklist" in {
     val params = new OpParams()
     val survPred = survived.copy(isResponse = false)

From c05f61ee3423a7934c1452cb7ff49fba2aa6b147 Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Mon, 25 Mar 2019 01:28:54 -0700
Subject: [PATCH 03/13] Several new tests with randomly generated features

---
 .../op/filters/RawFeatureFilterTest.scala     | 515 ++++++++++++++----
 1 file changed, 404 insertions(+), 111 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
index 6ba1f76c1f..31b02152e3 100644
--- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
@@ -30,8 +30,8 @@
 
 package com.salesforce.op.filters
 
-import com.salesforce.op.OpParams
-import com.salesforce.op.features.{Feature, FeatureDistributionType, OPFeature}
+import com.salesforce.op.{OpParams, OpWorkflow}
+import com.salesforce.op.features.{Feature, FeatureDistributionType, FeatureLike, OPFeature}
 import com.salesforce.op.features.types._
 import com.salesforce.op.readers.{CustomReader, DataFrameFieldNames, ReaderKey}
 import com.salesforce.op.test._
@@ -40,6 +40,7 @@ import com.salesforce.op.testkit.RandomData
 import com.salesforce.op.stages.impl.feature.OPMapVectorizerTestHelper.makeTernaryOPMapTransformer
 import com.salesforce.op.utils.spark.RichDataset._
 import com.twitter.algebird.Operators._
+import org.apache.log4j.Level
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{Dataset, Row, SparkSession}
 import org.junit.runner.RunWith
@@ -51,6 +52,8 @@ import scala.reflect.runtime.universe.TypeTag
 @RunWith(classOf[JUnitRunner])
 class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with FiltersTestData {
 
+  // loggingLevel(Level.INFO)
+
   Spec[RawFeatureFilter[_]] should "compute feature stats correctly" in {
     val features: Array[OPFeature] =
       Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap)
@@ -144,6 +147,39 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     excludedBothAllMK shouldBe empty
   }
 
+  /**
+   * Generates a random dataframe and OPFeatures from supplied data generators and their types. The names of the
+   * columns of the dataframe are fixed to be myF1, myF2, myF3, and myF4 so that the same OPFeatures can be used to
+   * refer to columns in either dataframe.
+   *
+   * @param f1        Random data generator for feature 1 (type F1)
+   * @param f2        Random data generator for feature 2 (type F2)
+   * @param f3        Random data generator for feature 3 (type F3)
+   * @param numRows   Number of rows to generate
+   * @tparam F1       Type of feature 1
+   * @tparam F2       Type of feature 2
+   * @tparam F3       Type of feature 3
+   * @return          Tuple containing the generated dataframe and each individual OPFeature
+   */
+  def generateRandomDfAndFeatures[
+  F1 <: FeatureType : TypeTag,
+  F2 <: FeatureType : TypeTag,
+  F3 <: FeatureType : TypeTag
+  ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], numRows: Int):
+  (Dataset[Row], Feature[F1], Feature[F2], Feature[F3])  = {
+
+    val f1Data = f1.limit(numRows)
+    val f2Data = f2.limit(numRows)
+    val f3Data = f3.limit(numRows)
+
+    // Combine the data into a single tuple for each row
+    val generatedTrainData: Seq[(F1, F2, F3)] = f1Data.zip(f2Data).zip(f3Data).map {
+      case ((a, b), c) => (a, b, c)
+    }
+
+    TestFeatureBuilder[F1, F2, F3]("myF1", "myF2", "myF3", generatedTrainData)
+  }
+
   /**
    * Generates a random dataframe and OPFeatures from supplied data generators and their types. The names of the
    * columns of the dataframe are fixed to be myF1, myF2, myF3, and myF4 so that the same OPFeatures can be used to
@@ -181,93 +217,23 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     TestFeatureBuilder[F1, F2, F3, F4]("myF1", "myF2", "myF3", "myF4", generatedTrainData)
   }
 
-  it should "use a simple function to generate random data" in {
-    val (testDf, f1, f2, f3, f4) = generateRandomDfAndFeatures[City, Country, PickList, Currency](
-      RandomText.cities.withProbabilityOfEmpty(0.2),
-      RandomText.countries.withProbabilityOfEmpty(0.2),
-      RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I")).withProbabilityOfEmpty(0.2),
-      RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.2),
-      1000
-    )
-
-    testDf.show(10)
-  }
-
-  it should "clean a dataframe filled with randomly generated data" in {
-    // Define random generators that will be the same for training and scoring dataframes
-    val cityGenerator = RandomText.cities.withProbabilityOfEmpty(0.2)
-    val countryGenerator = RandomText.countries.withProbabilityOfEmpty(0.2)
-    val pickListGenerator = RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I"))
-      .withProbabilityOfEmpty(0.2)
-    val currencyGenerator = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0)
-
-    // Define the training dataframe and the features (these should be the same between the training and scoring
-    // dataframes since they point to columns with the same names)
-    val (trainDf, trainCity, trainCountry, trainPickList, trainCurrency) =
-      generateRandomDfAndFeatures[City, Country, PickList, Currency](
-        cityGenerator,
-        countryGenerator,
-        pickListGenerator,
-        currencyGenerator.withProbabilityOfEmpty(0.2),
-        1000
-      )
-
-    // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
-    val (scoreDf, _, _, _, _) = generateRandomDfAndFeatures[City, Country, PickList, Currency](
-        cityGenerator,
-        countryGenerator,
-        pickListGenerator,
-        currencyGenerator.withProbabilityOfEmpty(1.0),
-        1000
-      )
-
-    // Define the readers
-    val trainReader = new CustomReader[Row](ReaderKey.randomKey) {
-      def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf)
-    }
-    val scoreReader = new CustomReader[Row](ReaderKey.randomKey) {
-      def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(scoreDf)
-    }
-
-    val params = new OpParams()
-    // We should be able to set the features to either be the train features or the score ones here
-    val features: Array[OPFeature] = Array(trainCity, trainCountry, trainPickList, trainCurrency)
-    val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0)
-    val filteredRawData = filter.generateFilteredRaw(features, params)
-
-
-    // Check that the only feature that was dropped was the currency feature
-    // TODO: Add a check for the reason dropped once that information is passed on to the workflow
-    filteredRawData.featuresToDrop.length shouldBe 1
-    filteredRawData.featuresToDrop.head.name.startsWith("myF4") shouldBe true
-  }
-
   it should "not remove any features when the training and scoring sets are identical" in {
     // Define random generators that will be the same for training and scoring dataframes
     val cityGenerator = RandomText.cities.withProbabilityOfEmpty(0.2)
     val countryGenerator = RandomText.countries.withProbabilityOfEmpty(0.2)
     val pickListGenerator = RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I"))
       .withProbabilityOfEmpty(0.2)
-    val currencyGenerator = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0)
+    val currencyGenerator = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.2)
 
     // Define the training dataframe and the features (these should be the same between the training and scoring
     // dataframes since they point to columns with the same names)
     val (trainDf, trainCity, trainCountry, trainPickList, trainCurrency) =
     generateRandomDfAndFeatures[City, Country, PickList, Currency](
-      cityGenerator,
-      countryGenerator,
-      pickListGenerator,
-      currencyGenerator.withProbabilityOfEmpty(0.2),
-      1000
+      cityGenerator, countryGenerator, pickListGenerator, currencyGenerator,1000
     )
 
     // Define the readers
-    val trainReader = new CustomReader[Row](ReaderKey.randomKey) {
-      def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf)
-    }
-    val scoreReader = new CustomReader[Row](ReaderKey.randomKey) {
-      def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf)
-    }
+    val (trainReader, scoreReader) = makeReaders(trainDf, trainDf)
 
     val params = new OpParams()
     // We should be able to set the features to either be the train features or the score ones here
@@ -294,62 +260,358 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     }
   }
 
-  it should "correctly clean the dataframe due to min fill rates" in {
+  it should "correctly clean the dataframe containing map and non-map features due to min fill rate" in {
     // Define random generators that will be the same for training and scoring dataframes
-    val stateGenerator = RandomText.states.withProbabilityOfEmpty(0.2)
-    val pickListGenerator = RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G", "H", "I"))
-      .withProbabilityOfEmpty(0.2)
-    val realMapGenerator = RandomMap.ofReals(RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0), 0, 4)
-      .withKeys ("customReal" +)
-    val binaryMapGenerator = RandomMap.ofBinaries(probabilityOfSuccess = 0.25, 0, 4)
-      .withKeys ("customBinary" +)
+    val currencyGenerator25 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25)
+    val currencyGenerator95 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95)
+    val currencyGenerator50 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.5)
+
+    // Define the training dataframe and the features (these should be the same between the training and scoring
+    // dataframes since they point to columns with the same names)
+    val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency](
+      currencyGenerator25, currencyGenerator95, currencyGenerator50, 1000
+    )
+    val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
+    // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
+    val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
+    val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
+
+    // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
+    val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
+      currencyGenerator95, currencyGenerator50, currencyGenerator25,1000
+    )
+    val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
+
+    // Define the readers
+    val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf)
+
+    val params = new OpParams()
+    // We should be able to set the features to either be the train features or the score ones here
+    val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw)
+
+    // Check that using the training reader only will result in the rarely filled features being removed
+    val filter = new RawFeatureFilter(trainReader, None, 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0)
+    val filteredRawData = filter.generateFilteredRaw(features, params)
+
+    // TODO: Add a check for the reason dropped once that information is passed on to the workflow
+    // Check that we drop one feature, as well as its corresponding map key
+    filteredRawData.featuresToDrop.length shouldBe 1
+    filteredRawData.mapKeysToDrop.keySet.size shouldBe 1
+    filteredRawData.mapKeysToDrop.values.head.size shouldBe 1
+
+    // The fure that is 99% empty should be thrown out
+    filteredRawData.featuresToDrop.head.name.startsWith("myF2") shouldBe true
+    filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f2")
+
+    // Check the actual filtered dataframe schemas
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe true
+    filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m =>
+      if (m.nonEmpty) m.value.keySet should not contain "f2")
 
-    val currencyGenerator = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25)
-    val c1 = currencyGenerator.limit(1000)
-    val c2 = currencyGenerator.limit(1000)
-    val c3 = currencyGenerator.limit(1000)
-    val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3).asRaw(isResponse = false)
+    // There should be 6 FeatureDistributions - training for 3 raw features, one map with three keys
+    // The map and non-map features should also be the same
+    assertFeatureDistributionEquality(filteredRawData, total = 6)
+
+
+    // Check that using the scoring reader only will result in the rarely filled in both training and scoring sets
+    // being removed
+    val filterWithScoring = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0)
+    val filteredRawDataWithScoring = filterWithScoring.generateFilteredRaw(features, params)
+
+    // TODO: Add a check for the reason dropped once that information is passed on to the workflow
+    // Check that we drop one feature, as well as its corresponding map key
+    filteredRawDataWithScoring.featuresToDrop.length shouldBe 2
+    filteredRawDataWithScoring.mapKeysToDrop.keySet.size shouldBe 1
+    filteredRawDataWithScoring.mapKeysToDrop.values.head.size shouldBe 2
+
+    // The feature that is 99% empty should be thrown out
+    filteredRawDataWithScoring.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF1", "myF2")
+    filteredRawDataWithScoring.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f1", "f2")
+    // filteredDataWithScoring.mapKeysToDrop
+    //  .foldLeft(Set.empty[String])((acc, x) => acc ++ x._2) should contain theSameElementsAs Seq("f1", "f2")
+
+    filteredRawDataWithScoring.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe false
+    filteredRawDataWithScoring.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false
+    filteredRawDataWithScoring.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe true
+    filteredRawDataWithScoring.cleanedData.collect(mapFeatureRaw).foreach(m =>
+      if (m.nonEmpty) m.value.keySet shouldEqual Set("f3"))
+
+    // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys
+    // The map and non-map features should also be the same
+    assertFeatureDistributionEquality(filteredRawDataWithScoring, total = 12)
+  }
+
+  /**
+   * This test generates three numeric generators with the same underlying distribution, but different fill rates.
+   * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three
+   * raw features - each key contains the same data as the corresponding raw feature.
+   *
+   * Features f2 & f3 are switched between the training and scoring sets, so that they should have an absolute
+   * fill rate difference of 0.6. The RawFeatureFilter is set up with a maximum absolute fill rate of 0.4 so both
+   * f2 and f3 (as well as their corresponding map keys) should be removed.
+   */
+  it should "correctly clean the dataframe containing map and non-map features due to max absolute fill rate " +
+    "difference" in {
+    // Define random generators that will be the same for training and scoring dataframes
+    val realGenerator1 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0)
+    val realGenerator2 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.2)
+    val realGenerator3 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.8)
 
     // Define the training dataframe and the features (these should be the same between the training and scoring
     // dataframes since they point to columns with the same names)
-    val (trainDf, trainState, trainPickList, trainRealMap, trainBinaryMap) =
-    generateRandomDfAndFeatures[State, PickList, RealMap, BinaryMap](
-      stateGenerator,
-      pickListGenerator,
-      realMapGenerator,
-      binaryMapGenerator,
-      1000
+    val (trainDf, r1, r2, r3) =
+    generateRandomDfAndFeatures[Real, Real, Real](
+      realGenerator1, realGenerator2, realGenerator3, 1000
     )
+    val mapFeature = makeTernaryOPMapTransformer[Real, RealMap, Double](r1, r2, r3)
+    // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
+    val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
+    val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
-    val (scoreDf, _, _, _, _) = generateRandomDfAndFeatures[City, Country, PickList, Currency](
-      cityGenerator,
-      countryGenerator,
-      pickListGenerator,
-      currencyGenerator.withProbabilityOfEmpty(1.0),
-      1000
+    val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Real, Real, Real](
+      realGenerator1, realGenerator3, realGenerator2, 1000
     )
+    val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
     // Define the readers
-    val trainReader = new CustomReader[Row](ReaderKey.randomKey) {
-      def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf)
-    }
-    val scoreReader = new CustomReader[Row](ReaderKey.randomKey) {
-      def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(scoreDf)
-    }
+    val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf)
 
     val params = new OpParams()
     // We should be able to set the features to either be the train features or the score ones here
-    val features: Array[OPFeature] = Array(trainCity, trainCountry, trainPickList, trainCurrency)
-    val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0)
+    val features: Array[OPFeature] = Array(r1, r2, r3, mapFeatureRaw)
+    val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.0, 0.4, Double.PositiveInfinity, 1.0, 1.0)
+    val filteredRawData = filter.generateFilteredRaw(features, params)
+
+    /*
+    val exclusions = filter.getFeaturesToExclude(
+      trainingDistribs = filteredRawData.trainingFeatureDistributions,
+      scoringDistribs = filteredRawData.scoringFeatureDistributions,
+      correlationInfo = Map.empty
+    )
+    println(exclusions)
+     */
+
+    // TODO: Add a check for the reason dropped once that information is passed on to the workflow
+    // Check that we drop one feature, as well as its corresponding map key
+    filteredRawData.featuresToDrop.length shouldBe 2
+    filteredRawData.mapKeysToDrop.keySet.size shouldBe 1
+    filteredRawData.mapKeysToDrop.values.head.size shouldBe 2
+
+    // Since we swtiched the distributions in features 2 & 3 between the training and scoring sets, then both of them
+    // should be removed (the two raw features and the two corresponding map keys)
+    filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF2", "myF3")
+    filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f2", "f3")
+
+    // Check the actual filtered dataframe schemas
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe false
+    filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m =>
+      if (m.nonEmpty) m.value.keySet shouldEqual Set("f1"))
+
+    // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys
+    // The map and non-map features should also be the same
+    assertFeatureDistributionEquality(filteredRawData, total = 12)
+  }
+
+  /**
+   * This test generates three numeric generators with the same underlying distribution, but different fill rates.
+   * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three
+   * raw features - each key contains the same data as the corresponding raw feature.
+   *
+   * Features f2 & f3 are switched between the training and scoring sets, so that they should have an absolute
+   * fill rate difference of 0.25, and a relative fill ratio difference of 6. The RawFeatureFilter is set up with a
+   * maximum fill ratio difference of 4 so both f2 and f3 (as well as their corresponding map keys) should be removed.
+   */
+  it should "correctly clean the dataframe containing map and non-map features due to max fill ratio " +
+    "difference" in {
+    // Define random generators that will be the same for training and scoring dataframes
+    val realGenerator1 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0)
+    val realGenerator2 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95)
+    val realGenerator3 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.7)
+
+    // Define the training dataframe and the features (these should be the same between the training and scoring
+    // dataframes since they point to columns with the same names)
+    val (trainDf, r1, r2, r3) =
+    generateRandomDfAndFeatures[Real, Real, Real](
+      realGenerator1, realGenerator2, realGenerator3, 1000
+    )
+    val mapFeature = makeTernaryOPMapTransformer[Real, RealMap, Double](r1, r2, r3)
+    // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
+    val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
+    val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
+
+    // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
+    val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Real, Real, Real](
+      realGenerator1, realGenerator3, realGenerator2, 1000
+    )
+    val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
+
+    // Define the readers
+    val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf)
+
+    val params = new OpParams()
+    // We should be able to set the features to either be the train features or the score ones here
+    val features: Array[OPFeature] = Array(r1, r2, r3, mapFeatureRaw)
+    val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.0, 1.0, 4.0, 1.0, 1.0)
     val filteredRawData = filter.generateFilteredRaw(features, params)
 
+    // TODO: Add a check for the reason dropped once that information is passed on to the workflow
+    // Check that we drop one feature, as well as its corresponding map key
+    filteredRawData.featuresToDrop.length shouldBe 2
+    filteredRawData.mapKeysToDrop.keySet.size shouldBe 1
+    filteredRawData.mapKeysToDrop.values.head.size shouldBe 2
+
+    // Since we swtiched the distributions in features 2 & 3 between the training and scoring sets, then both of them
+    // should be removed (the two raw features and the two corresponding map keys)
+    filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF2", "myF3")
+    filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f2", "f3")
+
+    // Check the actual filtered dataframe schemas
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe false
+    filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m =>
+      if (m.nonEmpty) m.value.keySet shouldEqual Set("f1"))
+
+    // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys
+    // The map and non-map features should also be the same
+    assertFeatureDistributionEquality(filteredRawData, total = 12)
+  }
+
+  /**
+   * This test generates three numeric generators with the same underlying distribution, but different fill rates.
+   * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three
+   * raw features - each key contains the same data as the corresponding raw feature.
+   *
+   * Features f1 & f3 are switched between the training and scoring sets, so they should have a very large JS
+   * divergence (practically, 1.0). The RawFeatureFilter is set up with a maximum JS divergence of 0.8, so both
+   * f1 and f3 (as well as their corresponding map keys) should be removed.
+   */
+  it should "correctly clean the dataframe containing map and non-map features due to JS divergence" in {
+    // Define random generators that will be the same for training and scoring dataframes
+    val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 1.0, sigma = 5.0).withProbabilityOfEmpty(0.1)
+    val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 5.0)
+    val currencyGenerator3 = RandomReal.logNormal[Currency](mean = 1000.0, sigma = 5.0).withProbabilityOfEmpty(0.1)
+
+    // Define the training dataframe and the features (these should be the same between the training and scoring
+    // dataframes since they point to columns with the same names)
+    val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency](
+      currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000
+    )
+    val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
+    // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
+    val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
+    val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
+
+    // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
+    val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
+      currencyGenerator3, currencyGenerator2, currencyGenerator1, 1000
+    )
+    val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
+
+    // Define the readers
+    val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf)
+
+    val params = new OpParams()
+    // We should be able to set the features to either be the train features or the score ones here
+    val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw)
+
+    val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 0.8, 1.0)
+    val filteredRawData = filter.generateFilteredRaw(features, params)
 
-    // Check that the only feature that was dropped was the currency feature
     // TODO: Add a check for the reason dropped once that information is passed on to the workflow
+    // Check that we drop one feature, as well as its corresponding map key
+    filteredRawData.featuresToDrop.length shouldBe 2
+    filteredRawData.mapKeysToDrop.keySet.size shouldBe 1
+    filteredRawData.mapKeysToDrop.values.head.size shouldBe 2
+
+    // The feature that is 99% empty should be thrown out
+    filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF1", "myF3")
+    filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f1", "f3")
+
+    // Check the actual filtered dataframe schemas
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe false
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe true
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe false
+    filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m =>
+      if (m.nonEmpty) m.value.keySet shouldEqual Set("f2"))
+
+    // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys
+    // The map and non-map features should also be the same
+    assertFeatureDistributionEquality(filteredRawData, total = 12)
+  }
+
+  it should "not drop protected raw features" in {
+    // Define random generators that will be the same for training and scoring dataframes
+    val currencyGenerator25 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25)
+    val currencyGenerator95 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95)
+    val currencyGenerator50 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.5)
+
+    // Define the training dataframe and the features (these should be the same between the training and scoring
+    // dataframes since they point to columns with the same names)
+    val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency](
+      currencyGenerator25, currencyGenerator95, currencyGenerator50, 1000
+    )
+    val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
+    // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
+    val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
+    val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
+
+    // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
+    val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
+      currencyGenerator95, currencyGenerator50, currencyGenerator25, 1000
+    )
+    val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
+
+    // Define the readers
+    val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf)
+
+    val params = new OpParams()
+    // We should be able to set the features to either be the train features or the score ones here
+    val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw)
+
+    // Check that using the scoring reader only will result in the rarely filled in both training and scoring sets
+    // being removed, except for the protected feature that would normally be removed
+    val filter = new RawFeatureFilter(trainReader, Some(scoreReader),
+      bins = 10,
+      minFill = 0.1,
+      maxFillDifference = 1.0,
+      maxFillRatioDiff = Double.PositiveInfinity,
+      maxJSDivergence = 1.0,
+      maxCorrelation = 1.0,
+      protectedFeatures = Set("myF1")
+    )
+    val filteredRawData = filter.generateFilteredRaw(features, params)
+
+    // TODO: Add a check for the reason dropped once that information is passed on to the workflow
+    // Check that we drop one feature, as well as its corresponding map key
     filteredRawData.featuresToDrop.length shouldBe 1
-    filteredRawData.featuresToDrop.head.name.startsWith("myF4") shouldBe true
+    filteredRawData.mapKeysToDrop.keySet.size shouldBe 1
+    filteredRawData.mapKeysToDrop.values.head.size shouldBe 2
+
+    // The feature that is 99% empty should be thrown out
+    filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF2")
+    filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f1", "f2")
+    // filteredData.mapKeysToDrop
+    //  .foldLeft(Set.empty[String])((acc, x) => acc ++ x._2) should contain theSameElementsAs Seq("f1", "f2")
+
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false
+    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe true
+    filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m =>
+      if (m.nonEmpty) m.value.keySet shouldEqual Set("f3"))
+
+    // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys
+    // The map and non-map features should also be the same
+    assertFeatureDistributionEquality(filteredRawData, total = 12)
   }
+  
+  // TODO: check null leakage removals (just do one threshold, or two in the same test)
+
 
   it should "correctly clean the dataframe returned and give the features to blacklist" in {
     val params = new OpParams()
@@ -469,6 +731,37 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     fd.trainingFeatureDistributions ++ fd.scoringFeatureDistributions shouldBe fd.featureDistributions
   }
 
+  private def assertFeatureDistributionEquality(fd: FilteredRawData, total: Int): Unit = {
+    fd.featureDistributions.length shouldBe total
+    fd.trainingFeatureDistributions.zip(fd.trainingFeatureDistributions).foreach {
+      case (train, score) =>
+        train.name shouldBe score.name
+        train.key shouldBe score.key
+        train.count shouldBe score.count
+        train.nulls shouldBe score.nulls
+        train.distribution shouldBe score.distribution
+        train.summaryInfo shouldBe score.summaryInfo
+    }
+  }
+
+  /**
+   * Defines readers in terms of datasets (in these tests, already created by feature generators)
+   *
+   * @param trainDf   Training dataframe
+   * @param scoreDf   Scoring dataframe
+   * @return          Tuple of (trainingReader, scoringReader)
+   */
+  private def makeReaders(trainDf: Dataset[Row], scoreDf: Dataset[Row]): (CustomReader[Row], CustomReader[Row]) = {
+    val trainReader = new CustomReader[Row](ReaderKey.randomKey) {
+      def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(trainDf)
+    }
+    val scoreReader = new CustomReader[Row](ReaderKey.randomKey) {
+      def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Row], Dataset[Row]] = Right(scoreDf)
+    }
+
+    (trainReader, scoreReader)
+  }
+
   private def nullLabelCorrelationTest(
     maxCorrelation: Double,
     expectedDropped: Seq[OPFeature],

From 8469d1f90532dce9a66df1e55a12f3b6b7919bdd Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Mon, 25 Mar 2019 12:02:16 -0700
Subject: [PATCH 04/13] Cleaned up new RFF tests, and added helper function for
 repetetive checks

---
 .../op/filters/RawFeatureFilterTest.scala     | 665 ++++++++----------
 1 file changed, 310 insertions(+), 355 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
index 31b02152e3..6b69eb2f71 100644
--- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
@@ -33,13 +33,13 @@ package com.salesforce.op.filters
 import com.salesforce.op.{OpParams, OpWorkflow}
 import com.salesforce.op.features.{Feature, FeatureDistributionType, FeatureLike, OPFeature}
 import com.salesforce.op.features.types._
-import com.salesforce.op.readers.{CustomReader, DataFrameFieldNames, ReaderKey}
+import com.salesforce.op.readers.{CustomReader, ReaderKey}
+import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer
 import com.salesforce.op.test._
 import com.salesforce.op.testkit._
 import com.salesforce.op.testkit.RandomData
 import com.salesforce.op.stages.impl.feature.OPMapVectorizerTestHelper.makeTernaryOPMapTransformer
 import com.salesforce.op.utils.spark.RichDataset._
-import com.twitter.algebird.Operators._
 import org.apache.log4j.Level
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{Dataset, Row, SparkSession}
@@ -54,6 +54,10 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
 
   // loggingLevel(Level.INFO)
 
+  // Our randomly generated data will generate feature names and corresponding map keys in this universe
+  val featureUniverse = Set("myF1", "myF2", "myF3")
+  val mapKeyUniverse = Set("f1", "f2", "f3")
+
   Spec[RawFeatureFilter[_]] should "compute feature stats correctly" in {
     val features: Array[OPFeature] =
       Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap)
@@ -147,76 +151,6 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     excludedBothAllMK shouldBe empty
   }
 
-  /**
-   * Generates a random dataframe and OPFeatures from supplied data generators and their types. The names of the
-   * columns of the dataframe are fixed to be myF1, myF2, myF3, and myF4 so that the same OPFeatures can be used to
-   * refer to columns in either dataframe.
-   *
-   * @param f1        Random data generator for feature 1 (type F1)
-   * @param f2        Random data generator for feature 2 (type F2)
-   * @param f3        Random data generator for feature 3 (type F3)
-   * @param numRows   Number of rows to generate
-   * @tparam F1       Type of feature 1
-   * @tparam F2       Type of feature 2
-   * @tparam F3       Type of feature 3
-   * @return          Tuple containing the generated dataframe and each individual OPFeature
-   */
-  def generateRandomDfAndFeatures[
-  F1 <: FeatureType : TypeTag,
-  F2 <: FeatureType : TypeTag,
-  F3 <: FeatureType : TypeTag
-  ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], numRows: Int):
-  (Dataset[Row], Feature[F1], Feature[F2], Feature[F3])  = {
-
-    val f1Data = f1.limit(numRows)
-    val f2Data = f2.limit(numRows)
-    val f3Data = f3.limit(numRows)
-
-    // Combine the data into a single tuple for each row
-    val generatedTrainData: Seq[(F1, F2, F3)] = f1Data.zip(f2Data).zip(f3Data).map {
-      case ((a, b), c) => (a, b, c)
-    }
-
-    TestFeatureBuilder[F1, F2, F3]("myF1", "myF2", "myF3", generatedTrainData)
-  }
-
-  /**
-   * Generates a random dataframe and OPFeatures from supplied data generators and their types. The names of the
-   * columns of the dataframe are fixed to be myF1, myF2, myF3, and myF4 so that the same OPFeatures can be used to
-   * refer to columns in either dataframe.
-   *
-   * @param f1        Random data generator for feature 1 (type F1)
-   * @param f2        Random data generator for feature 2 (type F2)
-   * @param f3        Random data generator for feature 3 (type F3)
-   * @param f4        Random data generator for feature 4 (type F4)
-   * @param numRows   Number of rows to generate
-   * @tparam F1       Type of feature 1
-   * @tparam F2       Type of feature 2
-   * @tparam F3       Type of feature 3
-   * @tparam F4       Type of feature 4
-   * @return          Tuple containing the generated dataframe and each individual OPFeature
-   */
-  def generateRandomDfAndFeatures[
-    F1 <: FeatureType : TypeTag,
-    F2 <: FeatureType : TypeTag,
-    F3 <: FeatureType : TypeTag,
-    F4 <: FeatureType : TypeTag
-  ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], f4: RandomData[F4], numRows: Int):
-  (Dataset[Row], Feature[F1], Feature[F2], Feature[F3], Feature[F4])  = {
-
-    val f1Data = f1.limit(numRows)
-    val f2Data = f2.limit(numRows)
-    val f3Data = f3.limit(numRows)
-    val f4Data = f4.limit(numRows)
-
-    // Combine the data into a single tuple for each row
-    val generatedTrainData: Seq[(F1, F2, F3, F4)] = f1Data.zip(f2Data).zip(f3Data).zip(f4Data).map {
-      case (((a, b), c), d) => (a, b, c, d)
-    }
-
-    TestFeatureBuilder[F1, F2, F3, F4]("myF1", "myF2", "myF3", "myF4", generatedTrainData)
-  }
-
   it should "not remove any features when the training and scoring sets are identical" in {
     // Define random generators that will be the same for training and scoring dataframes
     val cityGenerator = RandomText.cities.withProbabilityOfEmpty(0.2)
@@ -238,7 +172,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val params = new OpParams()
     // We should be able to set the features to either be the train features or the score ones here
     val features: Array[OPFeature] = Array(trainCity, trainCountry, trainPickList, trainCurrency)
-    val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0)
+    val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.4, 0.1, 1.0, 0.1, 1.0)
     val filteredRawData = filter.generateFilteredRaw(features, params)
 
     filteredRawData.featuresToDrop shouldBe empty
@@ -246,18 +180,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     filteredRawData.cleanedData.schema.fields should contain theSameElementsAs
       trainReader.generateDataFrame(features).schema.fields
 
-    assertFeatureDistributions(filteredRawData, total = features.length * 2)
-
-    // Also check that the all the feature distributions are the same between the training and scoring sets
-    filteredRawData.trainingFeatureDistributions.zip(filteredRawData.trainingFeatureDistributions).foreach{
-      case (train, score) =>
-        train.name shouldBe score.name
-        train.key shouldBe score.key
-        train.count shouldBe score.count
-        train.nulls shouldBe score.nulls
-        train.distribution shouldBe score.distribution
-        train.summaryInfo shouldBe score.summaryInfo
-    }
+    assertFeatureDistributionEquality(filteredRawData, total = features.length * 2)
   }
 
   it should "correctly clean the dataframe containing map and non-map features due to min fill rate" in {
@@ -288,32 +211,19 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val params = new OpParams()
     // We should be able to set the features to either be the train features or the score ones here
     val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw)
-
     // Check that using the training reader only will result in the rarely filled features being removed
     val filter = new RawFeatureFilter(trainReader, None, 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0)
     val filteredRawData = filter.generateFilteredRaw(features, params)
 
     // TODO: Add a check for the reason dropped once that information is passed on to the workflow
-    // Check that we drop one feature, as well as its corresponding map key
-    filteredRawData.featuresToDrop.length shouldBe 1
-    filteredRawData.mapKeysToDrop.keySet.size shouldBe 1
-    filteredRawData.mapKeysToDrop.values.head.size shouldBe 1
-
-    // The fure that is 99% empty should be thrown out
-    filteredRawData.featuresToDrop.head.name.startsWith("myF2") shouldBe true
-    filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f2")
-
-    // Check the actual filtered dataframe schemas
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe true
-    filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m =>
-      if (m.nonEmpty) m.value.keySet should not contain "f2")
-
-    // There should be 6 FeatureDistributions - training for 3 raw features, one map with three keys
-    // The map and non-map features should also be the same
-    assertFeatureDistributionEquality(filteredRawData, total = 6)
-
+    checkDroppedFeatures(
+      filteredRawData,
+      mapFeatureRaw,
+      featureUniverse = featureUniverse,
+      expectedDroppedFeatures = Set("myF2"),
+      mapKeyUniverse = mapKeyUniverse,
+      expectedDroppedKeys = Set("f2")
+    )
 
     // Check that using the scoring reader only will result in the rarely filled in both training and scoring sets
     // being removed
@@ -321,26 +231,17 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val filteredRawDataWithScoring = filterWithScoring.generateFilteredRaw(features, params)
 
     // TODO: Add a check for the reason dropped once that information is passed on to the workflow
-    // Check that we drop one feature, as well as its corresponding map key
-    filteredRawDataWithScoring.featuresToDrop.length shouldBe 2
-    filteredRawDataWithScoring.mapKeysToDrop.keySet.size shouldBe 1
-    filteredRawDataWithScoring.mapKeysToDrop.values.head.size shouldBe 2
-
-    // The feature that is 99% empty should be thrown out
-    filteredRawDataWithScoring.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF1", "myF2")
-    filteredRawDataWithScoring.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f1", "f2")
-    // filteredDataWithScoring.mapKeysToDrop
-    //  .foldLeft(Set.empty[String])((acc, x) => acc ++ x._2) should contain theSameElementsAs Seq("f1", "f2")
-
-    filteredRawDataWithScoring.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe false
-    filteredRawDataWithScoring.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false
-    filteredRawDataWithScoring.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe true
-    filteredRawDataWithScoring.cleanedData.collect(mapFeatureRaw).foreach(m =>
-      if (m.nonEmpty) m.value.keySet shouldEqual Set("f3"))
+    checkDroppedFeatures(
+      filteredRawDataWithScoring,
+      mapFeatureRaw,
+      featureUniverse = featureUniverse,
+      expectedDroppedFeatures = Set("myF1", "myF2"),
+      mapKeyUniverse = mapKeyUniverse,
+      expectedDroppedKeys = Set("f1", "f2")
+    )
 
     // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys
-    // The map and non-map features should also be the same
-    assertFeatureDistributionEquality(filteredRawDataWithScoring, total = 12)
+    assertFeatureDistributions(filteredRawDataWithScoring, total = 12)
   }
 
   /**
@@ -355,24 +256,24 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
   it should "correctly clean the dataframe containing map and non-map features due to max absolute fill rate " +
     "difference" in {
     // Define random generators that will be the same for training and scoring dataframes
-    val realGenerator1 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0)
-    val realGenerator2 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.2)
-    val realGenerator3 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.8)
+    val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0)
+    val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.2)
+    val currencyGenerator3 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.8)
 
     // Define the training dataframe and the features (these should be the same between the training and scoring
     // dataframes since they point to columns with the same names)
-    val (trainDf, r1, r2, r3) =
-    generateRandomDfAndFeatures[Real, Real, Real](
-      realGenerator1, realGenerator2, realGenerator3, 1000
+    val (trainDf, c1, c2, c3) =
+    generateRandomDfAndFeatures[Currency, Currency, Currency](
+      currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000
     )
-    val mapFeature = makeTernaryOPMapTransformer[Real, RealMap, Double](r1, r2, r3)
+    val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
     // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
     val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
     val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
-    val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Real, Real, Real](
-      realGenerator1, realGenerator3, realGenerator2, 1000
+    val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
+      currencyGenerator1, currencyGenerator3, currencyGenerator2, 1000
     )
     val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
@@ -381,40 +282,22 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
 
     val params = new OpParams()
     // We should be able to set the features to either be the train features or the score ones here
-    val features: Array[OPFeature] = Array(r1, r2, r3, mapFeatureRaw)
+    val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw)
     val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.0, 0.4, Double.PositiveInfinity, 1.0, 1.0)
     val filteredRawData = filter.generateFilteredRaw(features, params)
 
-    /*
-    val exclusions = filter.getFeaturesToExclude(
-      trainingDistribs = filteredRawData.trainingFeatureDistributions,
-      scoringDistribs = filteredRawData.scoringFeatureDistributions,
-      correlationInfo = Map.empty
-    )
-    println(exclusions)
-     */
-
     // TODO: Add a check for the reason dropped once that information is passed on to the workflow
-    // Check that we drop one feature, as well as its corresponding map key
-    filteredRawData.featuresToDrop.length shouldBe 2
-    filteredRawData.mapKeysToDrop.keySet.size shouldBe 1
-    filteredRawData.mapKeysToDrop.values.head.size shouldBe 2
-
-    // Since we swtiched the distributions in features 2 & 3 between the training and scoring sets, then both of them
-    // should be removed (the two raw features and the two corresponding map keys)
-    filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF2", "myF3")
-    filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f2", "f3")
-
-    // Check the actual filtered dataframe schemas
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe false
-    filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m =>
-      if (m.nonEmpty) m.value.keySet shouldEqual Set("f1"))
+    checkDroppedFeatures(
+      filteredRawData,
+      mapFeatureRaw,
+      featureUniverse = featureUniverse,
+      expectedDroppedFeatures = Set("myF2", "myF3"),
+      mapKeyUniverse = mapKeyUniverse,
+      expectedDroppedKeys = Set("f2", "f3")
+    )
 
     // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys
-    // The map and non-map features should also be the same
-    assertFeatureDistributionEquality(filteredRawData, total = 12)
+    assertFeatureDistributions(filteredRawData, total = 12)
   }
 
   /**
@@ -429,24 +312,24 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
   it should "correctly clean the dataframe containing map and non-map features due to max fill ratio " +
     "difference" in {
     // Define random generators that will be the same for training and scoring dataframes
-    val realGenerator1 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0)
-    val realGenerator2 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95)
-    val realGenerator3 = RandomReal.logNormal[Real](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.7)
+    val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0)
+    val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95)
+    val currencyGenerator3 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.7)
 
     // Define the training dataframe and the features (these should be the same between the training and scoring
     // dataframes since they point to columns with the same names)
-    val (trainDf, r1, r2, r3) =
-    generateRandomDfAndFeatures[Real, Real, Real](
-      realGenerator1, realGenerator2, realGenerator3, 1000
+    val (trainDf, c1, c2, c3) =
+    generateRandomDfAndFeatures[Currency, Currency, Currency](
+      currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000
     )
-    val mapFeature = makeTernaryOPMapTransformer[Real, RealMap, Double](r1, r2, r3)
+    val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
     // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
     val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
     val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
-    val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Real, Real, Real](
-      realGenerator1, realGenerator3, realGenerator2, 1000
+    val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
+      currencyGenerator1, currencyGenerator3, currencyGenerator2, 1000
     )
     val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
@@ -455,31 +338,22 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
 
     val params = new OpParams()
     // We should be able to set the features to either be the train features or the score ones here
-    val features: Array[OPFeature] = Array(r1, r2, r3, mapFeatureRaw)
+    val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw)
     val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.0, 1.0, 4.0, 1.0, 1.0)
     val filteredRawData = filter.generateFilteredRaw(features, params)
 
     // TODO: Add a check for the reason dropped once that information is passed on to the workflow
-    // Check that we drop one feature, as well as its corresponding map key
-    filteredRawData.featuresToDrop.length shouldBe 2
-    filteredRawData.mapKeysToDrop.keySet.size shouldBe 1
-    filteredRawData.mapKeysToDrop.values.head.size shouldBe 2
-
-    // Since we swtiched the distributions in features 2 & 3 between the training and scoring sets, then both of them
-    // should be removed (the two raw features and the two corresponding map keys)
-    filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF2", "myF3")
-    filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f2", "f3")
-
-    // Check the actual filtered dataframe schemas
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe false
-    filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m =>
-      if (m.nonEmpty) m.value.keySet shouldEqual Set("f1"))
+    checkDroppedFeatures(
+      filteredRawData,
+      mapFeatureRaw,
+      featureUniverse = featureUniverse,
+      expectedDroppedFeatures = Set("myF2", "myF3"),
+      mapKeyUniverse = mapKeyUniverse,
+      expectedDroppedKeys = Set("f2", "f3")
+    )
 
     // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys
-    // The map and non-map features should also be the same
-    assertFeatureDistributionEquality(filteredRawData, total = 12)
+    assertFeatureDistributions(filteredRawData, total = 12)
   }
 
   /**
@@ -519,33 +393,24 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val params = new OpParams()
     // We should be able to set the features to either be the train features or the score ones here
     val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw)
-
     val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 0.8, 1.0)
     val filteredRawData = filter.generateFilteredRaw(features, params)
 
     // TODO: Add a check for the reason dropped once that information is passed on to the workflow
-    // Check that we drop one feature, as well as its corresponding map key
-    filteredRawData.featuresToDrop.length shouldBe 2
-    filteredRawData.mapKeysToDrop.keySet.size shouldBe 1
-    filteredRawData.mapKeysToDrop.values.head.size shouldBe 2
-
-    // The feature that is 99% empty should be thrown out
-    filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF1", "myF3")
-    filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f1", "f3")
-
-    // Check the actual filtered dataframe schemas
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe false
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe true
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe false
-    filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m =>
-      if (m.nonEmpty) m.value.keySet shouldEqual Set("f2"))
+    checkDroppedFeatures(
+      filteredRawData,
+      mapFeatureRaw,
+      featureUniverse = featureUniverse,
+      expectedDroppedFeatures = Set("myF1", "myF3"),
+      mapKeyUniverse = mapKeyUniverse,
+      expectedDroppedKeys = Set("f1", "f3")
+    )
 
     // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys
-    // The map and non-map features should also be the same
-    assertFeatureDistributionEquality(filteredRawData, total = 12)
+    assertFeatureDistributions(filteredRawData, total = 12)
   }
 
-  it should "not drop protected raw features" in {
+  it should "not drop protected raw features or response features" in {
     // Define random generators that will be the same for training and scoring dataframes
     val currencyGenerator25 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25)
     val currencyGenerator95 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95)
@@ -573,10 +438,9 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val params = new OpParams()
     // We should be able to set the features to either be the train features or the score ones here
     val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw)
-
     // Check that using the scoring reader only will result in the rarely filled in both training and scoring sets
     // being removed, except for the protected feature that would normally be removed
-    val filter = new RawFeatureFilter(trainReader, Some(scoreReader),
+    val filterWithProtected = new RawFeatureFilter(trainReader, Some(scoreReader),
       bins = 10,
       minFill = 0.1,
       maxFillDifference = 1.0,
@@ -585,141 +449,234 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
       maxCorrelation = 1.0,
       protectedFeatures = Set("myF1")
     )
-    val filteredRawData = filter.generateFilteredRaw(features, params)
+    val filteredRawData = filterWithProtected.generateFilteredRaw(features, params)
 
     // TODO: Add a check for the reason dropped once that information is passed on to the workflow
-    // Check that we drop one feature, as well as its corresponding map key
-    filteredRawData.featuresToDrop.length shouldBe 1
-    filteredRawData.mapKeysToDrop.keySet.size shouldBe 1
-    filteredRawData.mapKeysToDrop.values.head.size shouldBe 2
+    checkDroppedFeatures(
+      filteredRawData,
+      mapFeatureRaw,
+      featureUniverse = featureUniverse,
+      expectedDroppedFeatures = Set("myF2"),
+      mapKeyUniverse = mapKeyUniverse,
+      expectedDroppedKeys = Set("f1", "f2")
+    )
 
-    // The feature that is 99% empty should be thrown out
-    filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs Seq("myF2")
-    filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs Set("f1", "f2")
-    // filteredData.mapKeysToDrop
-    //  .foldLeft(Set.empty[String])((acc, x) => acc ++ x._2) should contain theSameElementsAs Seq("f1", "f2")
+    // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys
+    assertFeatureDistributions(filteredRawData, total = 12)
 
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF1") shouldBe true
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF2") shouldBe false
-    filteredRawData.cleanedData.schema.fields.exists(_.name == "myF3") shouldBe true
-    filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m =>
-      if (m.nonEmpty) m.value.keySet shouldEqual Set("f3"))
+    val filter = new RawFeatureFilter(trainReader, Some(scoreReader),
+      bins = 10,
+      minFill = 0.1,
+      maxFillDifference = 1.0,
+      maxFillRatioDiff = Double.PositiveInfinity,
+      maxJSDivergence = 1.0,
+      maxCorrelation = 1.0
+    )
+    val featuresWithResponse: Array[OPFeature] = Array(c1.copy(isResponse = true), c2, c3, mapFeatureRaw)
+    val filteredRawDataWithResponse = filter.generateFilteredRaw(featuresWithResponse, params)
+
+    checkDroppedFeatures(
+      filteredRawDataWithResponse,
+      mapFeatureRaw,
+      featureUniverse = featureUniverse,
+      expectedDroppedFeatures = Set("myF2"),
+      mapKeyUniverse = mapKeyUniverse,
+      expectedDroppedKeys = Set("f1", "f2")
+    )
 
     // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys
-    // The map and non-map features should also be the same
-    assertFeatureDistributionEquality(filteredRawData, total = 12)
+    assertFeatureDistributions(filteredRawDataWithResponse, total = 12)
   }
-  
-  // TODO: check null leakage removals (just do one threshold, or two in the same test)
-
 
-  it should "correctly clean the dataframe returned and give the features to blacklist" in {
-    val params = new OpParams()
-    val survPred = survived.copy(isResponse = false)
-    val features: Array[OPFeature] =
-      Array(survPred, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap)
-    val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.0, 1.0, Double.PositiveInfinity, 1.0, 1.0)
-    val filteredRawData = filter.generateFilteredRaw(features, params)
-    filteredRawData.featuresToDrop shouldBe empty
-    filteredRawData.mapKeysToDrop shouldBe empty
-    filteredRawData.cleanedData.schema.fields should contain theSameElementsAs passengersDataSet.schema.fields
-
-    assertFeatureDistributions(filteredRawData, total = 26)
-
-    val filter1 = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.5, 0.5, Double.PositiveInfinity, 1.0, 1.0)
-    val filteredRawData1 = filter1.generateFilteredRaw(features, params)
-    filteredRawData1.featuresToDrop should contain theSameElementsAs Array(survPred)
-    filteredRawData1.mapKeysToDrop should contain theSameElementsAs Map(
-      "numericMap" -> Set("Male"), "booleanMap" -> Set("Male"), "stringMap" -> Set("Male"))
-    filteredRawData1.cleanedData.schema.fields.exists(_.name == survPred.name) shouldBe false
-    filteredRawData1.cleanedData.collect(stringMap).foreach(m =>
-      if (m.nonEmpty) m.value.keySet shouldEqual Set("Female"))
-    assertFeatureDistributions(filteredRawData, total = 26)
-  }
+  /**
+   * This test generates three numeric generators with the same underlying distribution, but different fill rates.
+   * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three
+   * raw features - each key contains the same data as the corresponding raw feature.
+   *
+   * Features f1 & f3 are switched between the training and scoring sets, so they should have a very large JS
+   * divergence (practically, 1.0). The RawFeatureFilter is set up with a maximum JS divergence of 0.8, so both
+   * f1 and f3 (as well as their corresponding map keys) should be removed, but they are added to a list of features
+   * protected from JS divergence removal.
+   */
+  it should "not drop JS divergence-protected features based on JS divergence check" in {
+    // Define random generators that will be the same for training and scoring dataframes
+    val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 1.0, sigma = 5.0).withProbabilityOfEmpty(0.1)
+    val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 5.0)
+    val currencyGenerator3 = RandomReal.logNormal[Currency](mean = 1000.0, sigma = 5.0).withProbabilityOfEmpty(0.1)
 
-  it should "not drop response features" in {
-    val params = new OpParams()
-    val features: Array[OPFeature] =
-      Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap)
-    val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.5, 0.5, Double.PositiveInfinity, 1.0, 1.0)
-    val filteredRawData = filter.generateFilteredRaw(features, params)
-    filteredRawData.featuresToDrop shouldBe empty
-    filteredRawData.cleanedData.schema.fields should contain theSameElementsAs passengersDataSet.schema.fields
-    filteredRawData.cleanedData.collect(stringMap)
-      .foreach(m => if (m.nonEmpty) m.value.keySet shouldEqual Set("Female"))
-    assertFeatureDistributions(filteredRawData, total = 26)
-  }
+    // Define the training dataframe and the features (these should be the same between the training and scoring
+    // dataframes since they point to columns with the same names)
+    val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency](
+      currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000
+    )
+    val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
+    // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
+    val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
+    val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
 
-  it should "not drop protected features" in {
-    val params = new OpParams()
-    val features: Array[OPFeature] =
-      Array(survived, age, gender, height, weight, description, boarded)
-    val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1, 2, 0.2, 0.9)
-    val filteredRawData = filter.generateFilteredRaw(features, params)
-    filteredRawData.featuresToDrop.toSet shouldEqual Set(age, gender, height, weight, description, boarded)
-    filteredRawData.cleanedData.schema.fields.map(_.name) should contain theSameElementsAs
-      Array(DataFrameFieldNames.KeyFieldName, survived.name)
-    assertFeatureDistributions(filteredRawData, total = 14)
+    // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
+    val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
+      currencyGenerator3, currencyGenerator2, currencyGenerator1, 1000
+    )
+    val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
-    val filter2 = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1, 2, 0.2, 0.9,
-      protectedFeatures = Set(age.name, gender.name))
-    val filteredRawData2 = filter2.generateFilteredRaw(features, params)
-    filteredRawData2.featuresToDrop.toSet shouldEqual Set(height, weight, description, boarded)
-    filteredRawData2.cleanedData.schema.fields.map(_.name) should contain theSameElementsAs
-      Array(DataFrameFieldNames.KeyFieldName, survived.name, age.name, gender.name)
-    assertFeatureDistributions(filteredRawData, total = 14)
-  }
+    // Define the readers
+    val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf)
 
-  it should "not drop JS divergence-protected features based on JS divergence check" in {
     val params = new OpParams()
-    val features: Array[OPFeature] =
-      Array(survived, age, gender, height, weight, description, boarded, boardedTime, boardedTimeAsDateTime)
-    val filter = new RawFeatureFilter(
-      trainingReader = dataReader,
-      scoringReader = Some(simpleReader),
+    // We should be able to set the features to either be the train features or the score ones here
+    val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw)
+    val filter = new RawFeatureFilter(trainReader, Some(scoreReader),
       bins = 10,
-      minFill = 0.0,
+      minFill = 0.1,
       maxFillDifference = 1.0,
       maxFillRatioDiff = Double.PositiveInfinity,
-      maxJSDivergence = 0.0,
+      maxJSDivergence = 0.8,
       maxCorrelation = 1.0,
-      jsDivergenceProtectedFeatures = Set(boardedTime.name, boardedTimeAsDateTime.name)
+      jsDivergenceProtectedFeatures = Set("myF3")
     )
-
     val filteredRawData = filter.generateFilteredRaw(features, params)
-    filteredRawData.featuresToDrop.toSet shouldEqual Set(age, gender, height, weight, description, boarded)
-    filteredRawData.cleanedData.schema.fields.map(_.name) should contain theSameElementsAs
-      Seq(DataFrameFieldNames.KeyFieldName, survived.name, boardedTime.name, boardedTimeAsDateTime.name)
-    assertFeatureDistributions(filteredRawData, total = 18)
-  }
 
-  it should "correctly drop features based on null-label leakage correlation greater than 0.9" in {
-    val expectedDropped = Seq(boarded, weight, gender)
-    val expectedMapKeys = Seq("Female", "Male")
-    val expectedDroppedMapKeys = Map[String, Set[String]]()
-    nullLabelCorrelationTest(0.9, expectedDropped, expectedMapKeys, expectedDroppedMapKeys)
+    // TODO: Add a check for the reason dropped once that information is passed on to the workflow
+    checkDroppedFeatures(
+      filteredRawData,
+      mapFeatureRaw,
+      featureUniverse = featureUniverse,
+      expectedDroppedFeatures = Set("myF1"),
+      mapKeyUniverse = mapKeyUniverse,
+      expectedDroppedKeys = Set("f1", "f3")
+    )
+
+    // There should be 12 FeatureDistributions - training and scoring for 3 raw features, one map with three keys
+    assertFeatureDistributions(filteredRawData, total = 12)
   }
 
-  it should "correctly drop features based on null-label leakage correlation greater than 0.6" in {
-    val expectedDropped = Seq(boarded, weight, gender, age)
-    val expectedMapKeys = Seq("Female", "Male")
-    val expectedDroppedMapKeys = Map[String, Set[String]]()
-    nullLabelCorrelationTest(0.6, expectedDropped, expectedMapKeys, expectedDroppedMapKeys)
+
+  it should "correctly drop features based on null-label correlations" in {
+    // Define random generators that will be the same for training and scoring dataframes
+    val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.3)
+    val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.3)
+    val currencyGenerator3 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.3)
+
+    // Define the training dataframe and the features (these should be the same between the training and scoring
+    // dataframes since they point to columns with the same names)
+    val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency](
+      currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000
+    )
+    val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
+    // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
+    val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
+
+    // Construct a label that we know is highly biased from the pickList data to check if SanityChecker detects it
+    val labelTransformer = new UnaryLambdaTransformer[Currency, RealNN](operationName = "labelFunc",
+      transformFn = r => r.value match {
+        case Some(v) => RealNN(1.0)
+        case _ => RealNN(0.0)
+      }
+    )
+    val labelData = labelTransformer.setInput(c2).getOutput().asInstanceOf[Feature[RealNN]]
+      .copy(isResponse = true)
+    val labelDataRaw = labelData.asRaw(isResponse = true)
+    val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(trainDf)
+
+    // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
+    val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
+      currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000
+    )
+    val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(scoreDf)
+
+    // Define the readers
+    val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf)
+
+    val params = new OpParams()
+    // We should be able to set the features to either be the train features or the score ones here
+    val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw, labelDataRaw)
+    val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 0.8)
+    val filteredRawData = filter.generateFilteredRaw(features, params)
+
+    // TODO: check that filter.getFeaturesToExclude contains the correlation exclusions too
+    // TODO: Add a check for the reason dropped once that information is passed on to the workflow
+    checkDroppedFeatures(
+      filteredRawData,
+      mapFeatureRaw,
+      featureUniverse = featureUniverse ++ Set(labelData.name),
+      expectedDroppedFeatures = Set("myF2"),
+      mapKeyUniverse = mapKeyUniverse,
+      expectedDroppedKeys = Set("f2")
+    )
+
+    // There should be 14 FeatureDistributions - training and scoring for 4 raw features, one map with three keys
+    assertFeatureDistributions(filteredRawData, total = 14)
   }
 
-  it should "correctly drop features based on null-label leakage correlation greater than 0.4" in {
-    val expectedDropped = Seq(boarded, weight, gender, age, description)
-    val expectedMapKeys = Seq("Male")
-    val expectedDroppedMapKeys = Map("booleanMap" -> Set("Female"), "stringMap" -> Set("Female"),
-      "numericMap" -> Set("Female"))
-    nullLabelCorrelationTest(0.4, expectedDropped, expectedMapKeys, expectedDroppedMapKeys)
+  /**
+   * Generates a random dataframe and OPFeatures from supplied data generators and their types. The names of the
+   * columns of the dataframe are fixed to be myF1, myF2, myF3, and myF4 so that the same OPFeatures can be used to
+   * refer to columns in either dataframe.
+   *
+   * @param f1        Random data generator for feature 1 (type F1)
+   * @param f2        Random data generator for feature 2 (type F2)
+   * @param f3        Random data generator for feature 3 (type F3)
+   * @param numRows   Number of rows to generate
+   * @tparam F1       Type of feature 1
+   * @tparam F2       Type of feature 2
+   * @tparam F3       Type of feature 3
+   * @return          Tuple containing the generated dataframe and each individual OPFeature
+   */
+  def generateRandomDfAndFeatures[
+  F1 <: FeatureType : TypeTag,
+  F2 <: FeatureType : TypeTag,
+  F3 <: FeatureType : TypeTag
+  ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], numRows: Int):
+  (Dataset[Row], Feature[F1], Feature[F2], Feature[F3])  = {
+
+    val f1Data = f1.limit(numRows)
+    val f2Data = f2.limit(numRows)
+    val f3Data = f3.limit(numRows)
+
+    // Combine the data into a single tuple for each row
+    val generatedTrainData: Seq[(F1, F2, F3)] = f1Data.zip(f2Data).zip(f3Data).map {
+      case ((a, b), c) => (a, b, c)
+    }
+
+    TestFeatureBuilder[F1, F2, F3]("myF1", "myF2", "myF3", generatedTrainData)
   }
 
-  it should "correctly drop features based on null-label leakage correlation greater than 0.3" in {
-    val expectedDropped = Seq(boarded, weight, gender, age, description, booleanMap, numericMap, stringMap)
-    // all the maps dropped
-    val expectedDroppedMapKeys = Map[String, Set[String]]()
-    nullLabelCorrelationTest(0.3, expectedDropped, Seq(), expectedDroppedMapKeys)
+  /**
+   * Generates a random dataframe and OPFeatures from supplied data generators and their types. The names of the
+   * columns of the dataframe are fixed to be myF1, myF2, myF3, and myF4 so that the same OPFeatures can be used to
+   * refer to columns in either dataframe.
+   *
+   * @param f1        Random data generator for feature 1 (type F1)
+   * @param f2        Random data generator for feature 2 (type F2)
+   * @param f3        Random data generator for feature 3 (type F3)
+   * @param f4        Random data generator for feature 4 (type F4)
+   * @param numRows   Number of rows to generate
+   * @tparam F1       Type of feature 1
+   * @tparam F2       Type of feature 2
+   * @tparam F3       Type of feature 3
+   * @tparam F4       Type of feature 4
+   * @return          Tuple containing the generated dataframe and each individual OPFeature
+   */
+  def generateRandomDfAndFeatures[
+  F1 <: FeatureType : TypeTag,
+  F2 <: FeatureType : TypeTag,
+  F3 <: FeatureType : TypeTag,
+  F4 <: FeatureType : TypeTag
+  ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], f4: RandomData[F4], numRows: Int):
+  (Dataset[Row], Feature[F1], Feature[F2], Feature[F3], Feature[F4])  = {
+
+    val f1Data = f1.limit(numRows)
+    val f2Data = f2.limit(numRows)
+    val f3Data = f3.limit(numRows)
+    val f4Data = f4.limit(numRows)
+
+    // Combine the data into a single tuple for each row
+    val generatedTrainData: Seq[(F1, F2, F3, F4)] = f1Data.zip(f2Data).zip(f3Data).zip(f4Data).map {
+      case (((a, b), c), d) => (a, b, c, d)
+    }
+
+    TestFeatureBuilder[F1, F2, F3, F4]("myF1", "myF2", "myF3", "myF4", generatedTrainData)
   }
 
   private def assertFeatureDistributions(fd: FilteredRawData, total: Int): Assertion = {
@@ -733,7 +690,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
 
   private def assertFeatureDistributionEquality(fd: FilteredRawData, total: Int): Unit = {
     fd.featureDistributions.length shouldBe total
-    fd.trainingFeatureDistributions.zip(fd.trainingFeatureDistributions).foreach {
+    fd.trainingFeatureDistributions.zip(fd.scoringFeatureDistributions).foreach {
       case (train, score) =>
         train.name shouldBe score.name
         train.key shouldBe score.key
@@ -762,42 +719,40 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     (trainReader, scoreReader)
   }
 
-  private def nullLabelCorrelationTest(
-    maxCorrelation: Double,
-    expectedDropped: Seq[OPFeature],
-    expectedMapKeys: Seq[String],
-    expectedDroppedMapKeys: Map[String, Set[String]]
+  // TODO: Expand scope to take multiple map types, and/or type parameters for maps of different types
+  /**
+   * Automates various checks on whether features are removed from the cleaned dataframe produced by RawFeatureFilter.
+   * Right now, it is specialized to accept just one map type feature, which is hardcoded to be a CurrencyMap based
+   * on current tests.
+   *
+   * @param filteredRawData         FilteredRawData object prdduced by RawFeatureFilter
+   * @param mapFeatureRaw           Name of raw map feature to check keys on
+   * @param featureUniverse         Set of raw feature names you start with
+   * @param expectedDroppedFeatures Expected set of raw feature names to be dropped
+   * @param mapKeyUniverse          Set of map keys in mapFeatureRaw you start with
+   * @param expectedDroppedKeys     Expected set of map keys to be dropped
+   */
+  private def checkDroppedFeatures(
+    filteredRawData: FilteredRawData,
+    mapFeatureRaw: FeatureLike[CurrencyMap],
+    featureUniverse: Set[String],
+    expectedDroppedFeatures: Set[String],
+    mapKeyUniverse: Set[String],
+    expectedDroppedKeys: Set[String]
   ): Unit = {
-    def getFilter(maxCorrelation: Double): RawFeatureFilter[Passenger] = new RawFeatureFilter(
-      trainingReader = dataReader,
-      scoringReader = Some(simpleReader),
-      bins = 10,
-      minFill = 0.0,
-      maxFillDifference = 1.0,
-      maxFillRatioDiff = Double.PositiveInfinity,
-      maxJSDivergence = 1.0,
-      maxCorrelation = maxCorrelation)
+    // Check that we drop one feature, as well as its corresponding map key
+    filteredRawData.featuresToDrop.length shouldBe expectedDroppedFeatures.size
+    filteredRawData.mapKeysToDrop.keySet.size shouldBe 1
+    filteredRawData.mapKeysToDrop.values.head.size shouldBe expectedDroppedKeys.size
 
-    val params = new OpParams()
-    val features: Array[OPFeature] =
-      Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap)
-    val filteredRawData@FilteredRawData(df, dropped, droppedKeyValue, _) =
-      getFilter(maxCorrelation).generateFilteredRaw(features, params)
-
-    assertFeatureDistributions(filteredRawData, total = 26)
-    dropped should contain theSameElementsAs expectedDropped
-    droppedKeyValue should contain theSameElementsAs expectedDroppedMapKeys
-
-    df.schema.fields.map(_.name) should contain theSameElementsAs
-      DataFrameFieldNames.KeyFieldName +: features.diff(dropped).map(_.name)
-    if (expectedMapKeys.nonEmpty) {
-      df.collect(booleanMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys
-      df.collect(numericMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys
-      df.collect(stringMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys
-    } else {
-      intercept[IllegalArgumentException] { df.collect(booleanMap) }
-      intercept[IllegalArgumentException] { df.collect(numericMap) }
-      intercept[IllegalArgumentException] { df.collect(stringMap) }
-    }
+    filteredRawData.featuresToDrop.map(_.name) should contain theSameElementsAs expectedDroppedFeatures
+    filteredRawData.mapKeysToDrop.head._2 should contain theSameElementsAs expectedDroppedKeys
+
+    // Check the actual filtered dataframe schemas
+    featureUniverse.foreach(f => {
+      filteredRawData.cleanedData.schema.fields.exists(_.name == f) shouldBe !expectedDroppedFeatures.contains(f)
+      filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m =>
+        if (m.nonEmpty) m.value.keySet.intersect(expectedDroppedKeys) shouldBe Set.empty)
+    })
   }
 }

From 0999ac9657c67eeae80de563d49708a56eb17c07 Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Mon, 25 Mar 2019 15:20:06 -0700
Subject: [PATCH 05/13] More documentation and readability changes

---
 .../op/filters/RawFeatureFilterTest.scala     | 111 +++++++++++-------
 1 file changed, 69 insertions(+), 42 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
index 6b69eb2f71..eec086b7c3 100644
--- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
@@ -57,6 +57,8 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
   // Our randomly generated data will generate feature names and corresponding map keys in this universe
   val featureUniverse = Set("myF1", "myF2", "myF3")
   val mapKeyUniverse = Set("f1", "f2", "f3")
+  // Number of rows to use in randomly generated data sets
+  val numRows = 1000
 
   Spec[RawFeatureFilter[_]] should "compute feature stats correctly" in {
     val features: Array[OPFeature] =
@@ -151,6 +153,12 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     excludedBothAllMK shouldBe empty
   }
 
+  /**
+   * This test uses several data generators to generate data according to different distributions, makes a reader
+   * corresponding to the generated dataframe, and then uses that as both the training and scoring reader in
+   * RawFeatureFilter. Not only should no features be removed, but the training and scoring distributions should be
+   * identical.
+   */
   it should "not remove any features when the training and scoring sets are identical" in {
     // Define random generators that will be the same for training and scoring dataframes
     val cityGenerator = RandomText.cities.withProbabilityOfEmpty(0.2)
@@ -163,7 +171,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     // dataframes since they point to columns with the same names)
     val (trainDf, trainCity, trainCountry, trainPickList, trainCurrency) =
     generateRandomDfAndFeatures[City, Country, PickList, Currency](
-      cityGenerator, countryGenerator, pickListGenerator, currencyGenerator,1000
+      cityGenerator, countryGenerator, pickListGenerator, currencyGenerator, numRows
     )
 
     // Define the readers
@@ -183,6 +191,17 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     assertFeatureDistributionEquality(filteredRawData, total = features.length * 2)
   }
 
+  /**
+   * This test generates three numeric generators with the same underlying distribution, but different fill rates.
+   * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three
+   * raw features - each key contains the same data as the corresponding raw feature.
+   *
+   * Features c1, c2, and c3 are permuted between the training and scoring sets. In the training set, feature c2
+   * has a 5% fill rate and should be removed. In the scoring set, map key f1 has a 5% fill rate so should be removed.
+   * This test checks removal when only the training reader is used, and when both the training and scoring readers
+   * are used.
+   *
+   */
   it should "correctly clean the dataframe containing map and non-map features due to min fill rate" in {
     // Define random generators that will be the same for training and scoring dataframes
     val currencyGenerator25 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25)
@@ -192,7 +211,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     // Define the training dataframe and the features (these should be the same between the training and scoring
     // dataframes since they point to columns with the same names)
     val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency](
-      currencyGenerator25, currencyGenerator95, currencyGenerator50, 1000
+      currencyGenerator25, currencyGenerator95, currencyGenerator50, numRows
     )
     val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
     // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
@@ -201,7 +220,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
     val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
-      currencyGenerator95, currencyGenerator50, currencyGenerator25,1000
+      currencyGenerator95, currencyGenerator50, currencyGenerator25, numRows
     )
     val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
@@ -220,7 +239,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
       filteredRawData,
       mapFeatureRaw,
       featureUniverse = featureUniverse,
-      expectedDroppedFeatures = Set("myF2"),
+      expectedDroppedFeatures = Set(c2.name),
       mapKeyUniverse = mapKeyUniverse,
       expectedDroppedKeys = Set("f2")
     )
@@ -235,7 +254,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
       filteredRawDataWithScoring,
       mapFeatureRaw,
       featureUniverse = featureUniverse,
-      expectedDroppedFeatures = Set("myF1", "myF2"),
+      expectedDroppedFeatures = Set(c1.name, c2.name),
       mapKeyUniverse = mapKeyUniverse,
       expectedDroppedKeys = Set("f1", "f2")
     )
@@ -249,9 +268,9 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
    * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three
    * raw features - each key contains the same data as the corresponding raw feature.
    *
-   * Features f2 & f3 are switched between the training and scoring sets, so that they should have an absolute
+   * Features c2 & c3 are switched between the training and scoring sets, so that they should have an absolute
    * fill rate difference of 0.6. The RawFeatureFilter is set up with a maximum absolute fill rate of 0.4 so both
-   * f2 and f3 (as well as their corresponding map keys) should be removed.
+   * c2 and c3 (as well as their corresponding map keys f2 & f3) should be removed.
    */
   it should "correctly clean the dataframe containing map and non-map features due to max absolute fill rate " +
     "difference" in {
@@ -264,7 +283,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     // dataframes since they point to columns with the same names)
     val (trainDf, c1, c2, c3) =
     generateRandomDfAndFeatures[Currency, Currency, Currency](
-      currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000
+      currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows
     )
     val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
     // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
@@ -273,7 +292,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
     val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
-      currencyGenerator1, currencyGenerator3, currencyGenerator2, 1000
+      currencyGenerator1, currencyGenerator3, currencyGenerator2, numRows
     )
     val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
@@ -291,7 +310,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
       filteredRawData,
       mapFeatureRaw,
       featureUniverse = featureUniverse,
-      expectedDroppedFeatures = Set("myF2", "myF3"),
+      expectedDroppedFeatures = Set(c2.name, c3.name),
       mapKeyUniverse = mapKeyUniverse,
       expectedDroppedKeys = Set("f2", "f3")
     )
@@ -305,9 +324,9 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
    * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three
    * raw features - each key contains the same data as the corresponding raw feature.
    *
-   * Features f2 & f3 are switched between the training and scoring sets, so that they should have an absolute
+   * Features c2 & c3 are switched between the training and scoring sets, so that they should have an absolute
    * fill rate difference of 0.25, and a relative fill ratio difference of 6. The RawFeatureFilter is set up with a
-   * maximum fill ratio difference of 4 so both f2 and f3 (as well as their corresponding map keys) should be removed.
+   * maximum fill ratio difference of 4 so both c2 and c3 (as well as their corresponding map keys) should be removed.
    */
   it should "correctly clean the dataframe containing map and non-map features due to max fill ratio " +
     "difference" in {
@@ -320,16 +339,16 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     // dataframes since they point to columns with the same names)
     val (trainDf, c1, c2, c3) =
     generateRandomDfAndFeatures[Currency, Currency, Currency](
-      currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000
+      currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows
     )
     val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
     // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
     val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
-    val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
+    val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
     val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
-      currencyGenerator1, currencyGenerator3, currencyGenerator2, 1000
+      currencyGenerator1, currencyGenerator3, currencyGenerator2, numRows
     )
     val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
@@ -347,7 +366,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
       filteredRawData,
       mapFeatureRaw,
       featureUniverse = featureUniverse,
-      expectedDroppedFeatures = Set("myF2", "myF3"),
+      expectedDroppedFeatures = Set(c2.name, c3.name),
       mapKeyUniverse = mapKeyUniverse,
       expectedDroppedKeys = Set("f2", "f3")
     )
@@ -361,7 +380,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
    * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three
    * raw features - each key contains the same data as the corresponding raw feature.
    *
-   * Features f1 & f3 are switched between the training and scoring sets, so they should have a very large JS
+   * Features c1 & c3 are switched between the training and scoring sets, so they should have a very large JS
    * divergence (practically, 1.0). The RawFeatureFilter is set up with a maximum JS divergence of 0.8, so both
    * f1 and f3 (as well as their corresponding map keys) should be removed.
    */
@@ -374,7 +393,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     // Define the training dataframe and the features (these should be the same between the training and scoring
     // dataframes since they point to columns with the same names)
     val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency](
-      currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000
+      currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows
     )
     val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
     // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
@@ -383,7 +402,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
     val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
-      currencyGenerator3, currencyGenerator2, currencyGenerator1, 1000
+      currencyGenerator3, currencyGenerator2, currencyGenerator1, numRows
     )
     val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
@@ -401,7 +420,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
       filteredRawData,
       mapFeatureRaw,
       featureUniverse = featureUniverse,
-      expectedDroppedFeatures = Set("myF1", "myF3"),
+      expectedDroppedFeatures = Set(c1.name, c3.name),
       mapKeyUniverse = mapKeyUniverse,
       expectedDroppedKeys = Set("f1", "f3")
     )
@@ -419,7 +438,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     // Define the training dataframe and the features (these should be the same between the training and scoring
     // dataframes since they point to columns with the same names)
     val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency](
-      currencyGenerator25, currencyGenerator95, currencyGenerator50, 1000
+      currencyGenerator25, currencyGenerator95, currencyGenerator50, numRows
     )
     val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
     // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
@@ -428,7 +447,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
     val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
-      currencyGenerator95, currencyGenerator50, currencyGenerator25, 1000
+      currencyGenerator95, currencyGenerator50, currencyGenerator25, numRows
     )
     val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
@@ -447,7 +466,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
       maxFillRatioDiff = Double.PositiveInfinity,
       maxJSDivergence = 1.0,
       maxCorrelation = 1.0,
-      protectedFeatures = Set("myF1")
+      protectedFeatures = Set(c1.name)
     )
     val filteredRawData = filterWithProtected.generateFilteredRaw(features, params)
 
@@ -456,7 +475,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
       filteredRawData,
       mapFeatureRaw,
       featureUniverse = featureUniverse,
-      expectedDroppedFeatures = Set("myF2"),
+      expectedDroppedFeatures = Set(c2.name),
       mapKeyUniverse = mapKeyUniverse,
       expectedDroppedKeys = Set("f1", "f2")
     )
@@ -479,7 +498,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
       filteredRawDataWithResponse,
       mapFeatureRaw,
       featureUniverse = featureUniverse,
-      expectedDroppedFeatures = Set("myF2"),
+      expectedDroppedFeatures = Set(c2.name),
       mapKeyUniverse = mapKeyUniverse,
       expectedDroppedKeys = Set("f1", "f2")
     )
@@ -488,26 +507,27 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     assertFeatureDistributions(filteredRawDataWithResponse, total = 12)
   }
 
+  // TODO: Add a way to protect map keys from removal?
   /**
-   * This test generates three numeric generators with the same underlying distribution, but different fill rates.
-   * Each generator corresponds to a different raw feature. Additionally, a single map feature is made from the three
-   * raw features - each key contains the same data as the corresponding raw feature.
+   * This test generates three numeric generators with very different underlying distributions. Each generator
+   * corresponds to a different raw feature. Additionally, a single map feature is made from the three raw features -
+   * each key contains the same data as the corresponding raw feature.
    *
-   * Features f1 & f3 are switched between the training and scoring sets, so they should have a very large JS
+   * Features c1 & c3 are switched between the training and scoring sets, so they should have a very large JS
    * divergence (practically, 1.0). The RawFeatureFilter is set up with a maximum JS divergence of 0.8, so both
-   * f1 and f3 (as well as their corresponding map keys) should be removed, but they are added to a list of features
+   * c1 and c3 (as well as their corresponding map keys) should be removed, but c3 is added to a list of features
    * protected from JS divergence removal.
    */
   it should "not drop JS divergence-protected features based on JS divergence check" in {
     // Define random generators that will be the same for training and scoring dataframes
     val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 1.0, sigma = 5.0).withProbabilityOfEmpty(0.1)
-    val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 5.0)
+    val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 5.0).withProbabilityOfEmpty(0.1)
     val currencyGenerator3 = RandomReal.logNormal[Currency](mean = 1000.0, sigma = 5.0).withProbabilityOfEmpty(0.1)
 
     // Define the training dataframe and the features (these should be the same between the training and scoring
     // dataframes since they point to columns with the same names)
     val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency](
-      currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000
+      currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows
     )
     val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
     // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
@@ -516,7 +536,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
     val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
-      currencyGenerator3, currencyGenerator2, currencyGenerator1, 1000
+      currencyGenerator3, currencyGenerator2, currencyGenerator1, numRows
     )
     val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
@@ -533,7 +553,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
       maxFillRatioDiff = Double.PositiveInfinity,
       maxJSDivergence = 0.8,
       maxCorrelation = 1.0,
-      jsDivergenceProtectedFeatures = Set("myF3")
+      jsDivergenceProtectedFeatures = Set(c3.name)
     )
     val filteredRawData = filter.generateFilteredRaw(features, params)
 
@@ -542,7 +562,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
       filteredRawData,
       mapFeatureRaw,
       featureUniverse = featureUniverse,
-      expectedDroppedFeatures = Set("myF1"),
+      expectedDroppedFeatures = Set(c1.name),
       mapKeyUniverse = mapKeyUniverse,
       expectedDroppedKeys = Set("f1", "f3")
     )
@@ -551,7 +571,15 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     assertFeatureDistributions(filteredRawData, total = 12)
   }
 
-
+  /**
+   * This test generates three numeric generators where ach generator corresponds to a different raw feature.
+   * Additionally, a single map feature is made from the three raw features - each key contains the same data
+   * as the corresponding raw feature.
+   *
+   * A binary label is generated with a perfect relationship to feature c2 - if it is empty then the label is 0,
+   * otherwise it is 1. Therefore feature c2 (and its corresponding map key) should be removed by the correlation
+   * check between a raw feature's null indicator and the label.
+   */
   it should "correctly drop features based on null-label correlations" in {
     // Define random generators that will be the same for training and scoring dataframes
     val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.3)
@@ -561,27 +589,26 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     // Define the training dataframe and the features (these should be the same between the training and scoring
     // dataframes since they point to columns with the same names)
     val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency](
-      currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000
+      currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows
     )
     val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
     // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
     val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
 
-    // Construct a label that we know is highly biased from the pickList data to check if SanityChecker detects it
+    // Construct a label that we know is directly correlated to the currency data
     val labelTransformer = new UnaryLambdaTransformer[Currency, RealNN](operationName = "labelFunc",
       transformFn = r => r.value match {
         case Some(v) => RealNN(1.0)
         case _ => RealNN(0.0)
       }
     )
-    val labelData = labelTransformer.setInput(c2).getOutput().asInstanceOf[Feature[RealNN]]
-      .copy(isResponse = true)
+    val labelData = labelTransformer.setInput(c2).getOutput().asInstanceOf[Feature[RealNN]].copy(isResponse = true)
     val labelDataRaw = labelData.asRaw(isResponse = true)
     val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(trainDf)
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
     val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
-      currencyGenerator1, currencyGenerator2, currencyGenerator3, 1000
+      currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows
     )
     val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(scoreDf)
 
@@ -600,7 +627,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
       filteredRawData,
       mapFeatureRaw,
       featureUniverse = featureUniverse ++ Set(labelData.name),
-      expectedDroppedFeatures = Set("myF2"),
+      expectedDroppedFeatures = Set(c2.name),
       mapKeyUniverse = mapKeyUniverse,
       expectedDroppedKeys = Set("f2")
     )

From 8b483bfa722f0fae21316f65278c7b20472cc7fe Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Mon, 25 Mar 2019 15:25:13 -0700
Subject: [PATCH 06/13] Small cleanup

---
 .../com/salesforce/op/filters/RawFeatureFilterTest.scala      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
index eec086b7c3..ffb38277ba 100644
--- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
@@ -778,8 +778,8 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     // Check the actual filtered dataframe schemas
     featureUniverse.foreach(f => {
       filteredRawData.cleanedData.schema.fields.exists(_.name == f) shouldBe !expectedDroppedFeatures.contains(f)
-      filteredRawData.cleanedData.collect(mapFeatureRaw).foreach(m =>
-        if (m.nonEmpty) m.value.keySet.intersect(expectedDroppedKeys) shouldBe Set.empty)
     })
+    filteredRawData.cleanedData.collect(mapFeatureRaw)
+      .foreach(_.value.keySet.intersect(expectedDroppedKeys) shouldBe Set.empty)
   }
 }

From aa1ebb6586808caf53646665ef8b6eea69070a59 Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Mon, 25 Mar 2019 16:50:34 -0700
Subject: [PATCH 07/13] Fix scalastyle errors

---
 .../op/filters/RawFeatureFilterTest.scala     | 52 +++++++++----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
index ffb38277ba..843a1e7789 100644
--- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
@@ -216,13 +216,13 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
     // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
     val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
-    val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
+    val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
     val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
       currencyGenerator95, currencyGenerator50, currencyGenerator25, numRows
     )
-    val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
+    val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
     // Define the readers
     val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf)
@@ -246,7 +246,8 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
 
     // Check that using the scoring reader only will result in the rarely filled in both training and scoring sets
     // being removed
-    val filterWithScoring = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0)
+    val filterWithScoring = new RawFeatureFilter(trainReader, Some(scoreReader),
+      10, 0.1, 1.0, Double.PositiveInfinity, 1.0, 1.0)
     val filteredRawDataWithScoring = filterWithScoring.generateFilteredRaw(features, params)
 
     // TODO: Add a check for the reason dropped once that information is passed on to the workflow
@@ -288,13 +289,13 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
     // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
     val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
-    val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
+    val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
     val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
       currencyGenerator1, currencyGenerator3, currencyGenerator2, numRows
     )
-    val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
+    val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
     // Define the readers
     val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf)
@@ -302,7 +303,8 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val params = new OpParams()
     // We should be able to set the features to either be the train features or the score ones here
     val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw)
-    val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.0, 0.4, Double.PositiveInfinity, 1.0, 1.0)
+    val filter = new RawFeatureFilter(trainReader, Some(scoreReader),
+      10, 0.0, 0.4, Double.PositiveInfinity, 1.0, 1.0)
     val filteredRawData = filter.generateFilteredRaw(features, params)
 
     // TODO: Add a check for the reason dropped once that information is passed on to the workflow
@@ -350,7 +352,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
       currencyGenerator1, currencyGenerator3, currencyGenerator2, numRows
     )
-    val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
+    val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
     // Define the readers
     val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf)
@@ -398,13 +400,13 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
     // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
     val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
-    val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
+    val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
     val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
       currencyGenerator3, currencyGenerator2, currencyGenerator1, numRows
     )
-    val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
+    val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
     // Define the readers
     val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf)
@@ -443,13 +445,13 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
     // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
     val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
-    val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
+    val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
     val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
       currencyGenerator95, currencyGenerator50, currencyGenerator25, numRows
     )
-    val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
+    val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
     // Define the readers
     val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf)
@@ -532,13 +534,13 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3)
     // Need to make a raw version of this feature so that RawFeatureFilter will pick it up
     val mapFeatureRaw = mapFeature.asRaw(isResponse = false)
-    val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
+    val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf)
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
     val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
       currencyGenerator3, currencyGenerator2, currencyGenerator1, numRows
     )
-    val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
+    val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature).transform(scoreDf)
 
     // Define the readers
     val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf)
@@ -604,13 +606,13 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     )
     val labelData = labelTransformer.setInput(c2).getOutput().asInstanceOf[Feature[RealNN]].copy(isResponse = true)
     val labelDataRaw = labelData.asRaw(isResponse = true)
-    val transformedTrainDf =  new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(trainDf)
+    val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(trainDf)
 
     // Define the scoring dataframe (we can reuse the existing features so don't need to keep them)
     val (scoreDf, _, _, _) = generateRandomDfAndFeatures[Currency, Currency, Currency](
       currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows
     )
-    val transformedScoreDf =  new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(scoreDf)
+    val transformedScoreDf = new OpWorkflow().setResultFeatures(mapFeature, labelData).transform(scoreDf)
 
     // Define the readers
     val (trainReader, scoreReader) = makeReaders(transformedTrainDf, transformedScoreDf)
@@ -650,12 +652,11 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
    * @tparam F3       Type of feature 3
    * @return          Tuple containing the generated dataframe and each individual OPFeature
    */
-  def generateRandomDfAndFeatures[
-  F1 <: FeatureType : TypeTag,
-  F2 <: FeatureType : TypeTag,
-  F3 <: FeatureType : TypeTag
+  def generateRandomDfAndFeatures[F1 <: FeatureType : TypeTag,
+    F2 <: FeatureType : TypeTag,
+    F3 <: FeatureType : TypeTag
   ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], numRows: Int):
-  (Dataset[Row], Feature[F1], Feature[F2], Feature[F3])  = {
+  (Dataset[Row], Feature[F1], Feature[F2], Feature[F3]) = {
 
     val f1Data = f1.limit(numRows)
     val f2Data = f2.limit(numRows)
@@ -685,13 +686,12 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
    * @tparam F4       Type of feature 4
    * @return          Tuple containing the generated dataframe and each individual OPFeature
    */
-  def generateRandomDfAndFeatures[
-  F1 <: FeatureType : TypeTag,
-  F2 <: FeatureType : TypeTag,
-  F3 <: FeatureType : TypeTag,
-  F4 <: FeatureType : TypeTag
+  def generateRandomDfAndFeatures[F1 <: FeatureType : TypeTag,
+    F2 <: FeatureType : TypeTag,
+    F3 <: FeatureType : TypeTag,
+    F4 <: FeatureType : TypeTag
   ](f1: RandomData[F1], f2: RandomData[F2], f3: RandomData[F3], f4: RandomData[F4], numRows: Int):
-  (Dataset[Row], Feature[F1], Feature[F2], Feature[F3], Feature[F4])  = {
+  (Dataset[Row], Feature[F1], Feature[F2], Feature[F3], Feature[F4]) = {
 
     val f1Data = f1.limit(numRows)
     val f2Data = f2.limit(numRows)

From 8e911b93dcbac9b327fa3a10ab470cd17fba2f15 Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Mon, 25 Mar 2019 20:45:52 -0700
Subject: [PATCH 08/13] Debugging

---
 .../scala/com/salesforce/op/OpWorkflowTest.scala   |  8 ++++++++
 .../impl/insights/RecordInsightsLOCOTest.scala     | 14 ++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
index 42c671831c..466ea9a33b 100644
--- a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
+++ b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
@@ -44,6 +44,7 @@ import com.salesforce.op.stages.impl.tuning._
 import com.salesforce.op.test.{Passenger, PassengerSparkFixtureTest, TestFeatureBuilder}
 import com.salesforce.op.utils.spark.RichDataset._
 import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
+import org.apache.log4j.Level
 import org.apache.spark.ml.param.{BooleanParam, ParamMap}
 import org.apache.spark.ml.tuning.ParamGridBuilder
 import org.apache.spark.rdd.RDD
@@ -238,14 +239,21 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest {
   }
 
   it should "use the raw feature filter to generate data instead of the reader when the filter is specified" in {
+    loggingLevel(Level.INFO)
+
     val fv = Seq(age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap).transmogrify()
     val survivedNum = survived.occurs()
     val pred = BinaryClassificationModelSelector().setInput(survivedNum, fv).getOutput()
+
+    val tempdata = new OpWorkflow().setReader(dataReader).setResultFeatures(pred).computeDataUpTo(weight)
+    tempdata.show(numRows = 10, truncate = false)
+
     val wf = new OpWorkflow()
       .setResultFeatures(pred)
       .withRawFeatureFilter(Option(dataReader), Option(simpleReader),
         maxFillRatioDiff = 1.0) // only height and the female key of maps should meet this criteria
     val data = wf.computeDataUpTo(weight)
+    data.show(10, truncate = false)
 
     data.schema.fields.map(_.name).toSet shouldEqual
       Set("key", "height", "survived", "stringMap", "numericMap", "booleanMap")
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala
index aad90b1a27..31fdcb4d88 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala
@@ -176,7 +176,7 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext {
     parsed.foreach { case (_, in) => math.abs(in.head._2(0)._2 + in.head._2(1)._2) < 0.00001 shouldBe true }
   }
 
-  it should "return the most predictive features for dat generated with a strong relation to the label" in {
+  it should "return the most predictive features for data generated with a strong relation to the label" in {
     val numRows = 1000
     val countryData: Seq[Country] = RandomText.countries.withProbabilityOfEmpty(0.3).take(numRows).toList
     val pickListData: Seq[PickList] = RandomText.pickLists(domain = List("A", "B", "C", "D", "E", "F", "G"))
@@ -252,11 +252,12 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext {
     val otherVar = math.abs(otherIndices.map(varImportances.apply).sum) / otherIndices.size
 
     // Strengths of features "A", "B", and "C" should be much larger the other feature strengths
-    assert(abcAvg > 5 * otherAvg, "Average feature strengths for features involved in label formula should be" +
+    assert(abcAvg > 5 * otherAvg,
+      "Average feature strengths for features involved in label formula should be " +
       "much larger than the average feature strengths of other features")
     // There should be a really large t-value when comparing the two avg feature strengths
-    assert(math.abs(abcAvg - otherAvg) / math.sqrt((abcVar + otherVar)/numRows) > 10, "The t-value comparing the" +
-      "average feature strengths between important and other features should be large")
+    assert(math.abs(abcAvg - otherAvg) / math.sqrt((abcVar + otherVar)/numRows) > 10,
+      "The t-value comparing the average feature strengths between important and other features should be large")
 
     // Record insights averaged across all records should be similar to the feature importances from Spark's RF
     val rfImportances = sparkModel.getSparkMlStage().get.featureImportances
@@ -267,8 +268,9 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext {
 
     // Compare the ratio of importances between "important" and "other" features in both paradigms
     assert(math.abs(avgRecordInsightRatio - featureImportanceRatio)*2 /
-      (avgRecordInsightRatio + featureImportanceRatio) < 0.8, "The ratio of feature strengths between important and" +
-      "other features should be similar to the ratio of feature importances from Spark's RandomForest")
+      (avgRecordInsightRatio + featureImportanceRatio) < 0.8,
+      "The ratio of feature strengths between important and other features should be similar to the ratio of " +
+        "feature importances from Spark's RandomForest")
   }
 
 }

From c08ac2dae20d89f9632b4628adea1c613c4ddc34 Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Mon, 25 Mar 2019 21:47:14 -0700
Subject: [PATCH 09/13] Fixed OpWorkflowTest and reduced flakiness of LOCO test

---
 .../scala/com/salesforce/op/OpWorkflowTest.scala   | 14 ++++----------
 .../impl/insights/RecordInsightsLOCOTest.scala     |  2 +-
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
index 466ea9a33b..9c2065abc5 100644
--- a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
+++ b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
@@ -44,7 +44,6 @@ import com.salesforce.op.stages.impl.tuning._
 import com.salesforce.op.test.{Passenger, PassengerSparkFixtureTest, TestFeatureBuilder}
 import com.salesforce.op.utils.spark.RichDataset._
 import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
-import org.apache.log4j.Level
 import org.apache.spark.ml.param.{BooleanParam, ParamMap}
 import org.apache.spark.ml.tuning.ParamGridBuilder
 import org.apache.spark.rdd.RDD
@@ -239,24 +238,19 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest {
   }
 
   it should "use the raw feature filter to generate data instead of the reader when the filter is specified" in {
-    loggingLevel(Level.INFO)
-
     val fv = Seq(age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap).transmogrify()
     val survivedNum = survived.occurs()
     val pred = BinaryClassificationModelSelector().setInput(survivedNum, fv).getOutput()
 
-    val tempdata = new OpWorkflow().setReader(dataReader).setResultFeatures(pred).computeDataUpTo(weight)
-    tempdata.show(numRows = 10, truncate = false)
-
     val wf = new OpWorkflow()
       .setResultFeatures(pred)
-      .withRawFeatureFilter(Option(dataReader), Option(simpleReader),
-        maxFillRatioDiff = 1.0) // only height and the female key of maps should meet this criteria
+      .withRawFeatureFilter(Option(dataReader), None, maxFillRatioDiff = 1.0)
     val data = wf.computeDataUpTo(weight)
-    data.show(10, truncate = false)
 
+    // Since there are < 500 rows in the scoring set, only the training set checks are applied here, and the only
+    // removal reasons should be null indicator - label correlations
     data.schema.fields.map(_.name).toSet shouldEqual
-      Set("key", "height", "survived", "stringMap", "numericMap", "booleanMap")
+      Set("booleanMap", "description", "height", "stringMap", "age", "key", "survived", "numericMap")
   }
 
   it should "return a model that transforms the data correctly" in {
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala
index 31fdcb4d88..d73dae904e 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala
@@ -252,7 +252,7 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext {
     val otherVar = math.abs(otherIndices.map(varImportances.apply).sum) / otherIndices.size
 
     // Strengths of features "A", "B", and "C" should be much larger the other feature strengths
-    assert(abcAvg > 5 * otherAvg,
+    assert(abcAvg > 4 * otherAvg,
       "Average feature strengths for features involved in label formula should be " +
       "much larger than the average feature strengths of other features")
     // There should be a really large t-value when comparing the two avg feature strengths

From bef03dd0229197ab22a90d1d7a68ec103214125e Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Tue, 26 Mar 2019 09:46:56 -0700
Subject: [PATCH 10/13] Made minScoringRows settable, with a default

---
 .../main/scala/com/salesforce/op/OpWorkflow.scala   |  8 ++++++--
 .../salesforce/op/filters/RawFeatureFilter.scala    | 13 +++++++++----
 .../scala/com/salesforce/op/OpWorkflowTest.scala    |  6 ++----
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/com/salesforce/op/OpWorkflow.scala b/core/src/main/scala/com/salesforce/op/OpWorkflow.scala
index 4f1c458bad..a92e8c3f2c 100644
--- a/core/src/main/scala/com/salesforce/op/OpWorkflow.scala
+++ b/core/src/main/scala/com/salesforce/op/OpWorkflow.scala
@@ -512,6 +512,8 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
    *                           Output is the bins for the text features.
    * @param timePeriod         Time period used to apply circulate date transformation for date features, if not
    *                           specified will use numeric feature transformation
+   * @param minScoringRows     Minimum row threshold for scoring set comparisons to be used in checks. If the scoring
+   *                           set size is below this threshold, then only training data checks will be used
    * @tparam T Type of the data read in
    */
   @Experimental
@@ -529,7 +531,8 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
     protectedFeatures: Array[OPFeature] = Array.empty,
     protectedJSFeatures: Array[OPFeature] = Array.empty,
     textBinsFormula: (Summary, Int) => Int = RawFeatureFilter.textBinsFormula,
-    timePeriod: Option[TimePeriod] = None
+    timePeriod: Option[TimePeriod] = None,
+    minScoringRows: Int = RawFeatureFilter.minScoringRowsDefault
   ): this.type = {
     val training = trainingReader.orElse(reader).map(_.asInstanceOf[Reader[T]])
     require(training.nonEmpty, "Reader for training data must be provided either in withRawFeatureFilter or directly" +
@@ -550,7 +553,8 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
         protectedFeatures = protectedRawFeatures,
         jsDivergenceProtectedFeatures = protectedRawJSFeatures,
         textBinsFormula = textBinsFormula,
-        timePeriod = timePeriod)
+        timePeriod = timePeriod,
+        minScoringRows = minScoringRows)
     }
     this
   }
diff --git a/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala b/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala
index 8d4cb21ef6..3acd770c00 100644
--- a/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala
+++ b/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala
@@ -82,6 +82,9 @@ import scala.util.Failure
  *                                      Output is the bins for the text features.
  * @param timePeriod                    Time period used to apply circulate date transformation for date features, if
  *                                      not specified will use regular numeric feature transformation
+ * @param minScoringRows                Minimum row threshold for scoring set comparisons to be used in checks. If
+ *                                      the scoring set size is below this threshold, then only training data checks
+ *                                      will be used
  * @tparam T datatype of the reader
  */
 class RawFeatureFilter[T]
@@ -98,7 +101,8 @@ class RawFeatureFilter[T]
   val jsDivergenceProtectedFeatures: Set[String] = Set.empty,
   val protectedFeatures: Set[String] = Set.empty,
   val textBinsFormula: (Summary, Int) => Int = RawFeatureFilter.textBinsFormula,
-  val timePeriod: Option[TimePeriod] = None
+  val timePeriod: Option[TimePeriod] = None,
+  val minScoringRows: Int = RawFeatureFilter.minScoringRowsDefault
 ) extends Serializable {
 
   require(bins > 1 && bins <= FeatureDistribution.MaxBins, s"Invalid bin size $bins," +
@@ -110,6 +114,7 @@ class RawFeatureFilter[T]
     s" maxFillRatioDiff must be greater than 0.0")
   require(maxJSDivergence >= 0.0 && maxJSDivergence <= 1.0, s"Invalid maxJSDivergence size $maxJSDivergence," +
     s" maxJSDivergence must be between 0 and 1")
+  require(minScoringRows >= 0, s"minRowsForScoringSet must be >= 0, but was set to $minScoringRows")
 
   ClosureUtils.checkSerializable(textBinsFormula) match {
     case Failure(e) => throw new IllegalArgumentException("The argument textBinsFormula must be serializable", e)
@@ -301,10 +306,10 @@ class RawFeatureFilter[T]
       val sd = s.generateDataFrame(rawFeatures, parameters.switchReaderParams()).persist()
       log.info("Loaded scoring data")
       val scoringDataCount = sd.count()
-      if (scoringDataCount >= RawFeatureFilter.minRowsForScoringSet) Some(sd)
+      if (scoringDataCount >= minScoringRows) Some(sd)
       else {
         log.warn(s"Scoring dataset has $scoringDataCount rows, which is less than the minimum required of " +
-          s"${RawFeatureFilter.minRowsForScoringSet}. Only training data checks will be used.")
+          s"$minScoringRows. Only training data checks will be used.")
         None
       }
     }
@@ -375,7 +380,7 @@ object RawFeatureFilter {
 
   // If there are not enough rows in the scoring set, we should not perform comparisons between the training and
   // scoring sets since they will not be reliable. Currently, this is set to the same as the minimum training size.
-  val minRowsForScoringSet = 500
+  val minScoringRowsDefault = 500
 
 }
 
diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
index 9c2065abc5..994776e8a8 100644
--- a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
+++ b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
@@ -244,13 +244,11 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest {
 
     val wf = new OpWorkflow()
       .setResultFeatures(pred)
-      .withRawFeatureFilter(Option(dataReader), None, maxFillRatioDiff = 1.0)
+      .withRawFeatureFilter(Option(dataReader), Option(simpleReader), maxFillRatioDiff = 1.0, minScoringRows = 0)
     val data = wf.computeDataUpTo(weight)
 
-    // Since there are < 500 rows in the scoring set, only the training set checks are applied here, and the only
-    // removal reasons should be null indicator - label correlations
     data.schema.fields.map(_.name).toSet shouldEqual
-      Set("booleanMap", "description", "height", "stringMap", "age", "key", "survived", "numericMap")
+      Set("key", "height", "survived", "stringMap", "numericMap", "booleanMap")
   }
 
   it should "return a model that transforms the data correctly" in {

From 73d9d61c0cf9d340eda09a32603b6297d20862f1 Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Tue, 26 Mar 2019 10:38:14 -0700
Subject: [PATCH 11/13] Put all the old RFF tests back in and overrode the
 minScoringRows when necessary

---
 .../op/filters/RawFeatureFilterTest.scala     | 174 +++++++++++++++++-
 1 file changed, 165 insertions(+), 9 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
index 843a1e7789..4f111b5775 100644
--- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
@@ -33,7 +33,7 @@ package com.salesforce.op.filters
 import com.salesforce.op.{OpParams, OpWorkflow}
 import com.salesforce.op.features.{Feature, FeatureDistributionType, FeatureLike, OPFeature}
 import com.salesforce.op.features.types._
-import com.salesforce.op.readers.{CustomReader, ReaderKey}
+import com.salesforce.op.readers.{CustomReader, DataFrameFieldNames, ReaderKey}
 import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer
 import com.salesforce.op.test._
 import com.salesforce.op.testkit._
@@ -43,6 +43,7 @@ import com.salesforce.op.utils.spark.RichDataset._
 import org.apache.log4j.Level
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{Dataset, Row, SparkSession}
+import com.twitter.algebird.Operators._
 import org.junit.runner.RunWith
 import org.scalatest.{Assertion, FlatSpec}
 import org.scalatest.junit.JUnitRunner
@@ -153,13 +154,128 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     excludedBothAllMK shouldBe empty
   }
 
+  it should "correctly clean the dataframe returned and give the features to blacklist" in {
+    val params = new OpParams()
+    val survPred = survived.copy(isResponse = false)
+    val features: Array[OPFeature] =
+      Array(survPred, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap)
+    val filter = new RawFeatureFilter(dataReader, Some(simpleReader),
+      10, 0.0, 1.0, Double.PositiveInfinity, 1.0, 1.0, minScoringRows = 0)
+    val filteredRawData = filter.generateFilteredRaw(features, params)
+    filteredRawData.featuresToDrop shouldBe empty
+    filteredRawData.mapKeysToDrop shouldBe empty
+    filteredRawData.cleanedData.schema.fields should contain theSameElementsAs passengersDataSet.schema.fields
+
+    assertFeatureDistributions(filteredRawData, total = 26)
+
+    val filter1 = new RawFeatureFilter(dataReader, Some(simpleReader),
+      10, 0.5, 0.5, Double.PositiveInfinity, 1.0, 1.0, minScoringRows = 0)
+    val filteredRawData1 = filter1.generateFilteredRaw(features, params)
+    filteredRawData1.featuresToDrop should contain theSameElementsAs Array(survPred)
+    filteredRawData1.mapKeysToDrop should contain theSameElementsAs Map(
+      "numericMap" -> Set("Male"), "booleanMap" -> Set("Male"), "stringMap" -> Set("Male"))
+    filteredRawData1.cleanedData.schema.fields.exists(_.name == survPred.name) shouldBe false
+    filteredRawData1.cleanedData.collect(stringMap).foreach(m =>
+      if (m.nonEmpty) m.value.keySet shouldEqual Set("Female"))
+    assertFeatureDistributions(filteredRawData, total = 26)
+  }
+
+  it should "not drop response features" in {
+    val params = new OpParams()
+    val features: Array[OPFeature] =
+      Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap)
+    val filter = new RawFeatureFilter(dataReader, Some(simpleReader),
+      10, 0.5, 0.5, Double.PositiveInfinity, 1.0, 1.0, minScoringRows = 0)
+    val filteredRawData = filter.generateFilteredRaw(features, params)
+    filteredRawData.featuresToDrop shouldBe empty
+    filteredRawData.cleanedData.schema.fields should contain theSameElementsAs passengersDataSet.schema.fields
+    filteredRawData.cleanedData.collect(stringMap)
+      .foreach(m => if (m.nonEmpty) m.value.keySet shouldEqual Set("Female"))
+    assertFeatureDistributions(filteredRawData, total = 26)
+  }
+
+  it should "not drop protected features" in {
+    val params = new OpParams()
+    val features: Array[OPFeature] =
+      Array(survived, age, gender, height, weight, description, boarded)
+    val filter = new RawFeatureFilter(dataReader, Some(simpleReader),
+      10, 0.1, 0.1, 2, 0.2, 0.9, minScoringRows = 0)
+    val filteredRawData = filter.generateFilteredRaw(features, params)
+    filteredRawData.featuresToDrop.toSet shouldEqual Set(age, gender, height, weight, description, boarded)
+    filteredRawData.cleanedData.schema.fields.map(_.name) should contain theSameElementsAs
+      Array(DataFrameFieldNames.KeyFieldName, survived.name)
+    assertFeatureDistributions(filteredRawData, total = 14)
+
+    val filter2 = new RawFeatureFilter(dataReader, Some(simpleReader),
+      10, 0.1, 0.1, 2, 0.2, 0.9, minScoringRows = 0,
+      protectedFeatures = Set(age.name, gender.name))
+    val filteredRawData2 = filter2.generateFilteredRaw(features, params)
+    filteredRawData2.featuresToDrop.toSet shouldEqual Set(height, weight, description, boarded)
+    filteredRawData2.cleanedData.schema.fields.map(_.name) should contain theSameElementsAs
+      Array(DataFrameFieldNames.KeyFieldName, survived.name, age.name, gender.name)
+    assertFeatureDistributions(filteredRawData, total = 14)
+  }
+
+  it should "not drop JS divergence-protected features based on JS divergence check" in {
+    val params = new OpParams()
+    val features: Array[OPFeature] =
+      Array(survived, age, gender, height, weight, description, boarded, boardedTime, boardedTimeAsDateTime)
+    val filter = new RawFeatureFilter(
+      trainingReader = dataReader,
+      scoringReader = Some(simpleReader),
+      bins = 10,
+      minFill = 0.0,
+      maxFillDifference = 1.0,
+      maxFillRatioDiff = Double.PositiveInfinity,
+      maxJSDivergence = 0.0,
+      maxCorrelation = 1.0,
+      jsDivergenceProtectedFeatures = Set(boardedTime.name, boardedTimeAsDateTime.name),
+      minScoringRows = 0
+    )
+
+    val filteredRawData = filter.generateFilteredRaw(features, params)
+    filteredRawData.featuresToDrop.toSet shouldEqual Set(age, gender, height, weight, description, boarded)
+    filteredRawData.cleanedData.schema.fields.map(_.name) should contain theSameElementsAs
+      Seq(DataFrameFieldNames.KeyFieldName, survived.name, boardedTime.name, boardedTimeAsDateTime.name)
+    assertFeatureDistributions(filteredRawData, total = 18)
+  }
+
+  it should "correctly drop features based on null-label leakage correlation greater than 0.9" in {
+    val expectedDropped = Seq(boarded, weight, gender)
+    val expectedMapKeys = Seq("Female", "Male")
+    val expectedDroppedMapKeys = Map[String, Set[String]]()
+    nullLabelCorrelationTest(0.9, expectedDropped, expectedMapKeys, expectedDroppedMapKeys)
+  }
+
+  it should "correctly drop features based on null-label leakage correlation greater than 0.6" in {
+    val expectedDropped = Seq(boarded, weight, gender, age)
+    val expectedMapKeys = Seq("Female", "Male")
+    val expectedDroppedMapKeys = Map[String, Set[String]]()
+    nullLabelCorrelationTest(0.6, expectedDropped, expectedMapKeys, expectedDroppedMapKeys)
+  }
+
+  it should "correctly drop features based on null-label leakage correlation greater than 0.4" in {
+    val expectedDropped = Seq(boarded, weight, gender, age, description)
+    val expectedMapKeys = Seq("Male")
+    val expectedDroppedMapKeys = Map("booleanMap" -> Set("Female"), "stringMap" -> Set("Female"),
+      "numericMap" -> Set("Female"))
+    nullLabelCorrelationTest(0.4, expectedDropped, expectedMapKeys, expectedDroppedMapKeys)
+  }
+
+  it should "correctly drop features based on null-label leakage correlation greater than 0.3" in {
+    val expectedDropped = Seq(boarded, weight, gender, age, description, booleanMap, numericMap, stringMap)
+    // all the maps dropped
+    val expectedDroppedMapKeys = Map[String, Set[String]]()
+    nullLabelCorrelationTest(0.3, expectedDropped, Seq(), expectedDroppedMapKeys)
+  }
+
   /**
    * This test uses several data generators to generate data according to different distributions, makes a reader
    * corresponding to the generated dataframe, and then uses that as both the training and scoring reader in
    * RawFeatureFilter. Not only should no features be removed, but the training and scoring distributions should be
    * identical.
    */
-  it should "not remove any features when the training and scoring sets are identical" in {
+  it should "not remove any features when the training and scoring sets are identical generated data" in {
     // Define random generators that will be the same for training and scoring dataframes
     val cityGenerator = RandomText.cities.withProbabilityOfEmpty(0.2)
     val countryGenerator = RandomText.countries.withProbabilityOfEmpty(0.2)
@@ -202,7 +318,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
    * are used.
    *
    */
-  it should "correctly clean the dataframe containing map and non-map features due to min fill rate" in {
+  it should "correctly clean randomly generated map and non-map features due to min fill rate" in {
     // Define random generators that will be the same for training and scoring dataframes
     val currencyGenerator25 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25)
     val currencyGenerator95 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95)
@@ -273,7 +389,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
    * fill rate difference of 0.6. The RawFeatureFilter is set up with a maximum absolute fill rate of 0.4 so both
    * c2 and c3 (as well as their corresponding map keys f2 & f3) should be removed.
    */
-  it should "correctly clean the dataframe containing map and non-map features due to max absolute fill rate " +
+  it should "correctly clean the randomly generated map and non-map features due to max absolute fill rate " +
     "difference" in {
     // Define random generators that will be the same for training and scoring dataframes
     val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0)
@@ -330,7 +446,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
    * fill rate difference of 0.25, and a relative fill ratio difference of 6. The RawFeatureFilter is set up with a
    * maximum fill ratio difference of 4 so both c2 and c3 (as well as their corresponding map keys) should be removed.
    */
-  it should "correctly clean the dataframe containing map and non-map features due to max fill ratio " +
+  it should "correctly clean the randomly generated map and non-map features due to max fill ratio " +
     "difference" in {
     // Define random generators that will be the same for training and scoring dataframes
     val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.0)
@@ -386,7 +502,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
    * divergence (practically, 1.0). The RawFeatureFilter is set up with a maximum JS divergence of 0.8, so both
    * f1 and f3 (as well as their corresponding map keys) should be removed.
    */
-  it should "correctly clean the dataframe containing map and non-map features due to JS divergence" in {
+  it should "correctly clean the randomly generated map and non-map features due to JS divergence" in {
     // Define random generators that will be the same for training and scoring dataframes
     val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 1.0, sigma = 5.0).withProbabilityOfEmpty(0.1)
     val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 5.0)
@@ -431,7 +547,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     assertFeatureDistributions(filteredRawData, total = 12)
   }
 
-  it should "not drop protected raw features or response features" in {
+  it should "not drop protected raw features or response features from generated data" in {
     // Define random generators that will be the same for training and scoring dataframes
     val currencyGenerator25 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.25)
     val currencyGenerator95 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.95)
@@ -520,7 +636,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
    * c1 and c3 (as well as their corresponding map keys) should be removed, but c3 is added to a list of features
    * protected from JS divergence removal.
    */
-  it should "not drop JS divergence-protected features based on JS divergence check" in {
+  it should "not drop JS divergence-protected features based on JS divergence check with generated data" in {
     // Define random generators that will be the same for training and scoring dataframes
     val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 1.0, sigma = 5.0).withProbabilityOfEmpty(0.1)
     val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 5.0).withProbabilityOfEmpty(0.1)
@@ -582,7 +698,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
    * otherwise it is 1. Therefore feature c2 (and its corresponding map key) should be removed by the correlation
    * check between a raw feature's null indicator and the label.
    */
-  it should "correctly drop features based on null-label correlations" in {
+  it should "correctly drop features based on null-label correlations with generated data" in {
     // Define random generators that will be the same for training and scoring dataframes
     val currencyGenerator1 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.3)
     val currencyGenerator2 = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).withProbabilityOfEmpty(0.3)
@@ -728,6 +844,46 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     }
   }
 
+  private def nullLabelCorrelationTest(
+    maxCorrelation: Double,
+    expectedDropped: Seq[OPFeature],
+    expectedMapKeys: Seq[String],
+    expectedDroppedMapKeys: Map[String, Set[String]]
+  ): Unit = {
+    def getFilter(maxCorrelation: Double): RawFeatureFilter[Passenger] = new RawFeatureFilter(
+      trainingReader = dataReader,
+      scoringReader = Some(simpleReader),
+      bins = 10,
+      minFill = 0.0,
+      maxFillDifference = 1.0,
+      maxFillRatioDiff = Double.PositiveInfinity,
+      maxJSDivergence = 1.0,
+      maxCorrelation = maxCorrelation,
+      minScoringRows = 0)
+
+    val params = new OpParams()
+    val features: Array[OPFeature] =
+      Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap)
+    val filteredRawData@FilteredRawData(df, dropped, droppedKeyValue, _) =
+      getFilter(maxCorrelation).generateFilteredRaw(features, params)
+
+    assertFeatureDistributions(filteredRawData, total = 26)
+    dropped should contain theSameElementsAs expectedDropped
+    droppedKeyValue should contain theSameElementsAs expectedDroppedMapKeys
+
+    df.schema.fields.map(_.name) should contain theSameElementsAs
+      DataFrameFieldNames.KeyFieldName +: features.diff(dropped).map(_.name)
+    if (expectedMapKeys.nonEmpty) {
+      df.collect(booleanMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys
+      df.collect(numericMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys
+      df.collect(stringMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys
+    } else {
+      intercept[IllegalArgumentException] { df.collect(booleanMap) }
+      intercept[IllegalArgumentException] { df.collect(numericMap) }
+      intercept[IllegalArgumentException] { df.collect(stringMap) }
+    }
+  }
+
   /**
    * Defines readers in terms of datasets (in these tests, already created by feature generators)
    *

From 94f856a1ced10d8ef4f09160eecdc2818f9d5d5f Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Thu, 28 Mar 2019 14:21:24 -0700
Subject: [PATCH 12/13] Reduced number of rows generated down to 500 (from
 1000) to speed things up

---
 .../com/salesforce/op/filters/RawFeatureFilterTest.scala      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
index 51b230a5c1..40f1af2995 100644
--- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
@@ -52,14 +52,14 @@ import scala.reflect.runtime.universe.TypeTag
 
 @RunWith(classOf[JUnitRunner])
 class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with FiltersTestData {
-  
+
   // loggingLevel(Level.INFO)
 
   // Our randomly generated data will generate feature names and corresponding map keys in this universe
   val featureUniverse = Set("myF1", "myF2", "myF3")
   val mapKeyUniverse = Set("f1", "f2", "f3")
   // Number of rows to use in randomly generated data sets
-  val numRows = 1000
+  val numRows = 500
 
   Spec[RawFeatureFilter[_]] should "correctly compute feature stats" in {
     val features: Array[OPFeature] =

From bee777be46cc7bc214336673ffce88e862848f94 Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Thu, 28 Mar 2019 14:58:04 -0700
Subject: [PATCH 13/13] Fixed scalastyle issues and made a test less likely to
 be flaky

---
 .../salesforce/op/filters/RawFeatureFilterTest.scala   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
index 40f1af2995..57d26d5332 100644
--- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
@@ -242,7 +242,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val features: Array[OPFeature] =
       Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap)
     val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.5, 0.5,
-      Double.PositiveInfinity, 1.0, 1.0,  minScoringRows = 0)
+      Double.PositiveInfinity, 1.0, 1.0, minScoringRows = 0)
     val filteredRawData = filter.generateFilteredRaw(features, params)
     filteredRawData.featuresToDrop shouldBe empty
     filteredRawData.cleanedData.schema.fields should contain theSameElementsAs passengersDataSet.schema.fields
@@ -255,7 +255,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val params = new OpParams()
     val features: Array[OPFeature] =
       Array(survived, age, gender, height, weight, description, boarded)
-    val filter = new RawFeatureFilter(dataReader, Some(simpleReader),10, 0.1, 0.1,
+    val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1,
       2, 0.2, 0.9, minScoringRows = 0)
     val filteredRawData = filter.generateFilteredRaw(features, params)
     filteredRawData.featuresToDrop.toSet shouldEqual Set(age, gender, height, weight, description, boarded)
@@ -263,7 +263,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
       Array(DataFrameFieldNames.KeyFieldName, survived.name)
     assertFeatureDistributions(filteredRawData, total = 14)
 
-    val filter2 = new RawFeatureFilter(dataReader, Some(simpleReader),10, 0.1, 0.1,
+    val filter2 = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1,
       2, 0.2, 0.9, minScoringRows = 0,
       protectedFeatures = Set(age.name, gender.name))
     val filteredRawData2 = filter2.generateFilteredRaw(features, params)
@@ -501,7 +501,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
    *
    * Features c2 & c3 are switched between the training and scoring sets, so that they should have an absolute
    * fill rate difference of 0.25, and a relative fill ratio difference of 6. The RawFeatureFilter is set up with a
-   * maximum fill ratio difference of 4 so both c2 and c3 (as well as their corresponding map keys) should be removed.
+   * maximum fill ratio difference of 3 so both c2 and c3 (as well as their corresponding map keys) should be removed.
    */
   it should "correctly clean the randomly generated map and non-map features due to max fill ratio " +
     "difference" in {
@@ -533,7 +533,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
     val params = new OpParams()
     // We should be able to set the features to either be the train features or the score ones here
     val features: Array[OPFeature] = Array(c1, c2, c3, mapFeatureRaw)
-    val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.0, 1.0, 4.0, 1.0, 1.0)
+    val filter = new RawFeatureFilter(trainReader, Some(scoreReader), 10, 0.0, 1.0, 3.0, 1.0, 1.0)
     val filteredRawData = filter.generateFilteredRaw(features, params)
 
     // TODO: Add a check for the reason dropped once that information is passed on to the workflow