salesforce · tovbinm · Mar 28, 2019 · Mar 15, 2019 · Mar 21, 2019 · Mar 25, 2019
@@ -300,9 +300,11 @@ class RawFeatureFilter[T]
     val scoreData = scoringReader.flatMap { s =>
       val sd = s.generateDataFrame(rawFeatures, parameters.switchReaderParams()).persist()
       log.info("Loaded scoring data")
-      if (sd.count() > 0) Some(sd)
+      val scoringDataCount = sd.count()
+      if (scoringDataCount >= RawFeatureFilter.minRowsForScoringSet) Some(sd)
       else {
-        log.warn("Scoring dataset was empty. Only training data checks will be used.")
+        log.warn(s"Scoring dataset has $scoringDataCount rows, which is less than the minimum required of " +
+          s"${RawFeatureFilter.minRowsForScoringSet}. Only training data checks will be used.")
         None
       }
     }
@@ -371,6 +373,10 @@ object RawFeatureFilter {
     bins
   }
 
+  // If there are not enough rows in the scoring set, we should not perform comparisons between the training and
+  // scoring sets since they will not be reliable. Currently, this is set to the same as the minimum training size.
+  val minRowsForScoringSet = 500
+
 }
 
 /**

@@ -241,14 +241,16 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest {
     val fv = Seq(age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap).transmogrify()
     val survivedNum = survived.occurs()
     val pred = BinaryClassificationModelSelector().setInput(survivedNum, fv).getOutput()
+
     val wf = new OpWorkflow()
       .setResultFeatures(pred)
-      .withRawFeatureFilter(Option(dataReader), Option(simpleReader),
-        maxFillRatioDiff = 1.0) // only height and the female key of maps should meet this criteria
+      .withRawFeatureFilter(Option(dataReader), None, maxFillRatioDiff = 1.0)
     val data = wf.computeDataUpTo(weight)
 
+    // Since there are < 500 rows in the scoring set, only the training set checks are applied here, and the only
+    // removal reasons should be null indicator - label correlations
     data.schema.fields.map(_.name).toSet shouldEqual
-      Set("key", "height", "survived", "stringMap", "numericMap", "booleanMap")
+      Set("booleanMap", "description", "height", "stringMap", "age", "key", "survived", "numericMap")
   }
 
   it should "return a model that transforms the data correctly" in {