salesforce · tovbinm · Mar 28, 2019 · Mar 15, 2019 · Mar 21, 2019 · Mar 25, 2019
@@ -514,6 +514,8 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
    *                           Output is the bins for the text features.
    * @param timePeriod         Time period used to apply circulate date transformation for date features, if not
    *                           specified will use numeric feature transformation
+   * @param minScoringRows     Minimum row threshold for scoring set comparisons to be used in checks. If the scoring
+   *                           set size is below this threshold, then only training data checks will be used
    * @tparam T Type of the data read in
    */
   @Experimental
@@ -531,7 +533,8 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
     protectedFeatures: Array[OPFeature] = Array.empty,
     protectedJSFeatures: Array[OPFeature] = Array.empty,
     textBinsFormula: (Summary, Int) => Int = RawFeatureFilter.textBinsFormula,
-    timePeriod: Option[TimePeriod] = None
+    timePeriod: Option[TimePeriod] = None,
+    minScoringRows: Int = RawFeatureFilter.minScoringRowsDefault
   ): this.type = {
     val training = trainingReader.orElse(reader).map(_.asInstanceOf[Reader[T]])
     require(training.nonEmpty, "Reader for training data must be provided either in withRawFeatureFilter or directly" +
@@ -552,7 +555,8 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
         protectedFeatures = protectedRawFeatures,
         jsDivergenceProtectedFeatures = protectedRawJSFeatures,
         textBinsFormula = textBinsFormula,
-        timePeriod = timePeriod)
+        timePeriod = timePeriod,
+        minScoringRows = minScoringRows)
     }
     this
   }

@@ -82,6 +82,9 @@ import scala.util.Failure
  *                                      Output is the bins for the text features.
  * @param timePeriod                    Time period used to apply circulate date transformation for date features, if
  *                                      not specified will use regular numeric feature transformation
+ * @param minScoringRows                Minimum row threshold for scoring set comparisons to be used in checks. If
+ *                                      the scoring set size is below this threshold, then only training data checks
+ *                                      will be used
  * @tparam T datatype of the reader
  */
 class RawFeatureFilter[T]
@@ -98,7 +101,8 @@ class RawFeatureFilter[T]
   val jsDivergenceProtectedFeatures: Set[String] = Set.empty,
   val protectedFeatures: Set[String] = Set.empty,
   val textBinsFormula: (Summary, Int) => Int = RawFeatureFilter.textBinsFormula,
-  val timePeriod: Option[TimePeriod] = None
+  val timePeriod: Option[TimePeriod] = None,
+  val minScoringRows: Int = RawFeatureFilter.minScoringRowsDefault
 ) extends Serializable {
 
   require(bins > 1 && bins <= FeatureDistribution.MaxBins, s"Invalid bin size $bins," +
@@ -110,6 +114,7 @@ class RawFeatureFilter[T]
     s" maxFillRatioDiff must be greater than 0.0")
   require(maxJSDivergence >= 0.0 && maxJSDivergence <= 1.0, s"Invalid maxJSDivergence size $maxJSDivergence," +
     s" maxJSDivergence must be between 0 and 1")
+  require(minScoringRows >= 0, s"minRowsForScoringSet must be >= 0, but was set to $minScoringRows")
 
   ClosureUtils.checkSerializable(textBinsFormula) match {
     case Failure(e) => throw new IllegalArgumentException("The argument textBinsFormula must be serializable", e)
@@ -490,9 +495,11 @@ class RawFeatureFilter[T]
     val scoreData = scoringReader.flatMap { s =>
       val sd = s.generateDataFrame(rawFeatures, parameters.switchReaderParams()).persist()
       log.info("Loaded scoring data")
-      if (sd.count() > 0) Some(sd)
+      val scoringDataCount = sd.count()
+      if (scoringDataCount >= minScoringRows) Some(sd)
       else {
-        log.warn("Scoring dataset was empty. Only training data checks will be used.")
+        log.warn(s"Scoring dataset has $scoringDataCount rows, which is less than the minimum required of " +
+          s"$minScoringRows. Only training data checks will be used.")
         None
       }
     }
@@ -581,6 +588,10 @@ object RawFeatureFilter {
     bins
   }
 
+  // If there are not enough rows in the scoring set, we should not perform comparisons between the training and
+  // scoring sets since they will not be reliable. Currently, this is set to the same as the minimum training size.
+  val minScoringRowsDefault = 500
+
 }
 
 /**

@@ -241,10 +241,10 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest {
     val fv = Seq(age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap).transmogrify()
     val survivedNum = survived.occurs()
     val pred = BinaryClassificationModelSelector().setInput(survivedNum, fv).getOutput()
+
     val wf = new OpWorkflow()
       .setResultFeatures(pred)
-      .withRawFeatureFilter(Option(dataReader), Option(simpleReader),
-        maxFillRatioDiff = 1.0) // only height and the female key of maps should meet this criteria
+      .withRawFeatureFilter(Option(dataReader), Option(simpleReader), maxFillRatioDiff = 1.0, minScoringRows = 0)
     val data = wf.computeDataUpTo(weight)
 
     data.schema.fields.map(_.name).toSet shouldEqual