salesforce · Jauntbox · Mar 26, 2020 · Mar 3, 2020 · Mar 3, 2020 · Mar 4, 2020
@@ -273,6 +273,10 @@ trait RichMapFeature {
      * @param defaultLanguage           default language to assume in case autoDetectLanguage is disabled or
      *                                  failed to make a good enough prediction.
      * @param hashAlgorithm             hash algorithm to use
+     * @param textLengthType            Method to use for constructing text length distribution in TextStats. Current
+     *                                  options are from the full entry or from the tokens
+     * @param minLengthStdDev           minimum standard deviation of the lengths of tokens in a text field for it to
+     *                                  be hashed instead of ignored
      * @param others                    additional text features
      * @return result feature of type Vector
      */
@@ -298,6 +302,8 @@ trait RichMapFeature {
       hashSpaceStrategy: HashSpaceStrategy = TransmogrifierDefaults.HashSpaceStrategy,
       defaultLanguage: Language = TextTokenizer.DefaultLanguage,
       hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
+      textLengthType: TextLengthType = SmartTextVectorizer.LengthType,
+      minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev,
       others: Array[FeatureLike[TextMap]] = Array.empty
     ): FeatureLike[OPVector] = {
       // scalastyle:on parameter.number
@@ -322,6 +328,8 @@ trait RichMapFeature {
         .setHashSpaceStrategy(hashSpaceStrategy)
         .setHashAlgorithm(hashAlgorithm)
         .setBinaryFreq(binaryFreq)
+        .setTextLengthType(textLengthType)
+        .setMinLengthStdDev(minLengthStdDev)
         .getOutput()
     }
   }
@@ -418,6 +426,10 @@ trait RichMapFeature {
      * @param defaultLanguage           default language to assume in case autoDetectLanguage is disabled or
      *                                  failed to make a good enough prediction.
      * @param hashAlgorithm             hash algorithm to use
+     * @param tokenizeForLengths        If true, then the length counts will be lengths of the tokens in the entries.
+     *                                  If false, then the length counts will be the lengths of the entire entries
+     * @param minLengthStdDev           minimum standard deviation of the lengths of tokens in a text field for it to
+     *                                  be hashed instead of ignored
      * @param others                    additional text features
      * @return result feature of type Vector
      */
@@ -443,6 +455,8 @@ trait RichMapFeature {
       hashSpaceStrategy: HashSpaceStrategy = TransmogrifierDefaults.HashSpaceStrategy,
       defaultLanguage: Language = TextTokenizer.DefaultLanguage,
       hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
+      textLengthType: TextLengthType = SmartTextVectorizer.LengthType,
+      minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev,
       others: Array[FeatureLike[TextAreaMap]] = Array.empty
     ): FeatureLike[OPVector] = {
       // scalastyle:on parameter.number
@@ -467,6 +481,8 @@ trait RichMapFeature {
         .setHashSpaceStrategy(hashSpaceStrategy)
         .setHashAlgorithm(hashAlgorithm)
         .setBinaryFreq(binaryFreq)
+        .setTextLengthType(textLengthType)
+        .setMinLengthStdDev(minLengthStdDev)
         .getOutput()
     }
   }

@@ -211,6 +211,10 @@ trait RichTextFeature {
      * @param defaultLanguage           default language to assume in case autoDetectLanguage is disabled or
      *                                  failed to make a good enough prediction.
      * @param hashAlgorithm             hash algorithm to use
+     * @param textLengthType            Method to use for constructing text length distribution in TextStats. Current
+     *                                  options are from the full entry or from the tokens
+     * @param minLengthStdDev           minimum standard deviation of the lengths of tokens in a text field for it to
+     *                                  be hashed instead of ignored
      * @param others                    additional text features
      * @return result feature of type Vector
      */
@@ -235,6 +239,8 @@ trait RichTextFeature {
       hashSpaceStrategy: HashSpaceStrategy = TransmogrifierDefaults.HashSpaceStrategy,
       defaultLanguage: Language = TextTokenizer.DefaultLanguage,
       hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
+      textLengthType: TextLengthType = SmartTextVectorizer.LengthType,
+      minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev,
       others: Array[FeatureLike[T]] = Array.empty
     ): FeatureLike[OPVector] = {
       // scalastyle:on parameter.number
@@ -258,6 +264,8 @@ trait RichTextFeature {
         .setHashSpaceStrategy(hashSpaceStrategy)
         .setHashAlgorithm(hashAlgorithm)
         .setBinaryFreq(binaryFreq)
+        .setTextLengthType(textLengthType)
+        .setMinLengthStdDev(minLengthStdDev)
         .getOutput()
     }
 

@@ -221,14 +221,15 @@ object FeatureDistribution {
   /**
    * Facilitates feature distribution retrieval from computed feature summaries
    *
-   * @param featureKey      feature key
-   * @param summary         feature summary
-   * @param value           optional processed sequence
-   * @param bins            number of histogram bins
-   * @param textBinsFormula formula to compute the text features bin size.
-   *                        Input arguments are [[Summary]] and number of bins to use in computing feature distributions
-   *                        (histograms for numerics, hashes for strings). Output is the bins for the text features.
-   * @param `type`          feature distribution type: training or scoring
+   * @param featureKey          feature key
+   * @param summary             feature summary
+   * @param value               optional processed sequence
+   * @param bins                number of histogram bins
+   * @param textBinsFormula     formula to compute the text features bin size.
+   *                            Input arguments are [[Summary]] and number of bins to use in computing feature
+   *                            distributions (histograms for numerics, hashes for strings). Output is the bins for
+   *                            the text features.
+   * @param `type`              feature distribution type: training or scoring
    * @return feature distribution given the provided information
    */
   private[op] def fromSummary(
@@ -245,7 +246,8 @@ object FeatureDistribution {
         .getOrElse(1L -> (Array(summary.min, summary.max, summary.sum, summary.count) -> new Array[Double](bins)))
 
     val moments = value.map(momentsValues)
-    val cardEstimate = value.map(cardinalityValues)
+    // Tokenization is already done during prep phase, so we can skip text cleaning and tokenization here (for now)
+    val cardEstimate = value.map(cardinalityValues(_, shouldCleanText = false, tokenizeForLengths = false))
 
     FeatureDistribution(
       name = name,
@@ -268,25 +270,36 @@ object FeatureDistribution {
    */
   private def momentsValues(values: ProcessedSeq): Moments = {
     val population = values match {
-      case Left(seq) => seq.map(x => x.length.toDouble)
+      case Left(seq) => seq.map(_.length.toDouble)
       case Right(seq) => seq
     }
     MomentsGroup.sum(population.map(x => Moments(x)))
   }
 
   /**
    * Function to track frequency of the first $(MaxCardinality) unique values
-   * (number for numeric features, token for text features)
+   * (number for numeric features, token for text features). Note that shouldCleanText and tokenizeForLengths are
+   * exposed, but not relevant yet because tokenization always happens before this, during data preparation.
    *
-   * @param values          values to track distribution / frequency
-   * @return TextStats object containing a Map from a value to its frequency (histogram)
+   * @param values              values to track distribution / frequency
+   * @param shouldCleanText     whether or not to clean text for TextStats calculation
+   * @param tokenizeForLengths  whether or not to tokenize strings before computing length distribution
+   * @return TextStats object   containing a Map from a value to its frequency (histogram)
    */
-  private def cardinalityValues(values: ProcessedSeq): TextStats = {
-    TextStats(countStringValues(values.left.getOrElse(values.right.get)), Map.empty)
-  }
+  private def cardinalityValues(values: ProcessedSeq, shouldCleanText: Boolean,
+    tokenizeForLengths: Boolean): TextStats = {
+    implicit val testStatsMonoid: Monoid[TextStats] = TextStats.monoid(RawFeatureFilter.MaxCardinality)
 
-  private def countStringValues[T](seq: Seq[T]): Map[String, Long] = {
-    seq.groupBy(identity).map { case (k, valSeq) => k.toString -> valSeq.size.toLong }
+    val stringVals = values match {
+      case Left(stringSeq) => stringSeq
+      case Right(doubleSeq) => doubleSeq.map(_.toString)
+    }
+    stringVals.foldLeft(TextStats.empty)((acc, el) => acc + TextStats.textStatsFromString(
+      textString = el,
+      shouldCleanText = shouldCleanText,
+      shouldTokenize = tokenizeForLengths,
+      maxCardinality = RawFeatureFilter.MaxCardinality)
+    )
   }
 
   /**

@@ -600,7 +600,6 @@ object RawFeatureFilter {
   val minScoringRowsDefault = 500
   val MaxCardinality = 500
 
-
   val stageName = classOf[RawFeatureFilter[_]].getSimpleName
 
   val uid = s"${stageName}_100000000000"

@@ -56,10 +56,11 @@ case object BinaryClassificationModelSelector extends ModelSelectorFactory {
     /**
      * Subset of models to use in model selector
      *
-     * Note: [[OpNaiveBayes]], [[OpDecisionTreeClassifier]] and [[OpXGBoostClassifier]] are off by default
+     * Note: [[OpNaiveBayes]], [[OpDecisionTreeClassifier]], [[OpLinearSVC]], and [[OpXGBoostClassifier]] are
+     * off by default
      */
     val modelTypesToUse: Seq[BinaryClassificationModelsToTry] = Seq(
-      MTT.OpLogisticRegression, MTT.OpRandomForestClassifier, MTT.OpGBTClassifier, MTT.OpLinearSVC
+      MTT.OpLogisticRegression, MTT.OpRandomForestClassifier, MTT.OpGBTClassifier
     )
 
     /**

@@ -67,17 +67,6 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
 
   private implicit val textMapStatsSeqEnc: Encoder[Array[TextMapStats]] = ExpressionEncoder[Array[TextMapStats]]()
 
-  private def computeTextMapStats
-  (
-    textMap: T#Value, shouldCleanKeys: Boolean, shouldCleanValues: Boolean
-  ): TextMapStats = {
-    val keyValueCounts = textMap.map{ case (k, v) =>
-      cleanTextFn(k, shouldCleanKeys) ->
-        TextStats(Map(cleanTextFn(v, shouldCleanValues) -> 1L), Map(cleanTextFn(v, shouldCleanValues).length -> 1L))
-    }
-    TextMapStats(keyValueCounts)
-  }
-
   private def makeHashingParams() = HashingFunctionParams(
     hashWithIndex = $(hashWithIndex),
     prependFeatureName = $(prependFeatureName),
@@ -166,6 +155,18 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
       }
     }
 
+    logInfo("TextStats for features used in SmartTextMapVectorizer:")
+    inN.map(_.name).zip(aggregatedStats).zip(allFeatureInfo).foreach { case ((mapName, mapStats), featInfo) =>
+      logInfo(s"FeatureMap: $mapName")
+      mapStats.keyValueCounts.zip(featInfo).foreach { case ((name, stats), info) =>
+        logInfo(s"Key: $name")
+        logInfo(s"LengthCounts: ${stats.lengthCounts}")
+        logInfo(s"LengthMean: ${stats.lengthMean}")
+        logInfo(s"LengthStdDev: ${stats.lengthStdDev}")
+        logInfo(s"Vectorization method: ${info.vectorizationMethod}")
+      }
+    }
+
     SmartTextMapVectorizerModelArgs(
       allFeatureInfo = allFeatureInfo,
       shouldCleanKeys = shouldCleanKeys,
@@ -181,10 +182,12 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
     val maxCard = $(maxCardinality)
     val shouldCleanKeys = $(cleanKeys)
     val shouldCleanValues = $(cleanText)
+    val shouldTokenize = $(textLengthType) == TextLengthType.Tokens.entryName
 
     implicit val testStatsMonoid: Monoid[TextMapStats] = TextMapStats.monoid(maxCard)
     val valueStats: Dataset[Array[TextMapStats]] = dataset.map(
-      _.map(computeTextMapStats(_, shouldCleanKeys, shouldCleanValues)).toArray
+      _.map(TextMapStats.computeTextMapStats(_, shouldCleanKeys, shouldCleanValues, shouldTokenize,
+        maxCard)).toArray
     )
     val aggregatedStats: Array[TextMapStats] = valueStats.reduce(_ + _)
 
@@ -210,13 +213,37 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
  */
 private[op] case class TextMapStats(keyValueCounts: Map[String, TextStats]) extends JsonLike
 
-private[op] object TextMapStats {
+private[op] object TextMapStats extends CleanTextFun {
 
   def monoid(maxCardinality: Int): Monoid[TextMapStats] = {
     implicit val testStatsMonoid: Monoid[TextStats] = TextStats.monoid(maxCardinality)
     caseclass.monoid[TextMapStats]
   }
 
+  /**
+   * Computes a TextMapStats instance from a text map entry
+   *
+   * @param textMap            Text value (eg. entry in a dataframe)
+   * @param shouldCleanKeys    Whether or not the keys (feature names) should be cleaned
+   * @param shouldCleanValues  Whether or not the values (the actual text) should be cleaned
+   * @param shouldTokenize     Whether or not to tokenize the values for the length distribution
+   * @param maxCardinality     Max cardinality to keep track of in maps (relevant for the text length distribution here)
+   * @tparam T                 Feature type that the text map value is coming from
+   * @return                   TextMapStats instance with value and length counts filled out appropriately for each key
+   */
+  private[op] def computeTextMapStats[T <:  OPMap[String]](
+    textMap: T#Value,
+    shouldCleanKeys: Boolean,
+    shouldCleanValues: Boolean,
+    shouldTokenize: Boolean,
+    maxCardinality: Int
+  )(implicit tti: TypeTag[T], ttiv: TypeTag[T#Value]): TextMapStats = {
+    val keyValueCounts = textMap.map { case (k, v) => cleanTextFn(k, shouldCleanKeys) ->
+      TextStats.textStatsFromString(v, shouldCleanValues, shouldTokenize, maxCardinality)
+    }
+    TextMapStats(keyValueCounts)
+  }
+
 }
 
 /**