Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow TextStats length distribution to be token-based and refactor for testability #464

Merged
merged 28 commits into from Mar 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
1c2cbc2
Refactored and added more incremental tests
Jauntbox Mar 3, 2020
83881c0
Updated test
Jauntbox Mar 3, 2020
ca2e122
Added tests and fixed a small bug
Jauntbox Mar 4, 2020
c190d97
More refactoring and updating tests
Jauntbox Mar 4, 2020
75b0770
More test refactoring
Jauntbox Mar 5, 2020
305c57e
More refactoring
Jauntbox Mar 5, 2020
ff53dc4
Small cleanups
Jauntbox Mar 5, 2020
13b6076
Merge branch 'master' of github.com:salesforce/TransmogrifAI into km/…
Jauntbox Mar 6, 2020
fc6cd07
Addressing comments
Jauntbox Mar 6, 2020
1434128
Added text length distribution to the TextStats calculated in RFF
Jauntbox Mar 9, 2020
e117fbd
Made offending methods private
Jauntbox Mar 16, 2020
87878e0
Merge branch 'master' of github.com:salesforce/TransmogrifAI into km/…
Jauntbox Mar 16, 2020
fe36ec5
Comments
Jauntbox Mar 17, 2020
cd31e32
Spelling
Jauntbox Mar 17, 2020
a322363
Added toggle for tokenization in length distribution
Jauntbox Mar 18, 2020
32ad893
Added toggle to turn tokenization on/off for length distribution coun…
Jauntbox Mar 19, 2020
ab9d2a7
Reverted changes to RFF for now and added logging to help with visibi…
Jauntbox Mar 19, 2020
4cb5c88
Updated tests to check both tokenized and non-tokenized text feature …
Jauntbox Mar 19, 2020
e463685
Better logging
Jauntbox Mar 19, 2020
e44ca68
Revert unintentional RFF changes
Jauntbox Mar 19, 2020
ce5663e
Removed unused method
Jauntbox Mar 19, 2020
b866bb3
Removed SVC models from the default models to try in BinaryClassifica…
Jauntbox Mar 19, 2020
e72af36
Added new params to vectorizer shortcuts
Jauntbox Mar 20, 2020
307a014
scalastyle issue
Jauntbox Mar 20, 2020
49127d9
Replaced boolean param with enum
Jauntbox Mar 26, 2020
95bc3e7
Added enum to json4s serialization list
Jauntbox Mar 26, 2020
aad13ba
Actually add the enum file
Jauntbox Mar 26, 2020
d78868d
Merge branch 'master' of github.com:salesforce/TransmogrifAI into km/…
Jauntbox Mar 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 16 additions & 0 deletions core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala
Expand Up @@ -273,6 +273,10 @@ trait RichMapFeature {
* @param defaultLanguage default language to assume in case autoDetectLanguage is disabled or
* failed to make a good enough prediction.
* @param hashAlgorithm hash algorithm to use
* @param textLengthType Method to use for constructing text length distribution in TextStats. Current
* options are from the full entry or from the tokens
* @param minLengthStdDev minimum standard deviation of the lengths of tokens in a text field for it to
* be hashed instead of ignored
* @param others additional text features
* @return result feature of type Vector
*/
Expand All @@ -298,6 +302,8 @@ trait RichMapFeature {
hashSpaceStrategy: HashSpaceStrategy = TransmogrifierDefaults.HashSpaceStrategy,
defaultLanguage: Language = TextTokenizer.DefaultLanguage,
hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
textLengthType: TextLengthType = SmartTextVectorizer.LengthType,
minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev,
others: Array[FeatureLike[TextMap]] = Array.empty
): FeatureLike[OPVector] = {
// scalastyle:on parameter.number
Expand All @@ -322,6 +328,8 @@ trait RichMapFeature {
.setHashSpaceStrategy(hashSpaceStrategy)
.setHashAlgorithm(hashAlgorithm)
.setBinaryFreq(binaryFreq)
.setTextLengthType(textLengthType)
.setMinLengthStdDev(minLengthStdDev)
.getOutput()
}
}
Expand Down Expand Up @@ -418,6 +426,10 @@ trait RichMapFeature {
* @param defaultLanguage default language to assume in case autoDetectLanguage is disabled or
* failed to make a good enough prediction.
* @param hashAlgorithm hash algorithm to use
* @param tokenizeForLengths If true, then the length counts will be lengths of the tokens in the entries.
* If false, then the length counts will be the lengths of the entire entries
* @param minLengthStdDev minimum standard deviation of the lengths of tokens in a text field for it to
* be hashed instead of ignored
* @param others additional text features
* @return result feature of type Vector
*/
Expand All @@ -443,6 +455,8 @@ trait RichMapFeature {
hashSpaceStrategy: HashSpaceStrategy = TransmogrifierDefaults.HashSpaceStrategy,
defaultLanguage: Language = TextTokenizer.DefaultLanguage,
hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
textLengthType: TextLengthType = SmartTextVectorizer.LengthType,
minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev,
others: Array[FeatureLike[TextAreaMap]] = Array.empty
): FeatureLike[OPVector] = {
// scalastyle:on parameter.number
Expand All @@ -467,6 +481,8 @@ trait RichMapFeature {
.setHashSpaceStrategy(hashSpaceStrategy)
.setHashAlgorithm(hashAlgorithm)
.setBinaryFreq(binaryFreq)
.setTextLengthType(textLengthType)
.setMinLengthStdDev(minLengthStdDev)
.getOutput()
}
}
Expand Down
Expand Up @@ -211,6 +211,10 @@ trait RichTextFeature {
* @param defaultLanguage default language to assume in case autoDetectLanguage is disabled or
* failed to make a good enough prediction.
* @param hashAlgorithm hash algorithm to use
* @param textLengthType Method to use for constructing text length distribution in TextStats. Current
* options are from the full entry or from the tokens
* @param minLengthStdDev minimum standard deviation of the lengths of tokens in a text field for it to
* be hashed instead of ignored
* @param others additional text features
* @return result feature of type Vector
*/
Expand All @@ -235,6 +239,8 @@ trait RichTextFeature {
hashSpaceStrategy: HashSpaceStrategy = TransmogrifierDefaults.HashSpaceStrategy,
defaultLanguage: Language = TextTokenizer.DefaultLanguage,
hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
textLengthType: TextLengthType = SmartTextVectorizer.LengthType,
minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev,
others: Array[FeatureLike[T]] = Array.empty
): FeatureLike[OPVector] = {
// scalastyle:on parameter.number
Expand All @@ -258,6 +264,8 @@ trait RichTextFeature {
.setHashSpaceStrategy(hashSpaceStrategy)
.setHashAlgorithm(hashAlgorithm)
.setBinaryFreq(binaryFreq)
.setTextLengthType(textLengthType)
.setMinLengthStdDev(minLengthStdDev)
.getOutput()
}

Expand Down
Expand Up @@ -221,14 +221,15 @@ object FeatureDistribution {
/**
* Facilitates feature distribution retrieval from computed feature summaries
*
* @param featureKey feature key
* @param summary feature summary
* @param value optional processed sequence
* @param bins number of histogram bins
* @param textBinsFormula formula to compute the text features bin size.
* Input arguments are [[Summary]] and number of bins to use in computing feature distributions
* (histograms for numerics, hashes for strings). Output is the bins for the text features.
* @param `type` feature distribution type: training or scoring
* @param featureKey feature key
* @param summary feature summary
* @param value optional processed sequence
* @param bins number of histogram bins
* @param textBinsFormula formula to compute the text features bin size.
* Input arguments are [[Summary]] and number of bins to use in computing feature
* distributions (histograms for numerics, hashes for strings). Output is the bins for
* the text features.
* @param `type` feature distribution type: training or scoring
* @return feature distribution given the provided information
*/
private[op] def fromSummary(
Expand All @@ -245,7 +246,8 @@ object FeatureDistribution {
.getOrElse(1L -> (Array(summary.min, summary.max, summary.sum, summary.count) -> new Array[Double](bins)))

val moments = value.map(momentsValues)
val cardEstimate = value.map(cardinalityValues)
// Tokenization is already done during prep phase, so we can skip text cleaning and tokenization here (for now)
val cardEstimate = value.map(cardinalityValues(_, shouldCleanText = false, tokenizeForLengths = false))

FeatureDistribution(
name = name,
Expand All @@ -268,25 +270,36 @@ object FeatureDistribution {
*/
private def momentsValues(values: ProcessedSeq): Moments = {
val population = values match {
case Left(seq) => seq.map(x => x.length.toDouble)
case Left(seq) => seq.map(_.length.toDouble)
case Right(seq) => seq
}
MomentsGroup.sum(population.map(x => Moments(x)))
}

/**
* Function to track frequency of the first $(MaxCardinality) unique values
* (number for numeric features, token for text features)
* (number for numeric features, token for text features). Note that shouldCleanText and tokenizeForLengths are
* exposed, but not relevant yet because tokenization always happens before this, during data preparation.
*
* @param values values to track distribution / frequency
* @return TextStats object containing a Map from a value to its frequency (histogram)
* @param values values to track distribution / frequency
* @param shouldCleanText whether or not to clean text for TextStats calculation
* @param tokenizeForLengths whether or not to tokenize strings before computing length distribution
* @return TextStats object containing a Map from a value to its frequency (histogram)
*/
private def cardinalityValues(values: ProcessedSeq): TextStats = {
TextStats(countStringValues(values.left.getOrElse(values.right.get)), Map.empty)
}
private def cardinalityValues(values: ProcessedSeq, shouldCleanText: Boolean,
tokenizeForLengths: Boolean): TextStats = {
implicit val testStatsMonoid: Monoid[TextStats] = TextStats.monoid(RawFeatureFilter.MaxCardinality)

private def countStringValues[T](seq: Seq[T]): Map[String, Long] = {
seq.groupBy(identity).map { case (k, valSeq) => k.toString -> valSeq.size.toLong }
val stringVals = values match {
case Left(stringSeq) => stringSeq
case Right(doubleSeq) => doubleSeq.map(_.toString)
}
stringVals.foldLeft(TextStats.empty)((acc, el) => acc + TextStats.textStatsFromString(
textString = el,
shouldCleanText = shouldCleanText,
shouldTokenize = tokenizeForLengths,
maxCardinality = RawFeatureFilter.MaxCardinality)
)
}

/**
Expand Down
Expand Up @@ -600,7 +600,6 @@ object RawFeatureFilter {
val minScoringRowsDefault = 500
val MaxCardinality = 500


val stageName = classOf[RawFeatureFilter[_]].getSimpleName

val uid = s"${stageName}_100000000000"
Expand Down
Expand Up @@ -56,10 +56,11 @@ case object BinaryClassificationModelSelector extends ModelSelectorFactory {
/**
* Subset of models to use in model selector
*
* Note: [[OpNaiveBayes]], [[OpDecisionTreeClassifier]] and [[OpXGBoostClassifier]] are off by default
* Note: [[OpNaiveBayes]], [[OpDecisionTreeClassifier]], [[OpLinearSVC]], and [[OpXGBoostClassifier]] are
* off by default
*/
val modelTypesToUse: Seq[BinaryClassificationModelsToTry] = Seq(
MTT.OpLogisticRegression, MTT.OpRandomForestClassifier, MTT.OpGBTClassifier, MTT.OpLinearSVC
MTT.OpLogisticRegression, MTT.OpRandomForestClassifier, MTT.OpGBTClassifier
)

/**
Expand Down
Expand Up @@ -67,17 +67,6 @@ class SmartTextMapVectorizer[T <: OPMap[String]]

private implicit val textMapStatsSeqEnc: Encoder[Array[TextMapStats]] = ExpressionEncoder[Array[TextMapStats]]()

private def computeTextMapStats
(
textMap: T#Value, shouldCleanKeys: Boolean, shouldCleanValues: Boolean
): TextMapStats = {
val keyValueCounts = textMap.map{ case (k, v) =>
cleanTextFn(k, shouldCleanKeys) ->
TextStats(Map(cleanTextFn(v, shouldCleanValues) -> 1L), Map(cleanTextFn(v, shouldCleanValues).length -> 1L))
}
TextMapStats(keyValueCounts)
}

private def makeHashingParams() = HashingFunctionParams(
hashWithIndex = $(hashWithIndex),
prependFeatureName = $(prependFeatureName),
Expand Down Expand Up @@ -166,6 +155,18 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
}
}

logInfo("TextStats for features used in SmartTextMapVectorizer:")
inN.map(_.name).zip(aggregatedStats).zip(allFeatureInfo).foreach { case ((mapName, mapStats), featInfo) =>
logInfo(s"FeatureMap: $mapName")
mapStats.keyValueCounts.zip(featInfo).foreach { case ((name, stats), info) =>
logInfo(s"Key: $name")
logInfo(s"LengthCounts: ${stats.lengthCounts}")
logInfo(s"LengthMean: ${stats.lengthMean}")
logInfo(s"LengthStdDev: ${stats.lengthStdDev}")
logInfo(s"Vectorization method: ${info.vectorizationMethod}")
}
}

SmartTextMapVectorizerModelArgs(
allFeatureInfo = allFeatureInfo,
shouldCleanKeys = shouldCleanKeys,
Expand All @@ -181,10 +182,12 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
val maxCard = $(maxCardinality)
val shouldCleanKeys = $(cleanKeys)
val shouldCleanValues = $(cleanText)
val shouldTokenize = $(textLengthType) == TextLengthType.Tokens.entryName

implicit val testStatsMonoid: Monoid[TextMapStats] = TextMapStats.monoid(maxCard)
val valueStats: Dataset[Array[TextMapStats]] = dataset.map(
_.map(computeTextMapStats(_, shouldCleanKeys, shouldCleanValues)).toArray
_.map(TextMapStats.computeTextMapStats(_, shouldCleanKeys, shouldCleanValues, shouldTokenize,
maxCard)).toArray
)
val aggregatedStats: Array[TextMapStats] = valueStats.reduce(_ + _)

Expand All @@ -210,13 +213,37 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
*/
private[op] case class TextMapStats(keyValueCounts: Map[String, TextStats]) extends JsonLike

private[op] object TextMapStats {
private[op] object TextMapStats extends CleanTextFun {

def monoid(maxCardinality: Int): Monoid[TextMapStats] = {
implicit val testStatsMonoid: Monoid[TextStats] = TextStats.monoid(maxCardinality)
caseclass.monoid[TextMapStats]
}

/**
* Computes a TextMapStats instance from a text map entry
*
* @param textMap Text value (eg. entry in a dataframe)
* @param shouldCleanKeys Whether or not the keys (feature names) should be cleaned
* @param shouldCleanValues Whether or not the values (the actual text) should be cleaned
* @param shouldTokenize Whether or not to tokenize the values for the length distribution
* @param maxCardinality Max cardinality to keep track of in maps (relevant for the text length distribution here)
* @tparam T Feature type that the text map value is coming from
* @return TextMapStats instance with value and length counts filled out appropriately for each key
*/
private[op] def computeTextMapStats[T <: OPMap[String]](
textMap: T#Value,
shouldCleanKeys: Boolean,
shouldCleanValues: Boolean,
shouldTokenize: Boolean,
maxCardinality: Int
)(implicit tti: TypeTag[T], ttiv: TypeTag[T#Value]): TextMapStats = {
val keyValueCounts = textMap.map { case (k, v) => cleanTextFn(k, shouldCleanKeys) ->
TextStats.textStatsFromString(v, shouldCleanValues, shouldTokenize, maxCardinality)
}
TextMapStats(keyValueCounts)
}

}

/**
Expand Down