-
Notifications
You must be signed in to change notification settings - Fork 393
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow TextStats length distribution to be token-based and refactor for testability #464
Changes from 14 commits
1c2cbc2
83881c0
ca2e122
c190d97
75b0770
305c57e
ff53dc4
13b6076
fc6cd07
1434128
e117fbd
87878e0
fe36ec5
cd31e32
a322363
32ad893
ab9d2a7
4cb5c88
e463685
e44ca68
ce5663e
b866bb3
e72af36
307a014
49127d9
95bc3e7
aad13ba
d78868d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -85,7 +85,9 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])( | |
val shouldCleanText = $(cleanText) | ||
|
||
implicit val testStatsMonoid: Semigroup[TextStats] = TextStats.monoid(maxCard) | ||
val valueStats: Dataset[Array[TextStats]] = dataset.map(_.map(computeTextStats(_, shouldCleanText)).toArray) | ||
val valueStats: Dataset[Array[TextStats]] = dataset.map( | ||
_.map(SmartTextVectorizer.computeTextStats(_, shouldCleanText, maxCard)).toArray | ||
) | ||
val aggregatedStats: Array[TextStats] = valueStats.reduce(_ + _) | ||
|
||
val (vectorizationMethods, topValues) = aggregatedStats.map { stats => | ||
|
@@ -121,14 +123,6 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])( | |
.setTrackTextLen($(trackTextLen)) | ||
} | ||
|
||
private def computeTextStats(text: T#Value, shouldCleanText: Boolean): TextStats = { | ||
val (valueCounts, lengthCounts) = text match { | ||
case Some(v) => (Map(cleanTextFn(v, shouldCleanText) -> 1L), Map(cleanTextFn(v, shouldCleanText).length -> 1L)) | ||
case None => (Map.empty[String, Long], Map.empty[Int, Long]) | ||
} | ||
TextStats(valueCounts, lengthCounts) | ||
} | ||
|
||
private def makeVectorMetadata(smartTextParams: SmartTextVectorizerModelArgs): OpVectorMetadata = { | ||
require(inN.length == smartTextParams.vectorizationMethods.length) | ||
|
||
|
@@ -164,13 +158,40 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])( | |
} | ||
} | ||
|
||
object SmartTextVectorizer { | ||
object SmartTextVectorizer extends CleanTextFun { | ||
val MaxCardinality: Int = 100 | ||
val MinTextLengthStdDev: Double = 0 | ||
|
||
private[op] def partition[T: ClassTag](input: Array[T], condition: Array[Boolean]): (Array[T], Array[T]) = { | ||
val all = input.zip(condition) | ||
(all.collect { case (item, true) => item }, all.collect { case (item, false) => item }) | ||
} | ||
|
||
/** | ||
* Computes a TextStats instance from a Text entry | ||
* | ||
* @param text Text value (eg. entry in a dataframe) | ||
* @param shouldCleanText Whether or not the text should be cleaned | ||
* @param maxCardinality Max cardinality to keep track of in maps (relevant for the text length distribution here) | ||
* @tparam T Feature type that the text value is coming from | ||
* @return TextStats instance with value and length counts filled out appropriately | ||
*/ | ||
private[op] def computeTextStats[T <: Text : TypeTag]( | ||
tovbinm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
text: T#Value, | ||
shouldCleanText: Boolean, | ||
maxCardinality: Int | ||
): TextStats = { | ||
// Go through each token in text and start appending it to a TextStats instance | ||
val lengthsMap: Map[Int, Long] = TextTokenizer.tokenizeStringOpt(text).tokens.value | ||
.foldLeft(Map.empty[Int, Long])( | ||
(acc, el) => TextStats.additionHelper(acc, Map(el.length -> 1L), maxCardinality) | ||
) | ||
val (valueCounts, lengthCounts) = text match { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when we reach There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nvm, this is taken care of by |
||
case Some(v) => (Map(cleanTextFn(v, shouldCleanText) -> 1L), lengthsMap) | ||
case None => (Map.empty[String, Long], Map.empty[Int, Long]) | ||
} | ||
TextStats(valueCounts, lengthCounts) | ||
} | ||
} | ||
|
||
/** | ||
|
@@ -189,8 +210,8 @@ private[op] case class TextStats | |
val lengthMean: Double = lengthCounts.foldLeft(0.0)((acc, el) => acc + el._1 * el._2) / lengthSize | ||
val lengthVariance: Double = lengthCounts.foldLeft(0.0)( | ||
(acc, el) => acc + el._2 * (el._1 - lengthMean) * (el._1 - lengthMean) | ||
) | ||
val lengthStdDev: Double = math.sqrt(lengthVariance / lengthSize) | ||
) / lengthSize | ||
tovbinm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
val lengthStdDev: Double = math.sqrt(lengthVariance) | ||
} | ||
|
||
private[op] object TextStats { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -140,7 +140,7 @@ object TextTokenizer { | |
/** | ||
* Language wise sentence tokenization | ||
* | ||
* @param text text to tokenize | ||
* @param textString text to tokenize (in String form) | ||
* @param languageDetector language detector instance | ||
* @param analyzer text analyzer instance | ||
* @param sentenceSplitter sentence splitter instance | ||
|
@@ -151,8 +151,8 @@ object TextTokenizer { | |
* @param minTokenLength minimum token length | ||
* @return detected language and sentence tokens | ||
*/ | ||
def tokenize( | ||
text: Text, | ||
private[op] def tokenizeString( | ||
textString: String, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. now this function can explode with NullPointerException is textString is null, while before it could not have happened. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmmm, I'm not sure if it's possible for the The actual tokenize call during vectorization is still I agree it's technically less safe, but I don't think it's necessary to have null checking at this point in the flow. We should make sure the data gets created in a safe way, which I think we already do. Are there some specific edge cases I'm missing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the simplest is to add a null check There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that I think about it more, I'm pretty sure the old tokenize function would also give a NPE if you fed it a sneaky null value. The I put back the old tokenize function as
which did indeed throw a NPE. We have tests all over the place (eg. our vectorizer tests and FeatureTypeSparkConverterTest) that make sure we can handle @leahmcguire any opinions on this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. SomeValue.unapply operates on value which is Option[String]. Null check is done during the construction of Text when the values are extracted from Dataframe / RDD. NullPointerException is indeed unlikely to be thrown. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your example you provided is not currently possible and also not a fair one :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Jauntbox is this only called from the Option[String] version below? if so make it private and it is fine. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In fact make them both private please |
||
languageDetector: LanguageDetector = LanguageDetector, | ||
analyzer: TextAnalyzer = Analyzer, | ||
sentenceSplitter: Option[SentenceSplitter] = None, | ||
|
@@ -162,30 +162,84 @@ object TextTokenizer { | |
toLowercase: Boolean = ToLowercase, | ||
minTokenLength: Int = MinTokenLength | ||
): TextTokenizerResult = { | ||
text match { | ||
case SomeValue(Some(txt)) => | ||
val language = | ||
if (!autoDetectLanguage) defaultLanguage | ||
else { | ||
languageDetector | ||
.detectLanguages(txt) | ||
.collectFirst { case (lang, confidence) if confidence > autoDetectThreshold => lang } | ||
.getOrElse(defaultLanguage) | ||
} | ||
val lowerTxt = if (toLowercase) txt.toLowerCase else txt | ||
val language = | ||
if (!autoDetectLanguage) defaultLanguage | ||
else { | ||
languageDetector | ||
.detectLanguages(textString) | ||
.collectFirst { case (lang, confidence) if confidence > autoDetectThreshold => lang } | ||
.getOrElse(defaultLanguage) | ||
} | ||
val lowerTxt = if (toLowercase) textString.toLowerCase else textString | ||
|
||
val sentences = sentenceSplitter.map(_.getSentences(lowerTxt, language)) | ||
.getOrElse(Seq(lowerTxt)) | ||
.map { sentence => | ||
val tokens = analyzer.analyze(sentence, language) | ||
tokens.filter(_.length >= minTokenLength).toTextList | ||
} | ||
TextTokenizerResult(language, sentences) | ||
case _ => | ||
TextTokenizerResult(defaultLanguage, Seq(TextList.empty)) | ||
val sentences = sentenceSplitter.map(_.getSentences(lowerTxt, language)) | ||
.getOrElse(Seq(lowerTxt)) | ||
.map { sentence => | ||
val tokens = analyzer.analyze(sentence, language) | ||
tokens.filter(_.length >= minTokenLength).toTextList | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we only keeping tokens with length > There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was existing behavior. It's a configurable parameter (defaulting to 1), so is not required. |
||
} | ||
TextTokenizerResult(language, sentences) | ||
} | ||
|
||
/** | ||
* Language wise sentence tokenization | ||
* | ||
* @param textStringOpt text to tokenize (in Option[String] form) | ||
* @param languageDetector language detector instance | ||
* @param analyzer text analyzer instance | ||
* @param sentenceSplitter sentence splitter instance | ||
* @param autoDetectLanguage whether to attempt language detection | ||
* @param defaultLanguage default language | ||
* @param autoDetectThreshold language detection threshold | ||
* @param toLowercase whether to convert all characters to lowercase before tokenizing | ||
* @param minTokenLength minimum token length | ||
* @return detected language and sentence tokens | ||
*/ | ||
private[op] def tokenizeStringOpt( | ||
textStringOpt: Option[String], | ||
languageDetector: LanguageDetector = LanguageDetector, | ||
analyzer: TextAnalyzer = Analyzer, | ||
sentenceSplitter: Option[SentenceSplitter] = None, | ||
autoDetectLanguage: Boolean = AutoDetectLanguage, | ||
defaultLanguage: Language = DefaultLanguage, | ||
autoDetectThreshold: Double = AutoDetectThreshold, | ||
toLowercase: Boolean = ToLowercase, | ||
minTokenLength: Int = MinTokenLength | ||
): TextTokenizerResult = { | ||
textStringOpt match { | ||
case Some(txt) => tokenizeString(txt, languageDetector, analyzer, sentenceSplitter, autoDetectLanguage, | ||
defaultLanguage, autoDetectThreshold, toLowercase, minTokenLength) | ||
case None => TextTokenizerResult(defaultLanguage, Seq(TextList.empty)) | ||
} | ||
} | ||
|
||
/** | ||
* Language wise sentence tokenization | ||
* | ||
* @param text text to tokenize | ||
* @param languageDetector language detector instance | ||
* @param analyzer text analyzer instance | ||
* @param sentenceSplitter sentence splitter instance | ||
* @param autoDetectLanguage whether to attempt language detection | ||
* @param defaultLanguage default language | ||
* @param autoDetectThreshold language detection threshold | ||
* @param toLowercase whether to convert all characters to lowercase before tokenizing | ||
* @param minTokenLength minimum token length | ||
* @return detected language and sentence tokens | ||
*/ | ||
def tokenize( | ||
tovbinm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
text: Text, | ||
languageDetector: LanguageDetector = LanguageDetector, | ||
analyzer: TextAnalyzer = Analyzer, | ||
sentenceSplitter: Option[SentenceSplitter] = None, | ||
autoDetectLanguage: Boolean = AutoDetectLanguage, | ||
defaultLanguage: Language = DefaultLanguage, | ||
autoDetectThreshold: Double = AutoDetectThreshold, | ||
toLowercase: Boolean = ToLowercase, | ||
minTokenLength: Int = MinTokenLength | ||
): TextTokenizerResult = tokenizeStringOpt(text.value, languageDetector, analyzer, sentenceSplitter, | ||
autoDetectLanguage, defaultLanguage, autoDetectThreshold, toLowercase, minTokenLength) | ||
|
||
/** | ||
* Text tokenization result | ||
* | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not relevant to this PR but i think
countStringValues
is no longer used.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, I can remove it then