From da1b27f8795a88e6793960224df0a7f226ee5182 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Tue, 26 Feb 2019 10:36:11 -0800 Subject: [PATCH 1/3] Expose some missing Transmogrifier defaults (#232) --- .../stages/impl/feature/Transmogrifier.scala | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala index a59873d1eb..6be1cba5fb 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala @@ -79,6 +79,11 @@ private[op] trait TransmogrifierDefaults { val MaxCategoricalCardinality = 30 val CircularDateRepresentations: Seq[TimePeriod] = Seq(TimePeriod.HourOfDay, TimePeriod.DayOfWeek, TimePeriod.DayOfMonth, TimePeriod.DayOfYear) + + val DefaultRegion: String = PhoneNumberParser.DefaultRegion + val AutoDetectLanguage: Boolean = TextTokenizer.AutoDetectLanguage + val MinTokenLength: Int = TextTokenizer.MinTokenLength + val ToLowercase: Boolean = TextTokenizer.ToLowercase } private[op] object TransmogrifierDefaults extends TransmogrifierDefaults @@ -178,7 +183,7 @@ private[op] case object Transmogrifier { trackNulls = TrackNulls, trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, label = label) case t if t =:= weakTypeOf[PhoneMap] => val (f, other) = castAs[PhoneMap](g) // TODO make better default - f.vectorize(defaultRegion = PhoneNumberParser.DefaultRegion, others = other, trackNulls = TrackNulls) + f.vectorize(defaultRegion = DefaultRegion, others = other, trackNulls = TrackNulls) case t if t =:= weakTypeOf[PickListMap] => val (f, other) = castAs[PickListMap](g) f.vectorize(topK = TopK, minSupport = MinSupport, cleanText = CleanText, cleanKeys = CleanKeys, @@ -190,15 +195,15 @@ private[op] case object Transmogrifier { case t if t =:= weakTypeOf[TextAreaMap] => val (f, other) = castAs[TextAreaMap](g) f.smartVectorize(maxCategoricalCardinality = MaxCategoricalCardinality, - numHashes = DefaultNumOfFeatures, autoDetectLanguage = TextTokenizer.AutoDetectLanguage, - minTokenLength = TextTokenizer.MinTokenLength, toLowercase = TextTokenizer.ToLowercase, + numHashes = DefaultNumOfFeatures, autoDetectLanguage = AutoDetectLanguage, + minTokenLength = MinTokenLength, toLowercase = ToLowercase, prependFeatureName = PrependFeatureName, cleanText = CleanText, cleanKeys = CleanKeys, others = other, trackNulls = TrackNulls) case t if t =:= weakTypeOf[TextMap] => val (f, other) = castAs[TextMap](g) f.smartVectorize(maxCategoricalCardinality = MaxCategoricalCardinality, - numHashes = DefaultNumOfFeatures, autoDetectLanguage = TextTokenizer.AutoDetectLanguage, - minTokenLength = TextTokenizer.MinTokenLength, toLowercase = TextTokenizer.ToLowercase, + numHashes = DefaultNumOfFeatures, autoDetectLanguage = AutoDetectLanguage, + minTokenLength = MinTokenLength, toLowercase = ToLowercase, prependFeatureName = PrependFeatureName, cleanText = CleanText, cleanKeys = CleanKeys, others = other, trackNulls = TrackNulls) case t if t =:= weakTypeOf[URLMap] => @@ -285,7 +290,7 @@ private[op] case object Transmogrifier { others = other) case t if t =:= weakTypeOf[Phone] => val (f, other) = castAs[Phone](g) - f.vectorize(defaultRegion = PhoneNumberParser.DefaultRegion, others = other) + f.vectorize(defaultRegion = DefaultRegion, others = other) case t if t =:= weakTypeOf[PickList] => val (f, other) = castAs[PickList](g) f.vectorize(topK = TopK, minSupport = MinSupport, cleanText = CleanText, trackNulls = TrackNulls, @@ -294,15 +299,15 @@ private[op] case object Transmogrifier { val (f, other) = castAs[Text](g) f.smartVectorize(maxCategoricalCardinality = MaxCategoricalCardinality, trackNulls = TrackNulls, numHashes = DefaultNumOfFeatures, - hashSpaceStrategy = defaults.HashSpaceStrategy, autoDetectLanguage = TextTokenizer.AutoDetectLanguage, - minTokenLength = TextTokenizer.MinTokenLength, toLowercase = TextTokenizer.ToLowercase, + hashSpaceStrategy = defaults.HashSpaceStrategy, autoDetectLanguage = AutoDetectLanguage, + minTokenLength = MinTokenLength, toLowercase = ToLowercase, prependFeatureName = PrependFeatureName, others = other) case t if t =:= weakTypeOf[TextArea] => val (f, other) = castAs[TextArea](g) f.smartVectorize(maxCategoricalCardinality = MaxCategoricalCardinality, trackNulls = TrackNulls, numHashes = DefaultNumOfFeatures, - hashSpaceStrategy = defaults.HashSpaceStrategy, autoDetectLanguage = TextTokenizer.AutoDetectLanguage, - minTokenLength = TextTokenizer.MinTokenLength, toLowercase = TextTokenizer.ToLowercase, + hashSpaceStrategy = defaults.HashSpaceStrategy, autoDetectLanguage = AutoDetectLanguage, + minTokenLength = MinTokenLength, toLowercase = ToLowercase, prependFeatureName = PrependFeatureName, others = other) case t if t =:= weakTypeOf[URL] => val (f, other) = castAs[URL](g) From f80bfbd068170b1714680e8360302b455926306f Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Wed, 27 Feb 2019 22:17:54 -0800 Subject: [PATCH 2/3] Update index.md @tillbe :1st_place_medal: --- docs/talks/index.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/talks/index.md b/docs/talks/index.md index 28beb17772..c42a07f35c 100644 --- a/docs/talks/index.md +++ b/docs/talks/index.md @@ -1,5 +1,8 @@ # Talks +**2019** +* [Automated Machine Learning with TransmogrifAI](https://vaultanalytics.com/podcast/automated-machine-learning-with-transmogrifai/), Till Bergmann, Data Crunch Podcast + **2018** * [AutoML: The Assembly Line of Machine Learning](http://www.dataengconf.com/automl-the-assembly-line-of-machine-learning), Mayukh Bhaowal, DataEngConf * [The Black Swan of Perfectly Interpretable Models](https://www.infoq.com/presentations/salesforce-einstein-ml), Leah McGuire and Mayukh Bhaowal, QCon.ai From 4da94f8c4e8563552ff39490afa964d87413145c Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Wed, 27 Feb 2019 22:22:05 -0800 Subject: [PATCH 3/3] Update index.md @Jauntbox :1st_place_medal: --- docs/talks/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/talks/index.md b/docs/talks/index.md index c42a07f35c..8a3ef135ef 100644 --- a/docs/talks/index.md +++ b/docs/talks/index.md @@ -2,6 +2,7 @@ **2019** * [Automated Machine Learning with TransmogrifAI](https://vaultanalytics.com/podcast/automated-machine-learning-with-transmogrifai/), Till Bergmann, Data Crunch Podcast +* [Automated ML Pipelines For Unseen Customer Data](https://www.youtube.com/watch?v=IZyceNOSitI), Kevin Moore, PAPIs.io, [Slides](https://drive.google.com/file/d/1MStBS4tR1yuklCuDCrZHrejNAFk1j_k9/view) **2018** * [AutoML: The Assembly Line of Machine Learning](http://www.dataengconf.com/automl-the-assembly-line-of-machine-learning), Mayukh Bhaowal, DataEngConf