diff --git a/core/src/main/scala/com/salesforce/op/OpWorkflowModel.scala b/core/src/main/scala/com/salesforce/op/OpWorkflowModel.scala index 6213df0e00..e68ff2e2bd 100644 --- a/core/src/main/scala/com/salesforce/op/OpWorkflowModel.scala +++ b/core/src/main/scala/com/salesforce/op/OpWorkflowModel.scala @@ -135,9 +135,10 @@ class OpWorkflowModel(val uid: String = UID[OpWorkflowModel], val trainingParams * @return Updated instance of feature */ def getUpdatedFeatures(features: Array[OPFeature]): Array[OPFeature] = { - val allFeatures = rawFeatures ++ blacklistedFeatures ++ stages.map(_.getOutput()) - features.map{f => allFeatures.find(_.sameOrigin(f)) - .getOrElse(throw new IllegalArgumentException(s"feature $f is not a part of this workflow")) + val allFeatures = getRawFeatures() ++ getBlacklist() ++ getStages().map(_.getOutput()) + features.map { f => + allFeatures.find(_.sameOrigin(f)) + .getOrElse(throw new IllegalArgumentException(s"feature $f is not a part of this workflow")) } } diff --git a/core/src/main/scala/com/salesforce/op/OpWorkflowModelReader.scala b/core/src/main/scala/com/salesforce/op/OpWorkflowModelReader.scala index 3276b960a2..53b448878e 100644 --- a/core/src/main/scala/com/salesforce/op/OpWorkflowModelReader.scala +++ b/core/src/main/scala/com/salesforce/op/OpWorkflowModelReader.scala @@ -30,15 +30,17 @@ package com.salesforce.op +import com.salesforce.op.OpWorkflowModelReadWriteShared.{FieldNames => FN} import com.salesforce.op.OpWorkflowModelReadWriteShared.FieldNames._ import com.salesforce.op.features.{FeatureJsonHelper, OPFeature, TransientFeature} import com.salesforce.op.filters.{FeatureDistribution, RawFeatureFilterResults} -import com.salesforce.op.stages.OpPipelineStageReadWriteShared._ +import com.salesforce.op.stages.OpPipelineStageReaderWriter._ import com.salesforce.op.stages._ import org.apache.spark.ml.util.MLReader import org.json4s.JsonAST.{JArray, JNothing, JValue} import org.json4s.jackson.JsonMethods.parse +import scala.collection.mutable.ArrayBuffer import scala.util.{Failure, Success, Try} /** @@ -50,7 +52,6 @@ import scala.util.{Failure, Success, Try} */ class OpWorkflowModelReader(val workflowOpt: Option[OpWorkflow]) extends MLReader[OpWorkflowModel] { - /** * Load a previously trained workflow model from path * @@ -72,7 +73,9 @@ class OpWorkflowModelReader(val workflowOpt: Option[OpWorkflow]) extends MLReade * @param path to the trained workflow model * @return workflow model */ - def loadJson(json: String, path: String): Try[OpWorkflowModel] = Try(parse(json)).flatMap(loadJson(_, path = path)) + def loadJson(json: String, path: String): Try[OpWorkflowModel] = { + Try(parse(json)).flatMap(loadJson(_, path = path)) + } /** * Load Workflow instance from json @@ -81,98 +84,110 @@ class OpWorkflowModelReader(val workflowOpt: Option[OpWorkflow]) extends MLReade * @param path to the trained workflow model * @return workflow model instance */ - def loadJson(json: JValue, path: String): Try[OpWorkflowModel] = workflowOpt match { - case None => - throw new NotImplementedError("Loading models without the original workflow is currently not supported") - - case Some(workflow) => - for { - trainParams <- OpParams.fromString((json \ TrainParameters.entryName).extract[String]) - params <- OpParams.fromString((json \ Parameters.entryName).extract[String]) - model <- Try(new OpWorkflowModel(uid = (json \ Uid.entryName).extract[String], trainParams)) - (stages, resultFeatures) <- Try(resolveFeaturesAndStages(workflow, json, path)) - blacklist <- Try(resolveBlacklist(workflow, json)) - blacklistMapKeys <- Try(resolveBlacklistMapKeys(json)) - results <- resolveRawFeatureFilterResults(json) - } yield model - .setStages(stages.filterNot(_.isInstanceOf[FeatureGeneratorStage[_, _]])) - .setFeatures(resultFeatures) - .setParameters(params) - .setBlacklist(blacklist) - .setBlacklistMapKeys(blacklistMapKeys) - .setRawFeatureFilterResults(results) + def loadJson(json: JValue, path: String): Try[OpWorkflowModel] = { + for { + trainingParams <- OpParams.fromString((json \ TrainParameters.entryName).extract[String]) + params <- OpParams.fromString((json \ Parameters.entryName).extract[String]) + model <- Try(new OpWorkflowModel(uid = (json \ Uid.entryName).extract[String], trainingParams)) + stages <- loadStages(json, workflowOpt, path) + resolvedFeatures <- resolveFeatures(json, stages) + resultFeatures <- resolveResultFeatures(json, resolvedFeatures) + blacklist <- resolveBlacklist(json, workflowOpt, resolvedFeatures, path) + blacklistMapKeys <- resolveBlacklistMapKeys(json) + rffResults <- resolveRawFeatureFilterResults(json) + } yield model + .setStages(stages.filterNot(_.isInstanceOf[FeatureGeneratorStage[_, _]])) + .setFeatures(resultFeatures) + .setParameters(params) + .setBlacklist(blacklist) + .setBlacklistMapKeys(blacklistMapKeys) + .setRawFeatureFilterResults(rffResults) } - private def resolveBlacklist(workflow: OpWorkflow, json: JValue): Array[OPFeature] = { - if ((json \ BlacklistedFeaturesUids.entryName) != JNothing) { // for backwards compatibility - val blacklistIds = (json \ BlacklistedFeaturesUids.entryName).extract[JArray].arr - val allFeatures = workflow.getRawFeatures() ++ workflow.getBlacklist() ++ - workflow.getStages().flatMap(_.getInputFeatures()) ++ - workflow.getResultFeatures() - blacklistIds.flatMap(uid => allFeatures.find(_.uid == uid.extract[String])).toArray - } else { - Array.empty[OPFeature] - } + private def loadStages(json: JValue, wfOpt: Option[OpWorkflow], path: String): Try[Array[OPStage]] = { + wfOpt.map(wf => loadStages(json, Stages, wf, path)).getOrElse(loadStages(json, Stages, path).map(_._1)) } - private def resolveBlacklistMapKeys(json: JValue): Map[String, Set[String]] = { - (json \ BlacklistedMapKeys.entryName).extractOpt[Map[String, List[String]]] match { - case Some(blackMapKeys) => blackMapKeys.map { case (k, vs) => k -> vs.toSet } - case None => Map.empty + private def loadStages(json: JValue, field: FN, path: String): Try[(Array[OPStage], Array[OPFeature])] = Try { + val stagesJs = (json \ field.entryName).extract[JArray].arr + val (recoveredStages, recoveredFeatures) = ArrayBuffer.empty[OPStage] -> ArrayBuffer.empty[OPFeature] + for {j <- stagesJs} { + val stage = new OpPipelineStageReader(recoveredFeatures).loadFromJson(j, path = path).asInstanceOf[OPStage] + recoveredStages += stage + recoveredFeatures += stage.getOutput() } + recoveredStages.toArray -> recoveredFeatures.toArray } - private def resolveFeaturesAndStages - ( - workflow: OpWorkflow, - json: JValue, - path: String - ): (Array[OPStage], Array[OPFeature]) = { - val stages = loadStages(workflow, json, path) - val stagesMap = stages.map(stage => stage.uid -> stage).toMap[String, OPStage] - val featuresMap = resolveFeatures(json, stagesMap) - resolveStages(stages, featuresMap) - - val resultIds = (json \ ResultFeaturesUids.entryName).extract[Array[String]] - val resultFeatures = featuresMap.filterKeys(resultIds.toSet).values - - stages.toArray -> resultFeatures.toArray - } - - private def loadStages(workflow: OpWorkflow, json: JValue, path: String): Seq[OPStage] = { - val stagesJs = (json \ Stages.entryName).extract[JArray].arr + private def loadStages(json: JValue, field: FN, workflow: OpWorkflow, path: String): Try[Array[OPStage]] = Try { + val generators = workflow.getRawFeatures().map(_.originStage) + val stagesJs = (json \ field.entryName).extract[JArray].arr val recoveredStages = stagesJs.flatMap { j => - val stageUidOpt = (j \ Uid.entryName).extractOpt[String] - stageUidOpt.map { stageUid => - val originalStage = workflow.getStages().find(_.uid == stageUid) - originalStage match { - case Some(os) => new OpPipelineStageReader(os).loadFromJson(j, path = path).asInstanceOf[OPStage] - case None => throw new RuntimeException(s"Workflow does not contain a stage with uid: $stageUid") - } + val stageUid = (j \ Uid.entryName).extract[String] + val originalStage = workflow.getStages().find(_.uid == stageUid) + originalStage match { + case Some(os) => Option( + new OpPipelineStageReader(os).loadFromJson(j, path = path)).map(_.asInstanceOf[OPStage] + ) + case None if generators.exists(_.uid == stageUid) => None // skip the generator since they are in the workflow + case None => throw new RuntimeException(s"Workflow does not contain a stage with uid: $stageUid") } } - val generators = workflow.getRawFeatures().map(_.originStage) generators ++ recoveredStages } - private def resolveFeatures(json: JValue, stages: Map[String, OPStage]): Map[String, OPFeature] = { - val results = (json \ AllFeatures.entryName).extract[JArray].arr + private def resolveFeatures(json: JValue, stages: Array[OPStage]): Try[Array[OPFeature]] = Try { + val featuresArr = (json \ AllFeatures.entryName).extract[JArray].arr + val stagesMap = stages.map(stage => stage.uid -> stage).toMap[String, OPStage] + // should have been serialized in topological order // so that parent features can be used to construct each new feature - results.foldLeft(Map.empty[String, OPFeature])((featMap, feat) => - FeatureJsonHelper.fromJson(feat, stages, featMap) match { - case Success(f) => featMap + (f.uid -> f) + val featuresMap = featuresArr.foldLeft(Map.empty[String, OPFeature])((featMap, feat) => + FeatureJsonHelper.fromJson(feat, stagesMap, featMap) match { case Failure(e) => throw new RuntimeException(s"Error resolving feature: $feat", e) + case Success(f) => featMap + (f.uid -> f) } ) - } - private def resolveStages(stages: Seq[OPStage], featuresMap: Map[String, OPFeature]): Unit = { + // set input features to stages for {stage <- stages} { val inputIds = stage.getTransientFeatures().map(_.uid) val inFeatures = inputIds.map(id => TransientFeature(featuresMap(id))) // features are order dependent stage.set(stage.inputFeatures, inFeatures) } + featuresMap.values.toArray + } + + private def resolveResultFeatures(json: JValue, features: Array[OPFeature]): Try[Array[OPFeature]] = Try { + val resultIds = (json \ ResultFeaturesUids.entryName).extract[Array[String]].toSet + features.filter(f => resultIds.contains(f.uid)) + } + + private def resolveBlacklist + ( + json: JValue, + wfOpt: Option[OpWorkflow], + features: Array[OPFeature], + path: String + ): Try[Array[OPFeature]] = { + if ((json \ BlacklistedFeaturesUids.entryName) != JNothing) { // for backwards compatibility + for { + feats <- wfOpt + .map(wf => Success(wf.getAllFeatures() ++ wf.getBlacklist())) + .getOrElse(loadStages(json, BlacklistedStages, path).map(_._2)) + allFeatures = features ++ feats + blacklistIds = (json \ BlacklistedFeaturesUids.entryName).extract[Array[String]] + } yield blacklistIds.flatMap(uid => allFeatures.find(_.uid == uid)) + } else { + Success(Array.empty[OPFeature]) + } + } + + private def resolveBlacklistMapKeys(json: JValue): Try[Map[String, Set[String]]] = Try { + (json \ BlacklistedMapKeys.entryName).extractOpt[Map[String, List[String]]] match { + case Some(blackMapKeys) => blackMapKeys.map { case (k, vs) => k -> vs.toSet } + case None => Map.empty + } } private def resolveRawFeatureFilterResults(json: JValue): Try[RawFeatureFilterResults] = { diff --git a/core/src/main/scala/com/salesforce/op/OpWorkflowModelWriter.scala b/core/src/main/scala/com/salesforce/op/OpWorkflowModelWriter.scala index 89146463c5..9dcdfa0be0 100644 --- a/core/src/main/scala/com/salesforce/op/OpWorkflowModelWriter.scala +++ b/core/src/main/scala/com/salesforce/op/OpWorkflowModelWriter.scala @@ -32,7 +32,7 @@ package com.salesforce.op import com.salesforce.op.features.FeatureJsonHelper import com.salesforce.op.filters.RawFeatureFilterResults -import com.salesforce.op.stages.{OpPipelineStageBase, OpPipelineStageWriter} +import com.salesforce.op.stages.{OPStage, OpPipelineStageWriter} import enumeratum._ import org.apache.hadoop.fs.Path import org.apache.spark.ml.util.MLWriter @@ -54,8 +54,7 @@ class OpWorkflowModelWriter(val model: OpWorkflowModel) extends MLWriter { implicit val jsonFormats: Formats = DefaultFormats override protected def saveImpl(path: String): Unit = { - sc.parallelize(Seq(toJsonString(path)), 1) - .saveAsTextFile(OpWorkflowModelReadWriteShared.jsonPath(path)) + sc.parallelize(Seq(toJsonString(path)), 1).saveAsTextFile(OpWorkflowModelReadWriteShared.jsonPath(path)) } /** @@ -78,6 +77,7 @@ class OpWorkflowModelWriter(val model: OpWorkflowModel) extends MLWriter { (FN.ResultFeaturesUids.entryName -> resultFeaturesJArray) ~ (FN.BlacklistedFeaturesUids.entryName -> blacklistFeaturesJArray()) ~ (FN.BlacklistedMapKeys.entryName -> blacklistMapKeys()) ~ + (FN.BlacklistedStages.entryName -> blackListedStagesJArray(path)) ~ (FN.Stages.entryName -> stagesJArray(path)) ~ (FN.AllFeatures.entryName -> allFeaturesJArray) ~ (FN.Parameters.entryName -> model.getParameters().toJson(pretty = false)) ~ @@ -96,14 +96,36 @@ class OpWorkflowModelWriter(val model: OpWorkflowModel) extends MLWriter { JObject(model.getBlacklistMapKeys().map { case (k, vs) => k -> JArray(vs.map(JString).toList) }.toList) /** - * Serialize all the workflow model stages + * Serialize all the model stages * * @param path path to store the spark params for stages * @return array of serialized stages */ private def stagesJArray(path: String): JArray = { - val stages: Seq[OpPipelineStageBase] = model.getStages() - val stagesJson: Seq[JObject] = stages + val stages = model.getRawFeatures().map(_.originStage) ++ model.getStages() + stagesJArray(stages, path) + } + + /** + * Serialize all the blacklisted model stages + * + * @param path path to store the spark params for stages + * @return array of serialized stages + */ + private def blackListedStagesJArray(path: String): JArray = { + val blacklistStages = model.getBlacklist().map(_.originStage) + stagesJArray(blacklistStages, path) + } + + /** + * Serialize the stages + * + * @param stages path to store the spark params for stages + * @param path path to store the spark params for stages + * @return array of serialized stages + */ + private def stagesJArray(stages: Array[OPStage], path: String): JArray = { + val stagesJson = stages .map(_.write.asInstanceOf[OpPipelineStageWriter].writeToJson(path)) .filter(_.children.nonEmpty) JArray(stagesJson.toList) @@ -140,6 +162,7 @@ private[op] object OpWorkflowModelReadWriteShared { case object ResultFeaturesUids extends FieldNames("resultFeaturesUids") case object BlacklistedFeaturesUids extends FieldNames("blacklistedFeaturesUids") case object BlacklistedMapKeys extends FieldNames("blacklistedMapKeys") + case object BlacklistedStages extends FieldNames("blacklistedStages") case object Stages extends FieldNames("stages") case object AllFeatures extends FieldNames("allFeatures") case object Parameters extends FieldNames("parameters") diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichDateFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichDateFeature.scala index 5e2a018238..ee8d1f1e20 100644 --- a/core/src/main/scala/com/salesforce/op/dsl/RichDateFeature.scala +++ b/core/src/main/scala/com/salesforce/op/dsl/RichDateFeature.scala @@ -49,11 +49,15 @@ trait RichDateFeature { /** * Convert to DateList feature + * * @return */ def toDateList(): FeatureLike[DateList] = { f.transformWith( - new UnaryLambdaTransformer[Date, DateList](operationName = "dateToList", _.value.toSeq.toDateList) + new UnaryLambdaTransformer[Date, DateList]( + operationName = "dateToList", + RichDateFeatureLambdas.toDateList + ) ) } @@ -70,7 +74,7 @@ trait RichDateFeature { * * @param timePeriod The time period to extract from the timestamp * @param others Other features of same type - * enum from: DayOfMonth, DayOfWeek, DayOfYear, HourOfDay, WeekOfMonth, WeekOfYear + * enum from: DayOfMonth, DayOfWeek, DayOfYear, HourOfDay, WeekOfMonth, WeekOfYear */ def toUnitCircle ( @@ -126,13 +130,14 @@ trait RichDateFeature { /** * Convert to DateTimeList feature + * * @return */ def toDateTimeList(): FeatureLike[DateTimeList] = { f.transformWith( new UnaryLambdaTransformer[DateTime, DateTimeList]( operationName = "dateTimeToList", - _.value.toSeq.toDateTimeList + RichDateFeatureLambdas.toDateTimeList ) ) } @@ -150,7 +155,7 @@ trait RichDateFeature { * * @param timePeriod The time period to extract from the timestamp * @param others Other features of same type - * enum from: DayOfMonth, DayOfWeek, DayOfYear, HourOfDay, WeekOfMonth, WeekOfYear + * enum from: DayOfMonth, DayOfWeek, DayOfYear, HourOfDay, WeekOfMonth, WeekOfYear */ def toUnitCircle( timePeriod: TimePeriod = TimePeriod.HourOfDay, @@ -197,3 +202,9 @@ trait RichDateFeature { } } + +object RichDateFeatureLambdas { + def toDateList: Date => DateList = (x: Date) => x.value.toSeq.toDateList + + def toDateTimeList: DateTime => DateTimeList = (x: DateTime) => x.value.toSeq.toDateTimeList +} diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichFeature.scala index 083caf3032..34f4d397ab 100644 --- a/core/src/main/scala/com/salesforce/op/dsl/RichFeature.scala +++ b/core/src/main/scala/com/salesforce/op/dsl/RichFeature.scala @@ -36,7 +36,7 @@ import com.salesforce.op.stages.base.binary.BinaryLambdaTransformer import com.salesforce.op.stages.base.quaternary.QuaternaryLambdaTransformer import com.salesforce.op.stages.base.ternary.TernaryLambdaTransformer import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer -import com.salesforce.op.stages.impl.feature.{AliasTransformer, ToOccurTransformer} +import com.salesforce.op.stages.impl.feature._ import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams import scala.reflect.runtime.universe.TypeTag @@ -73,7 +73,7 @@ trait RichFeature { * @return feature of type A */ def replaceWith(oldVal: A, newVal: A): FeatureLike[A] = { - map[A](a => if (oldVal == a) newVal else a) + feature.transformWith(new ReplaceTransformer[A](oldVal, newVal)) } /** @@ -132,40 +132,9 @@ trait RichFeature { * @return feature of type A */ def filter(p: A => Boolean, default: A): FeatureLike[A] = { - feature.transformWith( - new UnaryLambdaTransformer[A, A](operationName = "filter", transformFn = a => if (p(a)) a else default) - ) - } - - /** - * Filter feature[A] using the NOT predicate. - * Filtered out values are replaced with a default. - * - * @param p predicate A => Boolean - * @param default default value if predicate returns false - * @return feature of type A - */ - def filterNot(p: A => Boolean, default: A): FeatureLike[A] = { - filter(a => !p(a), default) + feature.transformWith(new FilterTransformer[A](p, default)) } - /** - * Filter & transform feature[A] => feature[B] using the partial function A => B. - * Filtered out values are replaced with a default. - * - * @param default default value if partial function is not defined - * @param pf partial function A => B - * @return feature of type B - */ - def collect[B <: FeatureType : TypeTag](default: B)(pf: PartialFunction[A, B]) - (implicit ttb: TypeTag[B#Value]): FeatureLike[B] = { - feature.transformWith( - new UnaryLambdaTransformer[A, B]( - operationName = "collect", - transformFn = a => if (pf.isDefinedAt(a)) pf(a) else default - ) - ) - } /** * Tests whether a predicate holds for feature[A] @@ -174,12 +143,7 @@ trait RichFeature { * @return feature[Binary] */ def exists(p: A => Boolean): FeatureLike[Binary] = { - feature.transformWith( - new UnaryLambdaTransformer[A, Binary]( - operationName = "exists", - transformFn = a => new Binary(p(a)) - ) - ) + feature.transformWith(new ExistsTransformer[A](p)) } /** diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala index 92091abb35..55c93d44f2 100644 --- a/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala +++ b/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala @@ -30,15 +30,16 @@ package com.salesforce.op.dsl +import com.salesforce.op.dsl.RichMapFeatureLambdas._ import com.salesforce.op.features.FeatureLike -import com.salesforce.op.features.types.{BinaryMap, _} -import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer +import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.feature._ import com.salesforce.op.utils.text.Language import org.apache.spark.ml.linalg.Vectors import scala.reflect.runtime.universe._ + trait RichMapFeature { /** @@ -1029,14 +1030,7 @@ trait RichMapFeature { maxPctCardinality: Double = OpOneHotVectorizer.MaxPctCardinality ): FeatureLike[OPVector] = { val domains: Array[FeatureLike[PickListMap]] = (f +: others).map { e => - val transformer = new OPMapTransformer[Email, PickList, EmailMap, PickListMap]( - operationName = "emailToPickListMap", - transformer = new UnaryLambdaTransformer[Email, PickList]( - operationName = "emailToPickList", - transformFn = _.domain.toPickList - ) - ) - transformer.setInput(e).getOutput() + new EmailToPickListMapTransformer().setInput(e).getOutput() } domains.head.vectorize( @@ -1081,14 +1075,7 @@ trait RichMapFeature { maxPctCardinality: Double = OpOneHotVectorizer.MaxPctCardinality ): FeatureLike[OPVector] = { val domains: Array[FeatureLike[PickListMap]] = (f +: others).map { e => - val transformer = - new UnaryLambdaTransformer[URLMap, PickListMap]( - operationName = "urlMapToPickListMap", - transformFn = _.value - .mapValues(v => if (v.toURL.isValid) v.toURL.domain else None) - .collect { case (k, Some(v)) => k -> v }.toPickListMap - ) - transformer.setInput(e).getOutput() + new UrlMapToPickListMapTransformer().setInput(e).getOutput() } domains.head.vectorize( @@ -1111,9 +1098,9 @@ trait RichMapFeature { * @return prediction, rawPrediction, probability */ def tupled(): (FeatureLike[RealNN], FeatureLike[OPVector], FeatureLike[OPVector]) = { - (f.map[RealNN](_.prediction.toRealNN), - f.map[OPVector]{ p => Vectors.dense(p.rawPrediction).toOPVector }, - f.map[OPVector]{ p => Vectors.dense(p.probability).toOPVector } + (f.map[RealNN](predictionToRealNN), + f.map[OPVector](predictionToRaw), + f.map[OPVector](predictionToProbability) ) } @@ -1131,3 +1118,15 @@ trait RichMapFeature { } } + +object RichMapFeatureLambdas { + + def predictionToRealNN: Prediction => RealNN = _.prediction.toRealNN + + def predictionToRaw: Prediction => OPVector = p => Vectors.dense(p.rawPrediction).toOPVector + + def predictionToProbability: Prediction => OPVector = p => Vectors.dense(p.probability).toOPVector + +} + + diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala index 0f53f609f9..fa175d8153 100644 --- a/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala +++ b/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala @@ -30,20 +30,13 @@ package com.salesforce.op.dsl -import java.util.regex.Pattern - +import com.salesforce.op.dsl.RichTextFeatureLambdas._ import com.salesforce.op.features.FeatureLike import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.feature._ import com.salesforce.op.utils.text._ -import org.apache.lucene.analysis.Analyzer -import org.apache.lucene.analysis.Analyzer.TokenStreamComponents -import org.apache.lucene.analysis.pattern.PatternTokenizer import scala.reflect.runtime.universe.TypeTag -import scala.util.Try - - trait RichTextFeature { self: RichFeature => @@ -55,7 +48,7 @@ trait RichTextFeature { * * @return A new MultiPickList feature */ - def toMultiPickList: FeatureLike[MultiPickList] = f.map[MultiPickList](_.value.toSet[String].toMultiPickList) + def toMultiPickList: FeatureLike[MultiPickList] = f.map[MultiPickList](textToMultiPickList) /** @@ -94,8 +87,8 @@ trait RichTextFeature { /** * Apply N-gram Similarity transformer * - * @param that other text feature - * @param nGramSize the size of the n-gram to be used to compute the string distance + * @param that other text feature + * @param nGramSize the size of the n-gram to be used to compute the string distance * @param toLowerCase lowercase before computing similarity * @return ngrammed feature */ @@ -186,7 +179,7 @@ trait RichTextFeature { case (false, true) => val nullIndicators = new TextListNullTransformer[TextList]().setInput(tokenized).getOutput() new VectorsCombiner().setInput(hashedFeatures, nullIndicators).getOutput() - case(false, false) => hashedFeatures + case (false, false) => hashedFeatures } } @@ -372,19 +365,10 @@ trait RichTextFeature { minTokenLength: Int = TextTokenizer.MinTokenLength, toLowercase: Boolean = TextTokenizer.ToLowercase ): FeatureLike[TextList] = { - require(Try(Pattern.compile(pattern)).isSuccess, s"Invalid regex pattern: $pattern") - - // A simple Lucene analyzer with regex Pattern Tokenizer - val analyzer = new LuceneTextAnalyzer(analyzers = lang => new Analyzer { - def createComponents(fieldName: String): TokenStreamComponents = { - val regex = Pattern.compile(pattern) - val source = new PatternTokenizer(regex, group) - new TokenStreamComponents(source) - } - }) + tokenize( languageDetector = TextTokenizer.LanguageDetector, - analyzer = analyzer, + analyzer = new LuceneRegexTextAnalyzer(pattern, group), autoDetectLanguage = false, autoDetectThreshold = 1.0, defaultLanguage = Language.Unknown, @@ -437,7 +421,8 @@ trait RichTextFeature { /** * Check if feature is a substring of the companion feature - * @param f2 feature which would contain the first input as a substring + * + * @param f2 feature which would contain the first input as a substring * @param toLowercase lowercase before checking for substrings * @tparam T2 type tag of second feature * @return Binary feature indicating if substring was found @@ -575,17 +560,18 @@ trait RichTextFeature { * * @return email prefix */ - def toEmailPrefix: FeatureLike[Text] = f.map[Text](_.prefix.toText, "prefix") + def toEmailPrefix: FeatureLike[Text] = f.map[Text](emailToPrefix, "prefix") /** * Extract email domains * * @return email domain */ - def toEmailDomain: FeatureLike[Text] = f.map[Text](_.domain.toText, "domain") + def toEmailDomain: FeatureLike[Text] = f.map[Text](emailToDomain, "domain") /** * Check if email is valid + * * @return binary feature containing boolean value of whether email was valid format */ def isValidEmail: FeatureLike[Binary] = f.transformWith(new ValidEmailTransformer()) @@ -614,7 +600,7 @@ trait RichTextFeature { others: Array[FeatureLike[Email]] = Array.empty, maxPctCardinality: Double = OpOneHotVectorizer.MaxPctCardinality ): FeatureLike[OPVector] = { - val domains = (f +: others).map(_.map[PickList](_.domain.toPickList)) + val domains = (f +: others).map(_.map[PickList](emailToPickList)) domains.head.pivot(others = domains.tail, topK = topK, minSupport = minSupport, cleanText = cleanText, trackNulls = trackNulls, maxPctCardinality = maxPctCardinality ) @@ -627,27 +613,19 @@ trait RichTextFeature { /** * Extract url domain, i.e. salesforce.com, data.com etc. */ - def toDomain: FeatureLike[Text] = f.map[Text](_.domain.toText, "urlDomain") + def toDomain: FeatureLike[Text] = f.map[Text](urlToDomain, "urlDomain") /** * Extracts url protocol, i.e. http, https, ftp etc. */ - def toProtocol: FeatureLike[Text] = f.map[Text](_.protocol.toText, "urlProtocol") + def toProtocol: FeatureLike[Text] = f.map[Text](urlToProtocol, "urlProtocol") /** * Verifies if the url is of correct form of "Uniform Resource Identifiers (URI): Generic Syntax" * RFC2396 (http://www.ietf.org/rfc/rfc2396.txt) * Default valid protocols are: http, https, ftp. */ - def isValidUrl: FeatureLike[Binary] = f.exists(_.isValid) - - /** - * Verifies if the url is of correct form of "Uniform Resource Identifiers (URI): Generic Syntax" - * RFC2396 (http://www.ietf.org/rfc/rfc2396.txt) - * - * @param protocols url protocols to consider valid, i.e. http, https, ftp etc. - */ - def isValidUrl(protocols: Array[String]): FeatureLike[Binary] = f.exists(_.isValid(protocols)) + def isValidUrl: FeatureLike[Binary] = f.exists(urlIsValid) /** * Converts a sequence of [[URL]] features into a vector, extracting the domains of the valid urls @@ -672,7 +650,7 @@ trait RichTextFeature { others: Array[FeatureLike[URL]] = Array.empty, maxPctCardinality: Double = OpOneHotVectorizer.MaxPctCardinality ): FeatureLike[OPVector] = { - val domains = (f +: others).map(_.map[PickList](v => if (v.isValid) v.domain.toPickList else PickList.empty)) + val domains = (f +: others).map(_.map[PickList](urlToPickList)) domains.head.pivot(others = domains.tail, topK = topK, minSupport = minSupport, cleanText = cleanText, trackNulls = trackNulls, maxPctCardinality = maxPctCardinality ) @@ -719,7 +697,7 @@ trait RichTextFeature { ): FeatureLike[OPVector] = { val feats: Array[FeatureLike[PickList]] = - (f +: others).map(_.detectMimeTypes(typeHint).map[PickList](_.value.toPickList)) + (f +: others).map(_.detectMimeTypes(typeHint).map[PickList](textToPickList)) feats.head.vectorize( topK = topK, minSupport = minSupport, cleanText = cleanText, trackNulls = trackNulls, others = feats.tail, @@ -820,3 +798,25 @@ trait RichTextFeature { } } + +object RichTextFeatureLambdas { + + def emailToPickList: Email => PickList = _.domain.toPickList + + def emailToPrefix: Email => Text = _.prefix.toText + + def emailToDomain: Email => Text = _.domain.toText + + def urlToPickList: URL => PickList = (v: URL) => if (v.isValid) v.domain.toPickList else PickList.empty + + def urlToDomain: URL => Text = _.domain.toText + + def urlToProtocol: URL => Text = _.protocol.toText + + def urlIsValid: URL => Boolean = _.isValid + + def textToPickList: Text => PickList = _.value.toPickList + + def textToMultiPickList: Text => MultiPickList = _.value.toSet[String].toMultiPickList + +} diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/EmailToPickListMapTransformer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/EmailToPickListMapTransformer.scala new file mode 100644 index 0000000000..748bdb4058 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/EmailToPickListMapTransformer.scala @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.feature + +import com.salesforce.op.UID +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer + + +class EmailToPickListMapTransformer(uid: String = UID[EmailToPickListMapTransformer]) + extends OPMapTransformer[Email, PickList, EmailMap, PickListMap]( + uid = uid, + operationName = "emailToPickListMap", + transformer = new UnaryLambdaTransformer[Email, PickList]( + operationName = "emailToPickList", + transformFn = EmailToPickListMapTransformer.emailToPickList + ) + ) + +object EmailToPickListMapTransformer { + def emailToPickList: Email => PickList = email => email.domain.toPickList +} diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/ExistsTransformer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/ExistsTransformer.scala new file mode 100644 index 0000000000..58502c87a6 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/ExistsTransformer.scala @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.feature + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{Binary, FeatureType} +import com.salesforce.op.stages.base.unary.UnaryTransformer + +import scala.reflect.runtime.universe.TypeTag + + +class ExistsTransformer[I <: FeatureType] +( + p: I => Boolean, + uid: String = UID[ExistsTransformer[_]], + operationName: String = "exists" +)(implicit tti: TypeTag[I]) + extends UnaryTransformer[I, Binary](uid = uid, operationName = operationName) { + + override def transformFn: I => Binary = a => new Binary(p(a)) +} diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/FilterTransformer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/FilterTransformer.scala new file mode 100644 index 0000000000..38e1fb9025 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/FilterTransformer.scala @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.feature + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{Binary, FeatureType} +import com.salesforce.op.stages.base.unary.UnaryTransformer +import scala.reflect.runtime.universe.TypeTag + + +class FilterTransformer[I <: FeatureType] +( + p: I => Boolean, + default: I, + uid: String = UID[FilterTransformer[_]], + operationName: String = "filter" +)(implicit tti: TypeTag[I], ttov: TypeTag[I#Value]) + extends UnaryTransformer[I, I](uid = uid, operationName = operationName) { + + override def transformFn: I => I = a => if (p(a)) a else default +} diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/ReplaceTransformer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/ReplaceTransformer.scala new file mode 100644 index 0000000000..8e396bd47f --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/ReplaceTransformer.scala @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.feature + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{Binary, FeatureType} +import com.salesforce.op.stages.base.unary.UnaryTransformer + +import scala.reflect.runtime.universe.TypeTag + +class ReplaceTransformer[I <: FeatureType] +( + oldVal: I, + newVal: I, + uid: String = UID[ReplaceTransformer[_]], + operationName: String = "replaceWith" +)(implicit tti: TypeTag[I], ttov: TypeTag[I#Value]) + extends UnaryTransformer[I, I](uid = uid, operationName = operationName) { + + override def transformFn: I => I = a => if (oldVal == a) newVal else a +} diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/ScalerTransformer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/ScalerTransformer.scala index c6a04e4521..437737d02a 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/ScalerTransformer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/ScalerTransformer.scala @@ -35,29 +35,12 @@ import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.unary.UnaryTransformer import com.salesforce.op.utils.json.{JsonLike, JsonUtils} import org.apache.spark.sql.types.{Metadata, MetadataBuilder} +import org.json4s.JsonAST.{JField, JNothing} +import org.json4s.{CustomSerializer, JObject} import scala.reflect.runtime.universe.TypeTag import scala.util.{Failure, Try} - -/** - * A trait to be extended by a case class containing the args needed to define a family of scaling & descaling functions - */ -trait ScalingArgs extends JsonLike - -/** - * Case class for Scaling families that take no parameters - */ -case class EmptyArgs() extends ScalingArgs - -/** - * Parameters need to uniquely define a linear scaling function - * - * @param slope the slope of the linear scaler - * @param intercept the x axis intercept of the linear scaler - */ -case class LinearScalerArgs(slope: Double, intercept: Double) extends ScalingArgs - /** * A trait for defining a new family of scaling functions * scalingType: a ScalingType Enum for the scaling name @@ -98,7 +81,7 @@ object Scaler { */ case class LogScaler() extends Scaler { val scalingType: ScalingType = ScalingType.Logarithmic - val args: ScalingArgs = EmptyArgs() + val args: ScalingArgs = EmptyScalerArgs() def scale(v: Double): Double = math.log(v) def descale(v: Double): Double = math.exp(v) } @@ -140,7 +123,7 @@ object ScalerMetadata extends { case t@ScalingType.Linear => JsonUtils.fromString[LinearScalerArgs](args).map(ScalerMetadata(t, _)) case t@ScalingType.Logarithmic => - JsonUtils.fromString[EmptyArgs](args).map(ScalerMetadata(t, _)) + JsonUtils.fromString[EmptyScalerArgs](args).map(ScalerMetadata(t, _)) case t => Failure(new IllegalArgumentException(s"Unsupported scaling type $t")) } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala index a7732355ea..fe43313b2d 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala @@ -34,10 +34,14 @@ import com.salesforce.op.UID import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.unary.UnaryTransformer import com.salesforce.op.stages.impl.feature.TextTokenizer.TextTokenizerResult +import com.salesforce.op.stages.{OpPipelineStageReaderWriter, ReaderWriter} import com.salesforce.op.utils.text.{Language, _} import org.apache.spark.ml.param._ +import org.json4s.{JObject, JValue} +import org.json4s.JsonDSL._ import scala.reflect.runtime.universe.TypeTag +import scala.util.Try trait LanguageDetectionParams extends Params { @@ -111,6 +115,7 @@ trait TextTokenizerParams extends LanguageDetectionParams with TextMatchingParam * @param analyzer a text analyzer instance (defaults to a [[LuceneTextAnalyzer]]) * @param uid uid of the stage */ +@ReaderWriter(classOf[TextTokenizerReaderWriter[_ <: Text]]) class TextTokenizer[T <: Text] ( val languageDetector: LanguageDetector = TextTokenizer.LanguageDetector, @@ -124,7 +129,7 @@ class TextTokenizer[T <: Text] object TextTokenizer { val LanguageDetector: LanguageDetector = new OptimaizeLanguageDetector() val Analyzer: TextAnalyzer = new LuceneTextAnalyzer() - val AnalyzerHtmlStrip: TextAnalyzer = new LuceneTextAnalyzer(LuceneTextAnalyzer.withHtmlStripping) + val AnalyzerHtmlStrip: TextAnalyzer = new LuceneHtmlStripTextAnalyzer() val AutoDetectLanguage = false val AutoDetectThreshold = 0.99 val DefaultLanguage: Language = Language.Unknown @@ -194,3 +199,62 @@ object TextTokenizer { def tokens: TextList = sentences.flatMap(_.value).toTextList } } + + +/** + * Special reader/writer class for [[TextTokenizer]] stage + */ +class TextTokenizerReaderWriter[T <: Text] extends OpPipelineStageReaderWriter[TextTokenizer[T]] { + + /** + * Read stage from json + * + * @param stageClass stage class + * @param json json to read stage from + * @return read result + */ + def read(stageClass: Class[TextTokenizer[T]], json: JValue): Try[TextTokenizer[T]] = Try { + val languageDetector = ((json \ "languageDetector").extract[JObject] \ "className").extract[String] match { + case c if c == classOf[OptimaizeLanguageDetector].getName => new OptimaizeLanguageDetector + } + val analyzerJson = (json \ "analyzer").extract[JObject] + val analyzer = (analyzerJson \ "className").extract[String] match { + case c if c == classOf[LuceneRegexTextAnalyzer].getName => + new LuceneRegexTextAnalyzer( + pattern = (analyzerJson \ "pattern").extract[String], + group = (analyzerJson \ "group").extract[Int] + ) + case c if c == classOf[LuceneHtmlStripTextAnalyzer].getName => new LuceneHtmlStripTextAnalyzer + case c if c == classOf[LuceneTextAnalyzer].getName => new LuceneTextAnalyzer + case c if c == classOf[OpenNLPAnalyzer].getName => new OpenNLPAnalyzer + case c => throw new RuntimeException(s"Unknown text analyzer class: $c") + } + val tti = FeatureType.featureTypeTag((json \ "tti").extract[String]).asInstanceOf[TypeTag[T]] + + new TextTokenizer[T]( + uid = (json \ "uid").extract[String], + languageDetector = languageDetector, + analyzer = analyzer + )(tti) + } + + /** + * Write stage to json + * + * @param stage stage instance to write + * @return write result + */ + def write(stage: TextTokenizer[T]): Try[JValue] = Try { + val analyzer: JValue = stage.analyzer match { + case r: LuceneRegexTextAnalyzer => + ("className" -> r.getClass.getName) ~ ("pattern" -> r.pattern) ~ ("group" -> r.group) + case _ => + "className" -> stage.analyzer.getClass.getName + } + ("uid" -> stage.uid) ~ + ("tti" -> FeatureType.typeName(stage.tti)) ~ + ("languageDetector" -> ("className" -> stage.languageDetector.getClass.getName)) ~ + ("analyzer" -> analyzer) + } + +} diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/ToOccurTransformer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/ToOccurTransformer.scala index 6cbe5558d4..066e780a58 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/ToOccurTransformer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/ToOccurTransformer.scala @@ -36,15 +36,6 @@ import com.salesforce.op.stages.base.unary.UnaryTransformer import scala.reflect.runtime.universe.TypeTag -object ToOccurTransformer { - private def defaultMatches[T <: FeatureType](value: T): Boolean = value match { - case num: OPNumeric[_] if num.nonEmpty => num.toDouble.get > 0.0 - case text: Text if text.nonEmpty => text.value.get.length > 0 - case collection: OPCollection => collection.nonEmpty - case _ => false - } -} - /** * Transformer that converts input feature of type I into doolean feature using a user specified function that * maps object type I to a Boolean @@ -56,7 +47,7 @@ object ToOccurTransformer { class ToOccurTransformer[I <: FeatureType] ( uid: String = UID[ToOccurTransformer[I]], - val matchFn: I => Boolean = ToOccurTransformer.defaultMatches[I] _ + val matchFn: I => Boolean = ToOccurTransformer.defaultMatches[I] )(implicit tti: TypeTag[I]) extends UnaryTransformer[I, RealNN](operationName = "toOccur", uid = uid) { @@ -65,3 +56,15 @@ class ToOccurTransformer[I <: FeatureType] def transformFn: I => RealNN = (value: I) => if (matchFn(value)) yes else no } + + +object ToOccurTransformer { + + def defaultMatches[T <: FeatureType]: T => Boolean = { + case num: OPNumeric[_] if num.nonEmpty => num.toDouble.get > 0.0 + case text: Text if text.nonEmpty => text.value.get.length > 0 + case collection: OPCollection => collection.nonEmpty + case _ => false + } + +} diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/UrlMapToPickListMapTransformer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/UrlMapToPickListMapTransformer.scala new file mode 100644 index 0000000000..eb8c35f2ed --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/UrlMapToPickListMapTransformer.scala @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.feature + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{Email, EmailMap, PickList, PickListMap, _} +import com.salesforce.op.stages.base.unary.{UnaryLambdaTransformer, UnaryTransformer} + +class UrlMapToPickListMapTransformer(uid: String = UID[UrlMapToPickListMapTransformer]) + extends UnaryTransformer[URLMap, PickListMap](operationName = "urlMapToPickListMap", uid = uid) { + + override def transformFn: URLMap => PickListMap = _.value + .mapValues(v => if (v.toURL.isValid) v.toURL.domain else None) + .collect { case (k, Some(v)) => k -> v }.toPickListMap + +} diff --git a/core/src/main/scala/com/salesforce/op/utils/text/LuceneTextAnalyzer.scala b/core/src/main/scala/com/salesforce/op/utils/text/LuceneTextAnalyzer.scala index 98f9790f06..5580ce83f2 100644 --- a/core/src/main/scala/com/salesforce/op/utils/text/LuceneTextAnalyzer.scala +++ b/core/src/main/scala/com/salesforce/op/utils/text/LuceneTextAnalyzer.scala @@ -32,8 +32,10 @@ package com.salesforce.op.utils.text import java.io.Reader import java.nio.charset.StandardCharsets +import java.util.regex.Pattern import com.salesforce.op.utils.text.Language._ +import org.apache.lucene.analysis.Analyzer.TokenStreamComponents import org.apache.lucene.analysis._ import org.apache.lucene.analysis.ar.ArabicAnalyzer import org.apache.lucene.analysis.bg.BulgarianAnalyzer @@ -64,6 +66,7 @@ import org.apache.lucene.analysis.lt.LithuanianAnalyzer import org.apache.lucene.analysis.lv.LatvianAnalyzer import org.apache.lucene.analysis.nl.DutchAnalyzer import org.apache.lucene.analysis.no.NorwegianAnalyzer +import org.apache.lucene.analysis.pattern.PatternTokenizer import org.apache.lucene.analysis.pt.PortugueseAnalyzer import org.apache.lucene.analysis.ro.RomanianAnalyzer import org.apache.lucene.analysis.ru.RussianAnalyzer @@ -76,17 +79,19 @@ import org.apache.lucene.analysis.tr.TurkishAnalyzer import org.apache.lucene.util.IOUtils import scala.collection.mutable.ArrayBuffer +import scala.util.Try /** * Text analyzer implementation using a Lucene analyzer - * - * @param analyzers Lucene analyzer factory to use (defaults to [[LuceneTextAnalyzer]]) */ -class LuceneTextAnalyzer -( - // use lambda to workaround a non serializable analyzer - analyzers: Language => Analyzer = LuceneTextAnalyzer.apply -) extends TextAnalyzer { +class LuceneTextAnalyzer extends TextAnalyzer { + + /** + * Lucene analyzer factory to use (defaults to [[LuceneTextAnalyzer]]) + * @param lang desired language + * @return language specific language analyzer + */ + def analyzers(lang: Language): Analyzer = LuceneTextAnalyzer.apply(lang) /** * Analyze a text and produce tokens @@ -118,6 +123,34 @@ class LuceneTextAnalyzer } +/** + * Text analyzer implementation using a Lucene analyzer with HTML stripping applied + */ +class LuceneHtmlStripTextAnalyzer extends LuceneTextAnalyzer { + override def analyzers(lang: Language): Analyzer = LuceneTextAnalyzer.withHtmlStripping(lang) +} + +/** + * Text analyzer implementation using a Lucene analyzer with Pattern Tokenizer matching + * + * @param pattern is the regular expression + * @param group selects the matching group as the token (default: -1, which is equivalent to "split". + */ +class LuceneRegexTextAnalyzer(val pattern: String, val group: Int = -1) extends LuceneTextAnalyzer { + require(Try(Pattern.compile(pattern)).isSuccess, s"Invalid regex pattern: $pattern") + + private lazy val analyzer: Analyzer = new Analyzer { + def createComponents(fieldName: String): TokenStreamComponents = { + val regex = Pattern.compile(pattern) + val source = new PatternTokenizer(regex, group) + new TokenStreamComponents(source) + } + } + + override def analyzers(lang: Language): Analyzer = analyzer +} + + /** * Creates a Lucene Analyzer for a specific language or falls back to [[StandardAnalyzer]] */ diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowModelReaderWriterTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowModelReaderWriterTest.scala index 4cc62d85a0..d65b5c2352 100644 --- a/core/src/test/scala/com/salesforce/op/OpWorkflowModelReaderWriterTest.scala +++ b/core/src/test/scala/com/salesforce/op/OpWorkflowModelReaderWriterTest.scala @@ -67,7 +67,6 @@ class OpWorkflowModelReaderWriterTest ) var saveFlowPath: String = _ var saveModelPath: String = _ - val saveFlowPathStable: String = tempDir + "/op-rw-wf-test-" + DateTime.now().getMillis override protected def beforeEach(): Unit = { @@ -142,7 +141,7 @@ class OpWorkflowModelReaderWriterTest } trait SwSingleStageFlow { - val vec = FeatureBuilder.OPVector[Passenger].extract(_ => OPVector.empty).asPredictor + val vec = FeatureBuilder.OPVector[Passenger].extract(OpWorkflowModelReaderWriterTest.emptyVectorFn).asPredictor val scaler = new StandardScaler().setWithStd(false).setWithMean(false) val schema = FeatureSparkTypes.toStructType(vec) val data = spark.createDataFrame(List(Row(Vectors.dense(1.0))).asJava, schema) @@ -157,26 +156,37 @@ class OpWorkflowModelReaderWriterTest val (wfM, jsonModel) = makeModelAndJson(wf) } + trait OldVectorizedFlow extends UIDReset { + val cat = Seq(gender, boarded, height, age, description).transmogrify() + val catHead = cat.map[Real](OpWorkflowModelReaderWriterTest.catHeadFn) + val wf = new OpWorkflow().setParameters(workflowParams).setResultFeatures(catHead) + } + + trait VectorizedFlow extends UIDReset { + val catHead = rawFeatures.transmogrify().map[Real](OpWorkflowModelReaderWriterTest.catHeadFn) + val wf = new OpWorkflow().setParameters(workflowParams).setResultFeatures(catHead) + } + "Single Stage OpWorkflowWriter" should "have proper json entries" in new SingleStageFlow { val modelKeys = jsonModel.extract[Map[String, Any]].keys modelKeys should contain theSameElementsAs OpWorkflowModelReadWriteShared.FieldNames.values.map(_.entryName) } - it should "have correct result id" in new SingleStageFlow { - val idsM = (jsonModel \ ResultFeaturesUids.entryName).extract[Array[String]] - idsM should contain theSameElementsAs Array(density.uid) - } - - it should "have a single stage" in new SingleStageFlow { + it should "recover all stages" in new SingleStageFlow { val stagesM = (jsonModel \ Stages.entryName).extract[JArray] - stagesM.values.size shouldBe 1 + stagesM.values.size shouldBe 3 } - it should "have 3 features" in new SingleStageFlow { + it should "recover all features" in new SingleStageFlow { val featsM = (jsonModel \ AllFeatures.entryName).extract[JArray] featsM.values.size shouldBe 3 } + it should "have the correct results feature ids" in new SingleStageFlow { + val idsM = (jsonModel \ ResultFeaturesUids.entryName).extract[Array[String]] + idsM should contain theSameElementsAs Array(density.uid) + } + it should "have correct uid" in new SingleStageFlow { val uidM = (jsonModel \ Uid.entryName).extract[String] uidM shouldBe wf.uid @@ -188,12 +198,12 @@ class OpWorkflowModelReaderWriterTest paramsM.stageParams shouldBe workflowParams.stageParams } - "MultiStage OpWorkflowWriter" should "recover all relevant stages" in new MultiStageFlow { + "MultiStage OpWorkflowWriter" should "recover all stages" in new MultiStageFlow { val stagesM = (jsonModel \ Stages.entryName).extract[JArray] - stagesM.values.size shouldBe 2 + stagesM.values.size shouldBe 4 } - it should "recover all relevant features" in new MultiStageFlow { + it should "recover all features" in new MultiStageFlow { val featsM = (jsonModel \ AllFeatures.entryName).extract[JArray] featsM.values.size shouldBe 4 } @@ -203,9 +213,9 @@ class OpWorkflowModelReaderWriterTest idsM should contain theSameElementsAs Array(density.uid, weight2.uid) } - "Raw feature only OpWorkflowWriter" should "recover no stages" in new RawFeatureFlow { + "Raw feature only OpWorkflowWriter" should "recover a single stage" in new RawFeatureFlow { val stagesM = (jsonModel \ Stages.entryName).extract[JArray] - stagesM.values.length shouldBe 0 + stagesM.values.length shouldBe 1 } it should "recover raw feature in feature list" in new RawFeatureFlow { @@ -220,57 +230,42 @@ class OpWorkflowModelReaderWriterTest Spec[OpWorkflowModelReader] should "load proper single stage workflow" in new SingleStageFlow { wfM.save(saveModelPath) - val wfMR = wf.loadModel(saveModelPath) - compareWorkflowModels(wfMR, wfM) + val wfMR = wf.loadModel(saveModelPath).setReader(wfM.getReader()) + assert(wfMR, wfM) } it should "load proper multiple stage workflow" in new MultiStageFlow { wfM.save(saveModelPath) - val wfMR = wf.loadModel(saveModelPath) - compareWorkflowModels(wfMR, wfM) + val wfMR = wf.loadModel(saveModelPath).setReader(wfM.getReader()) + assert(wfMR, wfM) } it should "load proper raw feature workflow" in new RawFeatureFlow { wfM.save(saveModelPath) - val wfMR = wf.loadModel(saveModelPath) - compareWorkflowModels(wfMR, wfM) + val wfMR = wf.loadModel(saveModelPath).setReader(wfM.getReader()) + assert(wfMR, wfM) } it should "load proper workflow with spark wrapped stages" in new SwSingleStageFlow { wfM.save(saveModelPath) - val wfMR = wf.loadModel(saveModelPath) - compareWorkflowModels(wfMR, wfM) + val wfMR = wf.loadModel(saveModelPath).setReader(wfM.getReader()) + assert(wfMR, wfM) } it should "work for models" in new SingleStageFlow { wf.setReader(dataReader) val model = wf.train() model.save(saveFlowPath) - val wfMR = wf.loadModel(saveFlowPath) - compareWorkflowModels(model, wfMR) - } - - trait OldVectorizedFlow extends UIDReset { - val cat = Seq(gender, boarded, height, age, description).transmogrify() - val catHead = cat.map[Real](v => Real(v.value.toArray.headOption)) - val wf = new OpWorkflow() - .setParameters(workflowParams) - .setResultFeatures(catHead) - } - - trait VectorizedFlow extends UIDReset { - val catHead = rawFeatures.transmogrify().map[Real](v => Real(v.value.toArray.headOption)) - val wf = new OpWorkflow() - .setParameters(workflowParams) - .setResultFeatures(catHead) + val wfMR = wf.loadModel(saveFlowPath).setReader(dataReader) + assert(model, wfMR) } it should "load workflow model with vectorized feature" in new VectorizedFlow { wf.setReader(dataReader) val wfM = wf.train() wfM.save(saveFlowPath) - val wfMR = wf.loadModel(saveFlowPath) - compareWorkflowModels(wfMR, wfM) + val wfMR = wf.loadModel(saveFlowPath).setReader(dataReader) + assert(wfMR, wfM) } it should "save a workflow model that has a RawFeatureFilter with correct blacklists" in new VectorizedFlow { @@ -281,48 +276,52 @@ class OpWorkflowModelReaderWriterTest val wfM = wf.train() wfM.save(saveFlowPathStable) wf.getBlacklist().map(_.name) should contain theSameElementsAs - Array("age", "boarded", "description", "gender", "height", "weight") + Seq(age, boarded, description, gender, height, weight).map(_.name) wf.getBlacklistMapKeys() shouldBe - Map("booleanMap" -> Set("Male"), "stringMap" -> Set("Male"), "numericMap" -> Set("Male")) + Map(booleanMap.name -> Set("Male"), stringMap.name -> Set("Male"), numericMap.name -> Set("Male")) - val wfMR = wf.loadModel(saveFlowPathStable) - compareWorkflowModels(wfM, wfMR) + val wfMR = wf.loadModel(saveFlowPathStable).setReader(wfM.getReader()) + assert(wfM, wfMR) } it should "load a workflow model that has a RawFeatureFilter and a different workflow" in new VectorizedFlow { val wfM = wf.loadModel(saveFlowPathStable) wf.getResultFeatures().head.name shouldBe wfM.getResultFeatures().head.name - wf.getResultFeatures().head.history().originFeatures should contain theSameElementsAs - Array("age", "boarded", "booleanMap", "description", "gender", "height", "numericMap", - "stringMap", "survived", "weight") + wf.getResultFeatures().head.history().originFeatures should contain theSameElementsAs rawFeatures.map(_.name) wfM.getResultFeatures().head.history().originFeatures should contain theSameElementsAs - Array("booleanMap", "numericMap", "stringMap", "survived") + Seq(booleanMap, numericMap, stringMap, survived).map(_.name) wfM.getBlacklist().map(_.name) should contain theSameElementsAs - Array("age", "boarded", "description", "gender", "height", "weight") + Seq(age, boarded, description, gender, height, weight).map(_.name) + } + + it should "load a workflow model that has a RawFeatureFilter without workflow" in new VectorizedFlow { + val wfM = OpWorkflowModel.load(saveFlowPathStable) + wf.getResultFeatures().head.name shouldBe wfM.getResultFeatures().head.name + wf.getResultFeatures().head.history().originFeatures should contain theSameElementsAs rawFeatures.map(_.name) + wfM.getResultFeatures().head.history().originFeatures should contain theSameElementsAs + Seq(booleanMap, numericMap, stringMap, survived).map(_.name) + wfM.getBlacklist().map(_.name) should contain theSameElementsAs + Seq(age, boarded, description, gender, height, weight).map(_.name) } it should "load model and allow copying it" in new VectorizedFlow { val wfM = wf.loadModel(saveFlowPathStable).setReader(dataReader) - val copy = wfM.copy() - copy.uid shouldBe wfM.uid - copy.trainingParams.toString shouldBe wfM.trainingParams.toString - copy.isWorkflowCV shouldBe wfM.isWorkflowCV - copy.getReader() shouldBe wfM.getReader() - copy.getResultFeatures() shouldBe wfM.getResultFeatures() - copy.getRawFeatures() shouldBe wfM.getRawFeatures() - copy.getBlacklist() shouldBe wfM.getBlacklist() - copy.getBlacklistMapKeys() shouldBe wfM.getBlacklistMapKeys() - copy.getRawFeatureFilterResults() shouldBe wfM.getRawFeatureFilterResults() - copy.getStages().map(_.uid) shouldBe wfM.getStages().map(_.uid) - copy.getParameters().toString shouldBe wfM.getParameters().toString - } - - it should "be able to load a old version of a saved model" in new OldVectorizedFlow { + val copy = wfM.copy().setReader(dataReader) + assert(copy, wfM) + } + + it should "load model without workflow and allow copying it" in { + val wfM = OpWorkflowModel.load(saveFlowPathStable).setReader(dataReader) + val copy = wfM.copy().setReader(dataReader) + assert(copy, wfM) + } + + it should "load a old version of a saved model" in new OldVectorizedFlow { val wfM = wf.loadModel("src/test/resources/OldModelVersion") wfM.getBlacklist().isEmpty shouldBe true } - it should "be able to load a old version of a saved model (v0.5.1)" in new OldVectorizedFlow { + it should "load a old version of a saved model (v0.5.1)" in new OldVectorizedFlow { // note: in these old models, raw feature filter config will be set to the config defaults // but we never re-initialize raw feature filter when loading a model (only scoring, no training) val wfM = wf.loadModel("src/test/resources/OldModelVersion_0_5_1") @@ -330,57 +329,52 @@ class OpWorkflowModelReaderWriterTest wfM.getRawFeatureFilterResults().exclusionReasons shouldBe empty } - it should "error on loading a model without workflow" in { - val error = intercept[RuntimeException](OpWorkflowModel.load(saveFlowPathStable)) - error.getMessage should startWith("Failed to load Workflow from path") - error.getCause.isInstanceOf[NotImplementedError] shouldBe true - error.getCause.getMessage shouldBe "Loading models without the original workflow is currently not supported" - } - - def compareFeatures(f1: Array[OPFeature], f2: Array[OPFeature]): Unit = { + def assert(f1: Array[OPFeature], f2: Array[OPFeature]): Unit = { f1.length shouldBe f2.length f1.sortBy(_.uid) should contain theSameElementsAs f2.sortBy(_.uid) } - // Ordering of stages is important - def compareStages(stages1: Array[OPStage], stages2: Array[OPStage]): Unit = { + def assert(stages1: Array[OPStage], stages2: Array[OPStage]): Unit = { stages1.length shouldBe stages2.length + // Ordering of stages is important stages1.zip(stages2).foreach { case (s1, s2) => { s1.uid shouldBe s2.uid - compareFeatures(s1.getInputFeatures(), s2.getInputFeatures()) + assert(s1.getInputFeatures(), s2.getInputFeatures()) val s1Feats: Array[OPFeature] = Array(s1.getOutput()) val s2Feats: Array[OPFeature] = Array(s2.getOutput()) - compareFeatures(s1Feats, s2Feats) + assert(s1Feats, s2Feats) } } } - def compareWorkflows(wf1: OpWorkflow, wf2: OpWorkflow): Unit = { + def assert(wf1: OpWorkflow, wf2: OpWorkflow): Unit = { wf1.uid shouldBe wf2.uid - compareParams(wf1.getParameters(), wf2.getParameters()) - compareFeatures(wf1.getResultFeatures(), wf2.getResultFeatures()) - compareFeatures(wf1.getBlacklist(), wf2.getBlacklist()) - compareFeatures(wf1.getRawFeatures(), wf2.getRawFeatures()) - compareStages(wf1.getStages(), wf2.getStages()) + assert(wf1.getParameters(), wf2.getParameters()) + assert(wf1.getResultFeatures(), wf2.getResultFeatures()) + assert(wf1.getBlacklist(), wf2.getBlacklist()) + assert(wf1.getRawFeatures(), wf2.getRawFeatures()) + assert(wf1.getStages(), wf2.getStages()) wf1.getBlacklistMapKeys() shouldBe wf2.getBlacklistMapKeys() RawFeatureFilterResultsComparison.compare(wf1.getRawFeatureFilterResults(), wf2.getRawFeatureFilterResults()) } - def compareWorkflowModels(wf1: OpWorkflowModel, wf2: OpWorkflowModel): Unit = { - wf1.uid shouldBe wf2.uid - compareParams(wf1.trainingParams, wf2.trainingParams) - compareParams(wf1.getParameters(), wf2.getParameters()) - compareFeatures(wf1.getResultFeatures(), wf2.getResultFeatures()) - compareFeatures(wf1.getBlacklist(), wf2.getBlacklist()) - compareFeatures(wf1.getRawFeatures(), wf2.getRawFeatures()) - compareStages(wf1.getStages(), wf2.getStages()) - wf1.getBlacklistMapKeys() shouldBe wf2.getBlacklistMapKeys() - RawFeatureFilterResultsComparison.compare(wf1.getRawFeatureFilterResults(), wf2.getRawFeatureFilterResults()) + def assert(wfm1: OpWorkflowModel, wfm2: OpWorkflowModel): Unit = { + wfm1.uid shouldBe wfm2.uid + assert(wfm1.trainingParams, wfm2.trainingParams) + assert(wfm1.getParameters(), wfm2.getParameters()) + wfm1.isWorkflowCV shouldBe wfm2.isWorkflowCV + wfm1.getReader() shouldBe wfm2.getReader() + assert(wfm1.getResultFeatures(), wfm2.getResultFeatures()) + assert(wfm1.getRawFeatures(), wfm2.getRawFeatures()) + assert(wfm1.getBlacklist(), wfm2.getBlacklist()) + wfm1.getBlacklistMapKeys() shouldBe wfm2.getBlacklistMapKeys() + assert(wfm1.getStages(), wfm2.getStages()) + RawFeatureFilterResultsComparison.compare(wfm1.getRawFeatureFilterResults(), wfm2.getRawFeatureFilterResults()) } - def compareParams(p1: OpParams, p2: OpParams): Unit = { + def assert(p1: OpParams, p2: OpParams): Unit = { p1.stageParams shouldBe p2.stageParams p1.readerParams.toString() shouldBe p2.readerParams.toString() p1.customParams shouldBe p2.customParams @@ -390,3 +384,8 @@ class OpWorkflowModelReaderWriterTest trait UIDReset { UID.reset() } + +object OpWorkflowModelReaderWriterTest { + def catHeadFn: OPVector => Real = v => Real(v.value.toArray.headOption) + def emptyVectorFn: Passenger => OPVector = _ => OPVector.empty +} diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala index 910607520d..3a51c1b1f7 100644 --- a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala +++ b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala @@ -52,7 +52,7 @@ import org.apache.spark.sql.types.{DoubleType, StringType} import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.joda.time.DateTime import org.junit.runner.RunWith -import org.scalatest.FlatSpec +import org.scalatest.{Assertion, FlatSpec} import org.scalatest.junit.JUnitRunner import org.slf4j.LoggerFactory @@ -543,17 +543,19 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest { val expectedScores = expectedScoresDF.select(prediction.name, KeyFieldName).sort(KeyFieldName).collect() model.save(workflowLocation) - // Load and score the model - val loaded = workflow.loadModel(workflowLocation) - val scoresDF = loaded.setInputDataset(ds, keyFn).score() - val scores = scoresDF.select(prediction.name, KeyFieldName).sort(KeyFieldName).collect() - - // Compare the scores produced by the loaded model vs original model - scores should contain theSameElementsAs expectedScores + def assertModel(model: OpWorkflowModel): Assertion = { + val scoresDF = model.setInputDataset(ds, keyFn).score() + val scores = scoresDF.select(prediction.name, KeyFieldName).sort(KeyFieldName).collect() + // Compare the scores produced by the loaded model vs original model + scores should contain theSameElementsAs expectedScores + } - // TODO - once supported, load the model without the workflow and score it as well - val error = intercept[RuntimeException](OpWorkflowModel.load(workflowLocation)) - error.getMessage should startWith("Failed to load Workflow from path") + withClue("Expected to load and score model with provided workflow: ") { + assertModel(model = workflow.loadModel(workflowLocation)) + } + withClue("Expected to load and score model without workflow: ") { + assertModel(model = OpWorkflowModel.load(workflowLocation)) + } } } diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala index a4add495c6..c2fa855111 100644 --- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala @@ -34,7 +34,7 @@ import com.salesforce.op.features.types._ import com.salesforce.op.features.{Feature, FeatureDistributionType, FeatureLike, OPFeature} import com.salesforce.op.readers.{CustomReader, DataFrameFieldNames, ReaderKey} import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer -import com.salesforce.op.stages.impl.feature.OPMapVectorizerTestHelper.makeTernaryOPMapTransformer +import com.salesforce.op.stages.impl.feature.OPMapVectorizerTestHelper.makeMapifyTransformer import com.salesforce.op.stages.impl.preparators.CorrelationType import com.salesforce.op.test._ import com.salesforce.op.testkit.{RandomData, _} @@ -440,7 +440,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( currencyGenerator25, currencyGenerator95, currencyGenerator50, numRows ) - val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) + val mapFeature = makeMapifyTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) @@ -513,7 +513,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with generateRandomDfAndFeatures[Currency, Currency, Currency]( currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows ) - val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) + val mapFeature = makeMapifyTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) @@ -570,7 +570,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with generateRandomDfAndFeatures[Currency, Currency, Currency]( currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows ) - val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) + val mapFeature = makeMapifyTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) @@ -624,7 +624,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows ) - val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) + val mapFeature = makeMapifyTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) @@ -669,7 +669,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( currencyGenerator25, currencyGenerator95, currencyGenerator50, numRows ) - val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) + val mapFeature = makeMapifyTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) @@ -758,7 +758,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows ) - val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) + val mapFeature = makeMapifyTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) val transformedTrainDf = new OpWorkflow().setResultFeatures(mapFeature).transform(trainDf) @@ -820,7 +820,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with val (trainDf, c1, c2, c3) = generateRandomDfAndFeatures[Currency, Currency, Currency]( currencyGenerator1, currencyGenerator2, currencyGenerator3, numRows ) - val mapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](c1, c2, c3) + val mapFeature = makeMapifyTransformer[Currency, CurrencyMap, Double](c1, c2, c3) // Need to make a raw version of this feature so that RawFeatureFilter will pick it up val mapFeatureRaw = mapFeature.asRaw(isResponse = false) diff --git a/core/src/test/scala/com/salesforce/op/stages/Lambdas.scala b/core/src/test/scala/com/salesforce/op/stages/Lambdas.scala new file mode 100644 index 0000000000..9ffb467bd8 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/Lambdas.scala @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages + +import com.salesforce.op.features.types.Real +import com.salesforce.op.features.types._ + +object Lambdas { + def fncUnary: Real => Real = (x: Real) => x.v.map(_ * 0.1234).toReal + + def fncSequence: Seq[DateList] => Real = (x: Seq[DateList]) => { + val v = x.foldLeft(0.0)((a, b) => a + b.value.sum) + Math.round(v / 1E6).toReal + } + + def fncBinarySequence: (Real, Seq[DateList]) => Real = (y: Real, x: Seq[DateList]) => { + val v = x.foldLeft(0.0)((a, b) => a + b.value.sum) + (Math.round(v / 1E6) + y.value.getOrElse(0.0)).toReal + } + + def fncBinary: (Real, Real) => Real = (x: Real, y: Real) => ( + for { + yv <- y.value + xv <- x.value + } yield xv * yv + ).toReal + + def fncTernary: (Real, Real, Real) => Real = (x: Real, y: Real, z: Real) => + (for { + xv <- x.value + yv <- y.value + zv <- z.value + } yield xv * yv + zv).toReal + + def fncQuaternary: (Real, Real, Text, Real) => Real = (x: Real, y: Real, t: Text, z: Real) => + (for { + xv <- x.value + yv <- y.value + tv <- t.value + zv <- z.value + } yield xv * yv + zv * tv.length).toReal + +} diff --git a/core/src/test/scala/com/salesforce/op/stages/OpPipelineStageReaderWriterTest.scala b/core/src/test/scala/com/salesforce/op/stages/OpPipelineStageReaderWriterTest.scala index 562ece50b1..8791277d9e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/OpPipelineStageReaderWriterTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/OpPipelineStageReaderWriterTest.scala @@ -32,7 +32,7 @@ package com.salesforce.op.stages import com.salesforce.op.features._ import com.salesforce.op.features.types._ -import com.salesforce.op.stages.OpPipelineStageReadWriteShared._ +import com.salesforce.op.stages.OpPipelineStageReaderWriter._ import com.salesforce.op.test.PassengerSparkFixtureTest import com.salesforce.op.utils.reflection.ReflectionUtils import com.salesforce.op.utils.spark.RichDataset._ @@ -50,7 +50,7 @@ private[stages] abstract class OpPipelineStageReaderWriterTest extends FlatSpec with PassengerSparkFixtureTest { val meta = new MetadataBuilder().putString("foo", "bar").build() - + val expectedFeaturesLength = 1 def stage: OpPipelineStageBase with Transformer val expected: Array[Real] val hasOutputName = true @@ -67,9 +67,6 @@ private[stages] abstract class OpPipelineStageReaderWriterTest log.info(pretty(stageJson)) (stageJson \ FN.Uid.entryName).extract[String] shouldBe stage.uid } - it should "write isModel" in { - (stageJson \ FN.IsModel.entryName).extract[Boolean] shouldBe isModel - } it should "write class name" in { (stageJson \ FN.Class.entryName).extract[String] shouldBe stage.getClass.getName } @@ -95,7 +92,7 @@ private[stages] abstract class OpPipelineStageReaderWriterTest } it should "write input features" in { val jArray = ((stageJson \ FN.ParamMap.entryName) \ "inputFeatures").extract[JArray] - jArray.values should have length 1 + jArray.values should have length expectedFeaturesLength val obj = jArray(0).extract[JObject] obj.values.keys shouldBe Set("name", "isResponse", "isRaw", "uid", "typeName", "stages", "originFeatures") } diff --git a/core/src/test/scala/com/salesforce/op/stages/OpPipelineStagesTest.scala b/core/src/test/scala/com/salesforce/op/stages/OpPipelineStagesTest.scala index 83b3327127..8e7c862f56 100644 --- a/core/src/test/scala/com/salesforce/op/stages/OpPipelineStagesTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/OpPipelineStagesTest.scala @@ -47,6 +47,8 @@ import scala.reflect.runtime.universe.TypeTag class OpPipelineStagesTest extends FlatSpec with PassengerSparkFixtureTest with BeforeAndAfterEach with Serializable { + import OpPipelineStagesTest._ + val tfParam = new TransientFeatureArrayParam(name = "foo", parent = "null", doc = "nothing") var stage: TestStage = _ @@ -78,8 +80,12 @@ class OpPipelineStagesTest features(0).isResponse && features(0).isRaw shouldBe true features(1).isResponse || features(1).isRaw shouldBe false - assertThrows[RuntimeException] { features(0).getFeature() } - assertThrows[RuntimeException] { features(1).getFeature() } + assertThrows[RuntimeException] { + features(0).getFeature() + } + assertThrows[RuntimeException] { + features(1).getFeature() + } } it should "encode json properly" in { @@ -89,8 +95,12 @@ class OpPipelineStagesTest tfsRecovered should have length 2 compare(tfsRecovered(0), weight) compare(tfsRecovered(1), height) - assertThrows[RuntimeException] { tfsRecovered(0).getFeature() } - assertThrows[RuntimeException] { tfsRecovered(1).getFeature() } + assertThrows[RuntimeException] { + tfsRecovered(0).getFeature() + } + assertThrows[RuntimeException] { + tfsRecovered(1).getFeature() + } } "Stage" should "set Transient Features properly as an input feature" in { @@ -103,7 +113,9 @@ class OpPipelineStagesTest } it should "be robust when getting features by index" in { - assertThrows[RuntimeException] { stage.getInputFeature(0) } + assertThrows[RuntimeException] { + stage.getInputFeature(0) + } stage.getTransientFeature(0) shouldBe None stage.setInput(height) @@ -122,7 +134,7 @@ class OpPipelineStagesTest val testOp = new com.salesforce.op.stages.base.unary.UnaryLambdaTransformer[Real, Real]( operationName = "test", - transformFn = (i: Real) => i, + transformFn = OpPipelineStagesTest.fnc0, uid = "myID" ) @@ -149,7 +161,11 @@ class OpPipelineStagesTest } } -class TestStage(implicit val tto: TypeTag[RealNN], val ttov: TypeTag[RealNN#Value]) - extends Pipeline with OpPipelineStage1[RealNN, RealNN] { - override def operationName: String = "test" +object OpPipelineStagesTest { + def fnc0: Real => Real = x => x + + class TestStage(implicit val tto: TypeTag[RealNN], val ttov: TypeTag[RealNN#Value]) + extends Pipeline with OpPipelineStage1[RealNN, RealNN] { + override def operationName: String = "test" + } } diff --git a/core/src/test/scala/com/salesforce/op/stages/OpTransformerBinaryReaderWriterTest.scala b/core/src/test/scala/com/salesforce/op/stages/OpTransformerBinaryReaderWriterTest.scala new file mode 100644 index 0000000000..91ce639439 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/OpTransformerBinaryReaderWriterTest.scala @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.base.binary.BinaryLambdaTransformer +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class OpTransformerBinaryReaderWriterTest extends OpPipelineStageReaderWriterTest { + override val expectedFeaturesLength = 2 + override val hasOutputName = false + + val stage = + new BinaryLambdaTransformer[Real, Real, Real]( + operationName = "test", + transformFn = Lambdas.fncBinary + ).setInput(weight, age).setMetadata(meta) + + val expected = Array(8600.toReal, 134.toReal, Real.empty, 2574.toReal, Real.empty, 2144.toReal) +} diff --git a/core/src/test/scala/com/salesforce/op/stages/OpTransformerBinarySequenceReaderWriterTest.scala b/core/src/test/scala/com/salesforce/op/stages/OpTransformerBinarySequenceReaderWriterTest.scala new file mode 100644 index 0000000000..fc169b2dc0 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/OpTransformerBinarySequenceReaderWriterTest.scala @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.base.sequence.BinarySequenceLambdaTransformer +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class OpTransformerBinarySequenceReaderWriterTest extends OpPipelineStageReaderWriterTest { + override val expectedFeaturesLength = 2 + override val hasOutputName = false + + val stage = + new BinarySequenceLambdaTransformer[Real, DateList, Real]( + operationName = "test", + transformFn = Lambdas.fncBinarySequence + ).setInput(weight, boarded).setMetadata(meta) + + val expected = Array(3114.toReal, 1538.toReal, 0.toReal, 1549.toReal, 1567.toReal, 1538.toReal) +} diff --git a/core/src/test/scala/com/salesforce/op/stages/OpTransformerQuaternaryReaderWriterTest.scala b/core/src/test/scala/com/salesforce/op/stages/OpTransformerQuaternaryReaderWriterTest.scala new file mode 100644 index 0000000000..ff839e6e25 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/OpTransformerQuaternaryReaderWriterTest.scala @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.base.quaternary.QuaternaryLambdaTransformer +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class OpTransformerQuaternaryReaderWriterTest extends OpPipelineStageReaderWriterTest { + override val expectedFeaturesLength = 4 + override val hasOutputName = false + + val stage = + new QuaternaryLambdaTransformer[Real, Real, Text, Real, Real]( + operationName = "test", + transformFn = Lambdas.fncQuaternary, + uid = "uid_1234" + ).setInput(weight, age, description, weight).setMetadata(meta) + + val expected = Array(13244.toReal, Real.empty, Real.empty, 2574.toReal, Real.empty, Real.empty) +} diff --git a/core/src/test/scala/com/salesforce/op/stages/OpTransformerReaderWriterTest.scala b/core/src/test/scala/com/salesforce/op/stages/OpTransformerReaderWriterTest.scala index 0e45eb9a8a..3548301670 100644 --- a/core/src/test/scala/com/salesforce/op/stages/OpTransformerReaderWriterTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/OpTransformerReaderWriterTest.scala @@ -30,7 +30,6 @@ package com.salesforce.op.stages -import com.salesforce.op.UID import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer import org.junit.runner.RunWith @@ -42,11 +41,11 @@ class OpTransformerReaderWriterTest extends OpPipelineStageReaderWriterTest { override val hasOutputName = false - lazy val stage = + val stage = new UnaryLambdaTransformer[Real, Real]( operationName = "test", - transformFn = _.v.map(_ * 0.1234).toReal, - uid = UID[UnaryLambdaTransformer[_, _]] + transformFn = Lambdas.fncUnary, + uid = "uid_1234" ).setInput(weight).setMetadata(meta) val expected = Array(21.2248.toReal, 8.2678.toReal, Real.empty, 9.6252.toReal, 11.8464.toReal, 8.2678.toReal) diff --git a/core/src/test/scala/com/salesforce/op/stages/OpTransformerSequenceReaderWriterTest.scala b/core/src/test/scala/com/salesforce/op/stages/OpTransformerSequenceReaderWriterTest.scala new file mode 100644 index 0000000000..c6886170bd --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/OpTransformerSequenceReaderWriterTest.scala @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.base.sequence.SequenceLambdaTransformer +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class OpTransformerSequenceReaderWriterTest extends OpPipelineStageReaderWriterTest { + override val expectedFeaturesLength = 1 + override val hasOutputName = false + + val stage = + new SequenceLambdaTransformer[DateList, Real]( + operationName = "test", + transformFn = Lambdas.fncSequence, + uid = "uid_1234" + ).setInput(boarded).setMetadata(meta) + + val expected = Array(2942.toReal, 1471.toReal, 0.toReal, 1471.toReal, 1471.toReal, 1471.toReal) +} diff --git a/core/src/test/scala/com/salesforce/op/stages/OpTransformerTernaryReaderWriterTest.scala b/core/src/test/scala/com/salesforce/op/stages/OpTransformerTernaryReaderWriterTest.scala new file mode 100644 index 0000000000..3d2cbbe708 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/OpTransformerTernaryReaderWriterTest.scala @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.base.ternary.TernaryLambdaTransformer +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class OpTransformerTernaryReaderWriterTest extends OpPipelineStageReaderWriterTest { + override val expectedFeaturesLength = 3 + override val hasOutputName = false + + val stage = + new TernaryLambdaTransformer[Real, Real, Real, Real]( + operationName = "test", + transformFn = Lambdas.fncTernary, + uid = "uid_1234" + ).setInput(weight, age, weight).setMetadata(meta) + + val expected = Array(8772.toReal, 201.toReal, Real.empty, 2652.toReal, Real.empty, 2211.toReal) +} diff --git a/core/src/test/scala/com/salesforce/op/stages/TransformersTest.scala b/core/src/test/scala/com/salesforce/op/stages/TransformersTest.scala index e79c2828e9..7365a8ff05 100644 --- a/core/src/test/scala/com/salesforce/op/stages/TransformersTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/TransformersTest.scala @@ -132,15 +132,11 @@ class TransformersTest extends FlatSpec with Matchers with PassengerFeaturesTest val ageMap: FeatureLike[Text] = age.map[Text](_.value.map(_.toString).toText) val heightFilter: FeatureLike[RealNN] = height.filter(_.value.contains(100.0), default = new RealNN(Double.MinValue)) - val heightFilterNot: FeatureLike[RealNN] = - height.filterNot(_.value.contains(100.0), default = new RealNN(0.0)) - val heightCollect: FeatureLike[Real] = - height.collect[Real](Real.empty){ case r if r.v.contains(100.0) => Real(123.0) } val ageExists: FeatureLike[Binary] = age.exists(_.value.contains(100.0)) val heightReplaced: FeatureLike[RealNN] = height.replaceWith(new RealNN(1.0), new RealNN(2.0)) - val all = Seq(ageMap, heightFilter, heightFilterNot, heightCollect, ageExists, heightReplaced) + val all = Seq(ageMap, heightFilter, ageExists, heightReplaced) - all.flatMap(_.parents) shouldBe Array(age, height, height, height, age, height) + all.flatMap(_.parents) shouldBe Array(age, height, age, height) all.forall(_.originStage.isInstanceOf[Transformer]) shouldBe true } it should "allow applying generic feature binary transformations" in { diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala index c5a47619f0..6b4fc3105b 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala @@ -81,7 +81,7 @@ class DecisionTreeNumericMapBucketizerTest extends OpEstimatorSpec[OPVector, numerics.limit(total).zip(numerics.limit(total)).zip(numerics.limit(total)).zip(labelData) .map { case (((f1, f2), f3), f4) => (f1, f2, f3, f4) } val (data, f1, f2, f3, label) = TestFeatureBuilder[Real, Real, Real, RealNN](rawData) - val realMapFeature = makeTernaryOPMapTransformer[Real, RealMap, Double](f1, f2, f3) + val realMapFeature = makeMapifyTransformer[Real, RealMap, Double](f1, f2, f3) lazy val modelLocation = tempDir + "/dt-map-buck-test-model-" + org.joda.time.DateTime.now().getMillis } @@ -104,7 +104,7 @@ class DecisionTreeNumericMapBucketizerTest extends OpEstimatorSpec[OPVector, correlated.zip(currencies.limit(total)).zip(correlated).zip(labelData) .map { case (((f1, f2), f3), f4) => (f1, f2, f3, f4) } val (data, f1, f2, f3, label) = TestFeatureBuilder[Currency, Currency, Currency, RealNN](rawData) - val currencyMapFeature = makeTernaryOPMapTransformer[Currency, CurrencyMap, Double](f1, f2, f3) + val currencyMapFeature = makeMapifyTransformer[Currency, CurrencyMap, Double](f1, f2, f3) val expectedSplits = Array(Double.NegativeInfinity, 15, 26, 91, Double.PositiveInfinity) } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DescalerTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DescalerTransformerTest.scala index d7ac12acc0..358b1a0c8e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DescalerTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DescalerTransformerTest.scala @@ -63,14 +63,14 @@ class DescalerTransformerTest extends OpTransformerSpec[Real, DescalerTransforme it should "descale and work in log-scaling workflow" in { val logScaler = new ScalerTransformer[Real, Real]( scalingType = ScalingType.Logarithmic, - scalingArgs = EmptyArgs() + scalingArgs = EmptyScalerArgs() ).setInput(f1) val scaledResponse = logScaler.getOutput() val metadata = logScaler.transform(inputData).schema(scaledResponse.name).metadata ScalerMetadata(metadata) match { case Failure(err) => fail(err) case Success(meta) => - meta shouldBe ScalerMetadata(ScalingType.Logarithmic, EmptyArgs()) + meta shouldBe ScalerMetadata(ScalingType.Logarithmic, EmptyScalerArgs()) } val shifted = scaledResponse.map[Real](v => v.value.map(_ + 1).toReal, operationName = "shift") diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala index 95c147a323..c0d0116630 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala @@ -35,7 +35,7 @@ import com.salesforce.op.features.TransientFeature import com.salesforce.op.features.types._ import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import com.salesforce.op.testkit.RandomText -import com.salesforce.op.utils.spark.OpVectorMetadata +import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.linalg.{Vector, Vectors} import org.junit.runner.RunWith @@ -45,7 +45,6 @@ import org.apache.spark.sql.functions._ @RunWith(classOf[JUnitRunner]) class DropIndicesByTransformerTest extends OpTransformerSpec[OPVector, DropIndicesByTransformer] with AttributeAsserts { - val (inputData, transformer) = { val vecData = Seq( Vectors.dense(1.0, 1.0, 0.0), @@ -55,7 +54,9 @@ class DropIndicesByTransformerTest extends OpTransformerSpec[OPVector, DropIndic val (data, v) = TestFeatureBuilder(vecData) val meta = OpVectorMetadata(v.name, Array(TransientFeature(v).toColumnMetaData()), Map.empty).toMetadata val inputData = data.withColumn(v.name, col(v.name).as(v.name, meta)) - val stage = new DropIndicesByTransformer(_.isNullIndicator).setInput(v).setInputSchema(inputData.schema) + val stage = new DropIndicesByTransformer( + DropIndicesByTransformerTest.matchFn + ).setInput(v).setInputSchema(inputData.schema) inputData -> stage } @@ -113,3 +114,7 @@ class DropIndicesByTransformerTest extends OpTransformerSpec[OPVector, DropIndic } } + +object DropIndicesByTransformerTest { + def matchFn: OpVectorColumnMetadata => Boolean = _.isNullIndicator +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPListTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPListTransformerTest.scala index bc842b7630..85451890b1 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPListTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPListTransformerTest.scala @@ -31,55 +31,43 @@ package com.salesforce.op.stages.impl.feature -import com.salesforce.op.features.types.{Email, EmailMap, Integral, IntegralMap, Real, _} -import com.salesforce.op.stages.base.unary.{UnaryLambdaTransformer, UnaryTransformer} +import com.salesforce.op.UID import com.salesforce.op.features.types._ -import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer -import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder, TestSparkContext} -import com.salesforce.op.utils.spark.RichDataset._ -import org.apache.spark.sql.Dataset +import com.salesforce.op.stages.base.unary.UnaryTransformer +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import org.junit.runner.RunWith -import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner -import OPListTransformerTest._ -import com.salesforce.op.UID -/** - * @author ksuchanek - * @since 214 - */ + @RunWith(classOf[JUnitRunner]) -class OPListTransformerTest extends OpTransformerSpec[TextList, TransformerType] { - lazy val (inputData, top) = TestFeatureBuilder("name", - Seq(TextList(Seq("A", "B"))) - ) +class OPListTransformerTest + extends OpTransformerSpec[TextList, OPListTransformer[Text, Text, TextList, TextList]] { - /** - * [[OpTransformer]] instance to be tested - */ - override val transformer: TransformerType = new TransformerType( - transformer = new BaseTransformer(), - operationName = "testUnaryMapWrap").setInput(top) + lazy val (inputData, top) = TestFeatureBuilder("name", Seq( + Seq("A", "B").toTextList + )) - /** - * Expected result of the transformer applied on the Input Dataset - */ - override val expectedResult: Seq[TextList] = Seq( - TextList(Seq("a", "b")) + val transformer: OPListTransformer[Text, Text, TextList, TextList] = + new LowerCaseListTransformer().setInput(top) + + val expectedResult: Seq[TextList] = Seq( + Seq("a", "b").toTextList ) } -object OPListTransformerTest { - type TransformerType = OPListTransformer[Text, Text, TextList, TextList] - - class BaseTransformer extends UnaryTransformer[Text, Text]( - operationName = "testUnary", - uid = UID[BaseTransformer] - ) { - override def transformFn: (Text => Text) = (input: Text) => input.value.map(_.toLowerCase()).toText - } - +class LowerCaseTransformer extends UnaryTransformer[Text, Text]( + operationName = "lowerCaseUnary", + uid = UID[LowerCaseTransformer] +) { + def transformFn: (Text => Text) = (input: Text) => input.value.map(_.toLowerCase()).toText } - - +class LowerCaseListTransformer +( + uid: String = UID[LowerCaseListTransformer], + operationName: String = "lowerCaseList" +) extends OPListTransformer[Text, Text, TextList, TextList]( + uid = uid, + operationName = operationName, + transformer = new LowerCaseTransformer +) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapTransformerTest.scala index 4588e0f5c0..82a860d4fd 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapTransformerTest.scala @@ -31,56 +31,43 @@ package com.salesforce.op.stages.impl.feature import com.salesforce.op.UID -import com.salesforce.op.features.types.{Email, EmailMap, Integral, IntegralMap, Real, _} -import com.salesforce.op.stages.base.unary.{UnaryLambdaTransformer, UnaryTransformer} import com.salesforce.op.features.types._ -import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer -import com.salesforce.op.stages.impl.feature.OPMapTransformerTest.TransformerType -import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder, TestSparkContext} -import com.salesforce.op.utils.spark.RichDataset._ -import org.apache.spark.sql.Dataset +import com.salesforce.op.stages.base.unary.UnaryTransformer +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import org.junit.runner.RunWith -import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner -/** - * @author ksuchanek - * @since 214 - */ -@RunWith(classOf[JUnitRunner]) -class OPMapTransformerTest extends OpTransformerSpec[IntegralMap, TransformerType] { - import OPMapTransformerTest._ +@RunWith(classOf[JUnitRunner]) +class OPMapTransformerTest + extends OpTransformerSpec[IntegralMap, OPMapTransformer[Email, Integral, EmailMap, IntegralMap]] { - lazy val (inputData, top) = TestFeatureBuilder("name", - Seq( - Map("p1" -> "a@abcd.com", "p2" -> "xy@abcd.com") - ).map(EmailMap(_)) - ) + lazy val (inputData, top) = TestFeatureBuilder("name", Seq( + Map("p1" -> "a@abcd.com", "p2" -> "xy@abcd.com").toEmailMap + )) - /** - * [[OpTransformer]] instance to be tested - */ - override val transformer: TransformerType = new TransformerType( - transformer = new BaseTransformer(), - operationName = "testUnaryMapWrap").setInput(top) + val transformer: OPMapTransformer[Email, Integral, EmailMap, IntegralMap] = + new LengthMapTransformer().setInput(top) - /** - * Expected result of the transformer applied on the Input Dataset - */ - override val expectedResult: Seq[IntegralMap] = Seq( - IntegralMap(Map("p1" -> 10L, "p2" -> 11L)) + val expectedResult: Seq[IntegralMap] = Seq( + Map("p1" -> 10L, "p2" -> 11L).toIntegralMap ) } -object OPMapTransformerTest { - type TransformerType = OPMapTransformer[Email, Integral, EmailMap, IntegralMap] +class LengthTransformer extends UnaryTransformer[Email, Integral]( + operationName = "lengthUnary", + uid = UID[LengthTransformer] +) { + override def transformFn: (Email => Integral) = (input: Email) => input.value.map(_.length).toIntegral +} - class BaseTransformer extends UnaryTransformer[Email, Integral]( - operationName = "testUnary", - uid = UID[BaseTransformer] - ) { - override def transformFn: (Email => Integral) = (input: Email) => input.value.map(_.length).toIntegral - } -} +class LengthMapTransformer +( + uid: String = UID[LengthMapTransformer], + operationName: String = "lengthMap" +) extends OPMapTransformer[Email, Integral, EmailMap, IntegralMap]( + uid = uid, + operationName = operationName, + transformer = new LengthTransformer +) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala index 0ba08528be..229941390d 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala @@ -32,10 +32,10 @@ package com.salesforce.op.stages.impl.feature import java.util.{Date => JDate} -import com.salesforce.op.OpWorkflow -import com.salesforce.op.features.types._ +import com.salesforce.op.{OpWorkflow, UID} +import com.salesforce.op.features.types.{OPMap, _} import com.salesforce.op.features.{Feature, FeatureLike} -import com.salesforce.op.stages.base.ternary.TernaryLambdaTransformer +import com.salesforce.op.stages.base.ternary.{TernaryLambdaTransformer, TernaryTransformer} import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.testkit._ import com.salesforce.op.utils.spark.RichDataset._ @@ -391,7 +391,7 @@ object OPMapVectorizerTestHelper extends Matchers with AttributeAsserts { summary.getMetadataArray(OpVectorMetadata.ColumnsKey).toList ) // Transformer to construct a single map feature from the individual features - val mapFeature = makeTernaryOPMapTransformer[F, FM, MT](rawF1, rawF2, rawF3) + val mapFeature = makeMapifyTransformer[F, FM, MT](rawF1, rawF2, rawF3) val mapFeatureVector = Transmogrifier.transmogrify(Seq(mapFeature))(TransmogrifierTestDefaults).combine() val transformedMap = new OpWorkflow().setResultFeatures(mapFeatureVector).transform(rawDF) val mapSummary = transformedMap.schema(mapFeatureVector.name).metadata @@ -464,35 +464,46 @@ object OPMapVectorizerTestHelper extends Matchers with AttributeAsserts { // TODO assert metadata } + + /** - * Construct OPMap transformer for raw features + * Construct Mapify transformer for raw features */ - def makeTernaryOPMapTransformer[F <: FeatureType : TypeTag, FM <: OPMap[MT] : TypeTag, MT: TypeTag] + def makeMapifyTransformer[F <: FeatureType : TypeTag, FM <: OPMap[MT] : TypeTag, MT: TypeTag] ( rawF1: FeatureLike[F], rawF2: FeatureLike[F], rawF3: FeatureLike[F] ): Feature[FM] = { - val ftFactory = FeatureTypeFactory[FM]() + val mapTransformer = new MapifyTransformer[F, FM, MT]() + mapTransformer.setInput(rawF1, rawF2, rawF3).getOutput().asInstanceOf[Feature[FM]] + } - // Transformer to construct a single map feature from the individual features - val mapTransformer = new TernaryLambdaTransformer[F, F, F, FM](operationName = "mapify", - transformFn = (f1, f2, f3) => { - // For all the maps, the value in the original feature type is an Option[MT], but can't figure out how - // to specify that here since that's not true in general for FeatureTypes (eg. RealNN) - def asMap(v: F, featureName: String): Map[String, Any] = { - v.value match { - case Some(s) => Map(featureName -> s) - case t: Traversable[_] if t.nonEmpty => Map(featureName -> t) - case _ => Map.empty - } - } +} - val mapData = asMap(f1, "f1") ++ asMap(f2, "f2") ++ asMap(f3, "f3") - ftFactory.newInstance(mapData) - } - ) - mapTransformer.setInput(rawF1, rawF2, rawF3).getOutput().asInstanceOf[Feature[FM]] + +class MapifyTransformer[F <: FeatureType, FM <: OPMap[MT], MT] +( + uid: String = UID[MapifyTransformer[F, FM, MT]], + operationName: String = "mapify" +)(implicit tti1: TypeTag[F], + tto: TypeTag[FM], + ttov: TypeTag[FM#Value] +) extends TernaryTransformer[F, F, F, FM](uid = uid, operationName = operationName)(tti1, tti1, tti1, tto, ttov) { + + @transient lazy val ftFactory = FeatureTypeFactory[FM]() + + def asMap(v: F, featureName: String): Map[String, Any] = v.value match { + case Some(s) => Map(featureName -> s) + case t: Traversable[_] if t.nonEmpty => Map(featureName -> t) + case _ => Map.empty + } + + override def transformFn: (F, F, F) => FM = (f1: F, f2: F, f3: F) => { + // For all the maps, the value in the original feature type is an Option[MT], but can't figure out how + // to specify that here since that's not true in general for FeatureTypes (eg. RealNN) + val mapData = asMap(f1, "f1") ++ asMap(f2, "f2") ++ asMap(f3, "f3") + ftFactory.newInstance(mapData) } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetTransformerTest.scala index 960647fabc..6e61b458e5 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetTransformerTest.scala @@ -31,55 +31,37 @@ package com.salesforce.op.stages.impl.feature -import com.salesforce.op.features.types.{Email, EmailMap, Integral, IntegralMap, Real, _} -import com.salesforce.op.stages.base.unary.{UnaryLambdaTransformer, UnaryTransformer} +import com.salesforce.op.UID import com.salesforce.op.features.types._ -import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer -import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder, TestSparkContext} -import com.salesforce.op.utils.spark.RichDataset._ -import org.apache.spark.sql.Dataset +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import org.junit.runner.RunWith -import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner -import OPSetTransformerTest._ -import com.salesforce.op.UID -/** - * @author ksuchanek - * @since 214 - */ -@RunWith(classOf[JUnitRunner]) -class OPSetTransformerTest extends OpTransformerSpec[MultiPickList, TransformerType] { - lazy val (inputData, top) = TestFeatureBuilder("name", - Seq(MultiPickList(Set("A", "B"))) - ) - /** - * [[OpTransformer]] instance to be tested - */ - override val transformer: TransformerType = new TransformerType( - transformer = new BaseTransformer(), - operationName = "testUnaryMapWrap").setInput(top) +@RunWith(classOf[JUnitRunner]) +class OPSetTransformerTest + extends OpTransformerSpec[MultiPickList, OPSetTransformer[Text, Text, MultiPickList, MultiPickList]] { - /** - * Expected result of the transformer applied on the Input Dataset - */ - override val expectedResult: Seq[MultiPickList] = Seq( - MultiPickList(Set("a", "b")) - ) -} + lazy val (inputData, top) = TestFeatureBuilder("name", Seq( + Set("A", "B").toMultiPickList + )) -object OPSetTransformerTest { - type TransformerType = OPSetTransformer[Text, Text, MultiPickList, MultiPickList] + val transformer: OPSetTransformer[Text, Text, MultiPickList, MultiPickList] = + new LowerCaseSetTransformer().setInput(top) - class BaseTransformer extends UnaryTransformer[Text, Text]( - operationName = "testUnary", - uid = UID[BaseTransformer] - ) { - override def transformFn: (Text => Text) = (input: Text) => input.value.map(_.toLowerCase()).toText - } + val expectedResult: Seq[MultiPickList] = Seq( + Set("a", "b").toMultiPickList + ) } - +class LowerCaseSetTransformer +( + uid: String = UID[LowerCaseSetTransformer], + operationName: String = "lowerCaseSet" +) extends OPSetTransformer[Text, Text, MultiPickList, MultiPickList]( + uid = uid, + operationName = operationName, + transformer = new LowerCaseTransformer +) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/PredictionDescalerTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/PredictionDescalerTransformerTest.scala index 354d383bed..68a4d40e87 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/PredictionDescalerTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/PredictionDescalerTransformerTest.scala @@ -67,14 +67,14 @@ class PredictionDescalerTransformerTest extends OpTransformerSpec[Real, Predicti it should "descale and serialize log-scaling workflow" in { val logScaler = new ScalerTransformer[Real, Real]( scalingType = ScalingType.Logarithmic, - scalingArgs = EmptyArgs() + scalingArgs = EmptyScalerArgs() ).setInput(f1) val scaledResponse = logScaler.getOutput() val metadata = logScaler.transform(inputData).schema(scaledResponse.name).metadata ScalerMetadata(metadata) match { case Failure(err) => fail(err) case Success(meta) => - meta shouldBe ScalerMetadata(ScalingType.Logarithmic, EmptyArgs()) + meta shouldBe ScalerMetadata(ScalingType.Logarithmic, EmptyScalerArgs()) } val shifted = scaledResponse.map[Prediction](v => Prediction(v.value.getOrElse(Double.NaN) + 1), operationName = "shift") diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/ScalerMetadataTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/ScalerMetadataTest.scala index 4f22281929..d883c4089f 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/ScalerMetadataTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/ScalerMetadataTest.scala @@ -55,7 +55,7 @@ class ScalerMetadataTest extends FlatSpec with TestSparkContext { } it should "properly construct ScalerMetaData for a LogScaler" in { - val metadata = ScalerMetadata(scalingType = ScalingType.Logarithmic, scalingArgs = EmptyArgs()).toMetadata() + val metadata = ScalerMetadata(scalingType = ScalingType.Logarithmic, scalingArgs = EmptyScalerArgs()).toMetadata() metadata.getString(ScalerMetadata.scalingTypeName) shouldBe ScalingType.Logarithmic.entryName metadata.getString(ScalerMetadata.scalingArgsName) shouldBe "{}" } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/ScalerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/ScalerTest.scala index eaedf13f61..604f4a04e2 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/ScalerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/ScalerTest.scala @@ -40,9 +40,11 @@ class ScalerTest extends FlatSpec with TestSparkContext { Spec[Scaler] should "error on invalid data" in { val error = intercept[IllegalArgumentException]( - Scaler.apply(scalingType = ScalingType.Linear, args = EmptyArgs()) + Scaler.apply(scalingType = ScalingType.Linear, args = EmptyScalerArgs()) ) - error.getMessage shouldBe "Invalid combination of scaling type 'Linear' and args type 'EmptyArgs'" + error.getMessage shouldBe + s"Invalid combination of scaling type '${ScalingType.Linear}' " + + s"and args type '${EmptyScalerArgs().getClass.getSimpleName}'" } it should "correctly build construct a LinearScaler" in { @@ -53,7 +55,7 @@ class ScalerTest extends FlatSpec with TestSparkContext { } it should "correctly build construct a LogScaler" in { - val linearScaler = Scaler.apply(scalingType = ScalingType.Logarithmic, args = EmptyArgs()) + val linearScaler = Scaler.apply(scalingType = ScalingType.Logarithmic, args = EmptyScalerArgs()) linearScaler shouldBe a[LogScaler] linearScaler.scalingType shouldBe ScalingType.Logarithmic } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/ScalerTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/ScalerTransformerTest.scala index 12f6238b64..04b60d91e5 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/ScalerTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/ScalerTransformerTest.scala @@ -66,7 +66,7 @@ class ScalerTransformerTest extends OpTransformerSpec[Real, ScalerTransformer[R it should "log scale numeric fields and produce correct metadata" in { val logScaler = new ScalerTransformer[Real, Real]( - scalingType = ScalingType.Logarithmic, scalingArgs = EmptyArgs() + scalingType = ScalingType.Logarithmic, scalingArgs = EmptyScalerArgs() ).setInput(f1) val scaled: FeatureLike[Real] = logScaler.getOutput() @@ -78,7 +78,7 @@ class ScalerTransformerTest extends OpTransformerSpec[Real, ScalerTransformer[R ScalerMetadata(transformed.schema(scaled.name).metadata) match { case Failure(err) => fail(err) case Success(meta) => - meta shouldBe ScalerMetadata(ScalingType.Logarithmic, EmptyArgs()) + meta shouldBe ScalerMetadata(ScalingType.Logarithmic, EmptyScalerArgs()) } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerRegexTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerRegexTest.scala new file mode 100644 index 0000000000..62c29493b6 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerRegexTest.scala @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.feature + +import com.salesforce.op.features.types._ +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class TextTokenizerRegexTest extends OpTransformerSpec[TextList, TextTokenizer[Text]] { + + val (inputData, english) = TestFeatureBuilder( + Seq( + "I've got a lovely bunch of coconuts", + "There they are, all standing in a row", + "Big ones, small ones, some as big as your head", + "Big ones, small

ones

, some as big as your head", + "two words", + " ehh, Fluff ", + "" + ).map(_.toText) + ) + + val transformer = + english.tokenizeRegex(pattern = "\\s+", minTokenLength = 5, toLowercase = false) + .originStage.asInstanceOf[TextTokenizer[Text]] + + val expectedResult: Seq[TextList] = Array( + List("lovely", "bunch", "coconuts").toTextList, + List("There", "standing").toTextList, + List("ones,", "small", "ones,").toTextList, + List("Big", "ones,", "small", "

ones

,", "head").toTextList, + List("words").toTextList, + List("Fluff").toTextList, + TextList.empty + ) +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/selector/ModelSelectorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/selector/ModelSelectorTest.scala index 2c9dc927f8..987569fa81 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/selector/ModelSelectorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/selector/ModelSelectorTest.scala @@ -57,7 +57,7 @@ import org.slf4j.LoggerFactory @RunWith(classOf[JUnitRunner]) class ModelSelectorTest extends OpEstimatorSpec[Prediction, SelectedModel, ModelSelector[_, _]] with PredictionEquality with CompareParamGrid { - + import ModelSelectorTest._ val log = LoggerFactory.getLogger(this.getClass) val (seed, smallCount, bigCount) = (1234L, 20, 80) @@ -263,15 +263,17 @@ class ModelSelectorTest extends OpEstimatorSpec[Prediction, SelectedModel, Model } } +object ModelSelectorTest { -class TestEstimator extends BinaryEstimator[RealNN, OPVector, Prediction]("test", UID[TestEstimator]) { - override def fitFn(dataset: Dataset[(Option[Double], Vector)]): TestModel = new TestModel(uid) -} + class TestEstimator extends BinaryEstimator[RealNN, OPVector, Prediction]("test", UID[TestEstimator]) { + override def fitFn(dataset: Dataset[(Option[Double], Vector)]): TestModel = new TestModel(uid) + } -class TestModel(uid: String) extends BinaryModel[RealNN, OPVector, Prediction]("test", uid){ - override def transformFn: (RealNN, OPVector) => Prediction = (l: RealNN, f: OPVector) => { - val pred = l.value.get - val raw = Array(pred, 1 - pred) - Prediction(pred, raw, raw) + class TestModel(uid: String) extends BinaryModel[RealNN, OPVector, Prediction]("test", uid) { + override def transformFn: (RealNN, OPVector) => Prediction = (l: RealNN, f: OPVector) => { + val pred = l.value.get + val raw = Array(pred, 1 - pred) + Prediction(pred, raw, raw) + } } } diff --git a/features/src/main/java/com/salesforce/op/stages/ReaderWriter.java b/features/src/main/java/com/salesforce/op/stages/ReaderWriter.java new file mode 100644 index 0000000000..55adb8349a --- /dev/null +++ b/features/src/main/java/com/salesforce/op/stages/ReaderWriter.java @@ -0,0 +1,21 @@ +package com.salesforce.op.stages; + +import java.lang.annotation.*; + +/** + * Stage class annotation to specify custom reader/writer implementation of [[OpPipelineStageReaderWriter]]. + * Reader/writer implementation must extend [[OpPipelineStageReaderWriter]] trait + * and has a single no arguments constructor. + */ +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +@Inherited +public @interface ReaderWriter { + + /** + * Reader/writer class extending [[OpPipelineStageReaderWriter]] to use when reading/writing the stage. + * It must extend [[OpPipelineStageReaderWriter]] trait and has a single no arguments constructor. + */ + Class value(); + +} diff --git a/features/src/main/scala/com/salesforce/op/features/FeatureBuilder.scala b/features/src/main/scala/com/salesforce/op/features/FeatureBuilder.scala index f8e2c27f5c..3c2a17760f 100644 --- a/features/src/main/scala/com/salesforce/op/features/FeatureBuilder.scala +++ b/features/src/main/scala/com/salesforce/op/features/FeatureBuilder.scala @@ -185,7 +185,7 @@ object FeatureBuilder { * @param response response feature name * @param nonNullable optional non nullable feature names * @throws IllegalArgumentException if fails to map dataframe field type into a feature type - * @throws RuntimeException if fails to construct a response feature + * @throws RuntimeException if fails to construct a response feature * @return label and other features */ def fromSchema[ResponseType <: FeatureType : WeakTypeTag]( @@ -218,30 +218,45 @@ object FeatureBuilder { } /** - * Builds features from a [[DataFrame]] - * - * @param data input [[DataFrame]] - * @param response response feature name - * @param nonNullable optional non nullable feature names - * @throws IllegalArgumentException if fails to map dataframe field type into a feature type - * @throws RuntimeException if fails to construct a response feature - * @return label and other features - */ + * Builds features from a [[DataFrame]] + * + * @param data input [[DataFrame]] + * @param response response feature name + * @param nonNullable optional non nullable feature names + * @throws IllegalArgumentException if fails to map dataframe field type into a feature type + * @throws RuntimeException if fails to construct a response feature + * @return label and other features + */ def fromDataFrame[ResponseType <: FeatureType : WeakTypeTag](data: DataFrame, response: String, nonNullable: Set[String] = Set.empty): (Feature[ResponseType], Array[Feature[_ <: FeatureType]]) = fromSchema(data.schema, response, nonNullable) def fromRow[O <: FeatureType : WeakTypeTag](implicit name: sourcecode.Name): FeatureBuilderWithExtract[Row, O] = fromRow[O](name.value, None) def fromRow[O <: FeatureType : WeakTypeTag](name: String): FeatureBuilderWithExtract[Row, O] = fromRow[O](name, None) def fromRow[O <: FeatureType : WeakTypeTag](index: Int)(implicit name: sourcecode.Name): FeatureBuilderWithExtract[Row, O] = fromRow[O](name.value, Some(index)) def fromRow[O <: FeatureType : WeakTypeTag](name: String, index: Option[Int]): FeatureBuilderWithExtract[Row, O] = { - val c = FeatureTypeSparkConverter[O]() new FeatureBuilderWithExtract[Row, O]( name = name, - extractFn = (r: Row) => c.fromSpark(index.map(r.get).getOrElse(r.getAny(name))), - extractSource = "(r: Row) => c.fromSpark(index.map(r.get).getOrElse(r.getAny(name)))" + extractFn = FromRowExtractFn[O](index, name), + extractSource = + s"""${classOf[FromRowExtractFn[O]].getName}[${FeatureType.shortTypeName[O]}]($index, "$name")""" ) } + // scalastyle:on } +/** + * Generic value extract function for [[Row]] + * + * @param index optional index of the value to extract from [[Row]] + * @param name name of the value to extract from [[Row]] + * @param tto feature type tag + * @tparam O output feature type + */ +case class FromRowExtractFn[O <: FeatureType](index: Option[Int], name: String) + (implicit val tto: WeakTypeTag[O]) extends Function1[Row, O] with Serializable { + private val c = FeatureTypeSparkConverter[O]()(tto) + def apply(r: Row): O = c.fromSpark(index.map(r.get).getOrElse(r.getAny(name))) +} + /** * Feature Builder allows building features * @@ -256,8 +271,7 @@ final class FeatureBuilder[I, O <: FeatureType](val name: String) { * * @param fn a function to extract value of the feature from the raw data */ - def extract(fn: I => O): FeatureBuilderWithExtract[I, O] = - macro FeatureBuilderMacros.extract[I, O] + def extract(fn: I => O): FeatureBuilderWithExtract[I, O] = macro FeatureBuilderMacros.extract[I, O] /** * Feature extract method - a function to extract value of the feature from the raw data. @@ -326,8 +340,9 @@ final class FeatureBuilderWithExtract[I, O <: FeatureType] aggregator = aggregator, outputName = name, outputIsResponse = isResponse, - aggregateWindow = aggregateWindow - )(tti, tto) + aggregateWindow = aggregateWindow, + inputType = Left(tti) + )(tto) originStage.getOutput().asInstanceOf[Feature[O]] } diff --git a/features/src/main/scala/com/salesforce/op/stages/DefaultOpPipelineStageReaderWriter.scala b/features/src/main/scala/com/salesforce/op/stages/DefaultOpPipelineStageReaderWriter.scala new file mode 100644 index 0000000000..389ba93746 --- /dev/null +++ b/features/src/main/scala/com/salesforce/op/stages/DefaultOpPipelineStageReaderWriter.scala @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages + +import com.salesforce.op.features.types.FeatureType +import com.salesforce.op.stages.OpPipelineStageReaderWriter._ +import com.salesforce.op.utils.reflection.ReflectionUtils +import org.apache.spark.ml.PipelineStage +import org.json4s.{JObject, JValue} +import org.json4s.jackson.JsonMethods.render +import org.json4s.{Extraction, _} + +import scala.reflect.{ClassTag, ManifestFactory} +import scala.reflect.runtime.universe._ +import scala.util.{Failure, Success, Try} + +/** + * Default reader/writer for stages that uses reflection to reflect stage ctor arguments + * + * @tparam StageType stage type to read/write + */ +final class DefaultOpPipelineStageReaderWriter[StageType <: OpPipelineStageBase] +( + implicit val ct: ClassTag[StageType] +) extends OpPipelineStageReaderWriter[StageType] with OpPipelineStageSerializationFuns { + + /** + * Read stage from json + * + * @param stageClass stage class + * @param json json to read stage from + * @return read result + */ + def read(stageClass: Class[StageType], json: JValue): Try[StageType] = Try { + // Extract all the ctor args + val ctorArgsMap = json.asInstanceOf[JObject].obj + .map { case (argName, j) => argName -> j.extract[AnyValue] }.toMap + + // Make the ctor function used for creating a stage instance + val ctorArgs: (String, Symbol) => Try[Any] = (argName, argSymbol) => { + for { + anyValue <- Try { + ctorArgsMap.getOrElse(argName, throw new RuntimeException( + s"Ctor argument '$argName' was not found for stage class '${stageClass.getName}'")) + } + argInstance = Try { + anyValue match { + // Special handling for Feature Type TypeTags + case AnyValue(AnyValueTypes.TypeTag, value, _) => + Try(FeatureType.featureTypeTag(value.toString)).recoverWith[TypeTag[_]] { case _ => + Try(FeatureType.featureValueTypeTag(value.toString)) + } match { + case Success(featureTypeTag) => featureTypeTag + case Failure(e) => throw new RuntimeException( + s"Unknown type tag '${value.toString}' for ctor argument '$argName'. " + + "Only Feature and Feature Value type tags are supported for serialization.", e) + } + + // Spark wrapped stage is saved using [[SparkWrapperParams]] and loaded later using + // [[SparkDefaultParamsReadWrite]].getAndSetParams returning 'null' here + case AnyValue(AnyValueTypes.SparkWrappedStage, _, _) => null // yes, yes - this should be 'null' + + // Class value argument, e.g. [[Function1]], [[Numeric]] etc. + case AnyValue(AnyValueTypes.ClassInstance, value, _) => + ReflectionUtils.newInstance[Any](value.toString) + + // Value with no ctor arguments should be instantiable by class name + case AnyValue(AnyValueTypes.Value, m: Map[_, _], Some(className)) if m.isEmpty => + ReflectionUtils.newInstance[Any](className) + + // Everything else is read using json4s + case AnyValue(AnyValueTypes.Value, value, valueClass) => + // Create type manifest either using the reflected type tag or serialized value class + val manifest = try { + val ttag = ReflectionUtils.typeTagForType[Any](tpe = argSymbol.info) + ReflectionUtils.manifestForTypeTag[Any](ttag) + } catch { + case _ if valueClass.isDefined => + ManifestFactory.classType[Any](ReflectionUtils.classForName(valueClass.get)) + } + Extraction.decompose(value).extract[Any](formats, manifest) + + } + } + res <- argInstance match { + case Failure(e) => + throw new RuntimeException( + s"Failed to parse argument '$argName' from value '${anyValue.value}'" + + anyValue.valueClass.map(c => s" of class '$c'").getOrElse(""), e) + case ok => ok + } + } yield res + } + + // Reflect stage class instance by class + ctor args + ReflectionUtils.newInstance[StageType](stageClass, ctorArgs) + } + + /** + * Write stage to json + * + * @param stage stage instance to write + * @return write result + */ + def write(stage: StageType): Try[JValue] = Try { + // Reflect all the ctor args + val (_, argsList) = ReflectionUtils.bestCtorWithArgs(stage) + + // Wrap all ctor args into AnyValue container + val args = + for {(argName, argValue) <- argsList} yield { + val anyValue = argValue match { + + // Special handling for Feature Type TypeTags + case t: TypeTag[_] if FeatureType.isFeatureType(t) || FeatureType.isFeatureValueType(t) => + AnyValue(AnyValueTypes.TypeTag, ReflectionUtils.dealisedTypeName(t.tpe), None) + case t: TypeTag[_] => + throw new RuntimeException( + s"Unknown type tag '${t.tpe.toString}'. " + + "Only Feature and Feature Value type tags are supported for serialization." + ) + + // Special handling for function value arguments + case f1: Function1[_, _] + // Maps and other scala collections extend [[Function1]] - skipping them by filtering by package name + if !f1.getClass.getPackage.getName.startsWith("scala") => serializeArgument(argName, f1) + case f2: Function2[_, _, _] => serializeArgument(argName, f2) + case f3: Function3[_, _, _, _] => serializeArgument(argName, f3) + case f4: Function4[_, _, _, _, _] => serializeArgument(argName, f4) + + // Special handling for [[Numeric]] + case n: Numeric[_] => serializeArgument(argName, n) + + // Spark wrapped stage is saved using [[SparkWrapperParams]], so we just writing it's uid here + case Some(v: PipelineStage) => AnyValue(AnyValueTypes.SparkWrappedStage, v.uid, None) + case v: PipelineStage => AnyValue(AnyValueTypes.SparkWrappedStage, v.uid, None) + + // Everything else goes as is and is handled by json4s + case v => + // try serialize value with json4s + val av = AnyValue(AnyValueTypes.Value, v, Option(v).map(_.getClass.getName)) + Try(jsonSerialize(av)) match { + case Success(_) => av + case Failure(e) => + throw new RuntimeException(s"Failed to json4s serialize argument '$argName' with value '$v'", e) + } + + } + argName -> anyValue + } + Extraction.decompose(args.toMap) + } + + + private def jsonSerialize(v: Any): JValue = render(Extraction.decompose(v)) +} diff --git a/features/src/main/scala/com/salesforce/op/stages/FeatureGeneratorStage.scala b/features/src/main/scala/com/salesforce/op/stages/FeatureGeneratorStage.scala index 17346b0d3c..bbbe6d9029 100644 --- a/features/src/main/scala/com/salesforce/op/stages/FeatureGeneratorStage.scala +++ b/features/src/main/scala/com/salesforce/op/stages/FeatureGeneratorStage.scala @@ -32,12 +32,16 @@ package com.salesforce.op.stages import com.salesforce.op.UID import com.salesforce.op.aggregators.{Event, FeatureAggregator, GenericFeatureAggregator} +import com.salesforce.op.features._ import com.salesforce.op.features.types.FeatureType -import com.salesforce.op.features.{Feature, FeatureLike, FeatureUID, OPFeature} +import com.salesforce.op.utils.reflection.ReflectionUtils import com.twitter.algebird.MonoidAggregator import org.apache.spark.ml.PipelineStage import org.apache.spark.util.ClosureUtils import org.joda.time.Duration +import org.json4s.JValue +import org.json4s.JsonAST.JObject +import org.json4s.JsonDSL._ import scala.reflect.runtime.universe.WeakTypeTag import scala.util.Try @@ -53,24 +57,34 @@ import scala.util.Try * (used to determine aggregation window) * @param aggregateWindow time period during which to include features in aggregation * @param uid unique id for stage - * @param tti weak type tag for input feature type + * @param inputType input weak type tag or type name * @param tto weak type tag for output feature type * @tparam I input data type * @tparam O output feature type */ + +@ReaderWriter(classOf[FeatureGeneratorStageReaderWriter[_, _ <: FeatureType]]) final class FeatureGeneratorStage[I, O <: FeatureType] ( val extractFn: I => O, val extractSource: String, val aggregator: MonoidAggregator[Event[O], _, O], - outputName: String, + val outputName: String, override val outputIsResponse: Boolean, val aggregateWindow: Option[Duration] = None, - val uid: String = UID[FeatureGeneratorStage[I, O]] -)( - implicit val tti: WeakTypeTag[I], - val tto: WeakTypeTag[O] -) extends PipelineStage with OpPipelineStage[O] with HasIn1 { + val uid: String = UID[FeatureGeneratorStage[I, O]], + val inputType: Either[WeakTypeTag[I], String] +)(implicit val tto: WeakTypeTag[O]) + extends PipelineStage with OpPipelineStage[O] with HasIn1 { + + // This hack is required as Spark can't serialize run-time created TypeTags + // when the stage is recovered on model loading. + // Since Spark tries to serialize Scala RuntimeMirror which is not serializable, + // we recover type tag from its type name upon request + @transient implicit val tti: WeakTypeTag[I] = inputType match { + case Right(typeName) => ReflectionUtils.typeTagForTypeName[I](typeName) + case Left(ttag) => ttag + } setOutputFeatureName(outputName) @@ -107,3 +121,81 @@ final class FeatureGeneratorStage[I, O <: FeatureType] */ override def checkSerializable: Try[Unit] = ClosureUtils.checkSerializable(extractFn) } + + +/** + * Stage reader/writer implementation used to (de)serialize [[FeatureGeneratorStage]] + */ +class FeatureGeneratorStageReaderWriter[I, O <: FeatureType] + extends OpPipelineStageReaderWriter[FeatureGeneratorStage[I, O]] with OpPipelineStageSerializationFuns { + + /** + * Read stage from json + * + * @param stageClass stage class + * @param json json to read stage from + * @return read result + */ + def read(stageClass: Class[FeatureGeneratorStage[I, O]], json: JValue): Try[FeatureGeneratorStage[I, O]] = { + Try { + val tti = (json \ "tti").extract[String] + val tto = FeatureType.featureTypeTag((json \ "tto").extract[String]).asInstanceOf[WeakTypeTag[O]] + + val extractFnJson = json \ "extractFn" + val extractFnClassName = (extractFnJson \ "className").extract[String] + val extractFn = extractFnClassName match { + case c if classOf[FromRowExtractFn[_]].getName == c => + val index = (extractFnJson \ "index").extractOpt[Int] + val name = (extractFnJson \ "name").extract[String] + FromRowExtractFn[O](index, name)(tto).asInstanceOf[Function1[I, O]] + case c => + ReflectionUtils.newInstance[Function1[I, O]](c) + } + + val aggregatorClassName = (json \ "aggregator" \ "className").extract[String] + val aggregator = ReflectionUtils.newInstance[MonoidAggregator[Event[O], _, O]](aggregatorClassName) + + val outputName = (json \ "outputName").extract[String] + val extractSource = (json \ "extractSource").extract[String] + val uid = (json \ "uid").extract[String] + val outputIsResponse = (json \ "outputIsResponse").extract[Boolean] + val aggregateWindow = (json \ "aggregateWindow").extractOpt[Long].map(Duration.millis) + + new FeatureGeneratorStage[I, O](extractFn, extractSource, aggregator, + outputName, outputIsResponse, aggregateWindow, uid, Right(tti))(tto) + } + + } + + /** + * Write stage to json + * + * @param stage stage instance to write + * @return write result + */ + def write(stage: FeatureGeneratorStage[I, O]): Try[JValue] = { + for { + extractFn <- Try { + stage.extractFn match { + case e: FromRowExtractFn[_] => + ("className" -> e.getClass.getName) ~ ("index" -> e.index) ~ ("name" -> e.name) + case e => + ("className" -> serializeArgument("extractFn", e).value.toString) ~ JObject() + } + } + aggregator <- Try( + ("className" -> serializeArgument("aggregator", stage.aggregator).value.toString) ~ JObject() + ) + } yield { + ("tti" -> stage.tti.tpe.typeSymbol.fullName) ~ + ("tto" -> FeatureType.typeName(stage.tto)) ~ + ("aggregator" -> aggregator) ~ + ("extractFn" -> extractFn) ~ + ("outputName" -> stage.outputName) ~ + ("aggregateWindow" -> stage.aggregateWindow.map(_.getMillis)) ~ + ("uid" -> stage.uid) ~ + ("extractSource" -> stage.extractSource) ~ + ("outputIsResponse" -> stage.outputIsResponse) + } + } +} diff --git a/features/src/main/scala/com/salesforce/op/stages/HasOut.scala b/features/src/main/scala/com/salesforce/op/stages/HasOut.scala index b8685395c1..f93cbde033 100644 --- a/features/src/main/scala/com/salesforce/op/stages/HasOut.scala +++ b/features/src/main/scala/com/salesforce/op/stages/HasOut.scala @@ -32,12 +32,13 @@ package com.salesforce.op.stages import com.salesforce.op.features._ import com.salesforce.op.features.types.FeatureType +import org.apache.spark.ml.PipelineStage import scala.reflect.runtime.universe.TypeTag -private[op] trait HasOut[O <: FeatureType] { - self: OpPipelineStage[O] => +private[op] trait HasOut[O <: FeatureType] extends OpPipelineStage[O] { + self: PipelineStage => /** * Type tag of the output diff --git a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReadWriteShared.scala b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReadWriteShared.scala deleted file mode 100644 index 03583d2dc3..0000000000 --- a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReadWriteShared.scala +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * * Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -package com.salesforce.op.stages - -import com.salesforce.op.features.FeatureDistributionType -import com.salesforce.op.stages.impl.feature.{HashAlgorithm, HashSpaceStrategy, ScalingType, TimePeriod} -import com.salesforce.op.utils.json.{EnumEntrySerializer, SpecialDoubleSerializer} -import enumeratum._ -import org.json4s.ext.JodaTimeSerializers -import org.json4s.{DefaultFormats, Formats} - - -object OpPipelineStageReadWriteShared { - - /** - * Stage json field names - */ - sealed abstract class FieldNames(override val entryName: String) extends EnumEntry - - /** - * Stage json field names - */ - object FieldNames extends Enum[FieldNames] { - val values = findValues - case object IsModel extends FieldNames("isModel") - case object CtorArgs extends FieldNames("ctorArgs") - case object Uid extends FieldNames("uid") - case object Class extends FieldNames("class") - case object ParamMap extends FieldNames("paramMap") - } - - /** - * Any Value Types - */ - sealed abstract class AnyValueTypes extends EnumEntry - - /** - * Any Value Types - */ - object AnyValueTypes extends Enum[AnyValueTypes] { - val values = findValues - case object TypeTag extends AnyValueTypes - case object SparkWrappedStage extends AnyValueTypes - case object Value extends AnyValueTypes - } - - /** - * A container for Any Value - */ - case class AnyValue(`type`: AnyValueTypes, value: Any) - - implicit val formats: Formats = - DefaultFormats ++ - JodaTimeSerializers.all + - EnumEntrySerializer.json4s[AnyValueTypes](AnyValueTypes) + - EnumEntrySerializer.json4s[HashAlgorithm](HashAlgorithm) + - EnumEntrySerializer.json4s[HashSpaceStrategy](HashSpaceStrategy) + - EnumEntrySerializer.json4s[ScalingType](ScalingType) + - EnumEntrySerializer.json4s[TimePeriod](TimePeriod) + - EnumEntrySerializer.json4s[FeatureDistributionType](FeatureDistributionType) + - new SpecialDoubleSerializer - -} diff --git a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReader.scala b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReader.scala index 561cb0d932..8e0eea4087 100644 --- a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReader.scala +++ b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReader.scala @@ -30,27 +30,38 @@ package com.salesforce.op.stages -import com.salesforce.op.features.types.FeatureType -import com.salesforce.op.stages.OpPipelineStageReadWriteShared._ +import com.salesforce.op.features.OPFeature +import com.salesforce.op.stages.OpPipelineStageReaderWriter._ import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams import com.salesforce.op.utils.reflection.ReflectionUtils import org.apache.hadoop.fs.Path import org.apache.spark.ml.SparkDefaultParamsReadWrite import org.apache.spark.ml.util.MLReader -import org.json4s.JsonAST.{JObject, JValue} +import org.json4s.JsonAST.JValue +import org.json4s._ import org.json4s.jackson.JsonMethods.{compact, render} -import org.json4s.{Extraction, _} -import scala.reflect.runtime.universe._ import scala.util.{Failure, Success, Try} /** * Reads the serialized output of [[OpPipelineStageWriter]] * - * @param originalStage original serialized stage + * @param originalStage original stage instance from the workflow (legacy) + * @param features features loaded so far (used to lookup input features fot the stage) */ -final class OpPipelineStageReader(val originalStage: OpPipelineStageBase) - extends MLReader[OpPipelineStageBase] { +final class OpPipelineStageReader private +( + val originalStage: Option[OpPipelineStageBase], + val features: Seq[OPFeature] +) extends MLReader[OpPipelineStageBase] { + + /** + * Legacy ctor which requires origin stage to be preset when loading stages + */ + def this(origStage: OpPipelineStageBase) = + this(Option(origStage), origStage.getInputFeatures().flatMap(_.allFeatures)) + + def this(feats: Seq[OPFeature]) = this(None, feats) /** * Load from disk. File should contain data serialized in json format @@ -77,20 +88,37 @@ final class OpPipelineStageReader(val originalStage: OpPipelineStageBase) * Loads from the json serialized data * * @param jsonStr json string - * @param path to the stored output + * @param path to the stored output * @return OpPipelineStageBase */ def loadFromJsonString(jsonStr: String, path: String): OpPipelineStageBase = { // Load stage json with it's params val metadata = SparkDefaultParamsReadWrite.parseMetadata(jsonStr) val (className, metadataJson) = metadata.className -> metadata.metadata + // Check if it's a model - val isModel = (metadataJson \ FieldNames.IsModel.entryName).extract[Boolean] - // In case we stumbled upon model we instantiate it using the class name + ctor args - // otherwise we simply use the provided stage instance. - val stage = if (isModel) loadModel(className, metadataJson) else originalStage - // Recover all stage spark params and it's input features - val inputFeatures = originalStage.getInputFeatures() + val isModelOpt = (metadataJson \ FieldNames.IsModel.entryName).extractOpt[Boolean] + val ctorArgsJson = metadataJson \ FieldNames.CtorArgs.entryName + val stageClass = ReflectionUtils.classForName(className).asInstanceOf[Class[OpPipelineStageBase]] + + val stageTry: Try[OpPipelineStageBase] = isModelOpt match { + // ************************** Legacy mode ************************** + case Some(isModel) => + // *** Legacy mode *** + // In case we stumbled upon model we instantiate it using the class name + ctor args + // otherwise we simply use the provided stage instance. + if (isModel) new DefaultOpPipelineStageReaderWriter[OpPipelineStageBase]().read(stageClass, ctorArgsJson) + else originalStage.map(Success(_)).getOrElse(Failure(new RuntimeException("Origin stage was not set"))) + // ***************************************************************** + case _ => + // Get the reader instance to load the stage + val reader = readerWriterFor[OpPipelineStageBase](stageClass) + reader.read(stageClass, ctorArgsJson) + } + val stage = stageTry match { + case Failure(err) => throw new RuntimeException(s"Failed to read the stage of type '${stageClass.getName}'", err) + case Success(stg) => stg + } // Update [[SparkWrapperParams]] with path so we can load the [[SparkStageParam]] instance val updatedMetadata = stage match { @@ -102,63 +130,13 @@ final class OpPipelineStageReader(val originalStage: OpPipelineStageBase) // Set all stage params from the metadata SparkDefaultParamsReadWrite.getAndSetParams(stage, updatedMetadata) - val matchingFeatures = stage.getTransientFeatures().map{ f => - inputFeatures.find( i => i.uid == f.uid && i.isResponse == f.isResponse && i.typeName == f.typeName ) - .getOrElse( throw new RuntimeException(s"Feature '${f.uid}' was not found for stage '${stage.uid}'") ) + + // Recover and set all stage input features + val matchingFeatures = stage.getTransientFeatures().map { f => + features.find(i => i.uid == f.uid && i.isResponse == f.isResponse && i.typeName == f.typeName) + .getOrElse(throw new RuntimeException(s"Feature '${f.uid}' was not found for stage '${stage.uid}'")) } stage.setInputFeatureArray(matchingFeatures) } - /** - * Load the model instance from the metadata by instantiating it using a class name + ctor args - */ - private def loadModel(modelClassName: String, metadataJson: JValue): OpPipelineStageBase = { - // Extract all the ctor args - val ctorArgsJson = (metadataJson \ FieldNames.CtorArgs.entryName).asInstanceOf[JObject].obj - val ctorArgsMap = ctorArgsJson.map { case (argName, j) => argName -> j.extract[AnyValue] }.toMap - // Get the model class - - // Make the ctor function used for creating a model instance - val ctorArgs: (String, Symbol) => Try[Any] = (argName, argSymbol) => Try { - val anyValue = ctorArgsMap.getOrElse(argName, - throw new RuntimeException(s"Ctor argument '$argName' was not found for model class '$modelClassName'") - ) - anyValue.`type` match { - // Special handling for Feature Type TypeTags - case AnyValueTypes.TypeTag => - Try(FeatureType.featureTypeTag(anyValue.value.toString)).recoverWith[TypeTag[_]] { case e => - Try(FeatureType.featureValueTypeTag(anyValue.value.toString)) - } match { - case Success(featureTypeTag) => featureTypeTag - case Failure(e) => - throw new RuntimeException( - s"Unknown type tag '${anyValue.value.toString}' for ctor argument '$argName'. " + - "Only Feature and Feature Value type tags are supported for serialization.", e - ) - } - - // Spark wrapped stage is saved using [[SparkWrapperParams]] and loaded later using - // [[SparkDefaultParamsReadWrite]].getAndSetParams returning 'null' here - case AnyValueTypes.SparkWrappedStage => { - null // yes, yes - this should be 'null' - } - - // Everything else is read using json4s - case AnyValueTypes.Value => Try { - val ttag = ReflectionUtils.typeTagForType[Any](tpe = argSymbol.info) - val manifest = ReflectionUtils.manifestForTypeTag[Any](ttag) - Extraction.decompose(anyValue.value).extract[Any](formats, manifest) - } match { - case Success(any) => any - case Failure(e) => throw new RuntimeException( - s"Failed to parse argument '$argName' from value '${anyValue.value}'", e) - } - } - } - - // Reflect model class instance by class + ctor args - val modelClass = ReflectionUtils.classForName(modelClassName) - val model = ReflectionUtils.newInstance[OpPipelineStageBase](modelClass, ctorArgs) - model - } } diff --git a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReaderWriter.scala b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReaderWriter.scala new file mode 100644 index 0000000000..30d60096a5 --- /dev/null +++ b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReaderWriter.scala @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages + +import com.salesforce.op.features.FeatureDistributionType +import com.salesforce.op.stages.impl.feature._ +import com.salesforce.op.utils.json.{EnumEntrySerializer, SpecialDoubleSerializer} +import com.salesforce.op.utils.reflection.ReflectionUtils +import enumeratum.{Enum, EnumEntry} +import org.json4s.ext.JodaTimeSerializers +import org.json4s.jackson.Serialization +import org.json4s.{Formats, FullTypeHints, JValue} +import org.slf4j.LoggerFactory + +import scala.reflect.ClassTag +import scala.util.{Failure, Success, Try} + + +/** + * Stage reader/writer implementation used to (de)serialize stages from/to trained models + * + * @tparam StageType stage type to read/write + */ +trait OpPipelineStageReaderWriter[StageType <: OpPipelineStageBase] extends OpPipelineStageReadWriteFormats { + + /** + * Read stage from json + * + * @param stageClass stage class + * @param json json to read stage from + * @return read result + */ + def read(stageClass: Class[StageType], json: JValue): Try[StageType] + + /** + * Write stage to json + * + * @param stage stage instance to write + * @return write result + */ + def write(stage: StageType): Try[JValue] + +} + + +object OpPipelineStageReaderWriter extends OpPipelineStageReadWriteFormats { + + private val log = LoggerFactory.getLogger(OpPipelineStageReaderWriter.getClass) + + /** + * Stage json field names + */ + sealed abstract class FieldNames(override val entryName: String) extends EnumEntry + + /** + * Stage json field names + */ + object FieldNames extends Enum[FieldNames] { + val values = findValues + case object IsModel extends FieldNames("isModel") + case object CtorArgs extends FieldNames("ctorArgs") + case object Uid extends FieldNames("uid") + case object Class extends FieldNames("class") + case object ParamMap extends FieldNames("paramMap") + } + + /** + * Any Value Types + */ + sealed abstract class AnyValueTypes extends EnumEntry + + /** + * Any Value Types + */ + object AnyValueTypes extends Enum[AnyValueTypes] { + val values = findValues + case object TypeTag extends AnyValueTypes + case object SparkWrappedStage extends AnyValueTypes + case object ClassInstance extends AnyValueTypes + case object Value extends AnyValueTypes + } + + /** + * A container for Any Value + */ + case class AnyValue(`type`: AnyValueTypes, value: Any, valueClass: Option[String]) + + /** + * Retrieve reader/writer implementation: either the custom one specified with [[ReaderWriter]] annotation + * or the default one [[DefaultOpPipelineStageReaderWriter]] + * + * @param stageClass stage class + * @tparam StageType stage type + * @return reader/writer implementation + */ + def readerWriterFor[StageType <: OpPipelineStageBase : ClassTag] + ( + stageClass: Class[StageType] + ): OpPipelineStageReaderWriter[StageType] = { + if (!stageClass.isAnnotationPresent(classOf[ReaderWriter])) { + new DefaultOpPipelineStageReaderWriter[StageType]() + } + else { + Try { + val readerWriterClass = stageClass.getAnnotation[ReaderWriter](classOf[ReaderWriter]).value() + ReflectionUtils.newInstance[OpPipelineStageReaderWriter[StageType]](readerWriterClass.getName) + } match { + case Success(readerWriter) => + if (log.isDebugEnabled) { + log.debug(s"Using reader/writer of type '${readerWriter.getClass.getName}'" + + s"to (de)serialize stage of type '${stageClass.getName}'") + } + readerWriter + case Failure(e) => throw new RuntimeException( + s"Failed to create reader/writer instance for stage class ${stageClass.getName}", e) + } + } + } + +} + + +trait OpPipelineStageReadWriteFormats { + + import OpPipelineStageReaderWriter._ + + val typeHints = FullTypeHints(List( + classOf[EmptyScalerArgs], classOf[LinearScalerArgs] + )) + + implicit val formats: Formats = + Serialization.formats(typeHints) ++ + JodaTimeSerializers.all + + EnumEntrySerializer.json4s[AnyValueTypes](AnyValueTypes) + + EnumEntrySerializer.json4s[HashAlgorithm](HashAlgorithm) + + EnumEntrySerializer.json4s[HashSpaceStrategy](HashSpaceStrategy) + + EnumEntrySerializer.json4s[ScalingType](ScalingType) + + EnumEntrySerializer.json4s[TimePeriod](TimePeriod) + + EnumEntrySerializer.json4s[FeatureDistributionType](FeatureDistributionType) + + new SpecialDoubleSerializer + +} + + +private[op] trait OpPipelineStageSerializationFuns { + + import OpPipelineStageReaderWriter._ + + def serializeArgument(argName: String, value: AnyRef): AnyValue = { + try { + val valueClass = value.getClass + // Test that value has no external dependencies and can be constructed without ctor args or is an object + ReflectionUtils.newInstance[AnyRef](valueClass.getName) + AnyValue(AnyValueTypes.ClassInstance, valueClass.getName, Option(valueClass.getName)) + } catch { + case error: Exception => throw new RuntimeException( + s"Argument '$argName' [${value.getClass.getName}] cannot be serialized. " + + s"Make sure ${value.getClass.getName} has either no-args ctor or is an object, " + + "and does not have any external dependencies, e.g. use any out of scope variables.", error) + } + } + +} diff --git a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageWriter.scala b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageWriter.scala index cc333af2bc..9cdf05ac35 100644 --- a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageWriter.scala +++ b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageWriter.scala @@ -30,19 +30,15 @@ package com.salesforce.op.stages -import com.salesforce.op.features.types.FeatureType -import com.salesforce.op.stages.OpPipelineStageReadWriteShared._ +import com.salesforce.op.stages.OpPipelineStageReaderWriter._ import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams -import com.salesforce.op.utils.reflection.ReflectionUtils import org.apache.hadoop.fs.Path import org.apache.spark.ml.util.MLWriter -import org.apache.spark.ml.{Estimator, Model, PipelineStage, SparkDefaultParamsReadWrite} -import org.json4s.Extraction +import org.apache.spark.ml.{Estimator, SparkDefaultParamsReadWrite} import org.json4s.JsonAST.{JObject, JValue} -import org.json4s.jackson.JsonMethods.{compact, parse, render} +import org.json4s.jackson.JsonMethods.{compact, render} -import scala.reflect.runtime.universe.TypeTag -import scala.util.{Failure, Success, Try} +import scala.util.{Failure, Success} /** * MLWriter class used to write TransmogrifAI stages to disk @@ -68,78 +64,27 @@ final class OpPipelineStageWriter(val stage: OpPipelineStageBase) extends MLWrit * * @return stage metadata json */ - def writeToJson(path: String): JObject = jsonSerialize(writeToMap(path)).asInstanceOf[JObject] - - /** - * Stage metadata map - * - * @return stage metadata map - */ - def writeToMap(path: String): Map[String, Any] = { + def writeToJson(path: String): JObject = { stage match { - case _: Estimator[_] => return Map.empty[String, Any] // no need to serialize estimators + case _: Estimator[_] => return JObject() // no need to serialize estimators case s: SparkWrapperParams[_] => // Set save path for all Spark wrapped stages of type [[SparkWrapperParams]] so they can save s.setStageSavePath(path) case _ => } // We produce stage metadata for all the Spark params - val metadataJson = SparkDefaultParamsReadWrite.getMetadataToSave(stage) - // Add isModel indicator - val metadata = parse(metadataJson).extract[Map[String, Any]] + (FieldNames.IsModel.entryName -> isModel) - // In case we stumbled upon a model instance, we also include it's ctor args - // so we can reconstruct the model instance when loading - if (isModel) metadata + (FieldNames.CtorArgs.entryName -> modelCtorArgs().toMap) else metadata - } - - private def isModel: Boolean = stage.isInstanceOf[Model[_]] - - /** - * Extract model ctor args values keyed by their names, so we can reconstruct the model instance when loading. - * See [[OpPipelineStageReader]].OpPipelineStageReader - * - * @return model ctor args values by their names - */ - private def modelCtorArgs(): Seq[(String, AnyValue)] = Try { - // Reflect all model ctor args - val (_, argsList) = ReflectionUtils.bestCtorWithArgs(stage) - - // Wrap all ctor args into AnyValue container - for {(argName, argValue) <- argsList} yield { - val anyValue = argValue match { - // Special handling for Feature Type TypeTags - case t: TypeTag[_] if FeatureType.isFeatureType(t) || FeatureType.isFeatureValueType(t) => - AnyValue(`type` = AnyValueTypes.TypeTag, value = ReflectionUtils.dealisedTypeName(t.tpe)) - case t: TypeTag[_] => - throw new RuntimeException( - s"Unknown type tag '${t.tpe.toString}'. " + - "Only Feature and Feature Value type tags are supported for serialization." - ) + val metadata = SparkDefaultParamsReadWrite.getMetadataToSave(stage) - // Spark wrapped stage is saved using [[SparkWrapperParams]], so we just writing it's uid here - case Some(v: PipelineStage) => AnyValue(AnyValueTypes.SparkWrappedStage, v.uid) - case v: PipelineStage => AnyValue(AnyValueTypes.SparkWrappedStage, v.uid) - - // Everything else goes as is and is handled by json4s - case v => - // try serialize value with json4s - val av = AnyValue(AnyValueTypes.Value, v) - Try(jsonSerialize(av)) match { - case Success(_) => av - case Failure(e) => - throw new RuntimeException(s"Failed to json4s serialize argument '$argName' with value '$v'", e) - } - - } - argName -> anyValue + // Write out the stage using the specified writer instance + val writer = readerWriterFor[OpPipelineStageBase](stage.getClass.asInstanceOf[Class[OpPipelineStageBase]]) + val stageJson: JValue = writer.write(stage) match { + case Failure(err) => throw new RuntimeException(s"Failed to write out stage '${stage.uid}'", err) + case Success(json) => json } - } match { - case Success(args) => args - case Failure(error) => - throw new RuntimeException(s"Failed to extract constructor arguments for model stage '${stage.uid}'. " + - "Make sure your model class is a concrete final class with json4s serializable arguments.", error - ) + + // Join metadata & with stage ctor args + val j = metadata.merge(JObject(FieldNames.CtorArgs.entryName -> stageJson)) + render(j).asInstanceOf[JObject] } - private def jsonSerialize(v: Any): JValue = render(Extraction.decompose(v)) } diff --git a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStages.scala b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStages.scala index 8b26237dba..c2010b7018 100644 --- a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStages.scala +++ b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStages.scala @@ -41,7 +41,6 @@ import org.apache.spark.ml.{PipelineStage, Transformer} import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType -import scala.reflect.runtime.universe.TypeTag import scala.util.{Success, Try} diff --git a/features/src/main/scala/com/salesforce/op/stages/base/sequence/BinarySequenceTransformer.scala b/features/src/main/scala/com/salesforce/op/stages/base/sequence/BinarySequenceTransformer.scala index 3c8ec96861..72ceb9b4a4 100644 --- a/features/src/main/scala/com/salesforce/op/stages/base/sequence/BinarySequenceTransformer.scala +++ b/features/src/main/scala/com/salesforce/op/stages/base/sequence/BinarySequenceTransformer.scala @@ -35,9 +35,9 @@ import com.salesforce.op.features.FeatureSparkTypes import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.{OpPipelineStage2N, OpTransformer} import org.apache.spark.ml.Transformer +import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.util.ClosureUtils -import org.apache.spark.sql.functions._ import scala.reflect.runtime.universe.TypeTag import scala.util.Try diff --git a/features/src/main/scala/com/salesforce/op/stages/impl/feature/ScalingArgs.scala b/features/src/main/scala/com/salesforce/op/stages/impl/feature/ScalingArgs.scala new file mode 100644 index 0000000000..9e3e8bfa56 --- /dev/null +++ b/features/src/main/scala/com/salesforce/op/stages/impl/feature/ScalingArgs.scala @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.feature + +import com.salesforce.op.utils.json.JsonLike + + +/** + * A trait to be extended by a case class containing the args needed to define a family of scaling & descaling functions + */ +trait ScalingArgs extends JsonLike + +/** + * Case class for Scaling families that take no parameters + */ +case class EmptyScalerArgs() extends ScalingArgs + +/** + * Parameters need to uniquely define a linear scaling function + * + * @param slope the slope of the linear scaler + * @param intercept the x axis intercept of the linear scaler + */ +case class LinearScalerArgs(slope: Double, intercept: Double) extends ScalingArgs diff --git a/features/src/main/scala/com/salesforce/op/test/OpTransformerSpec.scala b/features/src/main/scala/com/salesforce/op/test/OpTransformerSpec.scala index 4eb621fe46..2b65c92846 100644 --- a/features/src/main/scala/com/salesforce/op/test/OpTransformerSpec.scala +++ b/features/src/main/scala/com/salesforce/op/test/OpTransformerSpec.scala @@ -150,7 +150,8 @@ private[test] trait TransformerSpecCommon[O <: FeatureType, TransformerType <: O */ protected def writeAndRead(stage: TransformerType, savePath: String = stageSavePath): OpPipelineStageBase = { val json = new OpPipelineStageWriter(stage).overwrite().writeToJsonString(savePath) - new OpPipelineStageReader(stage).loadFromJsonString(json, savePath) + val features = stage.getInputFeatures().flatMap(_.allFeatures) + new OpPipelineStageReader(features).loadFromJsonString(json, savePath) } /** diff --git a/features/src/main/scala/org/apache/spark/ml/SparkDefaultParamsReadWrite.scala b/features/src/main/scala/org/apache/spark/ml/SparkDefaultParamsReadWrite.scala index bf3b5d3c65..8179b2b403 100644 --- a/features/src/main/scala/org/apache/spark/ml/SparkDefaultParamsReadWrite.scala +++ b/features/src/main/scala/org/apache/spark/ml/SparkDefaultParamsReadWrite.scala @@ -35,6 +35,8 @@ case object SparkDefaultParamsReadWrite { type Metadata = DefaultParamsReader.Metadata + implicit val formats = DefaultFormats + /** * Helper for [[OpPipelineStageWriter]] which extracts the JSON to save. * This is useful for ensemble models which need to save metadata for many sub-models. @@ -48,7 +50,7 @@ case object SparkDefaultParamsReadWrite { stage: OpPipelineStageBase, extraMetadata: Option[JObject] = None, paramMap: Option[JValue] = None - ): String = { + ): JObject = { val uid = stage.uid val cls = stage.getClass.getName val params = stage.extractParamMap().toSeq.asInstanceOf[Seq[ParamPair[Any]]] @@ -66,8 +68,7 @@ case object SparkDefaultParamsReadWrite { case None => basicMetadata } - val metadataJson: String = compact(render(metadata)) - metadataJson + metadata } /** diff --git a/features/src/test/scala/com/salesforce/op/stages/FeatureGeneratorStageTest.scala b/features/src/test/scala/com/salesforce/op/stages/FeatureGeneratorStageTest.scala new file mode 100644 index 0000000000..5be19f31e4 --- /dev/null +++ b/features/src/test/scala/com/salesforce/op/stages/FeatureGeneratorStageTest.scala @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages + +import com.salesforce.op.aggregators.{CutOffTime, Event, MonoidAggregatorDefaults} +import com.salesforce.op.features.Feature +import com.salesforce.op.features.types.{FeatureType, FeatureTypeSparkConverter} +import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} +import com.salesforce.op.utils.spark.RichRow._ +import org.apache.spark.sql.Row +import org.junit.runner.RunWith +import org.scalatest.FlatSpec +import org.scalatest.junit.JUnitRunner + +import scala.reflect.runtime.universe.WeakTypeTag + + +@RunWith(classOf[JUnitRunner]) +class FeatureGeneratorStageTest extends FlatSpec with TestSparkContext { + + val (ds, features) = TestFeatureBuilder.random()() + + type FeaturesAndGenerators = Array[(Feature[_ <: FeatureType], FeatureGeneratorStage[Row, _ <: FeatureType])] + + val featuresAndGenerators: FeaturesAndGenerators = + features + .map(f => f -> f.originStage) + .collect { case (f, fg: FeatureGeneratorStage[Row, _]@unchecked) => f -> fg } + + val rows = ds.collect() + + Spec[FeatureGeneratorStage[_, _]] should "be the origin stage for raw features" in { + features.length shouldBe featuresAndGenerators.length + } + + it should "extract feature values" in assertExtractFeatures(featuresAndGenerators) + + it should "aggregate feature values" in assertAggregateFeatures(featuresAndGenerators) + + it should "serialize to/from json, then extract and aggregate feature values" in { + val recovered: FeaturesAndGenerators = + for {(feature, featureGenerator) <- featuresAndGenerators} yield { + val featureGenJson = featureGenerator.write.asInstanceOf[OpPipelineStageWriter].writeToJsonString("") + val recoveredStage = new OpPipelineStageReader(Seq.empty).loadFromJsonString(featureGenJson, "") + recoveredStage shouldBe a[FeatureGeneratorStage[_, _]] + feature -> recoveredStage.asInstanceOf[FeatureGeneratorStage[Row, _ <: FeatureType]] + } + assertExtractFeatures(recovered) + assertAggregateFeatures(recovered) + } + + def assertExtractFeatures(fgs: FeaturesAndGenerators): Unit = { + for {(feature, featureGenerator) <- fgs} { + rows.map { row => + val featureValue = featureGenerator.extractFn(row) + featureValue shouldBe a[FeatureType] + row.getAny(feature.name) shouldBe FeatureTypeSparkConverter.toSpark(featureValue) + } + } + } + + def assertAggregateFeatures(fgs: FeaturesAndGenerators): Unit = { + for {(feature, featureGenerator) <- featuresAndGenerators} { + val fa = featureGenerator.featureAggregator + val expectedValue = fa.extract(rows, timeStampFn = None, cutOffTime = CutOffTime.NoCutoff()) + + val ftt = feature.wtt.asInstanceOf[WeakTypeTag[FeatureType]] + val rowVals = rows.map(r => FeatureTypeSparkConverter()(ftt).fromSpark(r.getAny(feature.name))) + val events = rowVals.map(Event(0L, _)) + val aggr = MonoidAggregatorDefaults.aggregatorOf(ftt) + val aggregatedValue = aggr(events) + + aggregatedValue shouldEqual expectedValue + } + } + +} diff --git a/features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala index 29597353c0..d92ef5641f 100644 --- a/features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala +++ b/features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala @@ -30,7 +30,7 @@ package com.salesforce.op.stages.base.binary -import com.salesforce.op.features.types._ +import com.salesforce.op.features.types.{Real, _} import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @@ -42,10 +42,14 @@ class BinaryTransformerTest extends OpTransformerSpec[Real, BinaryTransformer[Re val sample = Seq(Real(1.0) -> RealNN(0.0), Real(2.0) -> RealNN(2.0), Real.empty -> RealNN(1.0)) val (inputData, f1, f2) = TestFeatureBuilder(sample) - val transformer = new BinaryLambdaTransformer[Real, RealNN, Real](operationName = "bmi", - transformFn = (i1, i2) => new Real(for { v1 <- i1.value; v2 <- i2.value } yield v1 / (v2 * v2)) + val transformer = new BinaryLambdaTransformer[Real, RealNN, Real]( + operationName = "bmi", transformFn = BinaryTransformerTest.fn ).setInput(f1, f2) val expectedResult = Seq(Real(Double.PositiveInfinity), Real(0.5), Real.empty) } + +object BinaryTransformerTest { + def fn: (Real, RealNN) => Real = (i1, i2) => new Real(for {v1 <- i1.value; v2 <- i2.value} yield v1 / (v2 * v2)) +} diff --git a/features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala index ed5c448d97..0e029e8d55 100644 --- a/features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala +++ b/features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala @@ -48,12 +48,16 @@ class QuaternaryTransformerTest val (inputData, f1, f2, f3, f4) = TestFeatureBuilder(sample) - val transformer = new QuaternaryLambdaTransformer[Real, Integral, Text, Binary, Real](operationName = "quatro", - transformFn = (r, i, t, b) => - (r.v.getOrElse(0.0) + i.toDouble.getOrElse(0.0) + b.toDouble.getOrElse(0.0) + - t.value.map(_.length.toDouble).getOrElse(0.0)).toReal + val transformer = new QuaternaryLambdaTransformer[Real, Integral, Text, Binary, Real]( + operationName = "quatro", transformFn = QuaternaryTransformerTest.fn ).setInput(f1, f2, f3, f4) val expectedResult = Seq(4.toReal, 6.toReal, 11.toReal) } + +object QuaternaryTransformerTest { + def fn: (Real, Integral, Text, Binary) => Real = (r, i, t, b) => + (r.v.getOrElse(0.0) + i.toDouble.getOrElse(0.0) + b.toDouble.getOrElse(0.0) + + t.value.map(_.length.toDouble).getOrElse(0.0)).toReal +} diff --git a/features/src/test/scala/com/salesforce/op/stages/base/sequence/BinarySequenceTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/sequence/BinarySequenceTransformerTest.scala index 1602df79c1..6ec1bca0fc 100644 --- a/features/src/test/scala/com/salesforce/op/stages/base/sequence/BinarySequenceTransformerTest.scala +++ b/features/src/test/scala/com/salesforce/op/stages/base/sequence/BinarySequenceTransformerTest.scala @@ -50,8 +50,7 @@ class BinarySequenceTransformerTest val (inputData, f1, f2, f3) = TestFeatureBuilder(sample) val transformer = new BinarySequenceLambdaTransformer[Real, Text, MultiPickList]( - operationName = "realToMultiPicklist", - transformFn = (r, texts) => MultiPickList(texts.map(_.value.get).toSet + r.value.get.toString) + operationName = "realToMultiPicklist", transformFn = Lambda.fn ).setInput(f1, f2, f3) val expectedResult = Seq( @@ -62,3 +61,7 @@ class BinarySequenceTransformerTest ).map(_.toMultiPickList) } +object Lambda { + def fn: (Real, Seq[Text]) => MultiPickList = + (r, texts) => MultiPickList(texts.map(_.value.get).toSet + r.value.get.toString) +} diff --git a/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala index 8d0c880dc3..427fb8f55a 100644 --- a/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala +++ b/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala @@ -84,7 +84,7 @@ final class FractionOfResponsesModel private[op] operationName: String, uid: String ) extends SequenceModel[DateList, OPVector](operationName = operationName, uid = uid) { - def transformFn: (Seq[DateList]) => OPVector = row => { + def transformFn: Seq[DateList] => OPVector = row => { val fractions = row.zip(counts).map { case (feature, count) => feature.value.size.toDouble / count } Vectors.dense(fractions.toArray).toOPVector } diff --git a/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala index 2fa455079c..a03e3b1ec3 100644 --- a/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala +++ b/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala @@ -47,8 +47,8 @@ class SequenceTransformerTest extends OpTransformerSpec[MultiPickList, SequenceT ) val (inputData, f1, f2) = TestFeatureBuilder(sample) - val transformer = new SequenceLambdaTransformer[Real, MultiPickList](operationName = "realToMultiPicklist", - transformFn = value => MultiPickList(value.flatMap(_.v.map(_.toString)).toSet) + val transformer = new SequenceLambdaTransformer[Real, MultiPickList]( + operationName = "realToMultiPicklist", transformFn = SequenceTransformerTest.fn ).setInput(f1, f2) val expectedResult = Seq( @@ -59,3 +59,7 @@ class SequenceTransformerTest extends OpTransformerSpec[MultiPickList, SequenceT ) } + +object SequenceTransformerTest { + def fn: Seq[Real] => MultiPickList = value => MultiPickList(value.flatMap(_.v.map(_.toString)).toSet) +} diff --git a/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala index 3651fbad2b..26bdd38533 100644 --- a/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala +++ b/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala @@ -47,10 +47,15 @@ class TernaryTransformerTest extends OpTransformerSpec[Real, TernaryTransformer[ val (inputData, f1, f2, f3) = TestFeatureBuilder(sample) - val transformer = new TernaryLambdaTransformer[Real, Integral, Binary, Real](operationName = "trio", - transformFn = (r, i, b) => (r.v.getOrElse(0.0) + i.toDouble.getOrElse(0.0) + b.toDouble.getOrElse(0.0)).toReal + val transformer = new TernaryLambdaTransformer[Real, Integral, Binary, Real]( + operationName = "trio", transformFn = Lambda.fn ).setInput(f1, f2, f3) val expectedResult = Seq(1.toReal, 5.toReal, 4.toReal) } + +object Lambda { + def fn: (Real, Integral, Binary) => Real = + (r, i, b) => (r.v.getOrElse(0.0) + i.toDouble.getOrElse(0.0) + b.toDouble.getOrElse(0.0)).toReal +} diff --git a/features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala index 819a211484..d7cbbfcccc 100644 --- a/features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala +++ b/features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala @@ -48,8 +48,7 @@ class UnaryTransformerTest extends OpTransformerSpec[Real, UnaryLambdaTransforme * [[OpTransformer]] instance to be tested */ val transformer = new UnaryLambdaTransformer[Real, Real]( - operationName = "unary", - transformFn = r => r.v.map(_ * 2.0).toReal + operationName = "unary", transformFn = UnaryTransformerTest.fn ).setInput(f1) /** @@ -58,3 +57,7 @@ class UnaryTransformerTest extends OpTransformerSpec[Real, UnaryLambdaTransforme val expectedResult = Seq(Real(2), Real(4), Real(6), Real.empty) } + +object UnaryTransformerTest { + def fn: Real => Real = r => r.v.map(_ * 2.0).toReal +} diff --git a/readers/src/main/scala/com/salesforce/op/readers/JoinedDataReader.scala b/readers/src/main/scala/com/salesforce/op/readers/JoinedDataReader.scala index c52164b575..7b9b8a5ab3 100644 --- a/readers/src/main/scala/com/salesforce/op/readers/JoinedDataReader.scala +++ b/readers/src/main/scala/com/salesforce/op/readers/JoinedDataReader.scala @@ -149,7 +149,7 @@ private[op] abstract class JoinedReader[T, U] )(implicit spark: SparkSession): (DataFrame, Array[String]) = { def getData(r: DataReader[_]): DataFrame = { - val readerFeatures = rawFeatures.filter { f => getGenStage(f).tti.tpe.toString == r.fullTypeName } + val readerFeatures = rawFeatures.filter { f => getGenStage(f).tti.tpe.typeSymbol.fullName == r.fullTypeName } r.generateDataFrame(readerFeatures, opParams) } diff --git a/readers/src/main/scala/com/salesforce/op/readers/Reader.scala b/readers/src/main/scala/com/salesforce/op/readers/Reader.scala index 25fe4ef803..a41c51e612 100644 --- a/readers/src/main/scala/com/salesforce/op/readers/Reader.scala +++ b/readers/src/main/scala/com/salesforce/op/readers/Reader.scala @@ -51,7 +51,7 @@ private[readers] trait ReaderType[T] extends Serializable { * * @return full input type name */ - final def fullTypeName: String = wtt.tpe.toString + final def fullTypeName: String = wtt.tpe.typeSymbol.fullName /** * Short reader input type name diff --git a/readers/src/main/scala/com/salesforce/op/test/PassengerFeaturesTest.scala b/readers/src/main/scala/com/salesforce/op/test/PassengerFeaturesTest.scala index 896320035e..975bbf567a 100644 --- a/readers/src/main/scala/com/salesforce/op/test/PassengerFeaturesTest.scala +++ b/readers/src/main/scala/com/salesforce/op/test/PassengerFeaturesTest.scala @@ -34,32 +34,44 @@ import com.salesforce.op.aggregators.MaxReal import com.salesforce.op.features.types._ import com.salesforce.op.features.{FeatureBuilder, OPFeature} import org.joda.time.Duration +import PassengerFeaturesTestLambdas._ trait PassengerFeaturesTest { - - val age = FeatureBuilder.Real[Passenger].extract(_.getAge.toReal).aggregate(MaxReal).asPredictor - val gender = FeatureBuilder.MultiPickList[Passenger].extract(p => Set(p.getGender).toMultiPickList).asPredictor - val genderPL = FeatureBuilder.PickList[Passenger].extract(p => p.getGender.toPickList).asPredictor - - val height = FeatureBuilder.RealNN[Passenger] - .extract(p => Option(p.getHeight).map(_.toDouble).toRealNN(0.0)) - .window(Duration.millis(300)) - .asPredictor - - val heightNoWindow = FeatureBuilder.Real[Passenger].extract(_.getHeight.toReal).asPredictor - val weight = FeatureBuilder.Real[Passenger].extract(_.getWeight.toReal).asPredictor - val description = FeatureBuilder.Text[Passenger].extract(_.getDescription.toText).asPredictor - val boarded = FeatureBuilder.DateList[Passenger].extract(p => Seq(p.getBoarded.toLong).toDateList).asPredictor - val stringMap = FeatureBuilder.TextMap[Passenger].extract(p => p.getStringMap.toTextMap).asPredictor - val numericMap = FeatureBuilder.RealMap[Passenger].extract(p => p.getNumericMap.toRealMap).asPredictor - val booleanMap = FeatureBuilder.BinaryMap[Passenger].extract(p => p.getBooleanMap.toBinaryMap).asPredictor - val survived = FeatureBuilder.Binary[Passenger].extract(p => Option(p.getSurvived).map(_ == 1).toBinary).asResponse - val boardedTime = FeatureBuilder.Date[Passenger].extract(_.getBoarded.toLong.toDate).asPredictor - val boardedTimeAsDateTime = FeatureBuilder.DateTime[Passenger].extract(_.getBoarded.toLong.toDateTime).asPredictor + val age = FeatureBuilder.Real[Passenger].extract(ageFn).aggregate(MaxReal).asPredictor + val gender = FeatureBuilder.MultiPickList[Passenger].extract(genderFn).asPredictor + val genderPL = FeatureBuilder.PickList[Passenger].extract(genderPLFn).asPredictor + val height = FeatureBuilder.RealNN[Passenger].extract(heightFn).window(Duration.millis(300)).asPredictor + val heightNoWindow = FeatureBuilder.Real[Passenger].extract(heightToReal).asPredictor + val weight = FeatureBuilder.Real[Passenger].extract(weightToReal).asPredictor + val description = FeatureBuilder.Text[Passenger].extract(descriptionFn).asPredictor + val boarded = FeatureBuilder.DateList[Passenger].extract(boardedToDL).asPredictor + val stringMap = FeatureBuilder.TextMap[Passenger].extract(stringMapFn).asPredictor + val numericMap = FeatureBuilder.RealMap[Passenger].extract(numericMapFn).asPredictor + val booleanMap = FeatureBuilder.BinaryMap[Passenger].extract(booleanMapFn).asPredictor + val survived = FeatureBuilder.Binary[Passenger].extract(survivedFn).asResponse + val boardedTime = FeatureBuilder.Date[Passenger].extract(boardedTimeFn).asPredictor + val boardedTimeAsDateTime = FeatureBuilder.DateTime[Passenger].extract(boardedDTFn).asPredictor val rawFeatures: Array[OPFeature] = Array( survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap ) } + +object PassengerFeaturesTestLambdas { + def genderFn: Passenger => MultiPickList = p => Set(p.getGender).toMultiPickList + def genderPLFn: Passenger => PickList = p => p.getGender.toPickList + def heightFn: Passenger => RealNN = p => Option(p.getHeight).map(_.toDouble).toRealNN(0.0) + def heightToReal: Passenger => Real = _.getHeight.toReal + def weightToReal: Passenger => Real = _.getWeight.toReal + def descriptionFn: Passenger => Text = _.getDescription.toText + def boardedToDL: Passenger => DateList = p => Seq(p.getBoarded.toLong).toDateList + def stringMapFn: Passenger => TextMap = p => p.getStringMap.toTextMap + def numericMapFn: Passenger => RealMap = p => p.getNumericMap.toRealMap + def booleanMapFn: Passenger => BinaryMap = p => p.getBooleanMap.toBinaryMap + def survivedFn: Passenger => Binary = p => Option(p.getSurvived).map(_ == 1).toBinary + def boardedTimeFn: Passenger => Date = _.getBoarded.toLong.toDate + def boardedDTFn: Passenger => DateTime = _.getBoarded.toLong.toDateTime + def ageFn: Passenger => Real = _.getAge.toReal +} diff --git a/readers/src/test/scala/com/salesforce/op/readers/DataReadersTest.scala b/readers/src/test/scala/com/salesforce/op/readers/DataReadersTest.scala index 2f66c3dee9..007fe5866f 100644 --- a/readers/src/test/scala/com/salesforce/op/readers/DataReadersTest.scala +++ b/readers/src/test/scala/com/salesforce/op/readers/DataReadersTest.scala @@ -175,7 +175,7 @@ class DataReadersTest extends FlatSpec with PassengerSparkFixtureTest with TestC } } - aggReaders.foreach( reader => + aggReaders.foreach(reader => Spec(reader.getClass) should "read and aggregate data correctly" in { val data = reader.readDataset().collect() data.foreach(_ shouldBe a[PassengerCaseClass]) @@ -183,13 +183,13 @@ class DataReadersTest extends FlatSpec with PassengerSparkFixtureTest with TestC val aggregatedData = reader.generateDataFrame(rawFeatures = Array(agePredictor, survivedResponse)).collect() aggregatedData.length shouldBe 6 - aggregatedData.collect { case r if r.get(0) == "4" => r} shouldEqual Array(Row("4", 60, false)) + aggregatedData.collect { case r if r.get(0) == "4" => r } shouldEqual Array(Row("4", 60, false)) reader.fullTypeName shouldBe typeOf[PassengerCaseClass].toString } ) - conditionalReaders.foreach( reader => + conditionalReaders.foreach(reader => Spec(reader.getClass) should "read and conditionally aggregate data correctly" in { val data = reader.readDataset().collect() data.foreach(_ shouldBe a[PassengerCaseClass]) @@ -203,4 +203,3 @@ class DataReadersTest extends FlatSpec with PassengerSparkFixtureTest with TestC } ) } - diff --git a/testkit/src/main/scala/com/salesforce/op/test/FeatureAsserts.scala b/testkit/src/main/scala/com/salesforce/op/test/FeatureAsserts.scala index 833f4b3f78..e11ff062e1 100644 --- a/testkit/src/main/scala/com/salesforce/op/test/FeatureAsserts.scala +++ b/testkit/src/main/scala/com/salesforce/op/test/FeatureAsserts.scala @@ -86,4 +86,5 @@ trait FeatureAsserts extends Matchers { fg.uid.startsWith(classOf[FeatureGeneratorStage[I, O]].getSimpleName) shouldBe true } + } diff --git a/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala b/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala index b997cc45db..4b0b46927a 100644 --- a/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala +++ b/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala @@ -284,4 +284,3 @@ class ReflectionUtilsTest extends FlatSpec with Matchers { } } -