From eb2da768ebcfebe1a9658f40a9a23cb44b1762c3 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Wed, 8 Aug 2018 09:32:52 -0700 Subject: [PATCH 01/40] Initial implementation of XGBoost classifier & regressor moddels + upgrade to Spark 2.3.1 --- build.gradle | 8 +- core/build.gradle | 3 + .../OpMultiClassificationEvaluator.scala | 7 +- .../op/filters/FeatureDistribution.scala | 1 - .../op/filters/RawFeatureFilter.scala | 23 +-- .../com/salesforce/op/filters/Summary.scala | 2 +- .../classification/OpLogisticRegression.scala | 5 +- .../classification/OpXGBoostClassifier.scala | 167 ++++++++++++++++++ .../impl/regression/OpXGBoostRegressor.scala | 166 +++++++++++++++++ .../specific/SparkModelConverter.scala | 3 + .../xgboost4j/scala/spark/XGBoostParams.scala | 37 ++++ .../op/filters/PreparedFeaturesTest.scala | 7 +- .../OpDecisionTreeClassifierTest.scala | 2 + .../classification/OpGBTClassifierTest.scala | 2 + .../impl/classification/OpLinearSVCTest.scala | 2 + .../OpLogisticRegressionTest.scala | 2 + ...OpMultilayerPerceptronClassifierTest.scala | 2 + .../classification/OpNaiveBayesTest.scala | 2 + .../OpRandomForestClassifierTest.scala | 2 + .../OpXGBoostClassifierTest.scala | 85 +++++++++ .../OpDecisionTreeRegressorTest.scala | 2 + .../impl/regression/OpGBTRegressorTest.scala | 2 + .../OpGeneralizedLinearRegressionTest.scala | 2 + .../regression/OpLinearRegressionTest.scala | 2 + .../OpRandomForestRegressorTest.scala | 2 + .../regression/OpXGBoostRegressorTest.scala | 79 +++++++++ .../op/aggregators/ExtendedMultiset.scala | 2 + utils/build.gradle | 4 +- 28 files changed, 594 insertions(+), 29 deletions(-) create mode 100644 core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala create mode 100644 core/src/main/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressor.scala create mode 100644 core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala create mode 100644 core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala create mode 100644 core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala diff --git a/build.gradle b/build.gradle index c223a3db77..16a5e7f158 100644 --- a/build.gradle +++ b/build.gradle @@ -61,7 +61,7 @@ configure(allProjs) { scalaCheckVersion = '1.14.0' junitVersion = '4.11' avroVersion = '1.7.7' - sparkVersion = '2.2.1' + sparkVersion = '2.3.1' sparkAvroVersion = '4.0.0' scalaGraphVersion = '1.11.2' scalafmtVersion = '1.0.0-RC1' @@ -70,7 +70,7 @@ configure(allProjs) { json4sVersion = '3.2.11' // matches Spark dependency version jodaTimeVersion = '2.9.4' jodaConvertVersion = '1.8.1' - algebirdVersion = '0.12.3' + algebirdVersion = '0.13.4' jacksonVersion = '2.7.3' luceneVersion = '7.3.0' enumeratumVersion = '1.4.12' @@ -78,12 +78,12 @@ configure(allProjs) { googleLibPhoneNumberVersion = '8.8.5' googleGeoCoderVersion = '2.82' googleCarrierVersion = '1.72' - chillAvroVersion = '0.8.0' + chillAvroVersion = '0.8.4' reflectionsVersion = '0.9.11' collectionsVersion = '3.2.2' optimaizeLangDetectorVersion = '0.7.1' tikaVersion = '1.16' - sparkTestingBaseVersion = '2.2.0_0.8.0' + sparkTestingBaseVersion = '2.3.1_0.10.0' sourceCodeVersion = '0.1.3' pegdownVersion = '1.4.2' commonsValidatorVersion = '1.6' diff --git a/core/build.gradle b/core/build.gradle index 5290a7e4d0..9a4baa3a62 100644 --- a/core/build.gradle +++ b/core/build.gradle @@ -22,4 +22,7 @@ dependencies { // Scopt compile "com.github.scopt:scopt_$scalaVersion:$scoptVersion" + + // XGBoost + compile "ml.dmlc:xgboost4j-spark:0.80-SNAPSHOT" } diff --git a/core/src/main/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluator.scala b/core/src/main/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluator.scala index bd347e8384..7ff4e7a66f 100644 --- a/core/src/main/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluator.scala +++ b/core/src/main/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluator.scala @@ -32,10 +32,9 @@ package com.salesforce.op.evaluators import com.fasterxml.jackson.databind.annotation.JsonDeserialize import com.salesforce.op.UID -import com.salesforce.op.features.types.Prediction -import com.salesforce.op.utils.json.JsonLike import com.twitter.algebird.Monoid._ import com.twitter.algebird.Operators._ +import com.twitter.algebird.Tuple2Semigroup import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.{DoubleArrayParam, IntArrayParam} @@ -230,8 +229,8 @@ private[op] class OpMultiClassificationEvaluator .map(_ -> (new Array[Long](nThresholds), new Array[Long](nThresholds))) .toMap[Label, CorrIncorr] - val agg: MetricsMap = - data.treeAggregate[MetricsMap](zeroValue)(combOp = _ + _, seqOp = _ + computeMetrics(_)) + implicit val sgTuple2 = new Tuple2Semigroup[Array[Long], Array[Long]]() + val agg: MetricsMap = data.treeAggregate[MetricsMap](zeroValue)(combOp = _ + _, seqOp = _ + computeMetrics(_)) val nRows = data.count() ThresholdMetrics( diff --git a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala index 668057f065..3797e9a214 100644 --- a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala +++ b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala @@ -30,7 +30,6 @@ package com.salesforce.op.filters -import com.salesforce.op.features.TransientFeature import com.salesforce.op.stages.impl.feature.{Inclusion, NumericBucketizer} import com.twitter.algebird.Semigroup import com.twitter.algebird.Monoid._ diff --git a/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala b/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala index cc5da5dcd4..1830cfee4c 100644 --- a/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala +++ b/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala @@ -30,19 +30,18 @@ package com.salesforce.op.filters -import scala.math.{abs, min} - import com.salesforce.op.OpParams import com.salesforce.op.features.types._ import com.salesforce.op.features.{OPFeature, TransientFeature} +import com.salesforce.op.filters.FeatureDistribution._ +import com.salesforce.op.filters.Summary._ import com.salesforce.op.readers.{DataFrameFieldNames, Reader} -import com.salesforce.op.stages.impl.feature.{HashAlgorithm, Inclusion, NumericBucketizer, TextTokenizer} +import com.salesforce.op.stages.impl.feature.HashAlgorithm import com.salesforce.op.stages.impl.preparators.CorrelationType import com.salesforce.op.utils.spark.RichRow._ -import com.twitter.algebird.Monoid -import com.twitter.algebird.Semigroup import com.twitter.algebird.Monoid._ import com.twitter.algebird.Operators._ +import com.twitter.algebird.Tuple2Semigroup import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.linalg.{Matrix, Vector} import org.apache.spark.mllib.stat.Statistics @@ -51,6 +50,8 @@ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.slf4j.LoggerFactory +import scala.math.{abs, min} + /** * Specialized stage that will load up data and compute distributions and empty counts on raw features. * This information is then used to compute which raw features should be excluded from the workflow DAG @@ -122,25 +123,27 @@ class RawFeatureFilter[T] None } val predOut = allPredictors.map(TransientFeature(_)) - (respOut, predOut) } - val preparedFeatures: RDD[PreparedFeatures] = - data.rdd.map(PreparedFeatures(_, responses, predictors)) + val preparedFeatures: RDD[PreparedFeatures] = data.rdd.map(PreparedFeatures(_, responses, predictors)) + + implicit val sgTuple2Maps = new Tuple2Semigroup[Map[FeatureKey, Summary], Map[FeatureKey, Summary]]() // Have to use the training summaries do process scoring for comparison val (responseSummaries, predictorSummaries): (Map[FeatureKey, Summary], Map[FeatureKey, Summary]) = allFeatureInfo.map(info => info.responseSummaries -> info.predictorSummaries) .getOrElse(preparedFeatures.map(_.summaries).reduce(_ + _)) val (responseSummariesArr, predictorSummariesArr): (Array[(FeatureKey, Summary)], Array[(FeatureKey, Summary)]) = (responseSummaries.toArray, predictorSummaries.toArray) + + implicit val sgTuple2Feats = new Tuple2Semigroup[Array[FeatureDistribution], Array[FeatureDistribution]]() val (responseDistributions, predictorDistributions): (Array[FeatureDistribution], Array[FeatureDistribution]) = preparedFeatures .map(_.getFeatureDistributions( responseSummaries = responseSummariesArr, predictorSummaries = predictorSummariesArr, bins = bins, - hasher = hasher)) - .reduce(_ + _) // NOTE: resolved semigroup is IndexedSeqSemigroup + hasher = hasher) + ).reduce(_ + _) val correlationInfo: Map[FeatureKey, Map[FeatureKey, Double]] = allFeatureInfo.map(_.correlationInfo).getOrElse { val emptyCorr: Map[FeatureKey, Map[FeatureKey, Double]] = Map() diff --git a/core/src/main/scala/com/salesforce/op/filters/Summary.scala b/core/src/main/scala/com/salesforce/op/filters/Summary.scala index f623a75edc..0d0a259c10 100644 --- a/core/src/main/scala/com/salesforce/op/filters/Summary.scala +++ b/core/src/main/scala/com/salesforce/op/filters/Summary.scala @@ -45,7 +45,7 @@ private[op] case object Summary { val empty: Summary = Summary(Double.PositiveInfinity, Double.NegativeInfinity) implicit val monoid: Monoid[Summary] = new Monoid[Summary] { - override def zero = empty + override def zero = Summary.empty override def plus(l: Summary, r: Summary) = Summary(math.min(l.min, r.min), math.max(l.max, r.max)) } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegression.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegression.scala index 3b54cf9351..f0c33ad08a 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegression.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegression.scala @@ -195,8 +195,8 @@ class OpLogisticRegression(uid: String = UID[OpLogisticRegression]) class OpLogisticRegressionModel ( sparkModel: LogisticRegressionModel, - operationName: String = classOf[LogisticRegression].getSimpleName, - uid: String = UID[OpLogisticRegressionModel] + uid: String = UID[OpLogisticRegressionModel], + operationName: String = classOf[LogisticRegression].getSimpleName )( implicit tti1: TypeTag[RealNN], tti2: TypeTag[OPVector], @@ -210,4 +210,3 @@ class OpLogisticRegressionModel @transient lazy val probability2predictionMirror = reflectMethod(getSparkMlStage().get, "probability2prediction") } - diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala new file mode 100644 index 0000000000..6829b48c0d --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.classification + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import com.salesforce.op.stages.impl.CheckIsResponseValues +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper} +import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod +import ml.dmlc.xgboost4j.scala.{EvalTrait, ObjectiveTrait} +import ml.dmlc.xgboost4j.scala.spark.{OpXGBoostClassifierParams, TrackerConf, XGBoostClassificationModel, XGBoostClassifier} + +import scala.reflect.runtime.universe.TypeTag + +/** + * Wrapper around XGBoost classifier [[XGBoostClassifier]] + */ +class OpXGBoostClassifier(uid: String = UID[OpXGBoostClassifier]) + extends OpPredictorWrapper[XGBoostClassifier, XGBoostClassificationModel]( + predictor = new XGBoostClassifier(), + uid = uid + ) with OpXGBoostClassifierParams { + + override protected def onSetInput(): Unit = { + super.onSetInput() + CheckIsResponseValues(in1, in2) + } + + def setWeightCol(value: String): this.type = set(weightCol, value) + + def setBaseMarginCol(value: String): this.type = set(baseMarginCol, value) + + def setNumClass(value: Int): this.type = set(numClass, value) + + // setters for general params + def setTrackerConf(value: TrackerConf): this.type = set(trackerConf, value) + + def setNumRound(value: Int): this.type = set(numRound, value) + + def setNumWorkers(value: Int): this.type = set(numWorkers, value) + + def setNthread(value: Int): this.type = set(nthread, value) + + def setUseExternalMemory(value: Boolean): this.type = set(useExternalMemory, value) + + def setSilent(value: Int): this.type = set(silent, value) + + def setMissing(value: Float): this.type = set(missing, value) + + def setTimeoutRequestWorkers(value: Long): this.type = set(timeoutRequestWorkers, value) + + def setCheckpointPath(value: String): this.type = set(checkpointPath, value) + + def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) + + def setSeed(value: Long): this.type = set(seed, value) + + def setEta(value: Double): this.type = set(eta, value) + + def setGamma(value: Double): this.type = set(gamma, value) + + def setMaxDepth(value: Int): this.type = set(maxDepth, value) + + def setMinChildWeight(value: Double): this.type = set(minChildWeight, value) + + def setMaxDeltaStep(value: Double): this.type = set(maxDeltaStep, value) + + def setSubsample(value: Double): this.type = set(subsample, value) + + def setColsampleBytree(value: Double): this.type = set(colsampleBytree, value) + + def setColsampleBylevel(value: Double): this.type = set(colsampleBylevel, value) + + def setLambda(value: Double): this.type = set(lambda, value) + + def setAlpha(value: Double): this.type = set(alpha, value) + + def setTreeMethod(value: String): this.type = set(treeMethod, value) + + def setGrowPolicy(value: String): this.type = set(growPolicy, value) + + def setMaxBins(value: Int): this.type = set(maxBins, value) + + def setSketchEps(value: Double): this.type = set(sketchEps, value) + + def setScalePosWeight(value: Double): this.type = set(scalePosWeight, value) + + def setSampleType(value: String): this.type = set(sampleType, value) + + def setNormalizeType(value: String): this.type = set(normalizeType, value) + + def setRateDrop(value: Double): this.type = set(rateDrop, value) + + def setSkipDrop(value: Double): this.type = set(skipDrop, value) + + def setLambdaBias(value: Double): this.type = set(lambdaBias, value) + + // setters for learning params + def setObjective(value: String): this.type = set(objective, value) + + def setBaseScore(value: Double): this.type = set(baseScore, value) + + def setEvalMetric(value: String): this.type = set(evalMetric, value) + + def setTrainTestRatio(value: Double): this.type = set(trainTestRatio, value) + + def setNumEarlyStoppingRounds(value: Int): this.type = set(numEarlyStoppingRounds, value) + + def setCustomObj(value: ObjectiveTrait): this.type = set(customObj, value) + + def setCustomEval(value: EvalTrait): this.type = set(customEval, value) + +} + + +/** + * Class that takes in a spark [[XGBoostClassificationModel]] and wraps it into an OP model which returns a + * Prediction feature + * + * @param sparkModel model to wrap + * @param operationName unique name of the operation this stage performs + * @param uid uid to give stage + */ +class OpXGBoostClassificationModel +( + sparkModel: XGBoostClassificationModel, + uid: String = UID[OpXGBoostClassificationModel], + operationName: String = classOf[XGBoostClassifier].getSimpleName +)( + implicit tti1: TypeTag[RealNN], + tti2: TypeTag[OPVector], + tto: TypeTag[Prediction], + ttov: TypeTag[Prediction#Value] +) extends OpPredictionModel[XGBoostClassificationModel]( + sparkModel = sparkModel, uid = uid, operationName = operationName +) { + @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict") +} diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressor.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressor.scala new file mode 100644 index 0000000000..2a701360cb --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressor.scala @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.regression + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import com.salesforce.op.stages.impl.CheckIsResponseValues +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper} +import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod +import ml.dmlc.xgboost4j.scala.{EvalTrait, ObjectiveTrait} +import ml.dmlc.xgboost4j.scala.spark.{OpXGBoostRegressorParams, TrackerConf, XGBoostRegressionModel, XGBoostRegressor} + +import scala.reflect.runtime.universe.TypeTag + +/** + * Wrapper around XGBoost regressor [[XGBoostRegressor]] + */ +class OpXGBoostRegressor(uid: String = UID[OpXGBoostRegressor]) + extends OpPredictorWrapper[XGBoostRegressor, XGBoostRegressionModel]( + predictor = new XGBoostRegressor(), + uid = uid + ) with OpXGBoostRegressorParams { + + override protected def onSetInput(): Unit = { + super.onSetInput() + CheckIsResponseValues(in1, in2) + } + + def setWeightCol(value: String): this.type = set(weightCol, value) + + def setBaseMarginCol(value: String): this.type = set(baseMarginCol, value) + + def setGroupCol(value: String): this.type = set(groupCol, value) + + // setters for general params + def setTrackerConf(value: TrackerConf): this.type = set(trackerConf, value) + + def setNumRound(value: Int): this.type = set(numRound, value) + + def setNumWorkers(value: Int): this.type = set(numWorkers, value) + + def setNthread(value: Int): this.type = set(nthread, value) + + def setUseExternalMemory(value: Boolean): this.type = set(useExternalMemory, value) + + def setSilent(value: Int): this.type = set(silent, value) + + def setMissing(value: Float): this.type = set(missing, value) + + def setTimeoutRequestWorkers(value: Long): this.type = set(timeoutRequestWorkers, value) + + def setCheckpointPath(value: String): this.type = set(checkpointPath, value) + + def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) + + def setSeed(value: Long): this.type = set(seed, value) + + def setEta(value: Double): this.type = set(eta, value) + + def setGamma(value: Double): this.type = set(gamma, value) + + def setMaxDepth(value: Int): this.type = set(maxDepth, value) + + def setMinChildWeight(value: Double): this.type = set(minChildWeight, value) + + def setMaxDeltaStep(value: Double): this.type = set(maxDeltaStep, value) + + def setSubsample(value: Double): this.type = set(subsample, value) + + def setColsampleBytree(value: Double): this.type = set(colsampleBytree, value) + + def setColsampleBylevel(value: Double): this.type = set(colsampleBylevel, value) + + def setLambda(value: Double): this.type = set(lambda, value) + + def setAlpha(value: Double): this.type = set(alpha, value) + + def setTreeMethod(value: String): this.type = set(treeMethod, value) + + def setGrowPolicy(value: String): this.type = set(growPolicy, value) + + def setMaxBins(value: Int): this.type = set(maxBins, value) + + def setSketchEps(value: Double): this.type = set(sketchEps, value) + + def setScalePosWeight(value: Double): this.type = set(scalePosWeight, value) + + def setSampleType(value: String): this.type = set(sampleType, value) + + def setNormalizeType(value: String): this.type = set(normalizeType, value) + + def setRateDrop(value: Double): this.type = set(rateDrop, value) + + def setSkipDrop(value: Double): this.type = set(skipDrop, value) + + def setLambdaBias(value: Double): this.type = set(lambdaBias, value) + + // setters for learning params + def setObjective(value: String): this.type = set(objective, value) + + def setBaseScore(value: Double): this.type = set(baseScore, value) + + def setEvalMetric(value: String): this.type = set(evalMetric, value) + + def setTrainTestRatio(value: Double): this.type = set(trainTestRatio, value) + + def setNumEarlyStoppingRounds(value: Int): this.type = set(numEarlyStoppingRounds, value) + + def setCustomObj(value: ObjectiveTrait): this.type = set(customObj, value) + + def setCustomEval(value: EvalTrait): this.type = set(customEval, value) + +} + + +/** + * Class that takes in a spark [[XGBoostRegressionModel]] and wraps it into an OP model which returns a + * Prediction feature + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + */ +class OpXGBoostRegressionModel +( + sparkModel: XGBoostRegressionModel, + uid: String = UID[OpXGBoostRegressionModel], + operationName: String = classOf[XGBoostRegressor].getSimpleName +)( + implicit tti1: TypeTag[RealNN], + tti2: TypeTag[OPVector], + tto: TypeTag[Prediction], + ttov: TypeTag[Prediction#Value] +) extends OpPredictionModel[XGBoostRegressionModel]( + sparkModel = sparkModel, uid = uid, operationName = operationName +) { + @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict") +} diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/SparkModelConverter.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/SparkModelConverter.scala index 200ebd26f7..2f6d4eee47 100644 --- a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/SparkModelConverter.scala +++ b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/SparkModelConverter.scala @@ -34,6 +34,7 @@ import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} import com.salesforce.op.stages.base.binary.OpTransformer2 import com.salesforce.op.stages.impl.classification._ import com.salesforce.op.stages.impl.regression._ +import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostRegressionModel} import org.apache.spark.ml.classification._ import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.regression._ @@ -94,6 +95,8 @@ object SparkModelConverter { case m: GBTRegressionModel => new OpGBTRegressionModel(m, uid = uid) case m: DecisionTreeRegressionModel => new OpDecisionTreeRegressionModel(m, uid = uid) case m: GeneralizedLinearRegressionModel => new OpGeneralizedLinearRegressionModel(m, uid = uid) + case m: XGBoostClassificationModel => new OpXGBoostClassificationModel(m, uid = uid) + case m: XGBoostRegressionModel => new OpXGBoostRegressionModel(m, uid = uid) case m => throw new RuntimeException(s"model conversion not implemented for model $m") } } diff --git a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala new file mode 100644 index 0000000000..b9d054c5d8 --- /dev/null +++ b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package ml.dmlc.xgboost4j.scala.spark + + +trait OpXGBoostClassifierParams extends XGBoostClassifierParams + +trait OpXGBoostRegressorParams extends XGBoostRegressorParams diff --git a/core/src/test/scala/com/salesforce/op/filters/PreparedFeaturesTest.scala b/core/src/test/scala/com/salesforce/op/filters/PreparedFeaturesTest.scala index 55c6be135f..afd2233338 100644 --- a/core/src/test/scala/com/salesforce/op/filters/PreparedFeaturesTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/PreparedFeaturesTest.scala @@ -30,18 +30,16 @@ package com.salesforce.op.filters -import scala.math.round - import com.salesforce.op.stages.impl.preparators.CorrelationType import com.salesforce.op.test.TestSparkContext import com.twitter.algebird.Monoid._ import com.twitter.algebird.Operators._ -import org.apache.spark.mllib.linalg.{Matrix, Vector} +import com.twitter.algebird.Tuple2Semigroup import org.apache.spark.mllib.stat.Statistics -import org.apache.spark.rdd.RDD import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner +import com.salesforce.op.filters.Summary._ @RunWith(classOf[JUnitRunner]) class PreparedFeaturesTest extends FlatSpec with TestSparkContext { @@ -65,6 +63,7 @@ class PreparedFeaturesTest extends FlatSpec with TestSparkContext { responses = Map(responseKey2 -> Right(Seq(-0.5))), predictors = Map(predictorKey2A -> Left(Seq("iv")))) val allPreparedFeatures = Seq(preparedFeatures1, preparedFeatures2, preparedFeatures3) + implicit val sgTuple2 = new Tuple2Semigroup[Map[FeatureKey, Summary], Map[FeatureKey, Summary]]() val (allResponseSummaries, allPredictorSummaries) = allPreparedFeatures.map(_.summaries).reduce(_ + _) val allResponseKeys1 = Array(responseKey1, responseKey2) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala index 7d80f22ad6..dad43b8266 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala @@ -45,6 +45,8 @@ class OpDecisionTreeClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[DecisionTreeClassificationModel], OpPredictorWrapper[DecisionTreeClassifier, DecisionTreeClassificationModel]] with PredictionEquality { + override def specName: String = classOf[OpDecisionTreeClassifier].getSimpleName + val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifierTest.scala index f531f4790e..d21fedb85c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifierTest.scala @@ -44,6 +44,8 @@ import org.scalatest.junit.JUnitRunner class OpGBTClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[GBTClassificationModel], OpPredictorWrapper[GBTClassifier, GBTClassificationModel]] with PredictionEquality { + override def specName: String = classOf[OpGBTClassifier].getSimpleName + val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLinearSVCTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLinearSVCTest.scala index 0724b62f73..ad8a690d7d 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLinearSVCTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLinearSVCTest.scala @@ -44,6 +44,8 @@ import org.scalatest.junit.JUnitRunner class OpLinearSVCTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LinearSVCModel], OpPredictorWrapper[LinearSVC, LinearSVCModel]] with PredictionEquality { + override def specName: String = classOf[OpLinearSVC].getSimpleName + val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala index 95e7375735..3e18333742 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala @@ -44,6 +44,8 @@ import org.scalatest.junit.JUnitRunner class OpLogisticRegressionTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LogisticRegressionModel], OpPredictorWrapper[LogisticRegression, LogisticRegressionModel]] with PredictionEquality { + override def specName: String = classOf[OpLogisticRegression].getSimpleName + val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala index e932c7c023..f3e83f1e0c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala @@ -45,6 +45,8 @@ class OpMultilayerPerceptronClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[MultilayerPerceptronClassificationModel], OpPredictorWrapper[MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel]] with PredictionEquality { + override def specName: String = classOf[OpMultilayerPerceptronClassifier].getSimpleName + val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala index aed8c4d3e8..6ec1312bc5 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala @@ -44,6 +44,8 @@ import org.scalatest.junit.JUnitRunner class OpNaiveBayesTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[NaiveBayesModel], OpPredictorWrapper[NaiveBayes, NaiveBayesModel]] with PredictionEquality { + override def specName: String = classOf[OpNaiveBayes].getSimpleName + val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifierTest.scala index b2b25b5816..fa02d5347f 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifierTest.scala @@ -44,6 +44,8 @@ class OpRandomForestClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[RandomForestClassificationModel], OpPredictorWrapper[RandomForestClassifier, RandomForestClassificationModel]] with PredictionEquality { + override def specName: String = classOf[OpRandomForestClassifier].getSimpleName + lazy val (inputData, rawLabelMulti, featuresMulti) = TestFeatureBuilder[RealNN, OPVector]("labelMulti", "featuresMulti", Seq( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala new file mode 100644 index 0000000000..045b1d64ef --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.classification + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.PredictionEquality +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} +import ml.dmlc.xgboost4j.scala.spark.{TrackerConf, XGBoost, XGBoostClassificationModel, XGBoostClassifier} +import org.apache.log4j.{Level, Logger} +import org.apache.spark.ml.linalg.Vectors +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class OpXGBoostClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[XGBoostClassificationModel], + OpPredictorWrapper[XGBoostClassifier, XGBoostClassificationModel]] with PredictionEquality { + + override def specName: String = classOf[OpXGBoostClassifier].getSimpleName + + val rawData = Seq( + 1.0 -> Vectors.dense(12.0, 4.3, 1.3), + 0.0 -> Vectors.dense(0.0, 0.3, 0.1), + 0.0 -> Vectors.dense(1.0, 3.9, 4.3), + 1.0 -> Vectors.dense(10.0, 1.3, 0.9), + 1.0 -> Vectors.dense(15.0, 4.7, 1.3), + 0.0 -> Vectors.dense(0.5, 0.9, 10.1), + 1.0 -> Vectors.dense(11.5, 2.3, 1.3), + 0.0 -> Vectors.dense(0.1, 3.3, 0.1) + ).map { case (l, v) => l.toRealNN -> v.toOPVector } + + val (inputData, label, features) = TestFeatureBuilder("label", "features", rawData) + + val estimator = new OpXGBoostClassifier().setInput(label.copy(isResponse = true), features) + estimator.setTrackerConf(TrackerConf(0, "scala")) + estimator.setSilent(1) + + val expectedResult = Seq( + Prediction(1.0), + Prediction(0.0), + Prediction(0.0), + Prediction(1.0), + Prediction(1.0), + Prediction(0.0), + Prediction(1.0), + Prediction(0.0) + ) + + it should "allow the user to set the desired spark parameters" in { + estimator.setAlpha(0.872).setEta(0.99912) + estimator.fit(inputData) + estimator.predictor.getAlpha shouldBe 0.872 + estimator.predictor.getEta shouldBe 0.99912 + } +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressorTest.scala index 310d93baa0..1f1b20b926 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressorTest.scala @@ -44,6 +44,8 @@ class OpDecisionTreeRegressorTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[DecisionTreeRegressionModel], OpPredictorWrapper[DecisionTreeRegressor, DecisionTreeRegressionModel]] with PredictionEquality { + override def specName: String = classOf[OpDecisionTreeRegressor].getSimpleName + val (inputData, rawLabel, features) = TestFeatureBuilder( Seq[(RealNN, OPVector)]( (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressorTest.scala index 3022def8d5..fe272c2551 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressorTest.scala @@ -43,6 +43,8 @@ import org.scalatest.junit.JUnitRunner class OpGBTRegressorTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[GBTRegressionModel], OpPredictorWrapper[GBTRegressor, GBTRegressionModel]] with PredictionEquality { + override def specName: String = classOf[OpGBTRegressor].getSimpleName + val (inputData, rawLabel, features) = TestFeatureBuilder( Seq[(RealNN, OPVector)]( (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala index 4d313a45ee..9041a95007 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala @@ -44,6 +44,8 @@ class OpGeneralizedLinearRegressionTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[GeneralizedLinearRegressionModel], OpPredictorWrapper[GeneralizedLinearRegression, GeneralizedLinearRegressionModel]] with PredictionEquality { + override def specName: String = classOf[OpGeneralizedLinearRegression].getSimpleName + val (inputData, rawLabel, features) = TestFeatureBuilder( Seq[(RealNN, OPVector)]( (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala index d21ca781dc..e672b66382 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala @@ -44,6 +44,8 @@ import org.scalatest.junit.JUnitRunner class OpLinearRegressionTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LinearRegressionModel], OpPredictorWrapper[LinearRegression, LinearRegressionModel]] with PredictionEquality { + override def specName: String = classOf[OpLinearRegression].getSimpleName + val (inputData, rawLabel, features) = TestFeatureBuilder( Seq[(RealNN, OPVector)]( (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressorTest.scala index f989595c7c..e1fac3c60e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressorTest.scala @@ -44,6 +44,8 @@ class OpRandomForestRegressorTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[RandomForestRegressionModel], OpPredictorWrapper[RandomForestRegressor, RandomForestRegressionModel]] with PredictionEquality { + override def specName: String = classOf[OpRandomForestRegressor].getSimpleName + val (inputData, rawLabel, features) = TestFeatureBuilder( Seq[(RealNN, OPVector)]( (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala new file mode 100644 index 0000000000..b42e8bc81c --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.regression + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.PredictionEquality +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.test._ +import ml.dmlc.xgboost4j.scala.spark.{TrackerConf, XGBoostRegressionModel, XGBoostRegressor} +import org.apache.spark.ml.linalg.Vectors +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class OpXGBoostRegressorTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[XGBoostRegressionModel], + OpPredictorWrapper[XGBoostRegressor, XGBoostRegressionModel]] with PredictionEquality { + + override def specName: String = classOf[OpXGBoostRegressor].getSimpleName + + val rawData = Seq( + (10.0, Vectors.dense(1.0, 4.3, 1.3)), + (20.0, Vectors.dense(2.0, 0.3, 0.1)), + (30.0, Vectors.dense(3.0, 3.9, 4.3)), + (40.0, Vectors.dense(4.0, 1.3, 0.9)), + (50.0, Vectors.dense(5.0, 4.7, 1.3)) + ).map { case (l, v) => l.toRealNN -> v.toOPVector } + + val (inputData, label, features) = TestFeatureBuilder("label", "features", rawData) + + val estimator = new OpXGBoostRegressor().setInput(label.copy(isResponse = true), features) + estimator.setTrackerConf(TrackerConf(0, "scala")) + estimator.setSilent(1) + + val expectedResult = Seq( + Prediction(1.9250000715255737), + Prediction(8.780000686645508), + Prediction(8.780000686645508), + Prediction(8.780000686645508), + Prediction(8.780000686645508) + ) + + it should "allow the user to set the desired spark parameters" in { + estimator.setMaxDepth(18).setBaseScore(0.12345).setSkipDrop(0.6234) + estimator.fit(inputData) + estimator.predictor.getMaxDepth shouldBe 18 + estimator.predictor.getBaseScore shouldBe 0.12345 + estimator.predictor.getSkipDrop shouldBe 0.6234 + + } +} diff --git a/features/src/main/scala/com/salesforce/op/aggregators/ExtendedMultiset.scala b/features/src/main/scala/com/salesforce/op/aggregators/ExtendedMultiset.scala index 8718c94fb8..660fb74e23 100644 --- a/features/src/main/scala/com/salesforce/op/aggregators/ExtendedMultiset.scala +++ b/features/src/main/scala/com/salesforce/op/aggregators/ExtendedMultiset.scala @@ -41,6 +41,8 @@ import com.twitter.algebird._ * However, order does not matter, so {a, a, b} and {a, b, a} are the same multiset. */ trait ExtendedMultiset extends MapMonoid[String, Long] with Group[Map[String, Long]] { + override def negate(kv: Map[String, Long]): Map[String, Long] = kv.mapValues { v => -v } + override def minus(x: Map[String, Long], y: Map[String, Long]): Map[String, Long] = { val keys = x.keySet ++ y.keySet val kvPairs = keys map (k => k -> (x.getOrElse(k, 0L) - y.getOrElse(k, 0L))) filter (_._2 != 0L) diff --git a/utils/build.gradle b/utils/build.gradle index 5f12c8accc..f18024ec72 100644 --- a/utils/build.gradle +++ b/utils/build.gradle @@ -16,8 +16,8 @@ dependencies { compile "com.twitter:algebird-core_$scalaVersion:$algebirdVersion" // Twitter Chill - compile ("com.twitter:chill-avro_$scalaVersion:$chillAvroVersion") { exclude group: "org.apache.avro", module: "avro" } - compile "com.twitter:chill-algebird_$scalaVersion:$chillAvroVersion" + compile ("com.twitter:chill-avro_$scalaVersion:$chillVersion") { exclude group: "org.apache.avro", module: "avro" } + compile "com.twitter:chill-algebird_$scalaVersion:$chillVersion" // Lucene - (geo location) compile "org.apache.lucene:lucene-spatial3d:$luceneVersion" From e46a152b317c36fbcf0f334a6732f6b4e4db5001 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Wed, 8 Aug 2018 09:34:02 -0700 Subject: [PATCH 02/40] fix property name --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 16a5e7f158..aeed3d6102 100644 --- a/build.gradle +++ b/build.gradle @@ -78,7 +78,7 @@ configure(allProjs) { googleLibPhoneNumberVersion = '8.8.5' googleGeoCoderVersion = '2.82' googleCarrierVersion = '1.72' - chillAvroVersion = '0.8.4' + chillVersion = '0.8.4' reflectionsVersion = '0.9.11' collectionsVersion = '3.2.2' optimaizeLangDetectorVersion = '0.7.1' From 533aa3acccd96f9f9bf4b91de95655a32770e3d9 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Wed, 8 Aug 2018 09:39:40 -0700 Subject: [PATCH 03/40] Minor updates --- .../classification/OpXGBoostClassifier.scala | 33 +++++++++---------- .../impl/regression/OpXGBoostRegressor.scala | 33 +++++++++---------- .../xgboost4j/scala/spark/XGBoostParams.scala | 33 +++++++++---------- .../OpXGBoostClassifierTest.scala | 33 +++++++++---------- .../regression/OpXGBoostRegressorTest.scala | 33 +++++++++---------- .../op/utils/spark/OpSparkListener.scala | 4 +-- 6 files changed, 82 insertions(+), 87 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala index 6829b48c0d..6cb8f46ef3 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala @@ -5,28 +5,27 @@ * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package com.salesforce.op.stages.impl.classification diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressor.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressor.scala index 2a701360cb..617e84ae44 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressor.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressor.scala @@ -5,28 +5,27 @@ * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package com.salesforce.op.stages.impl.regression diff --git a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala index b9d054c5d8..4c4de2f1ba 100644 --- a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala +++ b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala @@ -5,28 +5,27 @@ * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package ml.dmlc.xgboost4j.scala.spark diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala index 045b1d64ef..2db4ded82a 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala @@ -5,28 +5,27 @@ * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package com.salesforce.op.stages.impl.classification diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala index b42e8bc81c..a3e8496b97 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala @@ -5,28 +5,27 @@ * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package com.salesforce.op.stages.impl.regression diff --git a/utils/src/main/scala/com/salesforce/op/utils/spark/OpSparkListener.scala b/utils/src/main/scala/com/salesforce/op/utils/spark/OpSparkListener.scala index 38404b87e5..2e3278af6c 100644 --- a/utils/src/main/scala/com/salesforce/op/utils/spark/OpSparkListener.scala +++ b/utils/src/main/scala/com/salesforce/op/utils/spark/OpSparkListener.scala @@ -211,7 +211,7 @@ object StageMetrics { def toMillis(ns: Long): Long = ns / 1000000 // some time values are in nanoseconds so we convert those StageMetrics( stageId = si.stageId, - attemptId = si.attemptId, + attemptId = si.attemptNumber, name = si.name, numTasks = si.numTasks, parentIds = si.parentIds, @@ -221,7 +221,7 @@ object StageMetrics { else if (si.completionTime.isDefined) "succeeded" else "running" }, - // TODO: consider also collection all the accumilables - might be costly + // TODO: consider also collecting all the accumilables - might be costly numAccumulables = si.accumulables.size, failureReason = si.failureReason, submissionTime = si.submissionTime, From 4877aa3d124765b20a02a85ab273a7f30246bab8 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Wed, 8 Aug 2018 09:56:50 -0700 Subject: [PATCH 04/40] added maven repo --- core/build.gradle | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/core/build.gradle b/core/build.gradle index 9a4baa3a62..4ea2fbb582 100644 --- a/core/build.gradle +++ b/core/build.gradle @@ -1,3 +1,8 @@ +repositories { + // Needed for XGboost + maven { url 'https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/' } +} + dependencies { compile project(':readers') testRuntime project(':models') From 3a023bd57721e5ed68fb89876a673d6918a7d29d Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Wed, 8 Aug 2018 10:55:09 -0700 Subject: [PATCH 05/40] move repo to build.gradle --- build.gradle | 3 +++ core/build.gradle | 5 ----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/build.gradle b/build.gradle index aeed3d6102..499f440a05 100644 --- a/build.gradle +++ b/build.gradle @@ -21,6 +21,9 @@ allprojects { repositories { mavenCentral() maven { url 'https://jitpack.io' } + // Needed for XGboost + // TODO: remove this repo once XGBoost is published into Maven Central or similar + maven { url 'https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/' } } } diff --git a/core/build.gradle b/core/build.gradle index 4ea2fbb582..9a4baa3a62 100644 --- a/core/build.gradle +++ b/core/build.gradle @@ -1,8 +1,3 @@ -repositories { - // Needed for XGboost - maven { url 'https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/' } -} - dependencies { compile project(':readers') testRuntime project(':models') From 26c1d897c4c82099ffdf7d2fe143beee45025775 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Wed, 8 Aug 2018 21:28:04 -0700 Subject: [PATCH 06/40] quite logging in tests --- core/build.gradle | 2 ++ .../xgboost4j/scala/spark/XGBoostParams.scala | 18 ++++++++++++++++-- core/src/test/resources/application.conf | 4 ++++ .../OpXGBoostClassifierTest.scala | 7 +++---- .../regression/OpXGBoostRegressorTest.scala | 7 ++++--- 5 files changed, 29 insertions(+), 9 deletions(-) create mode 100644 core/src/test/resources/application.conf diff --git a/core/build.gradle b/core/build.gradle index 9a4baa3a62..248a70433c 100644 --- a/core/build.gradle +++ b/core/build.gradle @@ -25,4 +25,6 @@ dependencies { // XGBoost compile "ml.dmlc:xgboost4j-spark:0.80-SNAPSHOT" + // Akka slfj4 logging (version matches XGBoost dependency) + testCompile "com.typesafe.akka:akka-slf4j_$scalaVersion:2.3.11" } diff --git a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala index 4c4de2f1ba..2d21681301 100644 --- a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala +++ b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala @@ -30,7 +30,21 @@ package ml.dmlc.xgboost4j.scala.spark +import ml.dmlc.xgboost4j.scala.spark.params.GeneralParams +import org.apache.log4j.{Level, Logger} -trait OpXGBoostClassifierParams extends XGBoostClassifierParams -trait OpXGBoostRegressorParams extends XGBoostRegressorParams +trait OpXGBoostClassifierParams extends XGBoostClassifierParams with OpXGBoostGeneralParamsDefaults + +trait OpXGBoostRegressorParams extends XGBoostRegressorParams with OpXGBoostGeneralParamsDefaults + +trait OpXGBoostGeneralParamsDefaults { + self: GeneralParams => + setDefault(trackerConf -> TrackerConf(0L, "scala")) +} + +trait OpXGBoostQuietLogging { + Logger.getLogger("akka").setLevel(Level.WARN) + Logger.getLogger("XGBoostSpark").setLevel(Level.WARN) + Logger.getLogger(classOf[XGBoostRegressor]).setLevel(Level.WARN) +} diff --git a/core/src/test/resources/application.conf b/core/src/test/resources/application.conf new file mode 100644 index 0000000000..3f09796cad --- /dev/null +++ b/core/src/test/resources/application.conf @@ -0,0 +1,4 @@ +akka { + loggers = ["akka.event.slf4j.Slf4jLogger"] + loglevel = "WARNING" +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala index 2db4ded82a..71d2240a52 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala @@ -34,8 +34,7 @@ import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.PredictionEquality import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} -import ml.dmlc.xgboost4j.scala.spark.{TrackerConf, XGBoost, XGBoostClassificationModel, XGBoostClassifier} -import org.apache.log4j.{Level, Logger} +import ml.dmlc.xgboost4j.scala.spark.{OpXGBoostQuietLogging, XGBoostClassificationModel, XGBoostClassifier} import org.apache.spark.ml.linalg.Vectors import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @@ -43,7 +42,8 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class OpXGBoostClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[XGBoostClassificationModel], - OpPredictorWrapper[XGBoostClassifier, XGBoostClassificationModel]] with PredictionEquality { + OpPredictorWrapper[XGBoostClassifier, XGBoostClassificationModel]] + with PredictionEquality with OpXGBoostQuietLogging { override def specName: String = classOf[OpXGBoostClassifier].getSimpleName @@ -61,7 +61,6 @@ class OpXGBoostClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWra val (inputData, label, features) = TestFeatureBuilder("label", "features", rawData) val estimator = new OpXGBoostClassifier().setInput(label.copy(isResponse = true), features) - estimator.setTrackerConf(TrackerConf(0, "scala")) estimator.setSilent(1) val expectedResult = Seq( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala index a3e8496b97..59e640a22a 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala @@ -34,14 +34,16 @@ import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.PredictionEquality import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} import com.salesforce.op.test._ -import ml.dmlc.xgboost4j.scala.spark.{TrackerConf, XGBoostRegressionModel, XGBoostRegressor} +import ml.dmlc.xgboost4j.scala.spark.{OpXGBoostQuietLogging, XGBoostRegressionModel, XGBoostRegressor} import org.apache.spark.ml.linalg.Vectors import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner + @RunWith(classOf[JUnitRunner]) class OpXGBoostRegressorTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[XGBoostRegressionModel], - OpPredictorWrapper[XGBoostRegressor, XGBoostRegressionModel]] with PredictionEquality { + OpPredictorWrapper[XGBoostRegressor, XGBoostRegressionModel]] + with PredictionEquality with OpXGBoostQuietLogging { override def specName: String = classOf[OpXGBoostRegressor].getSimpleName @@ -56,7 +58,6 @@ class OpXGBoostRegressorTest extends OpEstimatorSpec[Prediction, OpPredictorWrap val (inputData, label, features) = TestFeatureBuilder("label", "features", rawData) val estimator = new OpXGBoostRegressor().setInput(label.copy(isResponse = true), features) - estimator.setTrackerConf(TrackerConf(0, "scala")) estimator.setSilent(1) val expectedResult = Seq( From b355d4c3edf6521b89ea9427fa9002cff709883f Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Wed, 8 Aug 2018 22:16:39 -0700 Subject: [PATCH 07/40] update some tests --- .../OpMultilayerPerceptronClassifier.scala | 10 ++++++---- .../xgboost4j/scala/spark/XGBoostParams.scala | 6 +++++- .../classification/OpClassifierModelTest.scala | 13 ++++++++++++- .../impl/regression/OpRegressionModelTest.scala | 17 ++++++++++++----- 4 files changed, 35 insertions(+), 11 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifier.scala index f5a85911cb..43a26a8ec1 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifier.scala @@ -33,7 +33,7 @@ package com.salesforce.op.stages.impl.classification import com.salesforce.op.UID import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} import com.salesforce.op.stages.impl.CheckIsResponseValues -import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper} +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel} import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier, OpMultilayerPerceptronClassifierParams} import org.apache.spark.ml.linalg.Vector @@ -128,7 +128,6 @@ class OpMultilayerPerceptronClassifier(uid: String = UID[OpMultilayerPerceptronC * @param uid uid to give stage * @param operationName unique name of the operation this stage performs */ -// TODO in next release of spark this will be probablistic classifier class OpMultilayerPerceptronClassificationModel ( sparkModel: MultilayerPerceptronClassificationModel, @@ -139,9 +138,12 @@ class OpMultilayerPerceptronClassificationModel tti2: TypeTag[OPVector], tto: TypeTag[Prediction], ttov: TypeTag[Prediction#Value] -) extends OpPredictionModel[MultilayerPerceptronClassificationModel]( +) extends OpProbabilisticClassifierModel[MultilayerPerceptronClassificationModel]( sparkModel = sparkModel, uid = uid, operationName = operationName ) { - @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict") + @transient lazy val predictRawMirror = reflectMethod(getSparkMlStage().get, "predictRaw") + @transient lazy val raw2probabilityMirror = reflectMethod(getSparkMlStage().get, "raw2probability") + @transient lazy val probability2predictionMirror = + reflectMethod(getSparkMlStage().get, "probability2prediction") } diff --git a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala index 2d21681301..88c9b41ace 100644 --- a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala +++ b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala @@ -40,7 +40,7 @@ trait OpXGBoostRegressorParams extends XGBoostRegressorParams with OpXGBoostGene trait OpXGBoostGeneralParamsDefaults { self: GeneralParams => - setDefault(trackerConf -> TrackerConf(0L, "scala")) + setDefault(trackerConf -> OpXGBoost.DefaultTrackerConf) } trait OpXGBoostQuietLogging { @@ -48,3 +48,7 @@ trait OpXGBoostQuietLogging { Logger.getLogger("XGBoostSpark").setLevel(Level.WARN) Logger.getLogger(classOf[XGBoostRegressor]).setLevel(Level.WARN) } + +case object OpXGBoost { + val DefaultTrackerConf = TrackerConf(workerConnectionTimeout = 0L, "scala") +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala index f010566640..3dd7706f69 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala @@ -34,6 +34,7 @@ import com.salesforce.op.features.types.{Prediction, RealNN} import com.salesforce.op.stages.sparkwrappers.specific.SparkModelConverter._ import com.salesforce.op.test._ import com.salesforce.op.testkit._ +import ml.dmlc.xgboost4j.scala.spark.{OpXGBoost, OpXGBoostQuietLogging, XGBoostClassifier} import org.apache.spark.ml.classification._ import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.DataFrame @@ -43,7 +44,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class OpClassifierModelTest extends FlatSpec with TestSparkContext { +class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoostQuietLogging { private val label = RandomIntegral.integrals(0, 2).limit(1000) .map{ v => RealNN(v.value.map(_.toDouble).getOrElse(0.0)) } @@ -133,6 +134,16 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext { compareOutputsPred(spk.transform(rawDF), op.transform(rawDF), 2) } + Spec[OpXGBoostClassifier] should "produce the same values as the spark version" in { + val cl = new XGBoostClassifier() + cl.set(cl.trackerConf, OpXGBoost.DefaultTrackerConf) + .setFeaturesCol(featureV.name) + .setLabelCol(labelF.name) + val spk = cl.fit(rawDF) + val op = toOP(spk, spk.uid).setInput(labelF, featureV) + compareOutputsPred(spk.transform(rawDF), op.transform(rawDF), 2) + } + def compareOutputs(df1: DataFrame, df2: DataFrame): Unit = { def keysStartsWith(name: String, value: Map[String, Double]): Array[Double] = { diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala index 79c0b6be0a..ddb80c543e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala @@ -34,6 +34,7 @@ import com.salesforce.op.features.types.{Prediction, RealNN} import com.salesforce.op.stages.sparkwrappers.specific.SparkModelConverter.toOP import com.salesforce.op.test._ import com.salesforce.op.testkit._ +import ml.dmlc.xgboost4j.scala.spark.{OpXGBoost, OpXGBoostQuietLogging, XGBoostRegressor} import org.apache.spark.ml.regression._ import org.apache.spark.sql.DataFrame import org.junit.runner.RunWith @@ -41,7 +42,7 @@ import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class OpRegressionModelTest extends FlatSpec with TestSparkContext { +class OpRegressionModelTest extends FlatSpec with TestSparkContext with OpXGBoostQuietLogging { private val label = RandomIntegral.integrals(0, 2).limit(1000) .map{ v => RealNN(v.value.map(_.toDouble).getOrElse(0.0)) } @@ -58,7 +59,6 @@ class OpRegressionModelTest extends FlatSpec with TestSparkContext { .fit(rawDF) val op = toOP(spk, spk.uid).setInput(labelF, featureV) - compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } @@ -70,7 +70,6 @@ class OpRegressionModelTest extends FlatSpec with TestSparkContext { .fit(rawDF) val op = toOP(spk, spk.uid).setInput(labelF, featureV) - compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } @@ -81,7 +80,6 @@ class OpRegressionModelTest extends FlatSpec with TestSparkContext { .fit(rawDF) val op = toOP(spk, spk.uid).setInput(labelF, featureV) - compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } @@ -92,7 +90,6 @@ class OpRegressionModelTest extends FlatSpec with TestSparkContext { .fit(rawDF) val op = toOP(spk, spk.uid).setInput(labelF, featureV) - compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } @@ -103,7 +100,17 @@ class OpRegressionModelTest extends FlatSpec with TestSparkContext { .fit(rawDF) val op = toOP(spk, spk.uid).setInput(labelF, featureV) + compareOutputs(spk.transform(rawDF), op.transform(rawDF)) + } + Spec[OpXGBoostRegressionModel] should "produce the same values as the spark version" in { + val reg = new XGBoostRegressor() + reg.set(reg.trackerConf, OpXGBoost.DefaultTrackerConf) + .setFeaturesCol(featureV.name) + .setLabelCol(labelF.name) + val spk = reg.fit(rawDF) + + val op = toOP(spk, spk.uid).setInput(labelF, featureV) compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } From 9f7bf85ee86ab76f3ecdfaa598d9089be288b80a Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Wed, 8 Aug 2018 22:28:42 -0700 Subject: [PATCH 08/40] debug stuff --- .../classification/OpClassifierModelTest.scala | 14 +++++--------- .../impl/regression/OpRegressionModelTest.scala | 3 ++- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala index 3dd7706f69..fbcfd5e5cf 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala @@ -52,8 +52,7 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos private val data = label.zip(fv) - private val (rawDF, labelF, featureV) = - TestFeatureBuilder("label", "features", data) + private val (rawDF, labelF, featureV) = TestFeatureBuilder("label", "features", data) Spec[OpDecisionTreeClassificationModel] should "produce the same values as the spark version" in { @@ -63,7 +62,6 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos .fit(rawDF) val op = toOP(spk, spk.uid).setInput(labelF, featureV) - compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } @@ -76,7 +74,6 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos .fit(rawDF) val op = toOP(spk, spk.uid).setInput(labelF, featureV) - compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } @@ -88,7 +85,6 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos .fit(rawDF) val op = toOP(spk, uid = spk.uid).setInput(labelF, featureV) - compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } @@ -99,7 +95,6 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos .fit(rawDF) val op = toOP(spk, spk.uid).setInput(labelF, featureV) - compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } @@ -119,7 +114,6 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos .setLabelCol(labelF.name) .fit(rawDF) val op = toOP(spk, spk.uid).setInput(labelF, featureV) - compareOutputsPred(spk.transform(rawDF), op.transform(rawDF), 3) } @@ -131,7 +125,7 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos .setLabelCol(labelF.name) .fit(rawDF) val op = toOP(spk, spk.uid).setInput(labelF, featureV) - compareOutputsPred(spk.transform(rawDF), op.transform(rawDF), 2) + compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } Spec[OpXGBoostClassifier] should "produce the same values as the spark version" in { @@ -141,7 +135,7 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos .setLabelCol(labelF.name) val spk = cl.fit(rawDF) val op = toOP(spk, spk.uid).setInput(labelF, featureV) - compareOutputsPred(spk.transform(rawDF), op.transform(rawDF), 2) + compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } def compareOutputs(df1: DataFrame, df2: DataFrame): Unit = { @@ -150,6 +144,8 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos val names = value.keys.filter(_.startsWith(name)).toArray.sorted names.map(value) } + df1.show() + df2.show() val sorted1 = df1.collect().sortBy(_.getAs[Double](4)) val sorted2 = df2.collect().sortBy(_.getAs[Map[String, Double]](2)(Prediction.Keys.PredictionName)) sorted1.zip(sorted2).foreach{ case (r1, r2) => diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala index ddb80c543e..53f6acfffd 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala @@ -62,7 +62,6 @@ class OpRegressionModelTest extends FlatSpec with TestSparkContext with OpXGBoos compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } - Spec[OpLinearRegressionModel] should "produce the same values as the spark version" in { val spk = new LinearRegression() .setFeaturesCol(featureV.name) @@ -115,6 +114,8 @@ class OpRegressionModelTest extends FlatSpec with TestSparkContext with OpXGBoos } def compareOutputs(df1: DataFrame, df2: DataFrame): Unit = { + df1.show() + df2.show() val sorted1 = df1.collect().sortBy(_.getAs[Double](2)) val sorted2 = df2.collect().sortBy(_.getAs[Map[String, Double]](2)(Prediction.Keys.PredictionName)) sorted1.zip(sorted2).foreach{ case (r1, r2) => From 75bec3f8196028de0866cb476c084c6c08b8061c Mon Sep 17 00:00:00 2001 From: Matthew Date: Thu, 9 Aug 2018 13:54:43 -0700 Subject: [PATCH 09/40] remove line --- .../op/stages/impl/classification/OpClassifierModelTest.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala index fbcfd5e5cf..ed854a8b0e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala @@ -65,7 +65,6 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } - Spec[OpLogisticRegressionModel] should "produce the same values as the spark version" in { val spk = new LogisticRegression() .setFamily("multinomial") From 565868ea4a887b3239c628bbc90a11cce7ec2816 Mon Sep 17 00:00:00 2001 From: Matthew Date: Thu, 9 Aug 2018 15:08:00 -0700 Subject: [PATCH 10/40] Fix GeneralizedLinearRegression --- .../OpGeneralizedLinearRegression.scala | 9 ++-- .../regression/OpRegressionModelTest.scala | 2 - .../op/utils/reflection/ReflectionUtils.scala | 46 ++++++++++++------- .../reflection/ReflectionUtilsTest.scala | 27 +++++++++++ 4 files changed, 61 insertions(+), 23 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegression.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegression.scala index 1ad09555ad..9d64f62560 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegression.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegression.scala @@ -184,16 +184,15 @@ class OpGeneralizedLinearRegressionModel sparkModel = sparkModel) { @transient lazy private val predictLink = reflectMethod(getSparkMlStage().get, "predictLink") - @transient lazy private val predict = reflectMethod(getSparkMlStage().get, "predict") + @transient lazy private val predict = reflectMethod(getSparkMlStage().get, "predict", argsCount = Some(2)) /** * Function used to convert input to output */ override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => { - val raw = predictLink.apply(features.value).asInstanceOf[Double] - val pred = predict.apply(features.value).asInstanceOf[Double] + val offset = 0.0 + val raw = predictLink.apply(features.value, offset).asInstanceOf[Double] + val pred = predict.apply(features.value, offset).asInstanceOf[Double] Prediction(prediction = pred, rawPrediction = raw) } } - - diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala index 53f6acfffd..e3c0fc0703 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala @@ -114,8 +114,6 @@ class OpRegressionModelTest extends FlatSpec with TestSparkContext with OpXGBoos } def compareOutputs(df1: DataFrame, df2: DataFrame): Unit = { - df1.show() - df2.show() val sorted1 = df1.collect().sortBy(_.getAs[Double](2)) val sorted2 = df2.collect().sortBy(_.getAs[Map[String, Double]](2)(Prediction.Keys.PredictionName)) sorted1.zip(sorted2).foreach{ case (r1, r2) => diff --git a/utils/src/main/scala/com/salesforce/op/utils/reflection/ReflectionUtils.scala b/utils/src/main/scala/com/salesforce/op/utils/reflection/ReflectionUtils.scala index 0f8645c633..d317e35ef3 100644 --- a/utils/src/main/scala/com/salesforce/op/utils/reflection/ReflectionUtils.scala +++ b/utils/src/main/scala/com/salesforce/op/utils/reflection/ReflectionUtils.scala @@ -136,43 +136,57 @@ object ReflectionUtils { /** * Find setter methods for the provided method name - * @param instance class to find method for - * @param setterName name of method to find - * @param classLoader class loader to use - * @tparam T type of instance to copy - * @return reflected method to set type + * @param instance class to find method for + * @param setterName name of method to find + * @param args argument values + * @param argsCount optional number of arguments to match + * @param classLoader class loader to use + * @tparam T type of instance to copy + * @return reflected method to set type */ def reflectSetterMethod[T: ClassTag]( instance: T, setterName: String, - inputs: Seq[Any], + args: Seq[Any], + argsCount: Option[Int] = None, classLoader: ClassLoader = defaultClassLoader ): Any = { - reflectMethod(instance, s"set$setterName", classLoader).apply(inputs: _*) + reflectMethod(instance, s"set$setterName", argsCount, classLoader).apply(args: _*) } /** * Find setter methods for the provided method name - * @param instance class to find method for - * @param methodName name of method to find - * @param classLoader class loader to use - * @tparam T type of instance to copy - * @return reflected method to set type + * @param instance class to find method for + * @param methodName name of method to find + * @param argsCount optional number of arguments to match + * @param classLoader class loader to use + * @tparam T type of instance to copy + * @return reflected method to set type */ def reflectMethod[T: ClassTag]( instance: T, methodName: String, + argsCount: Option[Int] = None, classLoader: ClassLoader = defaultClassLoader ): MethodMirror = { val klazz = instance.getClass val (runtimeMirror, classMirror) = mirrors(klazz, classLoader) val classType = runtimeMirror.classSymbol(klazz).toType val tMembers = classType.members - val methods = tMembers.collect { case m: MethodSymbol if m.isMethod && - termNameStr(m.name).compareToIgnoreCase(methodName) == 0 => m + val methodsWithParams = tMembers.collect { case m: MethodSymbol => m -> m.paramLists.flatten } + val methods = methodsWithParams.collect { + case (m: MethodSymbol, params) if m.isMethod && + termNameStr(m.name).compareToIgnoreCase(methodName) == 0 && + (argsCount.isEmpty || argsCount.contains(params.length)) => m -> params + }.toList.sortBy(-_._2.length).map(_._1) + + methods match { + case method :: _ => + val instanceMirror = runtimeMirror.reflect(instance) + instanceMirror.reflectMethod(method) + case Nil => + throw new RuntimeException(s"Method with name '$methodName' was not found on instance of type: $klazz") } - val instanceMirror = runtimeMirror.reflect(instance) - instanceMirror.reflectMethod(methods.head) } /** diff --git a/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala b/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala index 0ba3f955dd..5bd6c1ef53 100644 --- a/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala +++ b/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala @@ -80,6 +80,10 @@ class TestClassVar { } private def getValue: Int = 2 def getValuePerf: Int = 2 + + def boo(x: Int, y: Int): Int = boo(x + y) + def boo(x: Int): Int = x + def boo(): Int = boo(1) } @RunWith(classOf[JUnitRunner]) @@ -226,5 +230,28 @@ class ReflectionUtilsTest extends FlatSpec with Matchers { elapsedReflect should be <= 10 * actual } + it should "error on reflecting a non existent method" in { + val myClass = new TestClassVar() + val err = intercept[RuntimeException](ReflectionUtils.reflectMethod(myClass, "non_existent")) + err.getMessage shouldBe + s"Method with name 'non_existent' was not found on instance of type: ${myClass.getClass}" + } + + it should "reflect methods with largest number of arguments by default" in { + val myClass = new TestClassVar() + val boo = ReflectionUtils.reflectMethod(myClass, "boo", argsCount = None) + boo(2, 3) shouldBe 5 + } + + it should "reflect methods with various number of arguments" in { + val myClass = new TestClassVar() + val boo = ReflectionUtils.reflectMethod(myClass, "boo", argsCount = Some(0)) + val boo1 = ReflectionUtils.reflectMethod(myClass, "boo", argsCount = Some(1)) + val boo2 = ReflectionUtils.reflectMethod(myClass, "boo", argsCount = Some(2)) + boo() shouldBe 1 + boo1(2) shouldBe 2 + boo2(2, 3) shouldBe 5 + } + } From de7969dcd585122cfd9a0e752838435dabb3c489 Mon Sep 17 00:00:00 2001 From: Matthew Date: Thu, 9 Aug 2018 17:15:51 -0700 Subject: [PATCH 11/40] Make xgboost work --- .../classification/OpXGBoostClassifier.scala | 42 ++++++++++++++++--- .../xgboost4j/scala/spark/XGBoostParams.scala | 29 +++++++++++++ .../com/salesforce/op/ModelInsightsTest.scala | 2 +- .../OpClassifierModelTest.scala | 24 ++++++++--- 4 files changed, 85 insertions(+), 12 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala index 6cb8f46ef3..111c3d2a3c 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala @@ -33,12 +33,13 @@ package com.salesforce.op.stages.impl.classification import com.salesforce.op.UID import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} import com.salesforce.op.stages.impl.CheckIsResponseValues -import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper} +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel} import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod -import ml.dmlc.xgboost4j.scala.{EvalTrait, ObjectiveTrait} -import ml.dmlc.xgboost4j.scala.spark.{OpXGBoostClassifierParams, TrackerConf, XGBoostClassificationModel, XGBoostClassifier} +import ml.dmlc.xgboost4j.scala.spark._ +import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, EvalTrait, ObjectiveTrait} +import org.apache.spark.ml.linalg.Vectors -import scala.reflect.runtime.universe.TypeTag +import scala.reflect.runtime.universe._ /** * Wrapper around XGBoost classifier [[XGBoostClassifier]] @@ -159,8 +160,37 @@ class OpXGBoostClassificationModel tti2: TypeTag[OPVector], tto: TypeTag[Prediction], ttov: TypeTag[Prediction#Value] -) extends OpPredictionModel[XGBoostClassificationModel]( +) extends OpProbabilisticClassifierModel[XGBoostClassificationModel]( sparkModel = sparkModel, uid = uid, operationName = operationName ) { - @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict") + import OpXGBoost._ + + protected def predictRawMirror: MethodMirror = throw new NotImplementedError() + protected def raw2probabilityMirror: MethodMirror = throw new NotImplementedError() + @transient lazy val probability2predictionMirror = + reflectMethod(getSparkMlStage().get, "probability2prediction") + + @transient lazy val model: XGBoostClassificationModel = getSparkMlStage().get + @transient lazy val booster: Booster = model.nativeBooster + + override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => { + // val appName = dataset.sparkSession.sparkContext.appName +// val cacheInfo = { +// if (model.getUseExternalMemory) { +// s"$appName-${TaskContext.get().stageId()}-dtest_cache-${TaskContext.getPartitionId()}" +// } else { +// null +// } +// } + booster.getFeatureScore() + val cacheInfo = null + val data = OpXGBoost.removeMissingValues(Iterator(features.value.asXGBLabeledPoint), model.getMissing) + val dm = new DMatrix(dataIter = data, cacheInfo = cacheInfo) + val treeLimit: Int = 0 // TODO: instead use model.getTreeLimit once available + val rawPred = booster.predict(dm, outPutMargin = true, treeLimit = treeLimit)(0).map(_.toDouble) + val prob = booster.predict(dm, outPutMargin = false, treeLimit = treeLimit)(0).map(_.toDouble) + val probability = if (model.numClasses == 2) Array(1.0 - prob(0), prob(0)) else prob + val prediction = probability2predictionMirror(Vectors.dense(probability)).asInstanceOf[Double] + Prediction(prediction = prediction, rawPrediction = rawPred, probability = probability) + } } diff --git a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala index 88c9b41ace..5ff1f5a685 100644 --- a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala +++ b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala @@ -30,8 +30,14 @@ package ml.dmlc.xgboost4j.scala.spark +import ml.dmlc.xgboost4j.LabeledPoint +import ml.dmlc.xgboost4j.scala.spark.DataUtils.MLVectorToXGBLabeledPoint import ml.dmlc.xgboost4j.scala.spark.params.GeneralParams import org.apache.log4j.{Level, Logger} +import org.apache.spark.ml.linalg.Vector + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer trait OpXGBoostClassifierParams extends XGBoostClassifierParams with OpXGBoostGeneralParamsDefaults @@ -51,4 +57,27 @@ trait OpXGBoostQuietLogging { case object OpXGBoost { val DefaultTrackerConf = TrackerConf(workerConnectionTimeout = 0L, "scala") + + implicit class RichMLVectorToXGBLabeledPoint(val v: Vector) extends AnyVal { + def asXGBLabeledPoint: LabeledPoint = MLVectorToXGBLabeledPoint(v).asXGB + } + + /** + * Copied from [[ml.dmlc.xgboost4j.scala.spark.XGBoost.removeMissingValues]] private method + */ + def removeMissingValues(xgbLabelPoints: Iterator[LabeledPoint], missing: Float): Iterator[LabeledPoint] = { + if (!missing.isNaN) { + xgbLabelPoints.map { labeledPoint => + val indices = new ArrayBuffer[Int]() + val values = new ArrayBuffer[Float]() + for ((value, i) <- labeledPoint.values.zipWithIndex if value != missing) { + indices += (if (labeledPoint.indices == null) i else labeledPoint.indices(i)) + values += value + } + labeledPoint.copy(indices = indices.toArray, values = values.toArray) + } + } else { + xgbLabelPoints + } + } } diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index 17578c2e5a..f6776468b7 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -63,7 +63,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest { implicit val doubleOptEquality = new Equality[Option[Double]] { def areEqual(a: Option[Double], b: Any): Boolean = b match { case None => a.isEmpty - case s: Option[Double] => (a.exists(_.isNaN) && s.exists(_.isNaN)) || + case s: Option[Double]@unchecked => (a.exists(_.isNaN) && s.exists(_.isNaN)) || (a.nonEmpty && a.toSeq.zip(s.toSeq).forall{ case (n, m) => n == m }) case _ => false } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala index ed854a8b0e..73585a6c57 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala @@ -39,6 +39,7 @@ import org.apache.spark.ml.classification._ import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.DataFrame import org.junit.runner.RunWith +import org.scalactic.Equality import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @@ -54,7 +55,6 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos private val (rawDF, labelF, featureV) = TestFeatureBuilder("label", "features", data) - Spec[OpDecisionTreeClassificationModel] should "produce the same values as the spark version" in { val spk = new DecisionTreeClassifier() .setFeaturesCol(featureV.name) @@ -134,17 +134,30 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos .setLabelCol(labelF.name) val spk = cl.fit(rawDF) val op = toOP(spk, spk.uid).setInput(labelF, featureV) + + // ****************************************************** + // TODO: equality tolerance once XGBoost rounding bug in XGBoostClassifier.transform(probabilityUDF) is fixed + implicit val doubleEquality = new Equality[Double] { + def areEqual(a: Double, b: Any): Boolean = b match { + case s: Double => (a.isNaN && s.isNaN) || math.abs(a - s) < 0.0000001 + case _ => false + } + } + implicit val doubleArrayEquality = new Equality[Array[Double]] { + def areEqual(a: Array[Double], b: Any): Boolean = b match { + case s: Array[_] if a.length == s.length => a.zip(s).forall(v => doubleEquality.areEqual(v._1, v._2)) + case _ => false + } + } + // ****************************************************** compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } - def compareOutputs(df1: DataFrame, df2: DataFrame): Unit = { - + def compareOutputs(df1: DataFrame, df2: DataFrame)(implicit arrayEquality: Equality[Array[Double]]): Unit = { def keysStartsWith(name: String, value: Map[String, Double]): Array[Double] = { val names = value.keys.filter(_.startsWith(name)).toArray.sorted names.map(value) } - df1.show() - df2.show() val sorted1 = df1.collect().sortBy(_.getAs[Double](4)) val sorted2 = df2.collect().sortBy(_.getAs[Map[String, Double]](2)(Prediction.Keys.PredictionName)) sorted1.zip(sorted2).foreach{ case (r1, r2) => @@ -155,6 +168,7 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos } } + def compareOutputsPred(df1: DataFrame, df2: DataFrame, predIndex: Int): Unit = { val sorted1 = df1.collect().sortBy(_.getAs[Double](predIndex)) val sorted2 = df2.collect().sortBy(_.getAs[Map[String, Double]](2)(Prediction.Keys.PredictionName)) From ea71e943d632247bd19690a6420ecfd6ecdade9f Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Thu, 9 Aug 2018 19:52:33 -0700 Subject: [PATCH 12/40] cleanup --- .../impl/classification/OpLinearSVC.scala | 8 ++++---- .../classification/OpXGBoostClassifier.scala | 18 ++++-------------- .../OpGeneralizedLinearRegression.scala | 4 ++-- .../xgboost4j/scala/spark/XGBoostParams.scala | 18 ++++++++++++++---- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLinearSVC.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLinearSVC.scala index 0f3d9381ff..425d43a866 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLinearSVC.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLinearSVC.scala @@ -151,15 +151,15 @@ class OpLinearSVCModel ttov: TypeTag[Prediction#Value] ) extends OpPredictorWrapperModel[LinearSVCModel](uid = uid, operationName = operationName, sparkModel = sparkModel) { - @transient private lazy val predictRaw = reflectMethod(getSparkMlStage().get, "predictRaw") - @transient private lazy val predict = reflectMethod(getSparkMlStage().get, "predict") + @transient lazy private val predictRaw = reflectMethod(getSparkMlStage().get, "predictRaw") + @transient lazy private val predict = reflectMethod(getSparkMlStage().get, "predict") /** * Function used to convert input to output */ override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => { - val raw = predictRaw.apply(features.value).asInstanceOf[Vector] - val pred = predict.apply(features.value).asInstanceOf[Double] + val raw = predictRaw(features.value).asInstanceOf[Vector] + val pred = predict(features.value).asInstanceOf[Double] Prediction(rawPrediction = raw, prediction = pred) } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala index 111c3d2a3c..173874889c 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala @@ -170,22 +170,12 @@ class OpXGBoostClassificationModel @transient lazy val probability2predictionMirror = reflectMethod(getSparkMlStage().get, "probability2prediction") - @transient lazy val model: XGBoostClassificationModel = getSparkMlStage().get - @transient lazy val booster: Booster = model.nativeBooster + private lazy val model = getSparkMlStage().get + private lazy val booster = model.nativeBooster override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => { - // val appName = dataset.sparkSession.sparkContext.appName -// val cacheInfo = { -// if (model.getUseExternalMemory) { -// s"$appName-${TaskContext.get().stageId()}-dtest_cache-${TaskContext.getPartitionId()}" -// } else { -// null -// } -// } - booster.getFeatureScore() - val cacheInfo = null - val data = OpXGBoost.removeMissingValues(Iterator(features.value.asXGBLabeledPoint), model.getMissing) - val dm = new DMatrix(dataIter = data, cacheInfo = cacheInfo) + val data = OpXGBoost.removeMissingValues(Iterator(features.value.asXGB), model.getMissing) + val dm = new DMatrix(dataIter = data) val treeLimit: Int = 0 // TODO: instead use model.getTreeLimit once available val rawPred = booster.predict(dm, outPutMargin = true, treeLimit = treeLimit)(0).map(_.toDouble) val prob = booster.predict(dm, outPutMargin = false, treeLimit = treeLimit)(0).map(_.toDouble) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegression.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegression.scala index 9d64f62560..148d13e02a 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegression.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegression.scala @@ -191,8 +191,8 @@ class OpGeneralizedLinearRegressionModel */ override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => { val offset = 0.0 - val raw = predictLink.apply(features.value, offset).asInstanceOf[Double] - val pred = predict.apply(features.value, offset).asInstanceOf[Double] + val raw = predictLink(features.value, offset).asInstanceOf[Double] + val pred = predict(features.value, offset).asInstanceOf[Double] Prediction(prediction = pred, rawPrediction = raw) } } diff --git a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala index 5ff1f5a685..58b510c257 100644 --- a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala +++ b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala @@ -34,9 +34,8 @@ import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.scala.spark.DataUtils.MLVectorToXGBLabeledPoint import ml.dmlc.xgboost4j.scala.spark.params.GeneralParams import org.apache.log4j.{Level, Logger} -import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} -import scala.collection.mutable import scala.collection.mutable.ArrayBuffer @@ -59,7 +58,18 @@ case object OpXGBoost { val DefaultTrackerConf = TrackerConf(workerConnectionTimeout = 0L, "scala") implicit class RichMLVectorToXGBLabeledPoint(val v: Vector) extends AnyVal { - def asXGBLabeledPoint: LabeledPoint = MLVectorToXGBLabeledPoint(v).asXGB + /** + * Converts a [[Vector]] to a data point with a dummy label. + * + * This is needed for constructing a [[ml.dmlc.xgboost4j.scala.DMatrix]] + * for prediction. + */ + def asXGB: LabeledPoint = v match { + case v: DenseVector => + LabeledPoint(0.0f, null, v.values.map(_.toFloat)) + case v: SparseVector => + LabeledPoint(0.0f, v.indices, v.values.map(_.toFloat)) + } } /** @@ -70,7 +80,7 @@ case object OpXGBoost { xgbLabelPoints.map { labeledPoint => val indices = new ArrayBuffer[Int]() val values = new ArrayBuffer[Float]() - for ((value, i) <- labeledPoint.values.zipWithIndex if value != missing) { + for {(value, i) <- labeledPoint.values.zipWithIndex if value != missing} { indices += (if (labeledPoint.indices == null) i else labeledPoint.indices(i)) values += value } From 42a95cfdf860fb7b21c5246378fed600c8f832c8 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Thu, 9 Aug 2018 20:49:10 -0700 Subject: [PATCH 13/40] update test --- .../classification/OpXGBoostClassifier.scala | 5 +++-- .../xgboost4j/scala/spark/XGBoostParams.scala | 1 - .../classification/OpClassifierModelTest.scala | 2 +- .../classification/OpXGBoostClassifierTest.scala | 16 ++++++++-------- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala index 173874889c..355cf114cf 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala @@ -36,7 +36,7 @@ import com.salesforce.op.stages.impl.CheckIsResponseValues import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel} import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod import ml.dmlc.xgboost4j.scala.spark._ -import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, EvalTrait, ObjectiveTrait} +import ml.dmlc.xgboost4j.scala.{DMatrix, EvalTrait, ObjectiveTrait} import org.apache.spark.ml.linalg.Vectors import scala.reflect.runtime.universe._ @@ -176,7 +176,8 @@ class OpXGBoostClassificationModel override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => { val data = OpXGBoost.removeMissingValues(Iterator(features.value.asXGB), model.getMissing) val dm = new DMatrix(dataIter = data) - val treeLimit: Int = 0 // TODO: instead use model.getTreeLimit once available + val treeLimit = 0 // TODO: instead use model.getTreeLimit once available + // TODO: can we avoid two booster.predict calls here? val rawPred = booster.predict(dm, outPutMargin = true, treeLimit = treeLimit)(0).map(_.toDouble) val prob = booster.predict(dm, outPutMargin = false, treeLimit = treeLimit)(0).map(_.toDouble) val probability = if (model.numClasses == 2) Array(1.0 - prob(0), prob(0)) else prob diff --git a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala index 58b510c257..456498ff29 100644 --- a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala +++ b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala @@ -31,7 +31,6 @@ package ml.dmlc.xgboost4j.scala.spark import ml.dmlc.xgboost4j.LabeledPoint -import ml.dmlc.xgboost4j.scala.spark.DataUtils.MLVectorToXGBLabeledPoint import ml.dmlc.xgboost4j.scala.spark.params.GeneralParams import org.apache.log4j.{Level, Logger} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala index 73585a6c57..fa0b69c0b8 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala @@ -136,7 +136,7 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos val op = toOP(spk, spk.uid).setInput(labelF, featureV) // ****************************************************** - // TODO: equality tolerance once XGBoost rounding bug in XGBoostClassifier.transform(probabilityUDF) is fixed + // TODO: remove equality tolerance once XGBoost rounding bug in XGBoostClassifier.transform(probabilityUDF) is fixed implicit val doubleEquality = new Equality[Double] { def areEqual(a: Double, b: Any): Boolean = b match { case s: Double => (a.isNaN && s.isNaN) || math.abs(a - s) < 0.0000001 diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala index 71d2240a52..50374d6855 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala @@ -64,14 +64,14 @@ class OpXGBoostClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWra estimator.setSilent(1) val expectedResult = Seq( - Prediction(1.0), - Prediction(0.0), - Prediction(0.0), - Prediction(1.0), - Prediction(1.0), - Prediction(0.0), - Prediction(1.0), - Prediction(0.0) + Prediction(1.0, Array(0.6200000047683716), Array(0.3799999952316284, 0.6200000047683716)), + Prediction(0.0, Array(0.3799999952316284), Array(0.6200000047683716, 0.3799999952316284)), + Prediction(0.0, Array(0.3799999952316284), Array(0.6200000047683716, 0.3799999952316284)), + Prediction(1.0, Array(0.6200000047683716), Array(0.3799999952316284, 0.6200000047683716)), + Prediction(1.0, Array(0.6200000047683716), Array(0.3799999952316284, 0.6200000047683716)), + Prediction(0.0, Array(0.3799999952316284), Array(0.6200000047683716, 0.3799999952316284)), + Prediction(1.0, Array(0.6200000047683716), Array(0.3799999952316284, 0.6200000047683716)), + Prediction(0.0, Array(0.3799999952316284), Array(0.6200000047683716, 0.3799999952316284)) ) it should "allow the user to set the desired spark parameters" in { From 07391ada2440e757f81d420a718500a6f022e266 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Thu, 9 Aug 2018 21:06:56 -0700 Subject: [PATCH 14/40] Added test --- .../OpClassifierModelTest.scala | 1 - .../OpDecisionTreeClassifierTest.scala | 1 - ...OpMultilayerPerceptronClassifierTest.scala | 24 +++++++------------ .../classification/OpNaiveBayesTest.scala | 5 +--- .../salesforce/op/features/types/Maps.scala | 7 ++++++ 5 files changed, 17 insertions(+), 21 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala index fa0b69c0b8..28f916a28f 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala @@ -168,7 +168,6 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos } } - def compareOutputsPred(df1: DataFrame, df2: DataFrame, predIndex: Int): Unit = { val sorted1 = df1.collect().sortBy(_.getAs[Double](predIndex)) val sorted2 = df2.collect().sortBy(_.getAs[Map[String, Double]](2)(Prediction.Keys.PredictionName)) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala index dad43b8266..77a3691ca1 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala @@ -73,7 +73,6 @@ class OpDecisionTreeClassifierTest extends OpEstimatorSpec[Prediction, Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)) ) - it should "allow the user to set the desired spark parameters" in { estimator .setMaxDepth(6) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala index f3e83f1e0c..2d6306a419 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala @@ -64,26 +64,20 @@ class OpMultilayerPerceptronClassifierTest extends OpEstimatorSpec[Prediction, .setInput(feature1, feature2) .setLayers(Array(3, 5, 4, 2)) - val expectedResult = Seq( - Prediction(1.0), - Prediction(0.0), - Prediction(0.0), - Prediction(1.0), - Prediction(1.0), - Prediction(0.0), - Prediction(1.0), - Prediction(0.0) + Prediction(1.0, Array(-9.655814651428148,9.202335441336952), Array(6.456683124562021E-9,0.9999999935433168)), + Prediction(0.0, Array(9.475612761543069,-10.617525149157993), Array(0.9999999981221492,1.877850786773977E-9)), + Prediction(0.0, Array(9.715293827870028,-10.885255922155942), Array(0.9999999988694366,1.130563392364822E-9)), + Prediction(1.0, Array(-9.66776357765489,9.215079716735316), Array(6.299199338896916E-9,0.9999999937008006)), + Prediction(1.0, Array(-9.668041712561456,9.215387575592239), Array(6.2955091287182745E-9,0.9999999937044908)), + Prediction(0.0, Array(9.692904797559496,-10.860273756796797), Array(0.9999999988145918,1.1854083109077814E-9)), + Prediction(1.0, Array(-9.667687253240183,9.214995747770411), Array(6.300209139771467E-9,0.9999999936997908)), + Prediction(0.0, Array(9.703097414537668,-10.872171694864653), Array(0.9999999988404908,1.1595091005698914E-9)) ) - it should "allow the user to set the desired spark parameters" in { - estimator - .setMaxIter(50) - .setBlockSize(2) - .setSeed(42) + estimator.setMaxIter(50).setBlockSize(2).setSeed(42) estimator.fit(inputData) - estimator.predictor.getMaxIter shouldBe 50 estimator.predictor.getBlockSize shouldBe 2 estimator.predictor.getSeed shouldBe 42 diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala index 6ec1312bc5..0fe8f5cc6c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala @@ -72,12 +72,9 @@ class OpNaiveBayesTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperMod Prediction(0.0, Array(-4.54, -6.32), Array(0.85, 0.14)) ) - it should "allow the user to set the desired spark parameters" in { - estimator - .setSmoothing(2) + estimator.setSmoothing(2) estimator.fit(inputData) - estimator.predictor.getSmoothing shouldBe 2 } } diff --git a/features/src/main/scala/com/salesforce/op/features/types/Maps.scala b/features/src/main/scala/com/salesforce/op/features/types/Maps.scala index 4834f64815..28f358f958 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/Maps.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/Maps.scala @@ -337,6 +337,13 @@ class Prediction private[op](value: Map[String, Double]) extends RealMap(value) val probKeys = keysStartsWith(ProbabilityName) if (probKeys.nonEmpty) probKeys.map(value) else Array(value(PredictionName)) } + + override def toString: String = { + val rawPred = rawPrediction.mkString("Array(", ",", ")") + val prob = probability.mkString("Array(", ",", ")") + s"${getClass.getSimpleName}(prediction = $prediction, rawPrediction = $rawPred, probability = $prob)" + } + } object Prediction { object Keys { From fafa9a0b643b6c2555c2949ad806d25a728b69e0 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Thu, 9 Aug 2018 21:11:13 -0700 Subject: [PATCH 15/40] Added test --- .../scala/com/salesforce/op/features/types/Maps.scala | 4 ++-- .../com/salesforce/op/features/types/PredictionTest.scala | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/features/src/main/scala/com/salesforce/op/features/types/Maps.scala b/features/src/main/scala/com/salesforce/op/features/types/Maps.scala index 28f358f958..edd3a8749e 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/Maps.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/Maps.scala @@ -339,8 +339,8 @@ class Prediction private[op](value: Map[String, Double]) extends RealMap(value) } override def toString: String = { - val rawPred = rawPrediction.mkString("Array(", ",", ")") - val prob = probability.mkString("Array(", ",", ")") + val rawPred = rawPrediction.mkString("Array(", ", ", ")") + val prob = probability.mkString("Array(", ", ", ")") s"${getClass.getSimpleName}(prediction = $prediction, rawPrediction = $rawPred, probability = $prob)" } diff --git a/features/src/test/scala/com/salesforce/op/features/types/PredictionTest.scala b/features/src/test/scala/com/salesforce/op/features/types/PredictionTest.scala index 8e57d4deca..b4cc4ad7e7 100644 --- a/features/src/test/scala/com/salesforce/op/features/types/PredictionTest.scala +++ b/features/src/test/scala/com/salesforce/op/features/types/PredictionTest.scala @@ -76,6 +76,14 @@ class PredictionTest extends FlatSpec with TestCommon { Prediction(1.0, Array(2.0, 3.0), Array.empty[Double]).score shouldBe Array(1.0) Prediction(1.0, Array.empty[Double], Array(2.0, 3.0)).score shouldBe Array(2.0, 3.0) } + it should "have a nice .toString method implementation" in { + Prediction(4.0).toString shouldBe + "Prediction(prediction = 4.0, rawPrediction = Array(), probability = Array())" + Prediction(1.0, Array(2.0, 3.0), Array.empty[Double]).toString shouldBe + "Prediction(prediction = 1.0, rawPrediction = Array(2.0, 3.0), probability = Array())" + Prediction(1.0, Array.empty[Double], Array(2.0, 3.0)).toString shouldBe + "Prediction(prediction = 1.0, rawPrediction = Array(), probability = Array(2.0, 3.0))" + } private def assertPredictionError(f: => Unit) = intercept[NonNullableEmptyException](f).getMessage shouldBe From be245a283b25abfb86434e23b3952cbabd2f8e33 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Thu, 9 Aug 2018 21:14:32 -0700 Subject: [PATCH 16/40] make stalastyle happy --- .../OpMultilayerPerceptronClassifierTest.scala | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala index 2d6306a419..f04c7e3574 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala @@ -65,14 +65,14 @@ class OpMultilayerPerceptronClassifierTest extends OpEstimatorSpec[Prediction, .setLayers(Array(3, 5, 4, 2)) val expectedResult = Seq( - Prediction(1.0, Array(-9.655814651428148,9.202335441336952), Array(6.456683124562021E-9,0.9999999935433168)), - Prediction(0.0, Array(9.475612761543069,-10.617525149157993), Array(0.9999999981221492,1.877850786773977E-9)), - Prediction(0.0, Array(9.715293827870028,-10.885255922155942), Array(0.9999999988694366,1.130563392364822E-9)), - Prediction(1.0, Array(-9.66776357765489,9.215079716735316), Array(6.299199338896916E-9,0.9999999937008006)), - Prediction(1.0, Array(-9.668041712561456,9.215387575592239), Array(6.2955091287182745E-9,0.9999999937044908)), - Prediction(0.0, Array(9.692904797559496,-10.860273756796797), Array(0.9999999988145918,1.1854083109077814E-9)), - Prediction(1.0, Array(-9.667687253240183,9.214995747770411), Array(6.300209139771467E-9,0.9999999936997908)), - Prediction(0.0, Array(9.703097414537668,-10.872171694864653), Array(0.9999999988404908,1.1595091005698914E-9)) + Prediction(1.0, Array(-9.655814651428148, 9.202335441336952), Array(6.456683124562021E-9, 0.9999999935433168)), + Prediction(0.0, Array(9.475612761543069, -10.617525149157993), Array(0.9999999981221492, 1.877850786773977E-9)), + Prediction(0.0, Array(9.715293827870028, -10.885255922155942), Array(0.9999999988694366, 1.130563392364822E-9)), + Prediction(1.0, Array(-9.66776357765489, 9.215079716735316), Array(6.299199338896916E-9, 0.9999999937008006)), + Prediction(1.0, Array(-9.668041712561456, 9.215387575592239), Array(6.2955091287182745E-9, 0.9999999937044908)), + Prediction(0.0, Array(9.692904797559496, -10.860273756796797), Array(0.9999999988145918, 1.1854083109077814E-9)), + Prediction(1.0, Array(-9.667687253240183, 9.214995747770411), Array(6.300209139771467E-9, 0.9999999936997908)), + Prediction(0.0, Array(9.703097414537668, -10.872171694864653), Array(0.9999999988404908, 1.1595091005698914E-9)) ) it should "allow the user to set the desired spark parameters" in { From 7dbfd6803dc62ce5b10d5187a3219fb0337c67c6 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Thu, 9 Aug 2018 21:35:46 -0700 Subject: [PATCH 17/40] fix tests --- .../op/stages/impl/feature/PercentileCalibrator.scala | 1 + .../stages/impl/feature/PercentileCalibratorTest.scala | 10 +++++++--- .../regression/OpGeneralizedLinearRegressionTest.scala | 4 ++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/PercentileCalibrator.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/PercentileCalibrator.scala index fe2f9bd326..f234adebc9 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/PercentileCalibrator.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/PercentileCalibrator.scala @@ -61,6 +61,7 @@ class PercentileCalibrator(uid: String = UID[PercentileCalibrator]) .setNumBuckets($(expectedNumBuckets)) .setRelativeError(0) .setInputCol(dataset.columns(0)) + .setOutputCol(dataset.columns(0) + "-out") val bucketizerModel = estimator.fit(dataset) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/PercentileCalibratorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/PercentileCalibratorTest.scala index 490cdd583d..a6c3887b81 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/PercentileCalibratorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/PercentileCalibratorTest.scala @@ -75,11 +75,15 @@ class PercentileCalibratorTest extends FlatSpec with TestSparkContext { val splits = trans.getMetadata().getSummaryMetadata().getStringArray(PercentileCalibrator.OrigSplitsKey) val scaled = trans.getMetadata().getSummaryMetadata().getStringArray(PercentileCalibrator.ScaledSplitsKey) - splits should contain theSameElementsAs - Array(Double.NegativeInfinity, 0.7231742029971469, 0.9908988967772393, Double.PositiveInfinity).map(_.toString) - scaled should contain theSameElementsAs Array(0.0, 50.0, 99.0, 99.0).map(_.toString) + splits shouldEqual Array( + Double.NegativeInfinity, 0.25329310557439133, 0.7231742029971469, 0.9908988967772393, + Double.PositiveInfinity).map(_.toString) + scaled shouldEqual Array(0.0, 33.0, 66.0, 99.0, 99.0).map(_.toString) } + // Array("-Infinity", "0.25329310557439133", "0.7231742029971469", "0.9908988967772393", "Infinity") did not contain the same elements as ArraySeq("-Infinity", "0.7231742029971469", "0.9908988967772393", "Infinity") + + it should "return a maximum calibrated score of 99" in { val data = (0 until 1000).map(i => i.toLong.toIntegral -> Random.nextDouble.toRealNN) val (scoresDF, f1, f2): (DataFrame, Feature[Integral], Feature[RealNN]) = TestFeatureBuilder(data) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala index 9041a95007..095c5e71fd 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala @@ -72,14 +72,14 @@ class OpGeneralizedLinearRegressionTest extends OpEstimatorSpec[Prediction, .setRegParam(0.1) .setFitIntercept(true) .setTol(1E-4) - .setSolver("normal") + .setSolver("irls") estimator.fit(inputData) estimator.predictor.getMaxIter shouldBe 10 estimator.predictor.getRegParam shouldBe 0.1 estimator.predictor.getFitIntercept shouldBe true estimator.predictor.getTol shouldBe 1E-4 - estimator.predictor.getSolver shouldBe "normal" + estimator.predictor.getSolver shouldBe "irls" } } From a3e63340878da759d387e2d74fca924f3fc9f019 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Thu, 9 Aug 2018 21:46:28 -0700 Subject: [PATCH 18/40] cleanup --- .../op/stages/impl/feature/PercentileCalibratorTest.scala | 3 --- 1 file changed, 3 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/PercentileCalibratorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/PercentileCalibratorTest.scala index a6c3887b81..48f3ad5746 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/PercentileCalibratorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/PercentileCalibratorTest.scala @@ -81,9 +81,6 @@ class PercentileCalibratorTest extends FlatSpec with TestSparkContext { scaled shouldEqual Array(0.0, 33.0, 66.0, 99.0, 99.0).map(_.toString) } - // Array("-Infinity", "0.25329310557439133", "0.7231742029971469", "0.9908988967772393", "Infinity") did not contain the same elements as ArraySeq("-Infinity", "0.7231742029971469", "0.9908988967772393", "Infinity") - - it should "return a maximum calibrated score of 99" in { val data = (0 until 1000).map(i => i.toLong.toIntegral -> Random.nextDouble.toRealNN) val (scoresDF, f1, f2): (DataFrame, Feature[Integral], Feature[RealNN]) = TestFeatureBuilder(data) From 897fc5ef5216fba0e70d024a6c8920f9c30d7c82 Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Mon, 13 Aug 2018 13:16:59 -0700 Subject: [PATCH 19/40] Fixed expected midpoint in DecisionTreeNumericBucketizer to reflect new midpoint algorithm introduced in Spark 2.3.0 --- .../impl/feature/DecisionTreeNumericBucketizerTest.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala index b2d916e8f1..a7622d2c96 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala @@ -171,11 +171,17 @@ class DecisionTreeNumericBucketizerTest extends OpEstimatorSpec[OPVector, val (ds, rawBinary, rawCurrency, rawER, label) = TestFeatureBuilder("binary", "currency", "expectedRevenue", "label", rawData) + // Spark changed their split algorithm in 2.3.0 to use the mean, so adjust our expected value here + // https://issues.apache.org/jira/browse/SPARK-16957 + val splitValue = expectedRevenueData + .filter(x => x.nonEmpty && x.value.get > 0.0) + .map(_.value.get).min / 2.0 + val out = rawER.autoBucketize(label.copy(isResponse = true), trackNulls = true, trackInvalid = true) assertBucketizer( bucketizer = out.originStage.asInstanceOf[DecisionTreeNumericBucketizer[_, _ <: OPNumeric[_]]], data = ds, shouldSplit = true, trackNulls = true, trackInvalid = true, - expectedSplits = Array(Double.NegativeInfinity, 0.0, Double.PositiveInfinity), + expectedSplits = Array(Double.NegativeInfinity, splitValue, Double.PositiveInfinity), expectedTolerance = 0.15 ) } From 56a91d9df8fabe7cb2d3473f4fa9e7f3463fb061 Mon Sep 17 00:00:00 2001 From: Matthew Date: Wed, 15 Aug 2018 10:54:56 -0700 Subject: [PATCH 20/40] Update workflow runner test to use tempDir --- core/src/test/resources/RunnerParams.json | 6 +-- .../salesforce/op/OpWorkflowRunnerTest.scala | 46 +++++++++---------- 2 files changed, 24 insertions(+), 28 deletions(-) diff --git a/core/src/test/resources/RunnerParams.json b/core/src/test/resources/RunnerParams.json index 8ecfd9ce4b..cd9f3bfb26 100644 --- a/core/src/test/resources/RunnerParams.json +++ b/core/src/test/resources/RunnerParams.json @@ -19,9 +19,9 @@ "partitions": 1 } }, - "modelLocation": "resources/tmp/OpWorkflowRunnerTest/op-runner-test-model", - "writeLocation": "resources/tmp/OpWorkflowRunnerTest/op-runner-test-write", - "metricsLocation": "resources/tmp/OpWorkflowRunnerTest/op-runner-test-metrics", + "modelLocation": "", + "writeLocation": "", + "metricsLocation": "", "customParams" : {}, "customTagName": "myTag", "collectStageMetrics": true, diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala index dd5c67ee69..e196f4c30d 100644 --- a/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala +++ b/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala @@ -36,34 +36,30 @@ import com.salesforce.op.OpWorkflowRunType._ import com.salesforce.op.evaluators.{BinaryClassificationMetrics, Evaluators} import com.salesforce.op.features.types._ import com.salesforce.op.readers.DataFrameFieldNames._ +import com.salesforce.op.stages.impl.classification.BinaryClassificationModelSelector import com.salesforce.op.stages.impl.classification.ClassificationModelsToTry.LogisticRegression -import com.salesforce.op.stages.impl.classification.{BinaryClassificationModelSelector, OpLogisticRegression} import com.salesforce.op.test.{PassengerSparkFixtureTest, TestSparkStreamingContext} import com.salesforce.op.utils.spark.AppMetrics import com.salesforce.op.utils.spark.RichDataset._ import org.apache.commons.io.FileUtils import org.junit.runner.RunWith import org.scalactic.source -import org.scalatest.AsyncFlatSpec +import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner import org.slf4j.LoggerFactory -import org.apache.log4j.Level import scala.collection.JavaConverters._ -import scala.concurrent.Promise +import scala.concurrent.{Future, Promise} import scala.reflect.ClassTag @RunWith(classOf[JUnitRunner]) -class OpWorkflowRunnerTest extends AsyncFlatSpec - with PassengerSparkFixtureTest with TestSparkStreamingContext { +class OpWorkflowRunnerTest extends FlatSpec with PassengerSparkFixtureTest with TestSparkStreamingContext { val log = LoggerFactory.getLogger(this.getClass) - val thisDir = new File("resources/tmp/OpWorkflowRunnerTest/").getCanonicalFile - - override def beforeAll: Unit = try deleteRecursively(thisDir) finally super.beforeAll - override def afterAll: Unit = try deleteRecursively(thisDir) finally super.afterAll + lazy val testDir = tempDir + "/op-runner-test" + lazy val modelLocation = new File( testDir + "/model") private val features = Seq(height, weight, gender, description, age).transmogrify() private val survivedNum = survived.occurs() @@ -73,9 +69,8 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec .setLogisticRegressionRegParam(0) .setInput(survivedNum, features).getOutput() private val workflow = new OpWorkflow().setResultFeatures(pred, raw, survivedNum).setReader(dataReader) - private val evaluator = - Evaluators.BinaryClassification().setLabelCol(survivedNum).setPredictionCol(pred).setRawPredictionCol(raw) - .setProbabilityCol(prob) + private val evaluator = Evaluators.BinaryClassification() + .setLabelCol(survivedNum).setPredictionCol(pred).setRawPredictionCol(raw).setProbabilityCol(prob) val metricsPromise = Promise[AppMetrics]() @@ -92,7 +87,8 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec val invalidParamsLocation = Some(resourceFile(name = "RunnerParamsInvalid.json").getPath) val paramsLocation = Some(resourceFile(name = "RunnerParams.json").getPath) - val testConfig = OpWorkflowRunnerConfig(paramLocation = paramsLocation) + def testConfig: OpWorkflowRunnerConfig = OpWorkflowRunnerConfig(paramLocation = paramsLocation) + .copy(modelLocation = Some(modelLocation.toString)) Spec[OpWorkflowRunner] should "correctly determine if the command line options are valid for each run type" in { assertConf(OpWorkflowRunnerConfig(Train, modelLocation = Some("Test"))) @@ -133,12 +129,10 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec } it should "train a workflow and write the trained model" in { - lazy val modelLocation = new File(thisDir + "/op-runner-test-model") - lazy val modelMetricsLocation = new File(thisDir + "/op-runner-test-metrics/train") + val modelMetricsLocation = new File(testDir + "/train-metrics") val runConfig = testConfig.copy( runType = Train, - modelLocation = Some(modelLocation.toString), metricsLocation = Some(modelMetricsLocation.toString) ) val res = doRun[TrainResult](runConfig, modelLocation, modelMetricsLocation) @@ -146,8 +140,8 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec } it should "score a dataset with a trained model" in { - val scoresLocation = new File(thisDir + "/op-runner-test-write/score") - val scoringMetricsLocation = new File(thisDir + "/op-runner-test-metrics/score") + val scoresLocation = new File(testDir + "/score") + val scoringMetricsLocation = new File(testDir + "/score-metrics") val runConfig = testConfig.copy( runType = Score, @@ -162,8 +156,8 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec } it should "streaming score a dataset with a trained model" in { - val readLocation = new File(thisDir + "/op-runner-test-read/streaming-score") - val scoresLocation = new File(thisDir + "/op-runner-test-write/streaming-score") + val readLocation = new File(testDir + "/streaming-score-in") + val scoresLocation = new File(testDir + "/streaming-score-out") // Prepare streaming input data FileUtils.forceMkdir(readLocation) @@ -185,18 +179,20 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec } it should "evaluate a dataset with a trained model" in { - val metricsLocation = new File(thisDir + "/op-runner-test-metrics/eval") + val scoresLocation = new File(testDir + "/eval-score") + val metricsLocation = new File(testDir + "/eval-metrics") val runConfig = testConfig.copy( runType = Evaluate, + writeLocation = Some(scoresLocation.toString), metricsLocation = Some(metricsLocation.toString) ) - val res = doRun[EvaluateResult](runConfig, metricsLocation) + val res = doRun[EvaluateResult](runConfig, metricsLocation, scoresLocation) res.metrics shouldBe a[BinaryClassificationMetrics] } it should "compute features upto with a workflow" in { - lazy val featuresLocation = new File(thisDir + "/op-runner-test-write/features") + lazy val featuresLocation = new File(testDir + "/features") val runConfig = testConfig.copy( runType = Features, @@ -207,7 +203,6 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec } it should "collect and report metrics on application end" in { - spark.stop() metricsPromise.future.map { metrics => metrics.appId.isEmpty shouldBe false OpWorkflowRunType.withNameInsensitiveOption(metrics.runType).isDefined shouldBe true @@ -217,6 +212,7 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec metrics.appDuration should be >= 0L metrics.stageMetrics.length should be > 0 } + Future(spark.stop()).map(_ => true shouldBe true) } private def assertConf(c: OpWorkflowRunnerConfig)(implicit pos: source.Position) = { From 46911f4926a18b7a70430c3d401afd0814238986 Mon Sep 17 00:00:00 2001 From: Matthew Date: Wed, 15 Aug 2018 11:01:39 -0700 Subject: [PATCH 21/40] flatmap futures --- .../com/salesforce/op/OpWorkflowRunnerTest.scala | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala index 435b37e210..1a1490448e 100644 --- a/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala +++ b/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala @@ -31,7 +31,6 @@ package com.salesforce.op import java.io.File -import java.nio.file.Paths import com.salesforce.op.OpWorkflowRunType._ import com.salesforce.op.evaluators.{BinaryClassificationMetrics, Evaluators} @@ -45,7 +44,7 @@ import com.salesforce.op.utils.spark.RichDataset._ import org.apache.commons.io.FileUtils import org.junit.runner.RunWith import org.scalactic.source -import org.scalatest.FlatSpec +import org.scalatest.AsyncFlatSpec import org.scalatest.junit.JUnitRunner import org.slf4j.LoggerFactory @@ -55,7 +54,7 @@ import scala.reflect.ClassTag @RunWith(classOf[JUnitRunner]) -class OpWorkflowRunnerTest extends FlatSpec with PassengerSparkFixtureTest with TestSparkStreamingContext { +class OpWorkflowRunnerTest extends AsyncFlatSpec with PassengerSparkFixtureTest with TestSparkStreamingContext { val log = LoggerFactory.getLogger(this.getClass) @@ -204,7 +203,10 @@ class OpWorkflowRunnerTest extends FlatSpec with PassengerSparkFixtureTest with } it should "collect and report metrics on application end" in { - metricsPromise.future.map { metrics => + for { + _ <- Future(spark.stop()) // stop spark to make sure metrics promise completes + metrics <- metricsPromise.future + } yield { metrics.appId.isEmpty shouldBe false OpWorkflowRunType.withNameInsensitiveOption(metrics.runType).isDefined shouldBe true metrics.appName shouldBe "op-test" @@ -213,7 +215,6 @@ class OpWorkflowRunnerTest extends FlatSpec with PassengerSparkFixtureTest with metrics.appDuration should be >= 0L metrics.stageMetrics.length should be > 0 } - Future(spark.stop()).map(_ => true shouldBe true) } private def assertConf(c: OpWorkflowRunnerConfig)(implicit pos: source.Position) = { From 907359126c339f9775749e08ac54b42668abb7d8 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Thu, 16 Aug 2018 19:31:13 -0700 Subject: [PATCH 22/40] update with official 0.80 release --- build.gradle | 4 +--- core/build.gradle | 2 +- .../op/stages/impl/classification/OpXGBoostClassifier.scala | 4 ++-- .../op/stages/impl/classification/OpClassifierModelTest.scala | 1 + 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/build.gradle b/build.gradle index c9e2bc660b..386da56dfe 100644 --- a/build.gradle +++ b/build.gradle @@ -19,9 +19,6 @@ plugins { allprojects { repositories { mavenCentral() - // Needed for XGboost - // TODO: remove this repo once XGBoost is published into Maven Central or similar - maven { url 'https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/' } maven { url 'https://dl.bintray.com/salesforce/maven' } } } @@ -91,6 +88,7 @@ configure(allProjs) { commonsValidatorVersion = '1.6' commonsIOVersion = '2.6' scoveragePluginVersion = '1.3.1' + xgboostVersion = '0.80' mainClassName = 'com.salesforce.Main' } diff --git a/core/build.gradle b/core/build.gradle index c232aa810b..e9035d31da 100644 --- a/core/build.gradle +++ b/core/build.gradle @@ -24,7 +24,7 @@ dependencies { compile "com.github.scopt:scopt_$scalaVersion:$scoptVersion" // XGBoost - compile "ml.dmlc:xgboost4j-spark:0.80-SNAPSHOT" + compile "ml.dmlc:xgboost4j-spark:$xgboostVersion" // Akka slfj4 logging (version matches XGBoost dependency) testCompile "com.typesafe.akka:akka-slf4j_$scalaVersion:2.3.11" } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala index 355cf114cf..4cbd48755d 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala @@ -171,12 +171,12 @@ class OpXGBoostClassificationModel reflectMethod(getSparkMlStage().get, "probability2prediction") private lazy val model = getSparkMlStage().get - private lazy val booster = model.nativeBooster + private lazy val booster = model.nativeBooster + private lazy val treeLimit = model.getTreeLimit.toInt override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => { val data = OpXGBoost.removeMissingValues(Iterator(features.value.asXGB), model.getMissing) val dm = new DMatrix(dataIter = data) - val treeLimit = 0 // TODO: instead use model.getTreeLimit once available // TODO: can we avoid two booster.predict calls here? val rawPred = booster.predict(dm, outPutMargin = true, treeLimit = treeLimit)(0).map(_.toDouble) val prob = booster.predict(dm, outPutMargin = false, treeLimit = treeLimit)(0).map(_.toDouble) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala index 28f916a28f..c5de4daabc 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala @@ -137,6 +137,7 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext with OpXGBoos // ****************************************************** // TODO: remove equality tolerance once XGBoost rounding bug in XGBoostClassifier.transform(probabilityUDF) is fixed + // TODO: ETA - will be added in XGBoost version 0.81 implicit val doubleEquality = new Equality[Double] { def areEqual(a: Double, b: Any): Boolean = b match { case s: Double => (a.isNaN && s.isNaN) || math.abs(a - s) < 0.0000001 From c242b8402e5075237dadccba1d4c5d8b8367567e Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Thu, 16 Aug 2018 22:10:40 -0700 Subject: [PATCH 23/40] update double opt equality --- core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index f6776468b7..dc5b5220cb 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -63,8 +63,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest { implicit val doubleOptEquality = new Equality[Option[Double]] { def areEqual(a: Option[Double], b: Any): Boolean = b match { case None => a.isEmpty - case s: Option[Double]@unchecked => (a.exists(_.isNaN) && s.exists(_.isNaN)) || - (a.nonEmpty && a.toSeq.zip(s.toSeq).forall{ case (n, m) => n == m }) + case Some(d: Double) => (a.exists(_.isNaN) && d.isNaN) || a.contains(d) case _ => false } } From 2eba329409aaaeb9666832dba15ba88ecc4111d9 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 17 Aug 2018 13:03:21 -0700 Subject: [PATCH 24/40] reuse the internal xgboost method --- .../classification/OpXGBoostClassifier.scala | 2 +- .../xgboost4j/scala/spark/XGBoostParams.scala | 19 +++---------------- 2 files changed, 4 insertions(+), 17 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala index 4cbd48755d..6bc78daff2 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala @@ -175,7 +175,7 @@ class OpXGBoostClassificationModel private lazy val treeLimit = model.getTreeLimit.toInt override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => { - val data = OpXGBoost.removeMissingValues(Iterator(features.value.asXGB), model.getMissing) + val data = removeMissingValues(Iterator(features.value.asXGB), model.getMissing) val dm = new DMatrix(dataIter = data) // TODO: can we avoid two booster.predict calls here? val rawPred = booster.predict(dm, outPutMargin = true, treeLimit = treeLimit)(0).map(_.toDouble) diff --git a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala index 456498ff29..6ba50006e9 100644 --- a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala +++ b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala @@ -72,21 +72,8 @@ case object OpXGBoost { } /** - * Copied from [[ml.dmlc.xgboost4j.scala.spark.XGBoost.removeMissingValues]] private method + * Hack to access [[ml.dmlc.xgboost4j.scala.spark.XGBoost.removeMissingValues]] private method */ - def removeMissingValues(xgbLabelPoints: Iterator[LabeledPoint], missing: Float): Iterator[LabeledPoint] = { - if (!missing.isNaN) { - xgbLabelPoints.map { labeledPoint => - val indices = new ArrayBuffer[Int]() - val values = new ArrayBuffer[Float]() - for {(value, i) <- labeledPoint.values.zipWithIndex if value != missing} { - indices += (if (labeledPoint.indices == null) i else labeledPoint.indices(i)) - values += value - } - labeledPoint.copy(indices = indices.toArray, values = values.toArray) - } - } else { - xgbLabelPoints - } - } + def removeMissingValues(xgbLabelPoints: Iterator[LabeledPoint], missing: Float): Iterator[LabeledPoint] = + XGBoost.removeMissingValues(xgbLabelPoints, missing) } From 55cd176d777652342ee95ab4d27ad8b6a7deb414 Mon Sep 17 00:00:00 2001 From: Matthew Date: Mon, 20 Aug 2018 15:47:44 -0700 Subject: [PATCH 25/40] organize imports --- .../src/test/scala/com/salesforce/op/ModelInsightsTest.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index 25e2d9c924..335457039d 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -30,16 +30,15 @@ package com.salesforce.op -import com.salesforce.op.evaluators.{EvalMetric, EvaluationMetrics} import com.salesforce.op.features.Feature import com.salesforce.op.features.types.{PickList, Real, RealNN} import com.salesforce.op.stages.impl.classification.{BinaryClassificationModelSelector, BinaryClassificationModelsToTry, OpLogisticRegression} import com.salesforce.op.stages.impl.preparators._ import com.salesforce.op.stages.impl.regression.{OpLinearRegression, RegressionModelSelector} import com.salesforce.op.stages.impl.selector.ModelSelectorNames.EstimatorType +import com.salesforce.op.stages.impl.selector.SelectedModel import com.salesforce.op.stages.impl.selector.ValidationType._ -import com.salesforce.op.stages.impl.selector.{ModelEvaluation, ProblemType, SelectedModel, ValidationType} -import com.salesforce.op.stages.impl.tuning.{DataSplitter, SplitterSummary} +import com.salesforce.op.stages.impl.tuning.DataSplitter import com.salesforce.op.test.PassengerSparkFixtureTest import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import org.apache.spark.ml.param.ParamMap From b553bbe6b90d951363df24e48b348d1d9e7fc40d Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Mon, 20 Aug 2018 20:47:15 -0700 Subject: [PATCH 26/40] added xgboost contributions to model insights --- .../com/salesforce/op/ModelInsights.scala | 51 ++++++++++--------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 06a21eb0f4..e40013ce0d 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -45,6 +45,7 @@ import com.salesforce.op.utils.spark.RichMetadata._ import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import com.salesforce.op.utils.table.Alignment._ import com.salesforce.op.utils.table.Table +import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostRegressionModel} import org.apache.spark.ml.classification._ import org.apache.spark.ml.regression._ import org.apache.spark.ml.{Model, PipelineStage, Transformer} @@ -606,39 +607,41 @@ case object ModelInsights { } private[op] def getModelContributions(model: Option[Model[_]]): Seq[Seq[Double]] = { - model.map { - case m: SparkWrapperParams[_] => m.getSparkMlStage() match { // TODO add additional models - case Some(m: LogisticRegressionModel) => m.coefficientMatrix.rowIter.toSeq.map(_.toArray.toSeq) - case Some(m: RandomForestClassificationModel) => Seq(m.featureImportances.toArray.toSeq) - case Some(m: NaiveBayesModel) => m.theta.rowIter.toSeq.map(_.toArray.toSeq) - case Some(m: DecisionTreeClassificationModel) => Seq(m.featureImportances.toArray.toSeq) - case Some(m: LinearRegressionModel) => Seq(m.coefficients.toArray.toSeq) - case Some(m: DecisionTreeRegressionModel) => Seq(m.featureImportances.toArray.toSeq) - case Some(m: RandomForestRegressionModel) => Seq(m.featureImportances.toArray.toSeq) - case _ => Seq.empty[Seq[Double]] - } - case _ => Seq.empty[Seq[Double]] - }.getOrElse(Seq.empty[Seq[Double]]) + val stage = model.flatMap { + case m: SparkWrapperParams[_] => m.getSparkMlStage() + case _ => None + } + val contributions = stage.collect { + case m: LogisticRegressionModel => m.coefficientMatrix.rowIter.toSeq.map(_.toArray.toSeq) + case m: RandomForestClassificationModel => Seq(m.featureImportances.toArray.toSeq) + case m: NaiveBayesModel => m.theta.rowIter.toSeq.map(_.toArray.toSeq) + case m: DecisionTreeClassificationModel => Seq(m.featureImportances.toArray.toSeq) + case m: LinearRegressionModel => Seq(m.coefficients.toArray.toSeq) + case m: DecisionTreeRegressionModel => Seq(m.featureImportances.toArray.toSeq) + case m: RandomForestRegressionModel => Seq(m.featureImportances.toArray.toSeq) + case m: XGBoostRegressionModel => Seq(m.nativeBooster.getFeatureScore().values.map(_.toDouble).toSeq) + case m: XGBoostClassificationModel => Seq(m.nativeBooster.getFeatureScore().values.map(_.toDouble).toSeq) + } + contributions.getOrElse(Seq.empty) } private def getModelInfo(model: Option[Model[_]]): Option[ModelSelectorSummary] = { model match { - case Some(m: SelectedModel) => Try(ModelSelectorSummary.fromMetadata(m.getMetadata().getSummaryMetadata())) - .toOption + case Some(m: SelectedModel) => + Try(ModelSelectorSummary.fromMetadata(m.getMetadata().getSummaryMetadata())).toOption case _ => None } } private def getStageInfo(stages: Array[OPStage]): Map[String, Any] = { - def getParams(stage: PipelineStage): Map[String, String] = - stage.extractParamMap().toSeq - .collect{ - case p if p.param.name == OpPipelineStageParamsNames.InputFeatures => - p.param.name -> p.value.asInstanceOf[Array[TransientFeature]].map(_.toJsonString()).mkString(", ") - case p if p.param.name != OpPipelineStageParamsNames.OutputMetadata && - p.param.name != OpPipelineStageParamsNames.InputSchema => p.param.name -> p.value.toString - }.toMap - + def getParams(stage: PipelineStage): Map[String, String] = { + stage.extractParamMap().toSeq.collect { + case p if p.param.name == OpPipelineStageParamsNames.InputFeatures => + p.param.name -> p.value.asInstanceOf[Array[TransientFeature]].map(_.toJsonString()).mkString(", ") + case p if p.param.name != OpPipelineStageParamsNames.OutputMetadata && + p.param.name != OpPipelineStageParamsNames.InputSchema => p.param.name -> p.value.toString + }.toMap + } stages.map { s => val params = s match { case m: Model[_] => getParams(if (m.hasParent) m.parent else m) // try for parent estimator so can get params From b20426a0b093ca0302ab65e061fbf724e77ee623 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 24 Aug 2018 17:18:06 -0700 Subject: [PATCH 27/40] minor fixes --- .../impl/preparators/SanityCheckerTest.scala | 24 +++++++++---------- .../RegressionModelSelectorTest.scala | 2 +- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala index 8efdf37403..e0a95e204f 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala @@ -34,17 +34,15 @@ import com.salesforce.op._ import com.salesforce.op.features.FeatureLike import com.salesforce.op.features.types._ import com.salesforce.op.stages.MetadataParam -import com.salesforce.op.stages.impl.feature._ import com.salesforce.op.stages.base.binary.{BinaryEstimator, BinaryModel} -import com.salesforce.op.stages.impl.feature.{HashSpaceStrategy, RealNNVectorizer, SmartTextMapVectorizer} -import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestSparkContext} +import com.salesforce.op.stages.impl.feature.{HashSpaceStrategy, RealNNVectorizer, SmartTextMapVectorizer, _} +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichMetadata._ import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} -import org.apache.log4j.Level +import org.apache.spark.SparkException import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql.types.Metadata import org.apache.spark.sql.{DataFrame, Row} -import org.apache.spark.SparkException import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @@ -70,7 +68,9 @@ case class TextRawData @RunWith(classOf[JUnitRunner]) class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OPVector, OPVector], - BinaryEstimator[RealNN, OPVector, OPVector]] with TestSparkContext { + BinaryEstimator[RealNN, OPVector, OPVector]] { + + override def specName: String = Spec[SanityChecker] // loggingLevel(Level.INFO) @@ -87,13 +87,11 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP TextRawData("9", 0.0, Map("beverage" -> "tea")), TextRawData("10", 0.0, Map("beverage" -> "coffee")), TextRawData("11", 0.0, Map("beverage" -> "water")) - ).map( textRawData => - ( - textRawData.id.toText, - textRawData.target.toRealNN, - textRawData.textMap.toTextMap - ) - ) + ).map( textRawData => ( + textRawData.id.toText, + textRawData.target.toRealNN, + textRawData.textMap.toTextMap + )) val (textData, id, target, textMap) = TestFeatureBuilder("id", "target", "textMap", textRawData) val targetResponse: FeatureLike[RealNN] = target.copy(isResponse = true) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/RegressionModelSelectorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/RegressionModelSelectorTest.scala index 2cb2a1619b..53a14f35f5 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/RegressionModelSelectorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/RegressionModelSelectorTest.scala @@ -72,7 +72,7 @@ class RegressionModelSelectorTest extends FlatSpec with TestSparkContext with Co .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0)) .addGrid(lr.maxIter, Array(10, 100)) .addGrid(lr.regParam, Array(0.0)) - .addGrid(lr.solver, Array("lbfgs")) + .addGrid(lr.solver, Array("l-bfgs")) .build() val rf = new OpRandomForestRegressor() From 39bce3913ad0b3c9606f5101afd81a30faed55f7 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 24 Aug 2018 18:17:59 -0700 Subject: [PATCH 28/40] update test --- .../op/features/types/FeatureTypeTest.scala | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeTest.scala b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeTest.scala index 1efa062895..7fb921341f 100644 --- a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeTest.scala +++ b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeTest.scala @@ -211,12 +211,16 @@ class FeatureTypeTest extends PropSpec with PropertyChecks with TestCommon { property("toString should return a valid string") { forAll(featureTypesVals) { ft => val actual = ft.toString - val v = ft.value match { + val v = ft match { case _ if ft.isEmpty => "" - case Seq(lat: Double, lon: Double, acc: Double) if ft.isInstanceOf[Geolocation] => - f"$lat%.5f, $lon%.5f, ${GeolocationAccuracy.withValue(acc.toInt)}" - case t: TraversableOnce[_] => t.mkString(", ") - case x => x.toString + case g: Geolocation => + f"${g.lat}%.5f, ${g.lon}%.5f, ${g.accuracy}" + case p: Prediction => + val rawPred = p.rawPrediction.mkString("Array(", ", ", ")") + val prob = p.probability.mkString("Array(", ", ", ")") + s"prediction = ${p.prediction}, rawPrediction = $rawPred, probability = $prob" + case SomeValue(v: TraversableOnce[_]) => v.mkString(", ") + case t => t.value.toString } val expected = s"${ft.getClass.getSimpleName}($v)" From a9dd209b50a7147114a121a57169653f36dff9cc Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 24 Aug 2018 18:30:16 -0700 Subject: [PATCH 29/40] update spec name --- .../impl/classification/OpDecisionTreeClassifierTest.scala | 2 +- .../op/stages/impl/classification/OpGBTClassifierTest.scala | 2 +- .../op/stages/impl/classification/OpLinearSVCTest.scala | 2 +- .../stages/impl/classification/OpLogisticRegressionTest.scala | 2 +- .../classification/OpMultilayerPerceptronClassifierTest.scala | 2 +- .../op/stages/impl/classification/OpNaiveBayesTest.scala | 2 +- .../impl/classification/OpRandomForestClassifierTest.scala | 2 +- .../op/stages/impl/classification/OpXGBoostClassifierTest.scala | 2 +- .../op/stages/impl/regression/OpDecisionTreeRegressorTest.scala | 2 +- .../op/stages/impl/regression/OpGBTRegressorTest.scala | 2 +- .../impl/regression/OpGeneralizedLinearRegressionTest.scala | 2 +- .../op/stages/impl/regression/OpLinearRegressionTest.scala | 2 +- .../op/stages/impl/regression/OpRandomForestRegressorTest.scala | 2 +- .../op/stages/impl/regression/OpXGBoostRegressorTest.scala | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala index 77a3691ca1..a7b864b2a5 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala @@ -45,7 +45,7 @@ class OpDecisionTreeClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[DecisionTreeClassificationModel], OpPredictorWrapper[DecisionTreeClassifier, DecisionTreeClassificationModel]] with PredictionEquality { - override def specName: String = classOf[OpDecisionTreeClassifier].getSimpleName + override def specName: String = Spec[OpDecisionTreeClassifier] val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifierTest.scala index d21fedb85c..7d1fa11e43 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifierTest.scala @@ -44,7 +44,7 @@ import org.scalatest.junit.JUnitRunner class OpGBTClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[GBTClassificationModel], OpPredictorWrapper[GBTClassifier, GBTClassificationModel]] with PredictionEquality { - override def specName: String = classOf[OpGBTClassifier].getSimpleName + override def specName: String = Spec[OpGBTClassifier] val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLinearSVCTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLinearSVCTest.scala index ad8a690d7d..c584bdd737 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLinearSVCTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLinearSVCTest.scala @@ -44,7 +44,7 @@ import org.scalatest.junit.JUnitRunner class OpLinearSVCTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LinearSVCModel], OpPredictorWrapper[LinearSVC, LinearSVCModel]] with PredictionEquality { - override def specName: String = classOf[OpLinearSVC].getSimpleName + override def specName: String = Spec[OpLinearSVC] val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala index 3e18333742..a9997d2300 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala @@ -44,7 +44,7 @@ import org.scalatest.junit.JUnitRunner class OpLogisticRegressionTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LogisticRegressionModel], OpPredictorWrapper[LogisticRegression, LogisticRegressionModel]] with PredictionEquality { - override def specName: String = classOf[OpLogisticRegression].getSimpleName + override def specName: String = Spec[OpLogisticRegression] val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala index f04c7e3574..f3486972a7 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala @@ -45,7 +45,7 @@ class OpMultilayerPerceptronClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[MultilayerPerceptronClassificationModel], OpPredictorWrapper[MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel]] with PredictionEquality { - override def specName: String = classOf[OpMultilayerPerceptronClassifier].getSimpleName + override def specName: String = Spec[OpMultilayerPerceptronClassifier] val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala index 0fe8f5cc6c..7e24de31ab 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala @@ -44,7 +44,7 @@ import org.scalatest.junit.JUnitRunner class OpNaiveBayesTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[NaiveBayesModel], OpPredictorWrapper[NaiveBayes, NaiveBayesModel]] with PredictionEquality { - override def specName: String = classOf[OpNaiveBayes].getSimpleName + override def specName: String = Spec[OpNaiveBayes] val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifierTest.scala index fa02d5347f..7c9e9d0277 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifierTest.scala @@ -44,7 +44,7 @@ class OpRandomForestClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[RandomForestClassificationModel], OpPredictorWrapper[RandomForestClassifier, RandomForestClassificationModel]] with PredictionEquality { - override def specName: String = classOf[OpRandomForestClassifier].getSimpleName + override def specName: String = Spec[OpRandomForestClassifier] lazy val (inputData, rawLabelMulti, featuresMulti) = TestFeatureBuilder[RealNN, OPVector]("labelMulti", "featuresMulti", diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala index 50374d6855..77760810db 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifierTest.scala @@ -45,7 +45,7 @@ class OpXGBoostClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWra OpPredictorWrapper[XGBoostClassifier, XGBoostClassificationModel]] with PredictionEquality with OpXGBoostQuietLogging { - override def specName: String = classOf[OpXGBoostClassifier].getSimpleName + override def specName: String = Spec[OpXGBoostClassifier] val rawData = Seq( 1.0 -> Vectors.dense(12.0, 4.3, 1.3), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressorTest.scala index 1f1b20b926..0b4554f3a5 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressorTest.scala @@ -44,7 +44,7 @@ class OpDecisionTreeRegressorTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[DecisionTreeRegressionModel], OpPredictorWrapper[DecisionTreeRegressor, DecisionTreeRegressionModel]] with PredictionEquality { - override def specName: String = classOf[OpDecisionTreeRegressor].getSimpleName + override def specName: String = Spec[OpDecisionTreeRegressor] val (inputData, rawLabel, features) = TestFeatureBuilder( Seq[(RealNN, OPVector)]( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressorTest.scala index fe272c2551..24e896188a 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressorTest.scala @@ -43,7 +43,7 @@ import org.scalatest.junit.JUnitRunner class OpGBTRegressorTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[GBTRegressionModel], OpPredictorWrapper[GBTRegressor, GBTRegressionModel]] with PredictionEquality { - override def specName: String = classOf[OpGBTRegressor].getSimpleName + override def specName: String = Spec[OpGBTRegressor] val (inputData, rawLabel, features) = TestFeatureBuilder( Seq[(RealNN, OPVector)]( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala index 095c5e71fd..c302ca9af2 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala @@ -44,7 +44,7 @@ class OpGeneralizedLinearRegressionTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[GeneralizedLinearRegressionModel], OpPredictorWrapper[GeneralizedLinearRegression, GeneralizedLinearRegressionModel]] with PredictionEquality { - override def specName: String = classOf[OpGeneralizedLinearRegression].getSimpleName + override def specName: String = Spec[OpGeneralizedLinearRegression] val (inputData, rawLabel, features) = TestFeatureBuilder( Seq[(RealNN, OPVector)]( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala index e672b66382..c7f346699e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala @@ -44,7 +44,7 @@ import org.scalatest.junit.JUnitRunner class OpLinearRegressionTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LinearRegressionModel], OpPredictorWrapper[LinearRegression, LinearRegressionModel]] with PredictionEquality { - override def specName: String = classOf[OpLinearRegression].getSimpleName + override def specName: String = Spec[OpLinearRegression] val (inputData, rawLabel, features) = TestFeatureBuilder( Seq[(RealNN, OPVector)]( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressorTest.scala index e1fac3c60e..b605a4a60a 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressorTest.scala @@ -44,7 +44,7 @@ class OpRandomForestRegressorTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[RandomForestRegressionModel], OpPredictorWrapper[RandomForestRegressor, RandomForestRegressionModel]] with PredictionEquality { - override def specName: String = classOf[OpRandomForestRegressor].getSimpleName + override def specName: String = Spec[OpRandomForestRegressor] val (inputData, rawLabel, features) = TestFeatureBuilder( Seq[(RealNN, OPVector)]( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala index 59e640a22a..db4498638c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressorTest.scala @@ -45,7 +45,7 @@ class OpXGBoostRegressorTest extends OpEstimatorSpec[Prediction, OpPredictorWrap OpPredictorWrapper[XGBoostRegressor, XGBoostRegressionModel]] with PredictionEquality with OpXGBoostQuietLogging { - override def specName: String = classOf[OpXGBoostRegressor].getSimpleName + override def specName: String = Spec[OpXGBoostRegressor] val rawData = Seq( (10.0, Vectors.dense(1.0, 4.3, 1.3)), From 29571d741b5bd0e4b6b7c55c9a971aaa76c962fc Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 24 Aug 2018 20:14:21 -0700 Subject: [PATCH 30/40] update tests --- .../salesforce/op/OpWorkflowRunnerTest.scala | 24 +++++++++---------- .../op/readers/DataGenerationTest.scala | 6 ++--- .../JoinedDataReaderDataGenerationTest.scala | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala index e7d40f19d2..23ef395cf9 100644 --- a/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala +++ b/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala @@ -31,6 +31,7 @@ package com.salesforce.op import java.io.File +import java.nio.file.Paths import com.salesforce.op.OpWorkflowRunType._ import com.salesforce.op.evaluators.{BinaryClassificationMetrics, Evaluators} @@ -56,14 +57,13 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec with PassengerSparkFixtureTest val log = LoggerFactory.getLogger(this.getClass) - lazy val testDir = tempDir + "/op-runner-test" - lazy val modelLocation = new File( testDir + "/model") + lazy val testDir = Paths.get(tempDir.toString, "op-runner-test").toFile.getAbsoluteFile + lazy val modelLocation = Paths.get(testDir.toString, "model").toFile.getAbsoluteFile private val features = Seq(height, weight, gender, description, age).transmogrify() private val survivedNum = survived.occurs() - val pred = new OpLogisticRegression().setRegParam(0) - .setInput(survivedNum, features).getOutput() + val pred = new OpLogisticRegression().setRegParam(0).setInput(survivedNum, features).getOutput() private val workflow = new OpWorkflow().setResultFeatures(pred, survivedNum).setReader(dataReader) private val evaluator = Evaluators.BinaryClassification().setLabelCol(survivedNum).setPredictionCol(pred) @@ -124,7 +124,7 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec with PassengerSparkFixtureTest } it should "train a workflow and write the trained model" in { - val modelMetricsLocation = new File(testDir + "/train-metrics") + val modelMetricsLocation = Paths.get(testDir.toString, "train-metrics").toFile.getCanonicalFile val runConfig = testConfig.copy( runType = Train, @@ -135,8 +135,8 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec with PassengerSparkFixtureTest } it should "score a dataset with a trained model" in { - val scoresLocation = new File(testDir + "/score") - val scoringMetricsLocation = new File(testDir + "/score-metrics") + val scoresLocation = Paths.get(testDir.toString, "score").toFile.getCanonicalFile + val scoringMetricsLocation = Paths.get(testDir.toString, "score-metrics").toFile.getCanonicalFile val runConfig = testConfig.copy( runType = Score, @@ -151,8 +151,8 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec with PassengerSparkFixtureTest } it should "streaming score a dataset with a trained model" in { - val readLocation = new File(testDir + "/streaming-score-in") - val scoresLocation = new File(testDir + "/streaming-score-out") + val readLocation = Paths.get(testDir.toString, "streaming-score-in").toFile.getCanonicalFile + val scoresLocation = Paths.get(testDir.toString, "streaming-score-out").toFile.getCanonicalFile // Prepare streaming input data FileUtils.forceMkdir(readLocation) @@ -174,8 +174,8 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec with PassengerSparkFixtureTest } it should "evaluate a dataset with a trained model" in { - val scoresLocation = new File(testDir + "/eval-score") - val metricsLocation = new File(testDir + "/eval-metrics") + val scoresLocation = Paths.get(testDir.toString, "eval-score").toFile.getCanonicalFile + val metricsLocation = Paths.get(testDir.toString, "eval-metrics").toFile.getCanonicalFile val runConfig = testConfig.copy( runType = Evaluate, @@ -187,7 +187,7 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec with PassengerSparkFixtureTest } it should "compute features upto with a workflow" in { - lazy val featuresLocation = new File(testDir + "/features") + lazy val featuresLocation = Paths.get(testDir.toString, "features").toFile.getCanonicalFile val runConfig = testConfig.copy( runType = Features, diff --git a/readers/src/test/scala/com/salesforce/op/readers/DataGenerationTest.scala b/readers/src/test/scala/com/salesforce/op/readers/DataGenerationTest.scala index 0b152ff17b..a7a6065a96 100644 --- a/readers/src/test/scala/com/salesforce/op/readers/DataGenerationTest.scala +++ b/readers/src/test/scala/com/salesforce/op/readers/DataGenerationTest.scala @@ -94,7 +94,7 @@ class DataGenerationTest extends FlatSpec with PassengerSparkFixtureTest { Map("Female" -> "string"), Map("Female" -> 1.0), Map("Female" -> false)), Row("3", null, null, List("Male"), 186, 96, "this is a description", List(1471046600), Map("Male" -> "string"), Map("Male" -> 1.0), Map("Male" -> false)), - Row("4", false, 50, List("Male"), 363, 172, "this is a description stuff", List(1471046400, 1471046300), + Row("4", false, 50, List("Male"), 363, 172, "stuff this is a description", List(1471046300, 1471046400), Map("Male" -> "string string"), Map("Male" -> 2.0), Map("Male" -> false)), Row("5", null, 2, List("Female"), 0.0, 67, "", List(1471046100), Map("Female" -> "string"), Map("Female" -> 1.0), Map("Female" -> false)), @@ -142,8 +142,8 @@ class DataGenerationTest extends FlatSpec with PassengerSparkFixtureTest { Row("5", null, 2, List("Female"), 0.0, 67, "", List(1471046100), Map("Female" -> "string"), Map("Female" -> 1.0), Map("Female" -> false)), Row("6", true, null, null, 0.0, null, null, null, null, null, null), - Row("4", null, 50, List("Male"), 0.0, 248, "this is a description stuff stuff", - List(1471046400, 1471046400, 1471046300), Map("Male" -> "string string string"), + Row("4", null, 50, List("Male"), 0.0, 248, "stuff stuff this is a description", + List(1471046400, 1471046300, 1471046400), Map("Male" -> "string string string"), Map("Male" -> 3.0), Map("Male" -> false)) ) val passenger4 = dataSet.filter(_.get(0) == "4").head diff --git a/readers/src/test/scala/com/salesforce/op/readers/JoinedDataReaderDataGenerationTest.scala b/readers/src/test/scala/com/salesforce/op/readers/JoinedDataReaderDataGenerationTest.scala index 4d0c78ed03..91a43ffc3f 100644 --- a/readers/src/test/scala/com/salesforce/op/readers/JoinedDataReaderDataGenerationTest.scala +++ b/readers/src/test/scala/com/salesforce/op/readers/JoinedDataReaderDataGenerationTest.scala @@ -290,7 +290,7 @@ class JoinedDataReaderDataGenerationTest extends FlatSpec with PassengerSparkFix aggregatedData.collect(description) should contain theSameElementsAs Array(Text.empty, Text.empty, Text.empty, Text(""), - Text("this is a description stuff this is a description stuff this is a description stuff"), + Text("stuff this is a description stuff this is a description stuff this is a description"), Text("this is a description")) aggregatedData.collect(stringMap) should contain theSameElementsAs @@ -300,7 +300,7 @@ class JoinedDataReaderDataGenerationTest extends FlatSpec with PassengerSparkFix aggregatedData.collect(boarded) should contain theSameElementsAs Array(DateList.empty, DateList.empty, DateList(Array(1471046100L)), DateList(Array(1471046400L)), - DateList(Array(1471046400L, 1471046300L, 1471046400L, 1471046300L, 1471046400L, 1471046300L)), + DateList(Array(1471046300L, 1471046400L, 1471046300L, 1471046400L, 1471046300L, 1471046400L)), DateList(Array(1471046600L))) // height has a special integration window so this features tests that things included in other From 9e658c21d237c8eb099e45e66825d843384e5d76 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 24 Aug 2018 20:34:53 -0700 Subject: [PATCH 31/40] final fixes --- .../op/utils/avro/RichGenericRecordTest.scala | 7 ++----- .../op/utils/spark/OpSparkListenerTest.scala | 11 ++++------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/utils/src/test/scala/com/salesforce/op/utils/avro/RichGenericRecordTest.scala b/utils/src/test/scala/com/salesforce/op/utils/avro/RichGenericRecordTest.scala index a2ed6ea496..28396be373 100644 --- a/utils/src/test/scala/com/salesforce/op/utils/avro/RichGenericRecordTest.scala +++ b/utils/src/test/scala/com/salesforce/op/utils/avro/RichGenericRecordTest.scala @@ -39,16 +39,13 @@ import org.scalatest.{FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class RichGenericRecordTest extends FlatSpec - with Matchers - with TestSparkContext - with TestCommon { +class RichGenericRecordTest extends FlatSpec with Matchers with TestSparkContext with TestCommon { import com.salesforce.op.utils.avro.RichGenericRecord._ val dataPath = resourceFile(parent = "../test-data", name = s"PassengerData.avro").getPath val passengerData = AvroInOut.read[GenericRecord](dataPath).getOrElse(throw new Exception("Couldn't read data")) - val firstRow = passengerData.first + val firstRow = passengerData.sortBy(_.get("passengerId").toString.toInt).first Spec[RichGenericRecord] should "get value of Int" in { val id = firstRow.getValue[Int]("passengerId") diff --git a/utils/src/test/scala/com/salesforce/op/utils/spark/OpSparkListenerTest.scala b/utils/src/test/scala/com/salesforce/op/utils/spark/OpSparkListenerTest.scala index ca67ffb646..90794b82d5 100644 --- a/utils/src/test/scala/com/salesforce/op/utils/spark/OpSparkListenerTest.scala +++ b/utils/src/test/scala/com/salesforce/op/utils/spark/OpSparkListenerTest.scala @@ -47,7 +47,7 @@ class OpSparkListenerTest extends FlatSpec with TableDrivenPropertyChecks with T sparkAppender.setName("spark-appender") sparkAppender.setThreshold(Level.INFO) sparkAppender.setLayout(new org.apache.log4j.PatternLayout) - LogManager.getLogger("com.salesforce.op.utils.spark.OpSparkListener").setLevel(Level.INFO) + LogManager.getLogger(classOf[OpSparkListener]).setLevel(Level.INFO) Logger.getRootLogger.addAppender(sparkAppender) sparkAppender } @@ -84,18 +84,15 @@ class OpSparkListenerTest extends FlatSpec with TableDrivenPropertyChecks with T it should "log messages for listener initialization, stage completion, app completion" in { val firstStage = listener.metrics.stageMetrics.head val logPrefix = listener.logPrefix - val logs = sparkLogAppender.logs + val logs = sparkLogAppender.logs.map(_.getMessage.toString) val messages = Table("Spark Log Messages", - "Instantiated spark listener: com.salesforce.op.utils.spark.OpSparkListener. Log Prefix %s".format(logPrefix), + "Instantiated spark listener: %s. Log Prefix %s".format(classOf[OpSparkListener].getName, logPrefix), "%s,APP_TIME_MS:%s".format(logPrefix, listener.metrics.appEndTime - listener.metrics.appStartTime), "%s,STAGE:%s,MEMORY_SPILLED_BYTES:%s,GC_TIME_MS:%s,STAGE_TIME_MS:%s".format( logPrefix, firstStage.name, firstStage.memoryBytesSpilled, firstStage.jvmGCTime, firstStage.executorRunTime ) ) - - forAll(messages) { m => - logs.map(x => x.getMessage.toString).contains(m) shouldBe true - } + forAll(messages) { m => logs.contains(m) shouldBe true } } } From 60565a5b7c87b8543511305325cac6dbaa49307d Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 24 Aug 2018 21:50:05 -0700 Subject: [PATCH 32/40] added enums --- .../impl/classification/BinaryClassificationModelSelector.scala | 1 + .../op/stages/impl/regression/RegressionModelSelector.scala | 1 + 2 files changed, 2 insertions(+) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelector.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelector.scala index 2545e951a6..9a509e968d 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelector.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelector.scala @@ -231,6 +231,7 @@ object BinaryClassificationModelsToTry extends Enum[BinaryClassificationModelsTo case object OpLinearSVC extends BinaryClassificationModelsToTry case object OpDecisionTreeClassifier extends BinaryClassificationModelsToTry case object OpNaiveBayes extends BinaryClassificationModelsToTry + case object OpXGBoostClassifier extends BinaryClassificationModelsToTry case class Custom(private val modeType: Class[_ <: EstimatorType]) extends BinaryClassificationModelsToTry { override val entryName: String = modeType.getSimpleName } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/RegressionModelSelector.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/RegressionModelSelector.scala index 1e3a9d3c4c..3b210320a2 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/regression/RegressionModelSelector.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/RegressionModelSelector.scala @@ -220,6 +220,7 @@ object RegressionModelsToTry extends Enum[RegressionModelsToTry] { case object OpRandomForestRegressor extends RegressionModelsToTry case object OpGBTRegressor extends RegressionModelsToTry case object OpGeneralizedLinearRegression extends RegressionModelsToTry + case object OpXGBoostRegressor extends RegressionModelsToTry case class Custom(private val modeType: Class[_ <: EstimatorType]) extends RegressionModelsToTry { override val entryName: String = modeType.getSimpleName } From 7e4857cb3e3bca82143d5dbe7262c984371918f7 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 24 Aug 2018 21:55:35 -0700 Subject: [PATCH 33/40] replace spark.sparkContext with sc --- .../BinaryClassificationModelSelectorTest.scala | 4 ++-- .../MultiClassificationModelSelectorTest.scala | 6 +++--- .../op/stages/impl/selector/ModelSelectorTest.scala | 4 ++-- .../com/salesforce/op/utils/spark/RichVectorTest.scala | 2 +- .../com/salesforce/op/utils/io/avro/AvroInOutTest.scala | 2 +- .../scala/com/salesforce/op/utils/spark/RichRDDTest.scala | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelectorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelectorTest.scala index 1106375360..beef7c5e9c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelectorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelectorTest.scala @@ -66,12 +66,12 @@ class BinaryClassificationModelSelectorTest extends FlatSpec with TestSparkConte // Generate positive observations following a distribution ~ N((0.0, 0.0, 0.0), I_3) val positiveData = - normalVectorRDD(spark.sparkContext, bigCount, 3, seed = seed) + normalVectorRDD(sc, bigCount, 3, seed = seed) .map(v => 1.0 -> Vectors.dense(v.toArray)) // Generate negative observations following a distribution ~ N((10.0, 10.0, 10.0), I_3) val negativeData = - normalVectorRDD(spark.sparkContext, smallCount, 3, seed = seed) + normalVectorRDD(sc, smallCount, 3, seed = seed) .map(v => 0.0 -> Vectors.dense(v.toArray.map(_ + 10.0))) val stageNames = Array("label_prediction", "label_rawPrediction", "label_probability") diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/MultiClassificationModelSelectorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/MultiClassificationModelSelectorTest.scala index 97b390da86..2d91850a67 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/MultiClassificationModelSelectorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/MultiClassificationModelSelectorTest.scala @@ -66,17 +66,17 @@ class MultiClassificationModelSelectorTest extends FlatSpec with TestSparkContex // Generate observations of label 1 following a distribution ~ N((-100.0, -100.0, -100.0), I_3) val label0Data = - normalVectorRDD(spark.sparkContext, label0Count, 3, seed = seed) + normalVectorRDD(sc, label0Count, 3, seed = seed) .map(v => 0.0 -> Vectors.dense(v.toArray.map(_ - 100.0))) // Generate observations of label 0 following a distribution ~ N((0.0, 0.0, 0.0), I_3) val label1Data = - normalVectorRDD(spark.sparkContext, label1Count, 3, seed = seed) + normalVectorRDD(sc, label1Count, 3, seed = seed) .map(v => 1.0 -> Vectors.dense(v.toArray)) // Generate observations of label 2 following a distribution ~ N((100.0, 100.0, 100.0), I_3) val label2Data = - normalVectorRDD(spark.sparkContext, label2Count, 3, seed = seed) + normalVectorRDD(sc, label2Count, 3, seed = seed) .map(v => 2.0 -> Vectors.dense(v.toArray.map(_ + 100.0))) val stageNames = Array("label_prediction", "label_rawPrediction", "label_probability") diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/selector/ModelSelectorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/selector/ModelSelectorTest.scala index 56c916ddcd..d5dcb7171f 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/selector/ModelSelectorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/selector/ModelSelectorTest.scala @@ -66,12 +66,12 @@ class ModelSelectorTest extends OpEstimatorSpec[Prediction, SelectedModel, Model // Generate positive observations following a distribution ~ N((0.0, 0.0, 0.0), I_3) val positiveData = - normalVectorRDD(spark.sparkContext, bigCount, 3, seed = seed) + normalVectorRDD(sc, bigCount, 3, seed = seed) .map(v => 1.0 -> Vectors.dense(v.toArray)) // Generate negative observations following a distribution ~ N((10.0, 10.0, 10.0), I_3) val negativeData = - normalVectorRDD(spark.sparkContext, smallCount, 3, seed = seed) + normalVectorRDD(sc, smallCount, 3, seed = seed) .map(v => 0.0 -> Vectors.dense(v.toArray.map(_ + 10.0))) val data = positiveData.union(negativeData).toDF("label", "features") diff --git a/features/src/test/scala/com/salesforce/op/utils/spark/RichVectorTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/RichVectorTest.scala index 889a93cf3e..1528bba3ee 100644 --- a/features/src/test/scala/com/salesforce/op/utils/spark/RichVectorTest.scala +++ b/features/src/test/scala/com/salesforce/op/utils/spark/RichVectorTest.scala @@ -52,7 +52,7 @@ class RichVectorTest extends PropSpec with PropertyChecks with TestSparkContext import VectorGenerators._ import com.salesforce.op.utils.spark.RichVector._ - lazy val sparseVevtorsRDDGen = RDDGenerator.genRDD[Vector](spark.sparkContext)(sparseVectorGen) + lazy val sparseVevtorsRDDGen = RDDGenerator.genRDD[Vector](sc)(sparseVectorGen) property("Vectors should error on size mismatch") { forAll(sparseVectorGen) { sparse: SparseVector => diff --git a/utils/src/test/scala/com/salesforce/op/utils/io/avro/AvroInOutTest.scala b/utils/src/test/scala/com/salesforce/op/utils/io/avro/AvroInOutTest.scala index 601b5cabc8..b8ca7c42b1 100644 --- a/utils/src/test/scala/com/salesforce/op/utils/io/avro/AvroInOutTest.scala +++ b/utils/src/test/scala/com/salesforce/op/utils/io/avro/AvroInOutTest.scala @@ -47,7 +47,7 @@ class AvroInOutTest extends FlatSpec with TestSparkContext { val avroSchemaPath = s"$testDataDir/PassengerDataAll.avsc" val avroFilePath = s"$testDataDir/PassengerDataAll.avro" val avroFileRecordCount = 891 - val hdfs: FileSystem = FileSystem.get(spark.sparkContext.hadoopConfiguration) + val hdfs: FileSystem = FileSystem.get(sc.hadoopConfiguration) lazy val avroTemp: String = tempDir + "/avro-inout-test" Spec(AvroInOut.getClass) should "creates RDD from an avro file" in { diff --git a/utils/src/test/scala/com/salesforce/op/utils/spark/RichRDDTest.scala b/utils/src/test/scala/com/salesforce/op/utils/spark/RichRDDTest.scala index 0a35e7de0a..4d161ac7d8 100644 --- a/utils/src/test/scala/com/salesforce/op/utils/spark/RichRDDTest.scala +++ b/utils/src/test/scala/com/salesforce/op/utils/spark/RichRDDTest.scala @@ -48,7 +48,7 @@ import org.scalatest.prop.PropertyChecks class RichRDDTest extends PropSpec with PropertyChecks with TestSparkContext { import com.salesforce.op.utils.spark.RichRDD._ - val data = RDDGenerator.genRDD[(Int, Int)](spark.sparkContext)(Arbitrary.arbitrary[(Int, Int)]) + val data = RDDGenerator.genRDD[(Int, Int)](sc)(Arbitrary.arbitrary[(Int, Int)]) property("save as a text file") { forAll(data) { rdd => From 91d52a84e87de0d3ffc284489d9f298402673da5 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 31 Aug 2018 22:10:28 -0700 Subject: [PATCH 34/40] move version --- build.gradle | 1 + core/build.gradle | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 1a4313a53c..82f777d85d 100644 --- a/build.gradle +++ b/build.gradle @@ -91,6 +91,7 @@ configure(allProjs) { hadrianVersion = '0.8.5' aardpfarkVersion = '0.1.0-SNAPSHOT' xgboostVersion = '0.80' + akkaSlf4jVersion = '2.3.11' mainClassName = 'com.salesforce.Main' } diff --git a/core/build.gradle b/core/build.gradle index 959e4b43d9..35b3dc40d3 100644 --- a/core/build.gradle +++ b/core/build.gradle @@ -26,5 +26,5 @@ dependencies { // XGBoost compile "ml.dmlc:xgboost4j-spark:$xgboostVersion" // Akka slfj4 logging (version matches XGBoost dependency) - testCompile "com.typesafe.akka:akka-slf4j_$scalaVersion:2.3.11" + testCompile "com.typesafe.akka:akka-slf4j_$scalaVersion:$akkaSlf4jVersion" } From 76a6d6f5a775369cf3421021c5b6d68d6fd87ae2 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 14 Sep 2018 16:22:51 -0700 Subject: [PATCH 35/40] make it compile --- .../salesforce/op/evaluators/Evaluators.scala | 3 +-- .../op/evaluators/OpBinScoreEvaluator.scala | 22 ++++++++----------- .../evaluators/OpBinScoreEvaluatorTest.scala | 2 +- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/evaluators/Evaluators.scala b/core/src/main/scala/com/salesforce/op/evaluators/Evaluators.scala index a60e97b51d..bea9ee7e25 100644 --- a/core/src/main/scala/com/salesforce/op/evaluators/Evaluators.scala +++ b/core/src/main/scala/com/salesforce/op/evaluators/Evaluators.scala @@ -56,8 +56,7 @@ object Evaluators { * Brier Score for the prediction */ def brierScore(): OpBinScoreEvaluator = - new OpBinScoreEvaluator( - name = BinaryClassEvalMetrics.brierScore, isLargerBetter = true) + new OpBinScoreEvaluator(name = BinaryClassEvalMetrics.brierScore, isLargerBetter = true) /** * Area under ROC diff --git a/core/src/main/scala/com/salesforce/op/evaluators/OpBinScoreEvaluator.scala b/core/src/main/scala/com/salesforce/op/evaluators/OpBinScoreEvaluator.scala index 5be7948180..b0f3605da0 100644 --- a/core/src/main/scala/com/salesforce/op/evaluators/OpBinScoreEvaluator.scala +++ b/core/src/main/scala/com/salesforce/op/evaluators/OpBinScoreEvaluator.scala @@ -31,14 +31,13 @@ package com.salesforce.op.evaluators import com.fasterxml.jackson.databind.annotation.JsonDeserialize import com.salesforce.op.UID +import com.twitter.algebird.Operators._ +import com.twitter.algebird.Tuple4Semigroup import org.apache.spark.ml.linalg.Vector -import org.apache.spark.sql.{Dataset, Row} -import org.slf4j.LoggerFactory import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.DoubleType -import com.twitter.algebird.Operators._ -import com.twitter.algebird.Monoid._ -import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{Dataset, Row} +import org.slf4j.LoggerFactory /** * @@ -86,16 +85,16 @@ private[op] class OpBinScoreEvaluator val (maxScore, minScore) = scoreAndLabels.map { case (score , _) => (score, score) }.fold(1.0, 0.0) { - case((maxVal, minVal), (scoreMax, scoreMin)) => { + case ((maxVal, minVal), (scoreMax, scoreMin)) => (math.max(maxVal, scoreMax), math.min(minVal, scoreMin)) - } } // Finding stats per bin -> avg score, avg conv rate, // total num of data points and overall brier score. + implicit val sg = new Tuple4Semigroup[Double, Double, Long, Double]() val stats = scoreAndLabels.map { case (score, label) => - (getBinIndex(score, minScore, maxScore), (score, label, 1L, math.pow((score - label), 2))) + (getBinIndex(score, minScore, maxScore), (score, label, 1L, math.pow(score - label, 2))) }.reduceByKey(_ + _).map { case (bin, (scoreSum, labelSum, count, squaredError)) => (bin, scoreSum / count, labelSum / count, count, squaredError) @@ -104,20 +103,17 @@ private[op] class OpBinScoreEvaluator val (averageScore, averageConversionRate, numberOfDataPoints, brierScoreSum, numberOfPoints) = stats.foldLeft((new Array[Double](numBins), new Array[Double](numBins), new Array[Long](numBins), 0.0, 0L)) { case ((score, convRate, dataPoints, brierScoreSum, totalPoints), - (binIndex, avgScore, avgConvRate, counts, squaredError)) => { - + (binIndex, avgScore, avgConvRate, counts, squaredError)) => score(binIndex) = avgScore convRate(binIndex) = avgConvRate dataPoints(binIndex) = counts - (score, convRate, dataPoints, brierScoreSum + squaredError, totalPoints + counts) - } } // binCenters is the center point in each bin. // e.g., for bins [(0.0 - 0.5), (0.5 - 1.0)], bin centers are [0.25, 0.75]. val diff = maxScore - minScore - val binCenters = (for {i <- 0 to numBins-1} yield (minScore + ((diff * i) / numBins) + (diff / (2 * numBins)))) + val binCenters = for {i <- 0 until numBins} yield minScore + ((diff * i) / numBins) + (diff / (2 * numBins)) val metrics = BinaryClassificationBinMetrics( brierScore = brierScoreSum / numberOfPoints, diff --git a/core/src/test/scala/com/salesforce/op/evaluators/OpBinScoreEvaluatorTest.scala b/core/src/test/scala/com/salesforce/op/evaluators/OpBinScoreEvaluatorTest.scala index acf8182bdd..3ea97eebee 100644 --- a/core/src/test/scala/com/salesforce/op/evaluators/OpBinScoreEvaluatorTest.scala +++ b/core/src/test/scala/com/salesforce/op/evaluators/OpBinScoreEvaluatorTest.scala @@ -71,7 +71,7 @@ class OpBinScoreEvaluatorTest extends FlatSpec with TestSparkContext { Spec[OpBinScoreEvaluator] should "return the bin metrics" in { val metrics = new OpBinScoreEvaluator(numBins = 4) - .setLabelCol(label.name).setPredictionCol(prediction.name).evaluateAll(dataset) + .setLabelCol(label.name).setPredictionCol(prediction.name).evaluateAll(dataset) metrics shouldBe BinaryClassificationBinMetrics( 0.09800605366, From 031a37dbbe1d514bb9754f4d8edbaa1984d4a951 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 14 Sep 2018 16:37:18 -0700 Subject: [PATCH 36/40] cleanup --- .../op/stages/impl/classification/OpXGBoostClassifier.scala | 1 - .../scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala index 6bc78daff2..03c83431c0 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala @@ -177,7 +177,6 @@ class OpXGBoostClassificationModel override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => { val data = removeMissingValues(Iterator(features.value.asXGB), model.getMissing) val dm = new DMatrix(dataIter = data) - // TODO: can we avoid two booster.predict calls here? val rawPred = booster.predict(dm, outPutMargin = true, treeLimit = treeLimit)(0).map(_.toDouble) val prob = booster.predict(dm, outPutMargin = false, treeLimit = treeLimit)(0).map(_.toDouble) val probability = if (model.numClasses == 2) Array(1.0 - prob(0), prob(0)) else prob diff --git a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala index 6ba50006e9..ddf79b6dc8 100644 --- a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala +++ b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala @@ -64,10 +64,8 @@ case object OpXGBoost { * for prediction. */ def asXGB: LabeledPoint = v match { - case v: DenseVector => - LabeledPoint(0.0f, null, v.values.map(_.toFloat)) - case v: SparseVector => - LabeledPoint(0.0f, v.indices, v.values.map(_.toFloat)) + case v: DenseVector => LabeledPoint(0.0f, null, v.values.map(_.toFloat)) + case v: SparseVector => LabeledPoint(0.0f, v.indices, v.values.map(_.toFloat)) } } From 5ec1a322d47db5ce7471b642052634ddd32260b8 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Thu, 27 Sep 2018 23:19:04 -0700 Subject: [PATCH 37/40] spark 2.3.2 --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 09423fcb65..2d3a348a17 100644 --- a/build.gradle +++ b/build.gradle @@ -60,7 +60,7 @@ configure(allProjs) { scalaCheckVersion = '1.14.0' junitVersion = '4.11' avroVersion = '1.7.7' - sparkVersion = '2.3.1' + sparkVersion = '2.3.2' sparkAvroVersion = '4.0.0' scalaGraphVersion = '1.12.5' scalafmtVersion = '1.5.1' From 9d49d21f876cf5eff73b18aaa04ea7b94a268b9f Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 12 Oct 2018 13:52:42 -0700 Subject: [PATCH 38/40] added docs --- .../classification/OpXGBoostClassifier.scala | 181 ++++++++++++++++++ .../impl/regression/OpXGBoostRegressor.scala | 181 ++++++++++++++++++ 2 files changed, 362 insertions(+) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala index 03c83431c0..66d02274b8 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala @@ -55,88 +55,269 @@ class OpXGBoostClassifier(uid: String = UID[OpXGBoostClassifier]) CheckIsResponseValues(in1, in2) } + /** + * Weight column name. If this is not set or empty, we treat all instance weights as 1.0. + */ def setWeightCol(value: String): this.type = set(weightCol, value) + /** + * Initial prediction (aka base margin) column name. + */ def setBaseMarginCol(value: String): this.type = set(baseMarginCol, value) + /** + * Number of classes + */ def setNumClass(value: Int): this.type = set(numClass, value) // setters for general params + + /** + * Rabit tracker configurations. The parameter must be provided as an instance of the + * [[TrackerConf]] class, which has the following definition: + * + * case class TrackerConf(workerConnectionTimeout: Duration, trainingTimeout: Duration, trackerImpl: String) + * + * See below for detailed explanations. + * + * - trackerImpl: Select the implementation of Rabit tracker. + * default: "python" + * + * Choice between "python" or "scala". The former utilizes the Java wrapper of the + * Python Rabit tracker (in dmlc_core), and does not support timeout settings. + * The "scala" version removes Python components, and fully supports timeout settings. + * + * - workerConnectionTimeout: the maximum wait time for all workers to connect to the tracker. + * default: 0 millisecond (no timeout) + * + * The timeout value should take the time of data loading and pre-processing into account, + * due to the lazy execution of Spark's operations. Alternatively, you may force Spark to + * perform data transformation before calling XGBoost.train(), so that this timeout truly + * reflects the connection delay. Set a reasonable timeout value to prevent model + * training/testing from hanging indefinitely, possible due to network issues. + * Note that zero timeout value means to wait indefinitely (equivalent to Duration.Inf). + * Ignored if the tracker implementation is "python". + */ def setTrackerConf(value: TrackerConf): this.type = set(trackerConf, value) + /** + * The number of rounds for boosting + */ def setNumRound(value: Int): this.type = set(numRound, value) + /** + * Number of workers used to train xgboost model. default: 1 + */ def setNumWorkers(value: Int): this.type = set(numWorkers, value) + /** + * Number of threads used by per worker. default 1 + */ def setNthread(value: Int): this.type = set(nthread, value) + /** + * Whether to use external memory as cache. default: false + */ def setUseExternalMemory(value: Boolean): this.type = set(useExternalMemory, value) + /** + * 0 means printing running messages, 1 means silent mode. default: 0 + */ def setSilent(value: Int): this.type = set(silent, value) + /** + * The value treated as missing + */ def setMissing(value: Float): this.type = set(missing, value) + /** + * The maximum time to wait for the job requesting new workers. default: 30 minutes + */ def setTimeoutRequestWorkers(value: Long): this.type = set(timeoutRequestWorkers, value) + /** + * The hdfs folder to load and save checkpoint boosters. default: `empty_string` + */ def setCheckpointPath(value: String): this.type = set(checkpointPath, value) + /** + * Checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that + * the trained model will get checkpointed every 10 iterations. Note: `checkpoint_path` must + * also be set if the checkpoint interval is greater than 0. + */ def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) + /** + * Random seed for the C++ part of XGBoost and train/test splitting. + */ def setSeed(value: Long): this.type = set(seed, value) + /** + * Step size shrinkage used in update to prevents overfitting. After each boosting step, we + * can directly get the weights of new features and eta actually shrinks the feature weights + * to make the boosting process more conservative. [default=0.3] range: [0,1] + */ def setEta(value: Double): this.type = set(eta, value) + /** + * Minimum loss reduction required to make a further partition on a leaf node of the tree. + * the larger, the more conservative the algorithm will be. [default=0] range: [0, + * Double.MaxValue] + */ def setGamma(value: Double): this.type = set(gamma, value) + /** + * Maximum depth of a tree, increase this value will make model more complex / likely to be + * overfitting. [default=6] range: [1, Int.MaxValue] + */ def setMaxDepth(value: Int): this.type = set(maxDepth, value) + /** + * Minimum sum of instance weight(hessian) needed in a child. If the tree partition step results + * in a leaf node with the sum of instance weight less than min_child_weight, then the building + * process will give up further partitioning. In linear regression mode, this simply corresponds + * to minimum number of instances needed to be in each node. The larger, the more conservative + * the algorithm will be. [default=1] range: [0, Double.MaxValue] + */ def setMinChildWeight(value: Double): this.type = set(minChildWeight, value) + /** + * Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0, it + * means there is no constraint. If it is set to a positive value, it can help making the update + * step more conservative. Usually this parameter is not needed, but it might help in logistic + * regression when class is extremely imbalanced. Set it to value of 1-10 might help control the + * update. [default=0] range: [0, Double.MaxValue] + */ def setMaxDeltaStep(value: Double): this.type = set(maxDeltaStep, value) + /** + * Subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly + * collected half of the data instances to grow trees and this will prevent overfitting. + * [default=1] range:(0,1] + */ def setSubsample(value: Double): this.type = set(subsample, value) + /** + * Subsample ratio of columns when constructing each tree. [default=1] range: (0,1] + */ def setColsampleBytree(value: Double): this.type = set(colsampleBytree, value) + /** + * Subsample ratio of columns for each split, in each level. [default=1] range: (0,1] + */ def setColsampleBylevel(value: Double): this.type = set(colsampleBylevel, value) + /** + * L2 regularization term on weights, increase this value will make model more conservative. + * [default=1] + */ def setLambda(value: Double): this.type = set(lambda, value) + /** + * L1 regularization term on weights, increase this value will make model more conservative. + * [default=0] + */ def setAlpha(value: Double): this.type = set(alpha, value) + /** + * The tree construction algorithm used in XGBoost. options: {'auto', 'exact', 'approx'} + * [default='auto'] + */ def setTreeMethod(value: String): this.type = set(treeMethod, value) + /** + * Growth policy for fast histogram algorithm + */ def setGrowPolicy(value: String): this.type = set(growPolicy, value) + /** + * Maximum number of bins in histogram + */ def setMaxBins(value: Int): this.type = set(maxBins, value) + /** + * This is only used for approximate greedy algorithm. + * This roughly translated into O(1 / sketch_eps) number of bins. Compared to directly select + * number of bins, this comes with theoretical guarantee with sketch accuracy. + * [default=0.03] range: (0, 1) + */ def setSketchEps(value: Double): this.type = set(sketchEps, value) + /** + * Control the balance of positive and negative weights, useful for unbalanced classes. A typical + * value to consider: sum(negative cases) / sum(positive cases). [default=1] + */ def setScalePosWeight(value: Double): this.type = set(scalePosWeight, value) + /** + * Parameter for Dart booster. + * Type of sampling algorithm. "uniform": dropped trees are selected uniformly. + * "weighted": dropped trees are selected in proportion to weight. [default="uniform"] + */ def setSampleType(value: String): this.type = set(sampleType, value) + /** + * Parameter of Dart booster. + * type of normalization algorithm, options: {'tree', 'forest'}. [default="tree"] + */ def setNormalizeType(value: String): this.type = set(normalizeType, value) + /** + * Parameter of Dart booster. + * dropout rate. [default=0.0] range: [0.0, 1.0] + */ def setRateDrop(value: Double): this.type = set(rateDrop, value) + /** + * Parameter of Dart booster. + * probability of skip dropout. If a dropout is skipped, new trees are added in the same manner + * as gbtree. [default=0.0] range: [0.0, 1.0] + */ def setSkipDrop(value: Double): this.type = set(skipDrop, value) + /** + * Parameter of linear booster + * L2 regularization term on bias, default 0(no L1 reg on bias because it is not important) + */ def setLambdaBias(value: Double): this.type = set(lambdaBias, value) // setters for learning params def setObjective(value: String): this.type = set(objective, value) + /** + * Specify the learning task and the corresponding learning objective. + * options: reg:linear, reg:logistic, binary:logistic, binary:logitraw, count:poisson, + * multi:softmax, multi:softprob, rank:pairwise, reg:gamma. default: reg:linear + */ def setBaseScore(value: Double): this.type = set(baseScore, value) + /** + * Evaluation metrics for validation data, a default metric will be assigned according to + * objective(rmse for regression, and error for classification, mean average precision for + * ranking). options: rmse, mae, logloss, error, merror, mlogloss, auc, aucpr, ndcg, map, + * gamma-deviance + */ def setEvalMetric(value: String): this.type = set(evalMetric, value) + /** + * Fraction of training points to use for testing. + */ def setTrainTestRatio(value: Double): this.type = set(trainTestRatio, value) + /** + * If non-zero, the training will be stopped after a specified number + * of consecutive increases in any evaluation metric. + */ def setNumEarlyStoppingRounds(value: Int): this.type = set(numEarlyStoppingRounds, value) + /** + * Customized objective function provided by user. default: null + */ def setCustomObj(value: ObjectiveTrait): this.type = set(customObj, value) + /** + * Customized evaluation function provided by user. default: null + */ def setCustomEval(value: EvalTrait): this.type = set(customEval, value) } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressor.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressor.scala index 617e84ae44..688f34f812 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressor.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpXGBoostRegressor.scala @@ -54,88 +54,269 @@ class OpXGBoostRegressor(uid: String = UID[OpXGBoostRegressor]) CheckIsResponseValues(in1, in2) } + /** + * Weight column name. If this is not set or empty, we treat all instance weights as 1.0. + */ def setWeightCol(value: String): this.type = set(weightCol, value) + /** + * Initial prediction (aka base margin) column name. + */ def setBaseMarginCol(value: String): this.type = set(baseMarginCol, value) + /** + * Group column name + */ def setGroupCol(value: String): this.type = set(groupCol, value) // setters for general params + + /** + * Rabit tracker configurations. The parameter must be provided as an instance of the + * [[TrackerConf]] class, which has the following definition: + * + * case class TrackerConf(workerConnectionTimeout: Duration, trainingTimeout: Duration, trackerImpl: String) + * + * See below for detailed explanations. + * + * - trackerImpl: Select the implementation of Rabit tracker. + * default: "python" + * + * Choice between "python" or "scala". The former utilizes the Java wrapper of the + * Python Rabit tracker (in dmlc_core), and does not support timeout settings. + * The "scala" version removes Python components, and fully supports timeout settings. + * + * - workerConnectionTimeout: the maximum wait time for all workers to connect to the tracker. + * default: 0 millisecond (no timeout) + * + * The timeout value should take the time of data loading and pre-processing into account, + * due to the lazy execution of Spark's operations. Alternatively, you may force Spark to + * perform data transformation before calling XGBoost.train(), so that this timeout truly + * reflects the connection delay. Set a reasonable timeout value to prevent model + * training/testing from hanging indefinitely, possible due to network issues. + * Note that zero timeout value means to wait indefinitely (equivalent to Duration.Inf). + * Ignored if the tracker implementation is "python". + */ def setTrackerConf(value: TrackerConf): this.type = set(trackerConf, value) + /** + * The number of rounds for boosting + */ def setNumRound(value: Int): this.type = set(numRound, value) + /** + * Number of workers used to train xgboost model. default: 1 + */ def setNumWorkers(value: Int): this.type = set(numWorkers, value) + /** + * Number of threads used by per worker. default 1 + */ def setNthread(value: Int): this.type = set(nthread, value) + /** + * Whether to use external memory as cache. default: false + */ def setUseExternalMemory(value: Boolean): this.type = set(useExternalMemory, value) + /** + * 0 means printing running messages, 1 means silent mode. default: 0 + */ def setSilent(value: Int): this.type = set(silent, value) + /** + * The value treated as missing + */ def setMissing(value: Float): this.type = set(missing, value) + /** + * The maximum time to wait for the job requesting new workers. default: 30 minutes + */ def setTimeoutRequestWorkers(value: Long): this.type = set(timeoutRequestWorkers, value) + /** + * The hdfs folder to load and save checkpoint boosters. default: `empty_string` + */ def setCheckpointPath(value: String): this.type = set(checkpointPath, value) + /** + * Checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that + * the trained model will get checkpointed every 10 iterations. Note: `checkpoint_path` must + * also be set if the checkpoint interval is greater than 0. + */ def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) + /** + * Random seed for the C++ part of XGBoost and train/test splitting. + */ def setSeed(value: Long): this.type = set(seed, value) + /** + * Step size shrinkage used in update to prevents overfitting. After each boosting step, we + * can directly get the weights of new features and eta actually shrinks the feature weights + * to make the boosting process more conservative. [default=0.3] range: [0,1] + */ def setEta(value: Double): this.type = set(eta, value) + /** + * Minimum loss reduction required to make a further partition on a leaf node of the tree. + * the larger, the more conservative the algorithm will be. [default=0] range: [0, + * Double.MaxValue] + */ def setGamma(value: Double): this.type = set(gamma, value) + /** + * Maximum depth of a tree, increase this value will make model more complex / likely to be + * overfitting. [default=6] range: [1, Int.MaxValue] + */ def setMaxDepth(value: Int): this.type = set(maxDepth, value) + /** + * Minimum sum of instance weight(hessian) needed in a child. If the tree partition step results + * in a leaf node with the sum of instance weight less than min_child_weight, then the building + * process will give up further partitioning. In linear regression mode, this simply corresponds + * to minimum number of instances needed to be in each node. The larger, the more conservative + * the algorithm will be. [default=1] range: [0, Double.MaxValue] + */ def setMinChildWeight(value: Double): this.type = set(minChildWeight, value) + /** + * Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0, it + * means there is no constraint. If it is set to a positive value, it can help making the update + * step more conservative. Usually this parameter is not needed, but it might help in logistic + * regression when class is extremely imbalanced. Set it to value of 1-10 might help control the + * update. [default=0] range: [0, Double.MaxValue] + */ def setMaxDeltaStep(value: Double): this.type = set(maxDeltaStep, value) + /** + * Subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly + * collected half of the data instances to grow trees and this will prevent overfitting. + * [default=1] range:(0,1] + */ def setSubsample(value: Double): this.type = set(subsample, value) + /** + * Subsample ratio of columns when constructing each tree. [default=1] range: (0,1] + */ def setColsampleBytree(value: Double): this.type = set(colsampleBytree, value) + /** + * Subsample ratio of columns for each split, in each level. [default=1] range: (0,1] + */ def setColsampleBylevel(value: Double): this.type = set(colsampleBylevel, value) + /** + * L2 regularization term on weights, increase this value will make model more conservative. + * [default=1] + */ def setLambda(value: Double): this.type = set(lambda, value) + /** + * L1 regularization term on weights, increase this value will make model more conservative. + * [default=0] + */ def setAlpha(value: Double): this.type = set(alpha, value) + /** + * The tree construction algorithm used in XGBoost. options: {'auto', 'exact', 'approx'} + * [default='auto'] + */ def setTreeMethod(value: String): this.type = set(treeMethod, value) + /** + * Growth policy for fast histogram algorithm + */ def setGrowPolicy(value: String): this.type = set(growPolicy, value) + /** + * Maximum number of bins in histogram + */ def setMaxBins(value: Int): this.type = set(maxBins, value) + /** + * This is only used for approximate greedy algorithm. + * This roughly translated into O(1 / sketch_eps) number of bins. Compared to directly select + * number of bins, this comes with theoretical guarantee with sketch accuracy. + * [default=0.03] range: (0, 1) + */ def setSketchEps(value: Double): this.type = set(sketchEps, value) + /** + * Control the balance of positive and negative weights, useful for unbalanced classes. A typical + * value to consider: sum(negative cases) / sum(positive cases). [default=1] + */ def setScalePosWeight(value: Double): this.type = set(scalePosWeight, value) + /** + * Parameter for Dart booster. + * Type of sampling algorithm. "uniform": dropped trees are selected uniformly. + * "weighted": dropped trees are selected in proportion to weight. [default="uniform"] + */ def setSampleType(value: String): this.type = set(sampleType, value) + /** + * Parameter of Dart booster. + * type of normalization algorithm, options: {'tree', 'forest'}. [default="tree"] + */ def setNormalizeType(value: String): this.type = set(normalizeType, value) + /** + * Parameter of Dart booster. + * dropout rate. [default=0.0] range: [0.0, 1.0] + */ def setRateDrop(value: Double): this.type = set(rateDrop, value) + /** + * Parameter of Dart booster. + * probability of skip dropout. If a dropout is skipped, new trees are added in the same manner + * as gbtree. [default=0.0] range: [0.0, 1.0] + */ def setSkipDrop(value: Double): this.type = set(skipDrop, value) + /** + * Parameter of linear booster + * L2 regularization term on bias, default 0(no L1 reg on bias because it is not important) + */ def setLambdaBias(value: Double): this.type = set(lambdaBias, value) // setters for learning params def setObjective(value: String): this.type = set(objective, value) + /** + * Specify the learning task and the corresponding learning objective. + * options: reg:linear, reg:logistic, binary:logistic, binary:logitraw, count:poisson, + * multi:softmax, multi:softprob, rank:pairwise, reg:gamma. default: reg:linear + */ def setBaseScore(value: Double): this.type = set(baseScore, value) + /** + * Evaluation metrics for validation data, a default metric will be assigned according to + * objective(rmse for regression, and error for classification, mean average precision for + * ranking). options: rmse, mae, logloss, error, merror, mlogloss, auc, aucpr, ndcg, map, + * gamma-deviance + */ def setEvalMetric(value: String): this.type = set(evalMetric, value) + /** + * Fraction of training points to use for testing. + */ def setTrainTestRatio(value: Double): this.type = set(trainTestRatio, value) + /** + * If non-zero, the training will be stopped after a specified number + * of consecutive increases in any evaluation metric. + */ def setNumEarlyStoppingRounds(value: Int): this.type = set(numEarlyStoppingRounds, value) + /** + * Customized objective function provided by user. default: null + */ def setCustomObj(value: ObjectiveTrait): this.type = set(customObj, value) + /** + * Customized evaluation function provided by user. default: null + */ def setCustomEval(value: EvalTrait): this.type = set(customEval, value) } From 55e7f279325451e327a0016d0706541e088cbe05 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 12 Oct 2018 14:47:05 -0700 Subject: [PATCH 39/40] remove enums for now --- .../impl/classification/BinaryClassificationModelSelector.scala | 1 - .../op/stages/impl/regression/RegressionModelSelector.scala | 1 - 2 files changed, 2 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelector.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelector.scala index 4507fb4703..4986b3d5b8 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelector.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelector.scala @@ -212,7 +212,6 @@ object BinaryClassificationModelsToTry extends Enum[BinaryClassificationModelsTo case object OpLinearSVC extends BinaryClassificationModelsToTry case object OpDecisionTreeClassifier extends BinaryClassificationModelsToTry case object OpNaiveBayes extends BinaryClassificationModelsToTry - case object OpXGBoostClassifier extends BinaryClassificationModelsToTry case class Custom(private val modeType: Class[_ <: EstimatorType]) extends BinaryClassificationModelsToTry { override val entryName: String = modeType.getSimpleName } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/RegressionModelSelector.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/RegressionModelSelector.scala index 33c42d3bae..41a55a43ab 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/regression/RegressionModelSelector.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/RegressionModelSelector.scala @@ -199,7 +199,6 @@ object RegressionModelsToTry extends Enum[RegressionModelsToTry] { case object OpRandomForestRegressor extends RegressionModelsToTry case object OpGBTRegressor extends RegressionModelsToTry case object OpGeneralizedLinearRegression extends RegressionModelsToTry - case object OpXGBoostRegressor extends RegressionModelsToTry case class Custom(private val modeType: Class[_ <: EstimatorType]) extends RegressionModelsToTry { override val entryName: String = modeType.getSimpleName } From 5a9862188daca821282b24e9737cd21b2be9448c Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 12 Oct 2018 18:24:18 -0700 Subject: [PATCH 40/40] Addressed comments + docs --- .../impl/classification/OpXGBoostClassifier.scala | 13 ++++++++++--- .../dmlc/xgboost4j/scala/spark/XGBoostParams.scala | 14 +++++++++++++- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala index 66d02274b8..bdfc3d5d41 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpXGBoostClassifier.scala @@ -346,17 +346,24 @@ class OpXGBoostClassificationModel ) { import OpXGBoost._ - protected def predictRawMirror: MethodMirror = throw new NotImplementedError() - protected def raw2probabilityMirror: MethodMirror = throw new NotImplementedError() + protected def predictRawMirror: MethodMirror = + throw new NotImplementedError( + "XGBoost-Spark does not support 'predictRaw'. This might change in upcoming releases.") + + protected def raw2probabilityMirror: MethodMirror = + throw new NotImplementedError( + "XGBoost-Spark does not support 'raw2probability'. This might change in upcoming releases.") + @transient lazy val probability2predictionMirror = reflectMethod(getSparkMlStage().get, "probability2prediction") private lazy val model = getSparkMlStage().get private lazy val booster = model.nativeBooster private lazy val treeLimit = model.getTreeLimit.toInt + private lazy val missing = model.getMissing override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => { - val data = removeMissingValues(Iterator(features.value.asXGB), model.getMissing) + val data = removeMissingValues(Iterator(features.value.asXGB), missing) val dm = new DMatrix(dataIter = data) val rawPred = booster.predict(dm, outPutMargin = true, treeLimit = treeLimit)(0).map(_.toDouble) val prob = booster.predict(dm, outPutMargin = false, treeLimit = treeLimit)(0).map(_.toDouble) diff --git a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala index ddf79b6dc8..06cd17a463 100644 --- a/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala +++ b/core/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParams.scala @@ -37,19 +37,31 @@ import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import scala.collection.mutable.ArrayBuffer - +/** + * Hack to access [[XGBoostClassifierParams]] + */ trait OpXGBoostClassifierParams extends XGBoostClassifierParams with OpXGBoostGeneralParamsDefaults +/** + * Hack to access [[XGBoostRegressorParams]] + */ trait OpXGBoostRegressorParams extends XGBoostRegressorParams with OpXGBoostGeneralParamsDefaults +/** + * XGBoost [[GeneralParams]] defaults + */ trait OpXGBoostGeneralParamsDefaults { self: GeneralParams => setDefault(trackerConf -> OpXGBoost.DefaultTrackerConf) } +/** + * Helper trait to hush XGBoost annoying logging + */ trait OpXGBoostQuietLogging { Logger.getLogger("akka").setLevel(Level.WARN) Logger.getLogger("XGBoostSpark").setLevel(Level.WARN) + Logger.getLogger(classOf[XGBoostClassifier]).setLevel(Level.WARN) Logger.getLogger(classOf[XGBoostRegressor]).setLevel(Level.WARN) }