From df6247860f7ee55c07443935f58fc1e1d745c9d1 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Sat, 7 Mar 2020 11:42:05 -0600 Subject: [PATCH] [SPARK-31012][ML][PYSPARK][DOCS] Updating ML API docs for 3.0 changes ### What changes were proposed in this pull request? Updating ML docs for 3.0 changes ### Why are the changes needed? I am auditing 3.0 ML changes, found some docs are missing or not updated. Need to update these. ### Does this PR introduce any user-facing change? Yes, doc changes ### How was this patch tested? Manually build and check Closes #27762 from huaxingao/spark-doc. Authored-by: Huaxin Gao Signed-off-by: Sean Owen --- docs/ml-features.md | 2 +- .../BinaryClassificationEvaluator.scala | 3 ++- .../MulticlassClassificationEvaluator.scala | 21 +++++++++++++++++++ .../ml/evaluation/RankingEvaluator.scala | 5 +++++ .../ml/evaluation/RegressionEvaluator.scala | 3 ++- python/pyspark/ml/evaluation.py | 9 ++++---- python/pyspark/ml/feature.py | 2 +- 7 files changed, 37 insertions(+), 8 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 9c05fd5fa1ce2..05ef848aefef8 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1499,7 +1499,7 @@ for more details on the API. The `Imputer` estimator completes missing values in a dataset, either using the mean or the median of the columns in which the missing values are located. The input columns should be of -`DoubleType` or `FloatType`. Currently `Imputer` does not support categorical features and possibly +numeric type. Currently `Imputer` does not support categorical features and possibly creates incorrect values for columns containing categorical features. Imputer can impute custom values other than 'NaN' by `.setMissingValue(custom_value)`. For example, `.setMissingValue(0)` will impute all occurrences of (0). diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala index ad3f87c398d1a..82b8e14f010af 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala @@ -28,7 +28,8 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType /** - * Evaluator for binary classification, which expects two input columns: rawPrediction and label. + * Evaluator for binary classification, which expects input columns rawPrediction, label and + * an optional weight column. * The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label 1) * or of type vector (length-2 vector of raw predictions, scores, or label probabilities). */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala index 435708186242f..1d6540e970383 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala @@ -81,6 +81,14 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid @Since("3.0.0") def setProbabilityCol(value: String): this.type = set(probabilityCol, value) + /** + * The class whose metric will be computed in `"truePositiveRateByLabel"`, + * `"falsePositiveRateByLabel"`, `"precisionByLabel"`, `"recallByLabel"`, + * `"fMeasureByLabel"`. + * Must be greater than or equal to 0. The default value is 0. + * + * @group param + */ @Since("3.0.0") final val metricLabel: DoubleParam = new DoubleParam(this, "metricLabel", "The class whose metric will be computed in " + @@ -98,6 +106,13 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid setDefault(metricLabel -> 0.0) + /** + * The beta value, which controls precision vs recall weighting, + * used in `"weightedFMeasure"`, `"fMeasureByLabel"`. + * Must be greater than 0. The default value is 1. + * + * @group param + */ @Since("3.0.0") final val beta: DoubleParam = new DoubleParam(this, "beta", "The beta value, which controls precision vs recall weighting, " + @@ -114,6 +129,12 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid setDefault(beta -> 1.0) + /** + * param for eps. log-loss is undefined for p=0 or p=1, so probabilities are clipped to + * max(eps, min(1 - eps, p)). Must be in range (0, 0.5). The default value is 1e-15. + * + * @group param + */ @Since("3.0.0") final val eps: DoubleParam = new DoubleParam(this, "eps", "log-loss is undefined for p=0 or p=1, so probabilities are clipped to " + diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala index ca3a8ebc1659d..8d017eb181f15 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala @@ -59,6 +59,11 @@ class RankingEvaluator (override val uid: String) setDefault(metricName -> "meanAveragePrecision") + /** + * param for ranking position value used in `"meanAveragePrecisionAtK"`, `"precisionAtK"`, + * `"ndcgAtK"`, `"recallAtK"`. Must be > 0. The default value is 10. + * @group param + */ final val k = new IntParam(this, "k", "The ranking position value used in " + s"${supportedMetricNames.filter(_.endsWith("AtK")).mkString("(", "|", ")")} " + diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala index 9f32d40d166bd..18a8dda0c76ef 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala @@ -27,7 +27,8 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} /** - * Evaluator for regression, which expects two input columns: prediction and label. + * Evaluator for regression, which expects input columns prediction, label and + * an optional weight column. */ @Since("1.4.0") final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val uid: String) diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index 556a2f85c708d..265f02c1a03ac 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -110,7 +110,8 @@ def isLargerBetter(self): class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol, HasWeightCol, JavaMLReadable, JavaMLWritable): """ - Evaluator for binary classification, which expects two input columns: rawPrediction and label. + Evaluator for binary classification, which expects input columns rawPrediction, label + and an optional weight column. The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label 1) or of type vector (length-2 vector of raw predictions, scores, or label probabilities). @@ -409,9 +410,9 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio """ metricName = Param(Params._dummy(), "metricName", "metric name in evaluation " - "(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate|" - "weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel|" - "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel|" + "(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate| " + "weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| " + "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| " "logLoss|hammingLoss)", typeConverter=TypeConverters.toString) metricLabel = Param(Params._dummy(), "metricLabel", diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 4c25bb495fdfb..6df2f74bcfc9d 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1467,7 +1467,7 @@ class Imputer(JavaEstimator, _ImputerParams, JavaMLReadable, JavaMLWritable): """ Imputation estimator for completing missing values, either using the mean or the median of the columns in which the missing values are located. The input columns should be of - DoubleType or FloatType. Currently Imputer does not support categorical features and + numeric type. Currently Imputer does not support categorical features and possibly creates incorrect values for a categorical feature. Note that the mean/median value is computed after filtering out missing values.