[SPARK-31012][ML][PYSPARK][DOCS] Updating ML API docs for 3.0 changes

### What changes were proposed in this pull request? Updating ML docs for 3.0 changes ### Why are the changes needed? I am auditing 3.0 ML changes, found some docs are missing or not updated. Need to update these. ### Does this PR introduce any user-facing change? Yes, doc changes ### How was this patch tested? Manually build and check Closes apache#27762 from huaxingao/spark-doc. Authored-by: Huaxin Gao <huaxing@us.ibm.com> Signed-off-by: Sean Owen <srowen@gmail.com>
sjincho · Apr 14, 2020 · df62478 · df62478
1 parent 8afcb40
commit df62478
Show file tree

Hide file tree

Showing 7 changed files with 37 additions and 8 deletions.
diff --git a/docs/ml-features.md b/docs/ml-features.md
@@ -1499,7 +1499,7 @@ for more details on the API.
 
 The `Imputer` estimator completes missing values in a dataset, either using the mean or the 
 median of the columns in which the missing values are located. The input columns should be of
-`DoubleType` or `FloatType`. Currently `Imputer` does not support categorical features and possibly
+numeric type. Currently `Imputer` does not support categorical features and possibly
 creates incorrect values for columns containing categorical features. Imputer can impute custom values 
 other than 'NaN' by `.setMissingValue(custom_value)`. For example, `.setMissingValue(0)` will impute 
 all occurrences of (0).

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -28,7 +28,8 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.DoubleType
 
 /**
- * Evaluator for binary classification, which expects two input columns: rawPrediction and label.
+ * Evaluator for binary classification, which expects input columns rawPrediction, label and
+ *  an optional weight column.
  * The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label 1)
  * or of type vector (length-2 vector of raw predictions, scores, or label probabilities).
  */

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
@@ -81,6 +81,14 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
   @Since("3.0.0")
   def setProbabilityCol(value: String): this.type = set(probabilityCol, value)
 
+  /**
+   * The class whose metric will be computed in `"truePositiveRateByLabel"`,
+   * `"falsePositiveRateByLabel"`, `"precisionByLabel"`, `"recallByLabel"`,
+   * `"fMeasureByLabel"`.
+   * Must be greater than or equal to 0. The default value is 0.
+   *
+   * @group param
+   */
   @Since("3.0.0")
   final val metricLabel: DoubleParam = new DoubleParam(this, "metricLabel",
     "The class whose metric will be computed in " +
@@ -98,6 +106,13 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
 
   setDefault(metricLabel -> 0.0)
 
+  /**
+   * The beta value, which controls precision vs recall weighting,
+   * used in `"weightedFMeasure"`, `"fMeasureByLabel"`.
+   * Must be greater than 0. The default value is 1.
+   *
+   * @group param
+   */
   @Since("3.0.0")
   final val beta: DoubleParam = new DoubleParam(this, "beta",
     "The beta value, which controls precision vs recall weighting, " +
@@ -114,6 +129,12 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
 
   setDefault(beta -> 1.0)
 
+  /**
+   * param for eps. log-loss is undefined for p=0 or p=1, so probabilities are clipped to
+   * max(eps, min(1 - eps, p)). Must be in range (0, 0.5). The default value is 1e-15.
+   *
+   * @group param
+   */
   @Since("3.0.0")
   final val eps: DoubleParam = new DoubleParam(this, "eps",
     "log-loss is undefined for p=0 or p=1, so probabilities are clipped to " +

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala
@@ -59,6 +59,11 @@ class RankingEvaluator (override val uid: String)
 
   setDefault(metricName -> "meanAveragePrecision")
 
+  /**
+   * param for ranking position value used in `"meanAveragePrecisionAtK"`, `"precisionAtK"`,
+   * `"ndcgAtK"`, `"recallAtK"`. Must be &gt; 0. The default value is 10.
+   * @group param
+   */
   final val k = new IntParam(this, "k",
     "The ranking position value used in " +
       s"${supportedMetricNames.filter(_.endsWith("AtK")).mkString("(", "|", ")")}  " +

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
@@ -27,7 +27,8 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DoubleType, FloatType}
 
 /**
- * Evaluator for regression, which expects two input columns: prediction and label.
+ * Evaluator for regression, which expects input columns prediction, label and
+ * an optional weight column.
  */
 @Since("1.4.0")
 final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val uid: String)

diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
@@ -110,7 +110,8 @@ def isLargerBetter(self):
 class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol, HasWeightCol,
                                     JavaMLReadable, JavaMLWritable):
     """
-    Evaluator for binary classification, which expects two input columns: rawPrediction and label.
+    Evaluator for binary classification, which expects input columns rawPrediction, label
+    and an optional weight column.
     The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label
     1) or of type vector (length-2 vector of raw predictions, scores, or label probabilities).
 
@@ -409,9 +410,9 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
     """
     metricName = Param(Params._dummy(), "metricName",
                        "metric name in evaluation "
-                       "(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate|"
-                       "weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel|"
-                       "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel|"
+                       "(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate| "
+                       "weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| "
+                       "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| "
                        "logLoss|hammingLoss)",
                        typeConverter=TypeConverters.toString)
     metricLabel = Param(Params._dummy(), "metricLabel",

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
@@ -1467,7 +1467,7 @@ class Imputer(JavaEstimator, _ImputerParams, JavaMLReadable, JavaMLWritable):
     """
     Imputation estimator for completing missing values, either using the mean or the median
     of the columns in which the missing values are located. The input columns should be of
-    DoubleType or FloatType. Currently Imputer does not support categorical features and
+    numeric type. Currently Imputer does not support categorical features and
     possibly creates incorrect values for a categorical feature.
 
     Note that the mean/median value is computed after filtering out missing values.