In [2]:
// $example on$
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.stat.Summarizer
// $example off$
import org.apache.spark.sql.SparkSession

val spark = SparkSession
  .builder
  .appName("SummarizerExample")
  .getOrCreate()

import spark.implicits._
import Summarizer._

spark = org.apache.spark.sql.SparkSession@768a88b7


org.apache.spark.sql.SparkSession@768a88b7

使用Summarizer 计算输入数据框的矢量列的均值和方差，包括和不包含权重列。

In [3]:
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.stat.Summarizer

val data = Seq(
  (Vectors.dense(2.0, 3.0, 5.0), 1.0),
  (Vectors.dense(4.0, 6.0, 7.0), 2.0)
)

val df = data.toDF("features", "weight")

val (meanVal, varianceVal) = df.select(metrics("mean", "variance")
  .summary($"features", $"weight").as("summary"))
  .select("summary.mean", "summary.variance")
  .as[(Vector, Vector)].first()

println(s"with weight: mean = ${meanVal}, variance = ${varianceVal}")

val (meanVal2, varianceVal2) = df.select(mean($"features"), variance($"features"))
  .as[(Vector, Vector)].first()

println(s"without weight: mean = ${meanVal2}, sum = ${varianceVal2}")


with weight: mean = [3.333333333333333,5.0,6.333333333333333], variance = [2.0,4.5,2.0]
without weight: mean = [3.0,4.5,6.0], sum = [2.0,4.5,2.0]


data = List(([2.0,3.0,5.0],1.0), ([4.0,6.0,7.0],2.0))
df = [features: vector, weight: double]
meanVal = [3.333333333333333,5.0,6.333333333333333]
varianceVal = [2.0,4.5,2.0]
meanVal2 = [3.0,4.5,6.0]
varianceVal2 = [2.0,4.5,2.0]


[2.0,4.5,2.0]

⊕ [Spark求统计量的两种方法 - Thinkgamer博客 - CSDN博客](https://blog.csdn.net/Gamer_gyt/article/details/79253420)

主要区别在于dataframe得到的是标准差，而使用mllib得到的统计值中是方差，但这并不矛盾，两者可以相互转化得到。

当然如果要求四分位数，可以转化成df，使用sql语句进行查询

Select PERCENTILE(col,<0.25,0.75>) from tableName;


In [5]:
import org.apache.spark.storage._
// persist(StorageLevel.MEMORY_AND_DISK) 当内存不够时cache到磁盘里
val df = spark.read.json("../../data/example.json").persist(StorageLevel.MEMORY_AND_DISK)
df.show()
df.describe()

+---+-------+-------+----+----------+
|age|chinese|english|math|      name|
+---+-------+-------+----+----------+
| 23|     78|     95|  78|thinkgamer|
| 25|     88|     93|  95|     think|
| 24|     68|     88|  93|     gamer|
+---+-------+-------+----+----------+



df = [age: bigint, chinese: bigint ... 3 more fields]


[summary: string, age: string ... 4 more fields]

In [6]:
df.select("age").describe().show()

+-------+----+
|summary| age|
+-------+----+
|  count|   3|
|   mean|24.0|
| stddev| 1.0|
|    min|  23|
|    max|  25|
+-------+----+

