# 7장. 집계함수

In [2]:
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("file:///home/ubuntu/ybigta/Dataset_spark/data/retail-data/all/*.csv")\
    .coalesce(5)  
# coalesce : null 값 거르기
    
df.cache()       # 빠른 접근을 위해
df.createOrReplaceTempView("dfTable")
# 스키마 정보 출력
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



- 모든 집계함수는 함수를 사용하거나 DataFrame의 stat 속성을 사용

## 7.1 > 집계함수

- count : 전체 레코드 수를 구할때
- countDistinct : 고유 레코드 수를 구할때
- approx_count_distinct : 레코드 수 근사치 구하기~
- first / last : DataFrame의 첫번째 값 / 마지막 값
- min / max : 최소 / 최대
- sum : 합
- sumDistinct
- 평균 avg
- 분산과 표준편차 variance / stddev / var_pop / stddev_pop
- 비대칭도와 첨도(변곡점) skewness
- 공분산과 상관관계 covar_samp / covar_pop / corr
- 복합데이터 타입의 집계

- count

In [3]:
from pyspark.sql.functions import count, countDistinct, approx_count_distinct

df.select(count("StockCode")).show()

df.select(countDistinct("StockCode")).show()

df.select(approx_count_distinct("StockCode", 0.1)).show()

+----------------+
|count(StockCode)|
+----------------+
|          541909|
+----------------+

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



- etc

In [9]:
from pyspark.sql.functions import first, last, min, max, sum, sumDistinct

df.select(first("StockCode"), last("StockCode")).show()

df.select(min("Quantity"), max("Quantity")).show()

df.select(sum("Quantity"), sumDistinct("Quantity")).show()

+-----------------------+----------------------+
|first(StockCode, false)|last(StockCode, false)|
+-----------------------+----------------------+
|                 85123A|                 22138|
+-----------------------+----------------------+

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+

+-------------+----------------------+
|sum(Quantity)|sum(DISTINCT Quantity)|
+-------------+----------------------+
|      5176450|                 29310|
+-------------+----------------------+



- 평균

In [17]:
from pyspark.sql.functions import sum, count, avg, expr

df.select(
count("Quantity").alias("total_transaction"),
sum("Quantity").alias("total_purchases"),
avg("Quantity").alias("avg_purchases"),
expr("mean(Quantity)").alias("mean_purchases"))\
.selectExpr(
"total_purchases/total_transaction as `sum/count`",
"avg_purchases",
"mean_purchases").show()

+----------------+----------------+----------------+
|       sum/count|   avg_purchases|  mean_purchases|
+----------------+----------------+----------------+
|9.55224954743324|9.55224954743324|9.55224954743324|
+----------------+----------------+----------------+



- 분산, 표준편차

In [18]:
from pyspark.sql.functions import var_pop, stddev_pop          #모집단 분산/표준편차
from pyspark.sql.functions import var_samp, stddev_samp        #표본집단 분산/표준편차

df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show()

+------------------+------------------+--------------------+---------------------+
| var_pop(Quantity)|var_samp(Quantity)|stddev_pop(Quantity)|stddev_samp(Quantity)|
+------------------+------------------+--------------------+---------------------+
|47559.303646609354| 47559.39140929905|  218.08095663447864|   218.08115785023486|
+------------------+------------------+--------------------+---------------------+



- 비대칭도와 첨도

In [19]:
from pyspark.sql.functions import skewness, kurtosis

df.select(skewness("Quantity"), kurtosis("Quantity")).show()

+--------------------+------------------+
|  skewness(Quantity)|kurtosis(Quantity)|
+--------------------+------------------+
|-0.26407557610527843|119768.05495536518|
+--------------------+------------------+



- 공분산과 상관관계

In [20]:
from pyspark.sql.functions import corr, covar_pop, covar_samp

df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), 
         covar_pop("InvoiceNo", "Quantity")).show()

+-------------------------+-------------------------------+------------------------------+
|corr(InvoiceNo, Quantity)|covar_samp(InvoiceNo, Quantity)|covar_pop(InvoiceNo, Quantity)|
+-------------------------+-------------------------------+------------------------------+
|     4.912186085637639E-4|             1052.7280543913773|            1052.7260778752732|
+-------------------------+-------------------------------+------------------------------+



- 복합데이터 다루기

In [24]:
from pyspark.sql.functions import collect_set, collect_list

df.agg(collect_set("Country"), collect_list("Country")).show()

+--------------------+---------------------+
|collect_set(Country)|collect_list(Country)|
+--------------------+---------------------+
|[Portugal, Italy,...| [United Kingdom, ...|
+--------------------+---------------------+

