# 5교시 집계 연산

### 목차
* [1. 집계 함수](#1.-집계-함수)
* [2. 그룹 함수](#2.-그룹-함수)
* [참고자료](#참고자료)


In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

spark = (
    SparkSession
    .builder
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .getOrCreate()
)

# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark

21/08/21 09:08:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
""" 구매 이력 데이터 """
df = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(f"{work_data}/retail-data/all")
    .coalesce(5)
)
df.cache()
df.createOrReplaceTempView("dfTable")

                                                                                

In [3]:
df.show(5, truncate=False)
df.count()

                                                                                

+---------+---------+-----------------------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate   |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+--------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |12/1/2010 8:26|2.55     |17850     |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |12/1/2010 8:26|3.39     |17850     |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |12/1/2010 8:26|2.75     |17850     |United Kingdom|
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |12/1/2010 8:26|3.39     |17850     |United Kingdom|
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |12/1/2010 8:26|3.39     |17850     |United Kingdom|
+---------+---------+-----------------------------------

                                                                                

541909

## 1. 집계 함수
### 1.1 로우 수 (count, countDistinct, approx_count_distinct)

In [4]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [5]:
from pyspark.sql.functions import *

df.selectExpr("count(*)").show()
df.where("Description is null").selectExpr("count(1)").show() # 1,454
df.selectExpr("count(Description)").show() # 540,455 + 1,454 = 541,909

+--------+
|count(1)|
+--------+
|  541909|
+--------+

+--------+
|count(1)|
+--------+
|    1454|
+--------+

+------------------+
|count(Description)|
+------------------+
|            540455|
+------------------+



In [6]:
from pyspark.sql.functions import *
# 명시적으로 컬럼을 지정한 경우 해당 컬럼이 널 인 경우 해당 로우는 제외됩니다
df.select(countDistinct("StockCode")).show()
df.selectExpr("count(distinct StockCode)").show()

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



In [7]:
from pyspark.sql.functions import *
# 근사치로 구하지만 연산 속도가 빠름
df.select(approx_count_distinct("StockCode", 0.1)).show() # 0.1은 최대 추정 오류율
df.select(approx_count_distinct("StockCode", 0.01)).show() 

                                                                                

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



21/08/21 08:49:41 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 20:>                                                         (0 + 1) / 1]

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            4079|
+--------------------------------+



                                                                                

### 1.2 수치 집계 함수 (first, last, min, max, sum, sumDistinct, avg)

In [8]:
from pyspark.sql.functions import *
df.select(first("StockCode"), last("StockCode")).show(1) # null도 감안하려면 True

df.select(min("Quantity"), max("Quantity")).show(1)
df.select(min("Description"), max("Description")).show(1) # 문자열

df.select(sum("Quantity")).show(1)
df.select(sumDistinct("Quantity")).show(1) # 고유값을 합산

                                                                                

+----------------+---------------+
|first(StockCode)|last(StockCode)|
+----------------+---------------+
|          85123A|          22138|
+----------------+---------------+

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+



                                                                                

+--------------------+-----------------+
|    min(Description)| max(Description)|
+--------------------+-----------------+
| 4 PURPLE FLOCK D...|wrongly sold sets|
+--------------------+-----------------+

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+

+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+



### 1.3 통계 집계 함수 (avg, mean, variance, stddev) 
* 표본표준분산 및 편차: variance, stddev
* 모표준분산 및 편차 : var_pop, stddev_pop

In [9]:
from pyspark.sql.functions import *

df.select(
    count("Quantity").alias("total_transcations"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_transcations"),    
).selectExpr(
    "total_purchases / total_transcations",
    "avg_purchases",
    "mean_transcations").show(3)

+--------------------------------------+----------------+-----------------+
|(total_purchases / total_transcations)|   avg_purchases|mean_transcations|
+--------------------------------------+----------------+-----------------+
|                      9.55224954743324|9.55224954743324| 9.55224954743324|
+--------------------------------------+----------------+-----------------+



In [10]:
df.select(
    variance("Quantity")
    , stddev("Quantity")
    , var_samp("Quantity")
    , stddev_samp("Quantity")
    , var_pop("Quantity")
    , stddev_pop("Quantity")
).show()

+------------------+---------------------+------------------+---------------------+------------------+--------------------+
|var_samp(Quantity)|stddev_samp(Quantity)|var_samp(Quantity)|stddev_samp(Quantity)| var_pop(Quantity)|stddev_pop(Quantity)|
+------------------+---------------------+------------------+---------------------+------------------+--------------------+
|47559.391409298696|   218.08115785023404|47559.391409298696|   218.08115785023404|47559.303646609005|  218.08095663447784|
+------------------+---------------------+------------------+---------------------+------------------+--------------------+



#### 1.3.1 분산과 표준편차
+ 표본표준분산 및 편차: variance, stddev
+ 모표준분산 및 편차 : var_pop, stddev_pop

In [11]:
from pyspark.sql.functions import variance, stddev
from pyspark.sql.functions import var_samp, stddev_samp
from pyspark.sql.functions import var_pop, stddev_pop

df.select(variance("Quantity"), stddev("Quantity"),      
          var_samp("Quantity"), stddev_samp("Quantity"), # 위와 동일
          var_pop("Quantity"), stddev_pop("Quantity")).show()

+------------------+---------------------+------------------+---------------------+------------------+--------------------+
|var_samp(Quantity)|stddev_samp(Quantity)|var_samp(Quantity)|stddev_samp(Quantity)| var_pop(Quantity)|stddev_pop(Quantity)|
+------------------+---------------------+------------------+---------------------+------------------+--------------------+
|47559.391409298696|   218.08115785023404|47559.391409298696|   218.08115785023404|47559.303646609005|  218.08095663447784|
+------------------+---------------------+------------------+---------------------+------------------+--------------------+



In [12]:
spark.createDataFrame(df.select("*").take(1)).select(variance("Quantity"), stddev("Quantity"),      
          var_samp("Quantity"), stddev_samp("Quantity"), # 위와 동일
          var_pop("Quantity"), stddev_pop("Quantity")).show() # 1일 때는 NaN이 나옵니다.

[Stage 39:>                                                         (0 + 3) / 3]

+------------------+---------------------+------------------+---------------------+-----------------+--------------------+
|var_samp(Quantity)|stddev_samp(Quantity)|var_samp(Quantity)|stddev_samp(Quantity)|var_pop(Quantity)|stddev_pop(Quantity)|
+------------------+---------------------+------------------+---------------------+-----------------+--------------------+
|              null|                 null|              null|                 null|              0.0|                 0.0|
+------------------+---------------------+------------------+---------------------+-----------------+--------------------+



                                                                                

#### 1.3.2 비대칭도와 첨도
+ 비대칭도와 첨도 : https://www.youtube.com/watch?time_continue=2&v=g9VOhfy2WWY

In [13]:
from pyspark.sql.functions import skewness, kurtosis

df.select(skewness("Quantity"), kurtosis("Quantity")).show()

+-------------------+------------------+
| skewness(Quantity)|kurtosis(Quantity)|
+-------------------+------------------+
|-0.2640755761052783|119768.05495533274|
+-------------------+------------------+



![왜도](images/func1.png)

#### 비대칭도, 왜도 (skewness)
> 왜도는 데이터가 대칭이 아닌 정도입니다. 왜도 값(0, 양수 또는 음수)이 데이터 형상에 대한 정보를 나타냅니다.
데이터가 대칭에 가까울수록 왜도 값이 0에 근접합니다. 그러나 왜도 부족만으로 정규성을 의미하지는 않습니다.

#### 첨도(kurtosis)
> 첨도는 분포의 꼬리가 정규 분포와 어떻게 다른지 나타냅니다. 완전히 정규 분포를 따르는 데이터의 첨도 값은 0입니다.
분포의 첨도 값이 양수이면 분포의 꼬리가 정규 분포보다 두껍다는 것을 나타냅니다
분포의 첨도 값이 음수이면 분포의 꼬리가 정규 분포보다 얇다는 것을 나타냅니다. 

#### [skewness](https://github.com/apache/spark/blob/5a7403623d0525c23ab8ae575e9d1383e3e10635/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#L231)
```scala
org.apache.spark.sql.catalyst.expressions.aggregate.CentralMomentAgg

def skewness(columnName: String): Column = skewness(Column(columnName))
def skewness(e: Column): Column = withAggregateFunction { Skewness(e.expr) }
def kurtosis(e: Column): Column = withAggregateFunction { Kurtosis(e.expr) }
def kurtosis(columnName: String): Column = kurtosis(Column(columnName))
```

#### 1.3.3 공분산과 상관관계
+ 표본공분산(cover_samp), 모공분산(cover_pop)

In [14]:
from pyspark.sql.functions import corr, covar_pop, covar_samp

df.select(corr("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity")).show()



+-------------------------+------------------------------+-------------------------------+
|corr(InvoiceNo, Quantity)|covar_pop(InvoiceNo, Quantity)|covar_samp(InvoiceNo, Quantity)|
+-------------------------+------------------------------+-------------------------------+
|     4.912186085648426E-4|            1052.7260778770628|              1052.728054393167|
+-------------------------+------------------------------+-------------------------------+



                                                                                

![공분산](images/func2.gif)
          
#### [공분산](https://ko.wikipedia.org/wiki/%EA%B3%B5%EB%B6%84%EC%82%B0) (covariance)
> 공분산(共分散, 영어: covariance)은 2개의 확률변수의 상관정도를 나타내는 값이다.(1개의 변수의 이산정도를 나타내는 분산과는 별개임) 만약 2개의 변수중 하나의 값이 상승하는 경향을 보일 때, 다른 값도 상승하는 경향의 상관관계에 있다면, 공분산의 값은 양수가 될 것이다. 반대로 2개의 변수중 하나의 값이 상승하는 경향을 보일 때, 다른 값이 하강하는 경향을 보인다면 공분산의 값은 음수가 된다. <br>
<br>
![function](images/func3.png)

<br>
단, 100점만점인 두과목의 점수 공분산은 별로 상관성이 부족하지만 100점만점이기 때문에 큰 값이 나오고
10점짜리 두과목의 점수 공분산은 상관성이 아주 높을지만 10점만점이기 때문에 작은값이 나온다

![function](images/func4.png)

#### [상관관계](https://ko.wikipedia.org/wiki/%EC%83%81%EA%B4%80_%EB%B6%84%EC%84%9D) (correlation)
> 상관 분석(Correlation analysis)은 확률론과 통계학에서 두 변수간에 어떤 선형적 관계를 갖고 있는 지를 분석하는 방법이다. 
![function](image/func5.png)

#### 피어슨 상관 계수
> 피어슨 상관 계수란 두 변수 X 와 Y 간의 선형 상관 관계를 계량화한 수치다 . 피어슨 상관 계수는 코시-슈바르츠 부등식에 의해 +1과 -1 사이의 값을 가지며, +1은 완벽한 양의 선형 상관 관계, 0은 선형 상관 관계 없음, -1은 완벽한 음의 선형 상관 관계를 의미한다.

#### Perason's r = X와 Y가 함께 변하는 정도 / X와 Y가 각각 변하는 정도
##### r 값은 X 와 Y 가 완전히 동일하면 +1, 전혀 다르면 0, 반대방향으로 완전히 동일 하면 –1 을 가진다.

### 1.4 복합 데이터 타입의 집계

In [15]:
from pyspark.sql.functions import collect_list, collect_set, size

df.select(collect_list("Country"), collect_set("Country")).show()

+---------------------+--------------------+
|collect_list(Country)|collect_set(Country)|
+---------------------+--------------------+
| [United Kingdom, ...|[Portugal, Italy,...|
+---------------------+--------------------+



In [16]:
df.select(size(collect_list("Country")), size(collect_set("Country"))).show() # 각 컬럼의 복합데이터 사이즈

+---------------------------+--------------------------+
|size(collect_list(Country))|size(collect_set(Country))|
+---------------------------+--------------------------+
|                     541909|                        38|
+---------------------------+--------------------------+



In [17]:
df.select(countDistinct("Country")).show() # 중복없이 카운트

+-----------------------+
|count(DISTINCT Country)|
+-----------------------+
|                     38|
+-----------------------+



#### 복합 데이터 타입의 집계 함수(collect_set, collect_list)의 실용 사례?
> 데이터의 Cardinality 가 충분히 많지 않은 경우에 하나의 컬럼에 담아 처리하고 싶을 때 활용할 수 있습니다.

#### [collect_set](https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala#L125)
```scala
org.apache.spark.sql.catalyst.expressions.aggregate.collect.scala
case class CollectSet(
    child: Expression,
    mutableAggBufferOffset: Int = 0,
    inputAggBufferOffset: Int = 0) extends Collect[mutable.HashSet[Any]] {
    ...
}
```

## 2. 그룹 함수

### 2.1 표현식을 이용한 그룹화

In [18]:
from pyspark.sql.functions import count
df.printSchema()
df.groupBy("InvoiceNo", "CustomerId").agg(expr("count(Quantity) as CountOfQuantity")).show(5)

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)





+---------+----------+---------------+
|InvoiceNo|CustomerId|CountOfQuantity|
+---------+----------+---------------+
|   536366|     17850|              2|
|   536367|     13047|             12|
|   536369|     13047|              1|
|   536376|     15291|              2|
|   536387|     16029|              5|
+---------+----------+---------------+
only showing top 5 rows



                                                                                

### 2.2 맵을 이용한 그룹화
> 파이선의 딕셔너리 데이터 타입을 활용하여 집계함수의 표현이 가능 

In [19]:
df.groupBy("InvoiceNo").agg( { "Quantity" : "avg", "UnitPrice" : "stddev_pop" } ).show(5)

+---------+---------------------+------------------+
|InvoiceNo|stddev_pop(UnitPrice)|     avg(Quantity)|
+---------+---------------------+------------------+
|   536370|   3.6916533897428674|             22.45|
|   536380|                  0.0|              24.0|
|   536384|   3.5529802474898813|14.615384615384615|
|   536387|   1.0775602071346178|             288.0|
|   536397|                  0.0|              30.0|
+---------+---------------------+------------------+
only showing top 5 rows



#### 맵을 이용한 그룹화 (agg(key->value))
> 맵을 이용하여 컬럼 단위로 적용할 함수를 전달하는 방식입니다

#### [RelationalGroupedDataset.agg](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala#L168)
```scala
org.apache.spark.sql.RelationalGroupedDataset

def agg(exprs: Map[String, String]): DataFrame = {
    toDF(exprs.map { case (colName, expr) =>
        strToExpr(expr)(df(colName).expr)
    }.toSeq)
}
```
##### 1. 위의 함수 호출 시에 value (key) 형식으로 expression 을 만들어주게 됩니다.
##### 2. map 이 mutable.HashMap  을 넘기면  Compile Error 가 발생함에 유의합니다

In [20]:
# 런타임 시에 맵으로 전달된 함수를 표현식으로 사용할 수 있습니다.
(
    df.groupBy("InvoiceNo")
    .agg(
        {"Quantity":"avg", "Quantity":"stddev_pop"}
    )
)

InvoiceNo,stddev_pop(Quantity)
536370,8.935742834258381
536380,0.0
536384,15.750645708563392
536387,117.57550765359257
536397,18.0
536405,0.0
536407,0.0
536463,0.0
536500,4.019950248448356
536522,1.6046058535136642


### 3. 요약
+ User Defined Aggregation Function, UDAF
+ UDAF를 생성하려면 기본 클래스인 UserDefinedAggregateFunction을 상속
+ UDAF는 현재 스칼라와 자바로만 사용할 수 있음(ver 2.3)
```
inputSchema: UDAF 입력 파라미터의 스키마를 StructType으로 정의 
bufferSchema: UDAF 중간 결과의 스키마를 StructType으로 정의
dataType: 반환될 값의 DataType을 정의
deterministic: UDAF가 동일한 입력값에 대해 항상 동일한 결과를 반환하는지 불리언값으로 정의
initialize: 집계용 버퍼의 값을 초기화하는 로직을 정의
update: 입력받은 로우를 기바느로 내부 버퍼를 업데이트하는 로직을 정의
merge: 두 개의 집계용 버퍼를 병합하는 로직을 정의
evaluate: 집계의 최종 결과를 생성하는 로직을 정의
```

※ Efficient UD(A)Fs with PySpark https://www.inovex.de/blog/efficient-udafs-with-pyspark/

### <font color=blue>1. [중급]</font> 구매 이력 CSV f"{work_data}/retail-data/all" 파일을 읽고
#### 1. 스키마를 출력하세요
#### 2. 데이터 10건을 출력하세요
#### 3. 상품코드(StockCode)의 유일한 값의 갯수를 출력하세요
#### 4. 상품단가(UnitPrice)의 최소, 최대 값을 출력하세요
#### 5. 송장번호(StockCode)별로 송장별총매출금액(TotalInvoicePrice)를 계산하고 내림차순으로 정렬하세요
#### 6. 송장별총매출금액(TotalInvoicePrice)이 최고금액이 송장을 필터하여 검증해 보세요
##### 예를 들어 `select sum(unit-price * quantity) from table where invoiceno = '123456'` 와 같은 쿼리로 검증이 가능합니다

<details><summary>[실습7] 출력 결과 확인 </summary>

> 아래와 유사하게 방식으로 작성 되었다면 정답입니다

```python
df1 = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(f"{work_data}/retail-data/all")
)
df1.printSchema()
df1.show()
answer = df1.withColumn("TotalPrice", expr("UnitPrice * Quantity")).groupBy("InvoiceNo").agg(sum("TotalPrice").alias("TotalInvoicePrice"))
answer.printSchema()
display(answer.orderBy(desc("TotalInvoicePrice")).limit(10))

df1.where("InvoiceNo = '581483'").select(sum(expr("UnitPrice * Quantity"))).show()
```

</details>


In [21]:
# 여기에 실습 코드를 작성하고 실행하세요 (Shift+Enter)
df1 = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(f"{work_data}/retail-data/all")
)
df1.printSchema()
df1.show()
answer = df1.withColumn("TotalPrice", expr("UnitPrice * Quantity")).groupBy("InvoiceNo").agg(sum("TotalPrice").alias("TotalInvoicePrice"))
answer.printSchema()
display(answer.orderBy(desc("TotalInvoicePrice")).limit(10))

df1.where("InvoiceNo = '581483'").select(sum(expr("UnitPrice * Quantity"))).show()


                                                                                

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|  

                                                                                

InvoiceNo,TotalInvoicePrice
581483,168469.6
541431,77183.6
574941,52940.93999999999
576365,50653.91
556444,38970.0
567423,31698.16
556917,22775.930000000008
572209,22206.0
567381,22104.8
563614,21880.439999999995


+---------------------------+
|sum((UnitPrice * Quantity))|
+---------------------------+
|                   168469.6|
+---------------------------+



### <font color=green>2. [기본]</font> 매출 테이블 f"{work_data}/tbl_purchase.csv" CSV 파일을 읽고
#### 1. 스키마를 출력하세요
#### 2. 데이터 10건을 출력하세요
#### 3. 제품(p_name)별 금액(p_amount) 의 전체 합인 총 매출금액(sum_amount)을 구하세요

<details><summary>[실습2] 출력 결과 확인 </summary>

> 아래와 유사하게 방식으로 작성 되었다면 정답입니다

```python
df2 = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(f"{work_data}/tbl_purchase.csv")
)
df2.printSchema()
df2.show()
answer = df2.groupBy("p_name").agg(sum("p_amount").alias("sum_amount"))
answer.printSchema()
display(answer)

```

</details>


In [22]:
# 여기에 실습 코드를 작성하고 실행하세요 (Shift+Enter)
df2 = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(f"{work_data}/tbl_purchase.csv")
)
df2.printSchema()
df2.show()
answer = df2.groupBy("p_name").agg(sum("p_amount").alias("sum_amount"))
answer.printSchema()
display(answer)

root
 |-- p_time: integer (nullable = true)
 |-- p_uid: integer (nullable = true)
 |-- p_id: integer (nullable = true)
 |-- p_name: string (nullable = true)
 |-- p_amount: integer (nullable = true)

+----------+-----+----+-----------+--------+
|    p_time|p_uid|p_id|     p_name|p_amount|
+----------+-----+----+-----------+--------+
|1603651550|    0|1000|GoldStar TV|  100000|
|1603651550|    1|2000|    LG DIOS| 2000000|
|1603694755|    1|2001|    LG Gram| 1800000|
|1603673500|    2|2002|    LG Cyon| 1400000|
|1603652155|    3|2003|      LG TV| 1000000|
|1603674500|    4|2004|LG Computer| 4500000|
|1603665955|    5|2001|    LG Gram| 3500000|
|1603666155|    5|2003|      LG TV| 2500000|
+----------+-----+----+-----------+--------+

root
 |-- p_name: string (nullable = true)
 |-- sum_amount: long (nullable = true)



p_name,sum_amount
LG Cyon,1400000
LG Gram,5300000
LG Computer,4500000
LG TV,3500000
GoldStar TV,100000
LG DIOS,2000000


### <font color=green>3. [기본]</font> 매출 테이블 f"{work_data}/tbl_purchase.csv" CSV 파일을 읽고
#### 1. 스키마를 출력하세요
#### 2. 데이터 10건을 출력하세요
#### 3. 구매 금액의 합이 가장 높은 고객(p_uid)을 구하세요

<details><summary>[실습3] 출력 결과 확인 </summary>

> 아래와 유사하게 방식으로 작성 되었다면 정답입니다

```python
df3 = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(f"{work_data}/tbl_purchase.csv")
)
df3.printSchema()
df3.show()
answer = df2.groupBy("p_uid").agg(sum("p_amount").alias("sum_amount_per_user"))
answer.printSchema()
display(answer)
```

</details>


In [23]:
# 여기에 실습 코드를 작성하고 실행하세요 (Shift+Enter)
df3 = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(f"{work_data}/tbl_purchase.csv")
)
df3.printSchema()
df3.show()
answer = df2.groupBy("p_uid").agg(sum("p_amount").alias("sum_amount_per_user"))
answer.printSchema()
display(answer)

root
 |-- p_time: integer (nullable = true)
 |-- p_uid: integer (nullable = true)
 |-- p_id: integer (nullable = true)
 |-- p_name: string (nullable = true)
 |-- p_amount: integer (nullable = true)

+----------+-----+----+-----------+--------+
|    p_time|p_uid|p_id|     p_name|p_amount|
+----------+-----+----+-----------+--------+
|1603651550|    0|1000|GoldStar TV|  100000|
|1603651550|    1|2000|    LG DIOS| 2000000|
|1603694755|    1|2001|    LG Gram| 1800000|
|1603673500|    2|2002|    LG Cyon| 1400000|
|1603652155|    3|2003|      LG TV| 1000000|
|1603674500|    4|2004|LG Computer| 4500000|
|1603665955|    5|2001|    LG Gram| 3500000|
|1603666155|    5|2003|      LG TV| 2500000|
+----------+-----+----+-----------+--------+

root
 |-- p_uid: integer (nullable = true)
 |-- sum_amount_per_user: long (nullable = true)



p_uid,sum_amount_per_user
0,100000
3,1000000
5,6000000
4,4500000
1,3800000
2,1400000


### <font color=red>4. [고급]</font> 샌프란시스코 긴급출동 데이터 CSV 파일인 f"{work_data}/learning-spark/sf-fire-calls.csv"를 읽고
#### 1. 스키마를 출력하세요
#### 2. 데이터를 3건 출력하세요
#### 3. 호출의 종류(CallType)가 어떤 것들이 있는지 출력하세요 (중복제거)
#### 3. 샌프란시스코에서 발생의 가장 빈도수가 높은 종류(CallType)를 구하고 빈도수를 구하세요
#### 4. 샌프란시스코에서 발생하는 최고 빈도수 3건은 무엇인가요? 

<details><summary>[실습3] 출력 결과 확인 </summary>

> 아래와 유사하게 방식으로 작성 되었다면 정답입니다

```python
df3 = (
    spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(f"{work_data}/learning-spark/sf-fire-calls.csv")
)
df3.printSchema()
df3.show(3)
df3.createOrReplaceTempView("fire_calls")
spark.sql("select distinct(CallType) from fire_calls").show(truncate=False)

answer = spark.sql("select CallType, count(CallType) as CallTypeCount from fire_calls group by CallType order by CallTypeCount desc")
display(answer.limit(3))
```

</details>


In [24]:
# 여기에 실습 코드를 작성하고 실행하세요 (Shift+Enter)
df3 = (
    spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(f"{work_data}/learning-spark/sf-fire-calls.csv")
)
df3.printSchema()
df3.show(5)
df3.createOrReplaceTempView("fire_calls")
spark.sql("select distinct(CallType) from fire_calls").show(truncate=False)

answer = spark.sql("select CallType, count(CallType) as CallTypeCount from fire_calls group by CallType order by CallTypeCount desc")
display(answer.limit(3))

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 

CallType,CallTypeCount
Medical Incident,113794
Structure Fire,23319
Alarms,19406


### <font color=red>5. [고급]</font> 샌프란시스코 긴급출동 데이터 CSV 파일인 f"{work_data}/learning-spark/sf-fire-calls.csv"를 읽고 다음과 같은 질문도 실습해 보면 재미있을 것 같습니다
#### 1. 2018 년의 모든 화재 신고 유형은 무엇 이었습니까?
#### 2. 2018 년의 몇 월에 화재 신고가 가장 많았습니까?
#### 3. 샌프란시스코에서 2018 년에 가장 많은 화재 신고가 발생한 지역은 어디입니까?
#### 4. 2018 년에 화재 신고에 대한 응답 시간이 가장 나쁜 지역은 어디입니까?
#### 5. 2018 년 중 어느 주에 화재 신고가 가장 많았습니까?
#### 6. 이웃, 우편 번호, 화재 전화 건수간에 상관 관계가 있습니까?
#### 7. Parquet 파일 또는 SQL 테이블을 사용하여이 데이터를 저장하고 다시 읽을 수있는 방법은 무엇입니까?


## 참고자료

#### 1. [Spark Programming Guide](https://spark.apache.org/docs/latest/sql-programming-guide.html)
#### 2. [PySpark SQL Modules Documentation](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html)
#### 3. <a href="https://spark.apache.org/docs/3.0.1/api/sql/" target="_blank">PySpark 3.0.1 Builtin Functions</a>
#### 4. [PySpark Search](https://spark.apache.org/docs/latest/api/python/search.html)
#### 5. [Pyspark Functions](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?#module-pyspark.sql.functions)

### SparkContext vs. SparkSession (1/2)
> SparkContext 는 spark.core 프로젝트이고, SparkSession 은 spark.sql 프로젝트이다
아래와 같이 SparkContext 는 Spark 실행에 가장 중심이 되는 객체이고, 병렬화, 브로드캐스팅, 분산 파일 추가 및 종료 등의 스파크 작업을 관장하는 클래스라 볼 수 있습니다

#### 1. [SparkContext.{parallelize, broadcast, addFile, listFiles, addJar, listJars, stop, getOrCreate}](https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkContext.scala)
```scala

'SparkContext : "Main entry point for Spark functionality. A SparkContext represents the connection to a Spark cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster."
@note Only one `SparkContext` should be active per JVM. You must `stop()` the active `SparkContext` before creating a new one.
@param config a Spark Config object describing the application configuration. Any settings in this config overrides the default configs as well as system properties.


'parallelize : "Distribute a local Scala collection to form an RDD."
@note Parallelize acts lazily. If `seq` is a mutable collection and is altered after the call to parallelize and before the first action on the RDD, the resultant RDD will reflect the modified collection. Pass a copy of the argument to avoid this.
@note avoid using `parallelize(Seq())` to create an empty `RDD`. Consider `emptyRDD` for an RDD with no partitions, or `parallelize(Seq[T]())` for an RDD of `T` with empty partitions.
@param seq Scala collection to distribute
@param numSlices number of partitions to divide the collection into
def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = withScope {
  assertNotStopped()
  new ParallelCollectionRDD[T](this, seq, numSlices, Map[Int, Seq[String]]())
}

'broadcast : "Broadcast a read-only variable to the cluster, returning a [[org.apache.spark.broadcast.Broadcast]] object for reading it in distributed functions. The variable will be sent to each cluster only once."
@param value value to broadcast to the Spark nodes
@return `Broadcast` object, a read-only variable cached on each machine
def broadcast[T: ClassTag](value: T): Broadcast[T] = {
  assertNotStopped()
  require(!classOf[RDD[_]].isAssignableFrom(classTag[T].runtimeClass), "Can not directly broadcast RDDs; instead, call collect() and broadcast the result.")
  val bc = env.broadcastManager.newBroadcast[T](value, isLocal)
  val callSite = getCallSite
  logInfo("Created broadcast " + bc.id + " from " + callSite.shortForm)
  cleaner.foreach(_.registerBroadcastForCleanup(bc))
  bc
}

'addFile : " Add a file to be downloaded with this Spark job on every node. If a file is added during execution, it will not be available until the next TaskSet starts."
@param path can be either a local file, a file in HDFS (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark jobs, use `SparkFiles.get(fileName)` to find its download location. 
@note A path can be added only once. Subsequent additions of the same path are ignored.
def addFile(path: String): Unit = {
  addFile(path, false)
}

'listFiles : "Returns a list of file paths that are added to resources."
def listFiles(): Seq[String] = addedFiles.keySet.toSeq

'addJar : "Adds a JAR dependency for all tasks to be executed on this `SparkContext` in the future.  If a jar is added during execution, it will not be available until the next TaskSet starts."
@param path can be either a local file, a file in HDFS (or other Hadoop-supported filesystems), an HTTP, HTTPS or FTP URI, or local:/path for a file on every worker node.
@note A path can be added only once. Subsequent additions of the same path are ignored.
def addJar(path: String) { ... }

'listJars : "Returns a list of jar files that are added to resources."
def listJars(): Seq[String] = addedJars.keySet.toSeq


'stop : "Shut down the SparkContext."
def stop(): Unit = { ... }

'getOrCreate(config) : "This function may be used to get or instantiate a SparkContext and register it as a singleton object. Because we can only have one active SparkContext per JVM, this is useful when applications may wish to share a SparkContext."
@param config `SparkConfig` that will be used for initialisation of the `SparkContext`
@return current `SparkContext` (or a new one if it wasn't created before the function call)
def getOrCreate(config: SparkConf): SparkContext = {
  // Synchronize to ensure that multiple create requests don't trigger an exception
  // from assertNoOtherContextIsRunning within setActiveContext
  SPARK_CONTEXT_CONSTRUCTOR_LOCK.synchronized {
    if (activeContext.get() == null) {
      setActiveContext(new SparkContext(config))
    } else {
      if (config.getAll.nonEmpty) { logWarning("Using an existing SparkContext; some configuration may not take effect.") }
    }
    activeContext.get()
  }
}

'getOrCreate() : "This function may be used to get or instantiate a SparkContext and register it as a singleton object. Because we can only have one active SparkContext per JVM, this is useful when applications may wish to share a SparkContext. This method allows not passing a SparkConf (useful if just retrieving)."
@return current `SparkContext` (or a new one if wasn't created before the function call)
def getOrCreate(): SparkContext = {
  SPARK_CONTEXT_CONSTRUCTOR_LOCK.synchronized {
    if (activeContext.get() == null) { setActiveContext(new SparkContext()) }
    activeContext.get()
  }
}
```
> Only one `SparkContext` should be active per JVM. You must `stop()` the active `SparkContext` before creating a new one. <br>
a Spark Config object describing the application configuration. Any settings in this config overrides the default configs as well as system properties.


### SparkContext vs. SparkSession (2/2)
> SparkSession 경우 sparkContext 와 관련을 가지는 객체와 데이터프레임을 다루는 함수들로 구성되어 있습니다.

#### 2. [SparkSession](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala)

#### 2-1 SparkSession.{version, sqlContext, udf}
#### 2-2 SparkSession.{newSession, emptyDataFrame, emptyDataset, createDataFrame}
#### 2-3 SparkSession.{range, table, sql, read, time, stop, close}
#### 2-4 SparkSession.builder.{master, appName, config, getOrCreate}

```scala
'SparkSession : "The entry point to programming Spark with the Dataset and DataFrame API.  In environments that this has been created upfront (e.g. REPL, notebooks), use the builder to get an existing session:"
SparkSession.builder().getOrCreate()
"The builder can also be used to create a new session:"
SparkSession.builder
  .master("local")
  .appName("Word Count")
  .config("spark.some.config.option", "some-value")
  .getOrCreate()
@param sparkContext The Spark context associated with this Spark session.
@param existingSharedState If supplied, use the existing shared state instead of creating a new one.
@param parentSessionState If supplied, inherit all session state (i.e. temporary views, SQL config, UDFs etc) from parent.


'version : "The version of Spark on which this application is running."
def version: String = SPARK_VERSION

'sqlContext : "A wrapped version of this session in the form of a [[SQLContext]], for backward compatibility."
val sqlContext: SQLContext = new SQLContext(this)

'udf : "A collection of methods for registering user-defined functions (UDF).  The following example registers a Scala closure as UDF:"
sparkSession.udf.register("myUDF", (arg1: Int, arg2: String) => arg2 + arg1)
"The following example registers a UDF in Java:"
sparkSession.udf().register("myUDF",
   (Integer arg1, String arg2) -> arg2 + arg1,
   DataTypes.StringType);
@note The user-defined functions must be deterministic. Due to optimization, duplicate invocations may be eliminated or the function may even be invoked more times than it is present in the query.
def udf: UDFRegistration = sessionState.udfRegistration

'newSession : "Start a new session with isolated SQL configurations, temporary tables, registered functions are isolated, but sharing the underlying `SparkContext` and cached data."
@note Other than the `SparkContext`, all shared state is initialized lazily. This method will force the initialization of the shared state to ensure that parent and child sessions are set up with the same shared state. If the underlying catalog implementation is Hive, this will initialize the metastore, which may take some time.
def newSession(): SparkSession = {
  new SparkSession(sparkContext, Some(sharedState), parentSessionState = None, extensions)
}

'emptyDataFrame : "Returns a `DataFrame` with no rows or columns."
lazy val emptyDataFrame: DataFrame = {
  createDataFrame(sparkContext.emptyRDD[Row].setName("empty"), StructType(Nil))
}

'emptyDataset : "Creates a new [[Dataset]] of type T containing zero elements."
def emptyDataset[T: Encoder]: Dataset[T] = {
  val encoder = implicitly[Encoder[T]]
  new Dataset(self, LocalRelation(encoder.schema.toAttributes), encoder)
}

'createDataFrame : "Creates a `DataFrame` from an RDD of Product (e.g. case classes, tuples)."
def createDataFrame[A <: Product : TypeTag](rdd: RDD[A]): DataFrame = {
  SparkSession.setActiveSession(this)
  val encoder = Encoders.product[A]
  Dataset.ofRows(self, ExternalRDD(rdd, self)(encoder))
}

'range(end) : "Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements in a range from 0 to `end` (exclusive) with step value 1."
def range(end: Long): Dataset[java.lang.Long] = range(0, end)

'range(start, end) : "Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements in a range from `start` to `end` (exclusive) with step value 1."
def range(start: Long, end: Long): Dataset[java.lang.Long] = {
  range(start, end, step = 1, numPartitions = sparkContext.defaultParallelism)
}

'table(tableName) : "Returns the specified table/view as a `DataFrame`."
@param tableName is either a qualified or unqualified name that designates a table or view. If a database is specified, it identifies the table/view from the database. Otherwise, it first attempts to find a temporary view with the given name and then match the table/view from the current database. Note that, the global temporary view database is also valid here.
def table(tableName: String): DataFrame = {
  table(sessionState.sqlParser.parseMultipartIdentifier(tableName))
}

'sql : "Executes a SQL query using Spark, returning the result as a `DataFrame`. The dialect that is used for SQL parsing can be configured with 'spark.sql.dialect'."
def sql(sqlText: String): DataFrame = {
  val tracker = new QueryPlanningTracker
  val plan = tracker.measurePhase(QueryPlanningTracker.PARSING) {
    sessionState.sqlParser.parsePlan(sqlText)
  }
  Dataset.ofRows(self, plan, tracker)
}

'read : "Returns a [[DataFrameReader]] that can be used to read non-streaming data in as a `DataFrame`."
sparkSession.read.parquet("/path/to/file.parquet")
sparkSession.read.schema(schema).json("/path/to/file.json")
def read: DataFrameReader = new DataFrameReader(self)

'time : "Executes some code block and prints to stdout the time taken to execute the block. This is available in Scala only and is used primarily for interactive testing and debugging."
def time[T](f: => T): T = {
  val start = System.nanoTime()
  val ret = f
  val end = System.nanoTime()
  // scalastyle:off println
  println(s"Time taken: {NANOSECONDS.toMillis(end - start)} ms")
  // scalastyle:on println
  ret
}

'stop : "Stop the underlying `SparkContext`."
def stop(): Unit = { sparkContext.stop() }

'close : "Synonym for `stop()`."
override def close(): Unit = stop()

'SparkSession : ""

object SparkSession extends Logging {
    class Builder extends Logging {
        'appName : " Sets a name for the application, which will be shown in the Spark web UI."
                "If no application name is set, a randomly generated name will be used."
        def appName(name: String): Builder = config("spark.app.name", name)

        'getOrCreate : "Gets an existing [[SparkSession]] or, if there is no existing one, creates a new
                one based on the options set in this builder.
                This method first checks whether there is a valid thread-local SparkSession,
                and if yes, return that one. It then checks whether there is a valid global
                default SparkSession, and if yes, return that one. If no valid global default
                SparkSession exists, the method creates a new SparkSession and assigns the
                newly created SparkSession as the global default.
                In case an existing SparkSession is returned, the config options specified in
                this builder will be applied to the existing SparkSession."
        def getOrCreate(): SparkSession = synchronized {
            assertOnDriver()
        'builder : "Creates a [[SparkSession.Builder]] for constructing a [[SparkSession]]."
        def builder(): Builder = new Builder
    }
}
```