In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
from datetime import date, datetime
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F

In [5]:
cdf = spark.read.csv('/dataframe/a_class_info.csv', header=True)
cdf.printSchema()

                                                                                

root
 |-- class_cd: string (nullable = true)
 |-- school: string (nullable = true)
 |-- class_std_cnt: string (nullable = true)
 |-- loc: string (nullable = true)
 |-- school_type: string (nullable = true)
 |-- teaching_type: string (nullable = true)



## Set Operation

- 집합연산
    - union : 합집합, 결과집합에서 중복되는 행 제거하지 않음
    - unionAll : 2.0버전이후 union으로 대체됨, union과 동일한 함수
    - exceptAll : 차집합
    - intersect : 교집합, 결과집합에서 중복되는 행 제거
    - intersectAll : 교집합, 결과집합에서 중복되는 행 제거하지 않음
    

![dataframe](./img/set_operation.jpg)
![dataframe](./img/Union-VS-intersection-.jpg)


### union, unionAll

In [6]:
# union과 unionAll 은 sql의 unionAll과 같은 개념
# sql union개념을 사용하고 싶으면 distinct()사용해야 함

# 학생 수가 30명 초과인 반과, 학생 수가 16명 미만인 합집합을 구해보자
cdf.where(cdf.class_std_cnt >= 30) \
    .union(cdf.where(cdf.class_std_cnt < 16)) \
    .show()

                                                                                

+--------+------+-------------+--------+-----------+-------------+
|class_cd|school|class_std_cnt|     loc|school_type|teaching_type|
+--------+------+-------------+--------+-----------+-------------+
|     18K| GOOBU|           31|   Urban|     Public|     Standard|
|     A93| VVTVA|           30|   Urban|     Public| Experimental|
|     YTB| VVTVA|           30|   Urban|     Public| Experimental|
|     Q0E| ZOWMK|           30|   Urban|     Public| Experimental|
|     QA2| ZOWMK|           30|   Urban|     Public|     Standard|
|     ZBH| ZOWMK|           30|   Urban|     Public|     Standard|
|     IQN| CCAAW|           15|Suburban| Non-public| Experimental|
|     197| FBUMG|           14|   Rural| Non-public| Experimental|
|     JGD| FBUMG|           14|   Rural| Non-public| Experimental|
|     MDE|  null|           10|   Rural| Non-public| Experimental|
|     SSP| UUUQX|           15|Suburban| Non-public|     Standard|
|     KR1| VHDHF|           15|   Rural| Non-public| Experimen

### intersect, intersectAll

In [9]:
# intersect와 intersectAll의 차이를 확인하기 위해 중복데이터를 추가하여 새로운 DF로 생성
temp = cdf.collect()

temp.append({
    'class_cd':'A33'
    ,'school':'CIMBB'
    ,'class_std_cnt':'19'
    ,'loc':'Urban'
    ,'school_type':'Non-public'
    ,'teaching_type':'Standard'    
})

temp_df = spark.createDataFrame(temp)

# 학교이름이 C로 시작하는 클래스와 학교 위치가 도시인 클래스간의 교집합을 구하시오
temp_df.where(temp_df.school.like('C%')) \
        .intersect(temp_df.where(temp_df.loc=='Urban')) \
        .orderBy(temp_df.class_cd).show()

temp_df.where(temp_df.school.like('C%')) \
        .intersectAll(temp_df.where(temp_df.loc=='Urban')) \
        .orderBy(temp_df.class_cd).show()

                                                                                

+--------+------+-------------+-----+-----------+-------------+
|class_cd|school|class_std_cnt|  loc|school_type|teaching_type|
+--------+------+-------------+-----+-----------+-------------+
|     1Q1| CUQAM|           28|Urban|     Public|     Standard|
|     A33| CIMBB|           19|Urban| Non-public|     Standard|
|     BFY| CUQAM|           27|Urban|     Public|     Standard|
|     EID| CIMBB|           21|Urban| Non-public|     Standard|
|     HUJ| CIMBB|           17|Urban| Non-public| Experimental|
|     OMI| CUQAM|           28|Urban|     Public|     Standard|
|     PC6| CIMBB|           17|Urban| Non-public|     Standard|
|     X6Z| CUQAM|           24|Urban|     Public| Experimental|
+--------+------+-------------+-----+-----------+-------------+



[Stage 26:>                                                         (0 + 1) / 1]

+--------+------+-------------+-----+-----------+-------------+
|class_cd|school|class_std_cnt|  loc|school_type|teaching_type|
+--------+------+-------------+-----+-----------+-------------+
|     1Q1| CUQAM|           28|Urban|     Public|     Standard|
|     A33| CIMBB|           19|Urban| Non-public|     Standard|
|     A33| CIMBB|           19|Urban| Non-public|     Standard|
|     BFY| CUQAM|           27|Urban|     Public|     Standard|
|     EID| CIMBB|           21|Urban| Non-public|     Standard|
|     HUJ| CIMBB|           17|Urban| Non-public| Experimental|
|     OMI| CUQAM|           28|Urban|     Public|     Standard|
|     PC6| CIMBB|           17|Urban| Non-public|     Standard|
|     X6Z| CUQAM|           24|Urban|     Public| Experimental|
+--------+------+-------------+-----+-----------+-------------+



                                                                                

### exceptAll

In [11]:
temp_df.where(temp_df.school.like('C%')) \
        .exceptAll(temp_df.where(temp_df.loc=='Urban')) \
        .orderBy(temp_df.class_cd).show()



+--------+------+-------------+--------+-----------+-------------+
|class_cd|school|class_std_cnt|     loc|school_type|teaching_type|
+--------+------+-------------+--------+-----------+-------------+
|     2B1| CCAAW|           18|Suburban| Non-public| Experimental|
|     EPS| CCAAW|           20|Suburban| Non-public| Experimental|
|     IQN| CCAAW|           15|Suburban| Non-public| Experimental|
|     PGK| CCAAW|           21|Suburban| Non-public|     Standard|
|     UHU| CCAAW|           16|Suburban| Non-public| Experimental|
|     UWK| CCAAW|           19|Suburban| Non-public|     Standard|
+--------+------+-------------+--------+-----------+-------------+



                                                                                