In [1]:
#singleton pattern object builder
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("241206_03_DataFrameAPI").getOrCreate()
spark

24/12/09 10:29:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
df = spark.read.format("csv").load("data/2015-summary.csv", interSchema=True, header=True)
df

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: string]

In [3]:
print(df.schema)
print(df.printSchema())

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,StringType,true)))
root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: string (nullable = true)

None


## Row 클래스, 단일 레코드(행)을 나타내는 객체

Row(DES_COUNTRY_NAME = 'United States', ORIGIN_COUNTRY_NAME='Romania', count=15)

In [4]:
df.take(5)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count='15'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count='1'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count='344'),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count='15'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count='62')]

In [5]:
df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [6]:
#spark DataTable
df.select("DEST_COUNTRY_NAME").show(5)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
|    United States|
|            Egypt|
|    United States|
+-----------------+
only showing top 5 rows



In [7]:
df.count()

256

In [8]:
df.select("DEST_COUNTRY_NAME").count()

256

In [9]:
#중복 확인
df_dup = df.select("DEST_COUNTRY_NAME").dropDuplicates()
df_dup.show(5)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|         Anguilla|
|           Russia|
|         Paraguay|
|          Senegal|
|           Sweden|
+-----------------+
only showing top 5 rows



In [10]:
df_dup = df.select("DEST_COUNTRY_NAME").dropDuplicates().cache()
df_dup.show(5)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|         Anguilla|
|           Russia|
|         Paraguay|
|          Senegal|
|           Sweden|
+-----------------+
only showing top 5 rows



In [11]:
df_dup.distinct().count()

                                                                                

132

In [12]:
df.sort("DEST_COUNTRY_NAME").show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|             Algeria|      United States|    4|
|              Angola|      United States|   15|
|            Anguilla|      United States|   41|
| Antigua and Barbuda|      United States|  126|
|           Argentina|      United States|  180|
|               Aruba|      United States|  346|
|           Australia|      United States|  329|
|             Austria|      United States|   62|
|          Azerbaijan|      United States|   21|
|             Bahrain|      United States|   19|
|            Barbados|      United States|  154|
|             Belgium|      United States|  259|
|              Belize|      United States|  188|
|             Bermuda|      United States|  183|
|             Bolivia|      United States|   30|
|Bonaire, Sint Eus...|      United States|   58|
|              Brazil|      United States|  853|
|British Virgin Is..

In [13]:
from pyspark.sql.functions import *

#새로 컬럼 추가
df3 = df.withColumn('withInCountry',expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME"))
df3.show(5)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withInCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
|    United States|            Ireland|  344|        false|
|            Egypt|      United States|   15|        false|
|    United States|              India|   62|        false|
+-----------------+-------------------+-----+-------------+
only showing top 5 rows



In [14]:
#SQL 구문 - CASE WHEN > 수치형 변수 > 명목형 변수로 변환

df4 = df.withColumn('Category',expr("CASE WHEN count <10 THEN 'under' WHEN count >=10 THEN 'upper' END"))
df4.show(5)

+-----------------+-------------------+-----+--------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|Category|
+-----------------+-------------------+-----+--------+
|    United States|            Romania|   15|   upper|
|    United States|            Croatia|    1|   under|
|    United States|            Ireland|  344|   upper|
|            Egypt|      United States|   15|   upper|
|    United States|              India|   62|   upper|
+-----------------+-------------------+-----+--------+
only showing top 5 rows



In [15]:
df5 = df4.withColumn('withInColumn',expr("ORIGIN_COUNTRY_NAME==DEST_COUNTRY_NAME"))
df5.show(5)

+-----------------+-------------------+-----+--------+------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|Category|withInColumn|
+-----------------+-------------------+-----+--------+------------+
|    United States|            Romania|   15|   upper|       false|
|    United States|            Croatia|    1|   under|       false|
|    United States|            Ireland|  344|   upper|       false|
|            Egypt|      United States|   15|   upper|       false|
|    United States|              India|   62|   upper|       false|
+-----------------+-------------------+-----+--------+------------+
only showing top 5 rows



In [16]:
df5.groupBy("withInColumn").count().show()

+------------+-----+
|withInColumn|count|
+------------+-----+
|        true|    1|
|       false|  255|
+------------+-----+



## Projection 과 Filter
```
SELECT a,b,c # projection > column > Transformation select('col name')
FROM Table A 
WHERE a>10 #filter > Row > Transformation where('condition')
```

In [17]:
#filter 
df6 = df5.where('count<5')
df6.show()
df6.count()

+--------------------+-------------------+-----+--------+------------+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|Category|withInColumn|
+--------------------+-------------------+-----+--------+------------+
|       United States|            Croatia|    1|   under|       false|
|       United States|          Singapore|    1|   under|       false|
|             Moldova|      United States|    1|   under|       false|
|               Malta|      United States|    1|   under|       false|
|             Algeria|      United States|    4|   under|       false|
|       United States|          Gibraltar|    1|   under|       false|
|Saint Vincent and...|      United States|    1|   under|       false|
|            Suriname|      United States|    1|   under|       false|
|       United States|             Cyprus|    1|   under|       false|
|       United States|           Malaysia|    3|   under|       false|
|            Thailand|      United States|    3|   under|       false|
|     

46

In [18]:
df6 = df5.where('count<5').where("ORIGIN_COUNTRY_NAME != 'United States'")
df6.show()
df6.count()

+-----------------+-------------------+-----+--------+------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|Category|withInColumn|
+-----------------+-------------------+-----+--------+------------+
|    United States|            Croatia|    1|   under|       false|
|    United States|          Singapore|    1|   under|       false|
|    United States|          Gibraltar|    1|   under|       false|
|    United States|             Cyprus|    1|   under|       false|
|    United States|           Malaysia|    3|   under|       false|
|    United States|            Vietnam|    2|   under|       false|
|    United States|            Estonia|    1|   under|       false|
|    United States|            Hungary|    3|   under|       false|
|    United States|           Thailand|    4|   under|       false|
|    United States|            Liberia|    2|   under|       false|
|    United States|              Malta|    2|   under|       false|
|    United States|          Lithuania|    1|   

20

In [19]:
#프로젝션, 필터링 연습 
df5 = df5.withColumn("count", col("count").cast("int")) # count 가 string 이네..? int 로 변경-- cast()
# df5.show(10)

#국내 여행이 아니면서 가장 횟수가 많은 ORIGIN_COUNTRY_NAME top 10을  추출해보세요
top10_trip = df5.where("ORIGIN_COUNTRY_NAME != DEST_COUNTRY_NAME") \
                .groupBy("ORIGIN_COUNTRY_NAME") \
                .sum("count") \
                .sort("sum(count)", ascending=False)
top10_trip.show(10)



+-------------------+----------+
|ORIGIN_COUNTRY_NAME|sum(count)|
+-------------------+----------+
|      United States|     41964|
|             Canada|      8483|
|             Mexico|      7187|
|     United Kingdom|      1970|
|              Japan|      1496|
| Dominican Republic|      1420|
|            Germany|      1336|
|        The Bahamas|       986|
|             France|       952|
|              China|       920|
+-------------------+----------+
only showing top 10 rows





In [20]:
#국내 여행이 아니면서 가장 횟수가 적은 ORIGIN_COUNTRY_NAME top 10을  추출해보세요
top10_trip_least = df5.where("ORIGIN_COUNTRY_NAME != DEST_COUNTRY_NAME")\
                .groupBy("ORIGIN_COUNTRY_NAME").count()\
                .sort("count", ascending=True)
top10_trip_least.show(10)



+-------------------+-----+
|ORIGIN_COUNTRY_NAME|count|
+-------------------+-----+
|            Senegal|    1|
|             Sweden|    1|
|           Paraguay|    1|
|             Russia|    1|
|           Anguilla|    1|
|           Kiribati|    1|
|             Guyana|    1|
|          Singapore|    1|
|           Malaysia|    1|
|        Philippines|    1|
+-------------------+-----+
only showing top 10 rows





In [21]:
#도착국가별 count 총합이 가장 많은 top10을 추출해 보세요
df7 = df5.groupBy("DEST_COUNTRY_NAME").sum("count").sort('sum(count)',ascending = False)
df7.show(10)



+------------------+----------+
| DEST_COUNTRY_NAME|sum(count)|
+------------------+----------+
|     United States|    411352|
|            Canada|      8399|
|            Mexico|      7140|
|    United Kingdom|      2025|
|             Japan|      1548|
|           Germany|      1468|
|Dominican Republic|      1353|
|       South Korea|      1048|
|       The Bahamas|       955|
|            France|       935|
+------------------+----------+
only showing top 10 rows



                                                                                

In [22]:
#카테고리별 분석
df6 = df5.withColumn("count", col("count").cast("int"))
df6.groupby("Category").sum("count").show()

+--------+----------+
|Category|sum(count)|
+--------+----------+
|   under|        91|
|   upper|    453225|
+--------+----------+



In [23]:
#비행 횟수가 1000 이상 필터링
df6.filter(df6['count']>=1000).show()

+------------------+-------------------+------+--------+------------+
| DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|Category|withInColumn|
+------------------+-------------------+------+--------+------------+
|            Mexico|      United States|  7140|   upper|       false|
|     United States| Dominican Republic|  1420|   upper|       false|
|     United States|      United States|370002|   upper|        true|
|           Germany|      United States|  1468|   upper|       false|
|            Canada|      United States|  8399|   upper|       false|
|Dominican Republic|      United States|  1353|   upper|       false|
|             Japan|      United States|  1548|   upper|       false|
|     United States|            Germany|  1336|   upper|       false|
|     United States|             Mexico|  7187|   upper|       false|
|    United Kingdom|      United States|  2025|   upper|       false|
|     United States|              Japan|  1496|   upper|       false|
|     United States|

In [24]:
#상위 10 목적지 국가 
df6.groupBy("DEST_COUNTRY_NAME").sum("count").sort("sum(count)", ascending = False).show(10)

+------------------+----------+
| DEST_COUNTRY_NAME|sum(count)|
+------------------+----------+
|     United States|    411352|
|            Canada|      8399|
|            Mexico|      7140|
|    United Kingdom|      2025|
|             Japan|      1548|
|           Germany|      1468|
|Dominican Republic|      1353|
|       South Korea|      1048|
|       The Bahamas|       955|
|            France|       935|
+------------------+----------+
only showing top 10 rows





In [25]:
#spark SQL
df.createOrReplaceTempView("mobility_data")
spark.sql("select * from mobility_data").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [27]:
spark.stop()

# 집계 함수

In [28]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SecondSparkSessionApp").getOrCreate()
spark

In [37]:
df = spark.read.format("csv")\
    .option("header",'true')\
    .option('inferSchema', 'true')\
    .load("data/emp.csv")

In [40]:
df.printSchema()

root
 |-- empno: integer (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: integer (nullable = true)
 |-- hiredate: string (nullable = true)
 |-- sal: integer (nullable = true)
 |-- comm: integer (nullable = true)
 |-- deptno: integer (nullable = true)



In [41]:
df.show()

+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|null|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250| 500|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250|1400|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|null|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|null|    10|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000|null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500|   0|    30|
| 7876| ADAMS|    CLERK|7788|1987-05-23|1100|null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950|null|    30|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000|null|    20|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|null|    10|
| 9292|  JACK|

In [39]:
df.select('ename','deptno').show()

+------+------+
| ename|deptno|
+------+------+
| SMITH|    20|
| ALLEN|    30|
|  WARD|    30|
| JONES|    20|
|MARTIN|    30|
| BLAKE|    30|
| CLARK|    10|
| SCOTT|    20|
|  KING|    10|
|TURNER|    30|
| ADAMS|    20|
| JAMES|    30|
|  FORD|    20|
|MILLER|    10|
|  JACK|    70|
+------+------+



In [42]:
df.select('ename','deptno').where('deptno=20').show()

+-----+------+
|ename|deptno|
+-----+------+
|SMITH|    20|
|JONES|    20|
|SCOTT|    20|
|ADAMS|    20|
| FORD|    20|
+-----+------+



In [79]:
# 카운트 집계
#null 값이 제외
from pyspark.sql.functions import count, countDistinct, approx_count_distinct
df.select(count('job')).show()

+----------+
|count(job)|
+----------+
|        15|
+----------+



In [80]:
#null 값이 포함
df.selectExpr('count(comm)').show()

+-----------+
|count(comm)|
+-----------+
|          4|
+-----------+



In [81]:
df.select('job').distinct().show()

+---------+
|      job|
+---------+
|  ANALYST|
| SALESMAN|
|    CLERK|
|  MANAGER|
|PRESIDENT|
+---------+



In [82]:
df.select('job').distinct().count() #정확도 



5

In [83]:
df.select(countDistinct('job')).show() #근사치

+-------------------+
|count(DISTINCT job)|
+-------------------+
|                  5|
+-------------------+





In [84]:
df.select(approx_count_distinct('job', 0.1)).show() #지정한 오차가 있어도 성능면에서 유리한 연산 

+--------------------------+
|approx_count_distinct(job)|
+--------------------------+
|                         5|
+--------------------------+



In [122]:
from pyspark.sql.functions import *

In [123]:
df.select(first('ename'),last('ename')).show()

+------------+-----------+
|first(ename)|last(ename)|
+------------+-----------+
|       SMITH|       JACK|
+------------+-----------+



In [124]:
# min, max
df.select(min('mgr'), max('mgr')).show()

+--------+--------+
|min(mgr)|max(mgr)|
+--------+--------+
|    7566|    7902|
+--------+--------+



In [125]:
df.select(min('hiredate'), max('hiredate')).show()

+-------------+-------------+
|min(hiredate)|max(hiredate)|
+-------------+-------------+
|   1980-12-17|   1987-05-23|
+-------------+-------------+



In [126]:
df.select(count("empno"), count("*"), max("ename"), min("ename")).show()

+------------+--------+----------+----------+
|count(empno)|count(1)|max(ename)|min(ename)|
+------------+--------+----------+----------+
|          15|      15|      WARD|     ADAMS|
+------------+--------+----------+----------+



In [127]:
df.select(sum("sal")).show() #sal 컬럽의 총합 

+--------+
|sum(sal)|
+--------+
|   32225|
+--------+



In [128]:
# sal 컬럼값의 중복을 제거하고 합산 

df.selectExpr('sum(distinct sal)').show()



+-----------------+
|sum(DISTINCT sal)|
+-----------------+
|            27975|
+-----------------+



                                                                                

In [141]:
#alias 
dfs = df.select(
                count('sal').alias('total_tx'),
                sum('sal').alias('total_salary'),
                avg('sal').alias('avg_salary'),
                expr('mean(sal)').alias('mean_salary')
)
dfs.show()

+--------+------------+------------------+------------------+
|total_tx|total_salary|        avg_salary|       mean_salary|
+--------+------------+------------------+------------------+
|      15|       32225|2148.3333333333335|2148.3333333333335|
+--------+------------+------------------+------------------+



In [146]:
#round (data, 자릿수)
dfs = df.select(
                count('sal').alias('total_tx'),
                sum('sal').alias('total_salary'),
                round(avg('sal'),2).alias('avg_salary'),
                round(expr('mean(sal)'),2).alias('mean_salary'),
        )
dfs.show()

+--------+------------+----------+-----------+
|total_tx|total_salary|avg_salary|mean_salary|
+--------+------------+----------+-----------+
|      15|       32225|   2148.33|    2148.33|
+--------+------------+----------+-----------+



# 그룹화

In [142]:
df.groupBy('job').count().show()

+---------+-----+
|      job|count|
+---------+-----+
|  ANALYST|    2|
| SALESMAN|    4|
|    CLERK|    5|
|  MANAGER|    3|
|PRESIDENT|    1|
+---------+-----+



In [150]:
#agg() 집계 함수 적용

dfs = df.groupBy('job').agg(expr('avg(sal) as SAL_AVG'))
dfs.show()

+---------+------------------+
|      job|           SAL_AVG|
+---------+------------------+
|  ANALYST|            3000.0|
| SALESMAN|            1400.0|
|    CLERK|            1470.0|
|  MANAGER|2758.3333333333335|
|PRESIDENT|            5000.0|
+---------+------------------+



In [166]:
#표준편차 구하기 
# 1. sql.function stddev
# 2.sql expression

#1번째 방법
dfs=df.groupBy('job').agg(round(stddev_pop('sal'),2)).alias('SAL_STDDEV')
dfs.show()

+---------+-------------------------+
|      job|round(stddev_pop(sal), 2)|
+---------+-------------------------+
|  ANALYST|                      0.0|
| SALESMAN|                   154.11|
|    CLERK|                   880.68|
|  MANAGER|                   223.92|
|PRESIDENT|                      0.0|
+---------+-------------------------+



In [167]:
#2번째 방법
dfs = df.groupBy('job').agg(round(expr('stddev_pop(sal) as SAL_STDDEV'),2))
dfs.show()

+---------+-----------------------------------------+
|      job|round(stddev_pop(sal) AS `SAL_STDDEV`, 2)|
+---------+-----------------------------------------+
|  ANALYST|                                      0.0|
| SALESMAN|                                   154.11|
|    CLERK|                                   880.68|
|  MANAGER|                                   223.92|
|PRESIDENT|                                      0.0|
+---------+-----------------------------------------+

