In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
spark

In [3]:
# read data

path = "data/flight-data/csv/2015-summary.csv"

"""
  data를 읽는 과정이 지연 연산 형태의 트랜스포메이션 이어서,
  읽는 시점에는 로우와 컬럼 수를 알 수 없다.
  단, 각 컬럼의 데이터 타입을 추론하기 위해
  적은 양의 데이터를 읽는다.
"""
flightData2015 = spark \
  .read \
  .option("inferSchema", "true") \
  .option("header", "true") \
  .csv(path)

In [4]:
flightData2015.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [5]:
"""
  sort: a wide dependency transfomation.
  explain: seeing the dataframe's lineage
"""

flightData2015.sort("count").explain()

== Physical Plan ==
*(2) Sort [count#12 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(count#12 ASC NULLS FIRST, 200)
   +- *(1) FileScan csv [DEST_COUNTRY_NAME#10,ORIGIN_COUNTRY_NAME#11,count#12] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/user/Documents/strange-study/ss-spark/week1/youn/data/flight-data/c..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>


In [6]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

flightData2015.sort("count").take(2)

%time

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 3.81 µs


In [7]:
spark.conf.set("spark.sql.shuffle.partitions", "10")

flightData2015.sort("count").take(2)

%time

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs


In [8]:
"""
   스파크는 SQL쿼리를 Dataframe 코드와 같은 실행 계획으로 컴파일 하므로,
   둘 사이의 성능 차이는 없다.
"""

flightData2015.createOrReplaceTempView("flight_data_2015")

In [9]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

dataFrameWay = flightData2015 \
  .groupby("DEST_COUNTRY_NAME") \
  .count()

sqlWay.explain()
dataFrameWay.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#10, 10)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#10] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/user/Documents/strange-study/ss-spark/week1/youn/data/flight-data/c..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>
== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#10, 10)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#10] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/user/Documents/strange-study/ss-spark/week1/youn/data/flight-data/c..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_N

In [10]:
print(spark.sql("SELECT max(count) FROM flight_data_2015").take(1))


from pyspark.sql.functions import max

flightData2015.select(max("count")).take(1)

[Row(max(count)=370002)]


[Row(max(count)=370002)]

In [11]:
# 상위 5개 국가를 찾아보자.

from pyspark.sql.functions import sum, desc

top5 = flightData2015 \
  .groupby("DEST_COUNTRY_NAME") \
  .agg(sum("count").alias("total_count")) \
  .sort(desc("total_count")) \
  .limit(5)

top5 \
  .explain()

top5.show(5)

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[total_count#56L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#10,total_count#56L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[sum(cast(count#12 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#10, 10)
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[partial_sum(cast(count#12 as bigint))])
         +- *(1) FileScan csv [DEST_COUNTRY_NAME#10,count#12] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/user/Documents/strange-study/ss-spark/week1/youn/data/flight-data/c..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>
+-----------------+-----------+
|DEST_COUNTRY_NAME|total_count|
+-----------------+-----------+
|    United States|     411352|
|           Canada|       8399|
|           Mexico|       7140|
|   United Kingdom|       2025|
|            Japan|       1548|
+-----------------+-----------+



In [12]:
maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")

maxSql.explain()
maxSql.show()

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[aggOrder#72L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#10,destination_total#70L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[sum(cast(count#12 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#10, 10)
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[partial_sum(cast(count#12 as bigint))])
         +- *(1) FileScan csv [DEST_COUNTRY_NAME#10,count#12] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/user/Documents/strange-study/ss-spark/week1/youn/data/flight-data/c..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>
+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|            

In [13]:
from pyspark.sql.functions import desc

flightData2015 \
  .groupBy("DEST_COUNTRY_NAME") \
  .sum("count") \
  .withColumnRenamed("sum(count)", "destination_total") \
  .sort(desc("destination_total")) \
  .limit(5) \
  .explain()


== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#94L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#10,destination_total#94L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[sum(cast(count#12 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#10, 10)
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[partial_sum(cast(count#12 as bigint))])
         +- *(1) FileScan csv [DEST_COUNTRY_NAME#10,count#12] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/user/Documents/strange-study/ss-spark/week1/youn/data/flight-data/c..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>


'\n  실행계획을 살펴보면, sum이 두 번 발생하는 것을 확인할 수 있는데,\n  sum이 두 번 발생하는 이유는 \n  먼저 파티션 별로 합을 구하고, 파티션끼리 합을 구하기 때문이다.\n  (sum이 commutative를 가지고 있어서라고 설명하는데, \n  교환 법칙이 성립해서 연산 순서를 변경할 수 있다는 것을 말한다.)\n'

In [None]:
"""
  실행계획을 살펴보면, sum이 두 번 발생하는 것을 확인할 수 있는데,
  sum이 두 번 발생하는 이유는 
  먼저 파티션 별로 합을 구하고, 파티션끼리 합을 구하기 때문이다.
  (sum이 commutative를 가지고 있어서라고 설명하는데, 
  교환 법칙이 성립해서 연산 순서를 변경할 수 있다는 것을 말한다.)
"""

In [17]:
df1 = spark.range(2, 10000000, 2)
df2 = spark.range(2, 10000000, 4)
step1 = df1.repartition(5)
step12 = df2.repartition(6)
step2 = step1.selectExpr("id * 5 as id")
step3 = step2.join(step12, ["id"])
step4 = step3.selectExpr("sum(id)")

step4.explain()

== Physical Plan ==
*(7) HashAggregate(keys=[], functions=[sum(id#124L)])
+- Exchange SinglePartition
   +- *(6) HashAggregate(keys=[], functions=[partial_sum(id#124L)])
      +- *(6) Project [id#124L]
         +- *(6) SortMergeJoin [id#124L], [id#120L], Inner
            :- *(3) Sort [id#124L ASC NULLS FIRST], false, 0
            :  +- Exchange hashpartitioning(id#124L, 10)
            :     +- *(2) Project [(id#118L * 5) AS id#124L]
            :        +- Exchange RoundRobinPartitioning(5)
            :           +- *(1) Range (2, 10000000, step=2, splits=12)
            +- *(5) Sort [id#120L ASC NULLS FIRST], false, 0
               +- Exchange hashpartitioning(id#120L, 10)
                  +- Exchange RoundRobinPartitioning(6)
                     +- *(4) Range (2, 10000000, step=4, splits=12)


In [16]:
step4.explain()

== Physical Plan ==
*(7) HashAggregate(keys=[], functions=[sum(id#110L)])
+- Exchange SinglePartition
   +- *(6) HashAggregate(keys=[], functions=[partial_sum(id#110L)])
      +- *(6) Project [id#110L]
         +- *(6) SortMergeJoin [id#110L], [id#106L], Inner
            :- *(3) Sort [id#110L ASC NULLS FIRST], false, 0
            :  +- Exchange hashpartitioning(id#110L, 10)
            :     +- *(2) Project [(id#104L * 5) AS id#110L]
            :        +- Exchange RoundRobinPartitioning(5)
            :           +- *(1) Range (2, 10000000, step=2, splits=12)
            +- *(5) Sort [id#106L ASC NULLS FIRST], false, 0
               +- Exchange hashpartitioning(id#106L, 10)
                  +- Exchange RoundRobinPartitioning(6)
                     +- *(4) Range (2, 10000000, step=4, splits=12)
