In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-19-perf")\
    .getOrCreate()

In [2]:
import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

file_path = SPARK_BOOK_DATA_PATH + "/data/flight-data/csv/2015-summary.csv"

In [3]:
DF1 = spark.read.format("csv")\
  .option("inferSchema", "true")\
  .option("header", "true")\
  .load(file_path)

In [4]:
DF1.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [5]:
%%time
DF2 = DF1.groupBy("DEST_COUNTRY_NAME").count().collect()    # sys time = 3.38 ms 

CPU times: user 2.05 ms, sys: 3.38 ms, total: 5.43 ms
Wall time: 1.58 s


In [6]:
DF1.cache()

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: int]

In [7]:
%%time
DF2 = DF1.groupBy("DEST_COUNTRY_NAME").count().collect()    # sys time = 0.086 ms with cache

CPU times: user 8.15 ms, sys: 86 µs, total: 8.24 ms
Wall time: 846 ms


In [8]:
DF1.is_cached

True

In [9]:
DF1.unpersist()

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: int]

In [12]:
%%time
DF2 = DF1.groupBy("DEST_COUNTRY_NAME").count().collect()    # sys time = 3.8 ms after unpersist()

CPU times: user 3.44 ms, sys: 3.8 ms, total: 7.24 ms
Wall time: 491 ms


### [Apache Spark Optimization Toolkit](https://towardsdatascience.com/apache-spark-optimization-toolkit-17cf3e491992)

In [13]:
from pyspark.sql.functions import spark_partition_id

In [14]:
file_path = SPARK_BOOK_DATA_PATH + "/data/retail-data/by-day/2010-12-01.csv"
# Original loading code that does *not* cache DataFrame
df1 = spark.read.format("csv")\
  .option("inferSchema", "true")\
  .option("header", "true")\
  .load(file_path)

In [15]:
(
df1.withColumn("partition_id", spark_partition_id())
  .groupBy("partition_id")
  .count().show()
)

+------------+-----+
|partition_id|count|
+------------+-----+
|           0| 3108|
+------------+-----+



In [16]:
spark.stop()