#### PySpark Configurations ####

In [1]:
# Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
# Initialize Spark session
spark = SparkSession.builder \
            .master("spark://spark-master:7077") \
                .appName("Ansh-Lamba-Apache-Spark-Optimization") \
                    .config("spark.ui.port", "4040") \
                        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/20 20:07:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Adaptive Query Execution - AQE
spark.conf.set("spark.sql.adaptive.enabled", False)    # Disable AQE, enabled by default
print('Adaptive Query Execution (AQE) enabled:', spark.conf.get("spark.sql.adaptive.enabled"))   # Check if AQE is enabled

Adaptive Query Execution (AQE) enabled: false


#### Reading data from CSV file ####

In [4]:
# Create root directory
INPUT_DATA_ROOT = "/opt/spark-data/input/ansh-lamba"

In [14]:
# Read CSV file with Infered schema
df_big_mart_sales = spark.read.format("csv") \
                        .option('inferSchema',True) \
                            .option("header", True) \
                                .load(f"{INPUT_DATA_ROOT}/BigMart Sales.csv")

In [6]:
# Check first N records
df_big_mart_sales.limit(5).toPandas()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [7]:
# Check dataframe schema
df_big_mart_sales.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



In [8]:
# Check total number of records in dataframe
print('Total records: {:,}'.format(df_big_mart_sales.count()))

Total records: 8,523


In [15]:
# Check number of Partitions in dataframe
print('Number of Partitions: {:,}'.format(df_big_mart_sales.rdd.getNumPartitions()))

Number of Partitions: 1


In [13]:
# Change default Partition size to 128KB
# spark.conf.set("spark.sql.files.maxPartitionBytes", 131072)

# Change back to default Partition size
spark.conf.set("spark.sql.files.maxPartitionBytes", 134217728)

In [16]:
# Repartition dataframe
df_big_mart_sales = df_big_mart_sales.repartition(10)

In [17]:
# Check number of Partitions in dataframe
print('Number of Partitions: {:,}'.format(df_big_mart_sales.rdd.getNumPartitions()))

Number of Partitions: 10


In [19]:
# Add Partition ID column to dataframe - showing which Partition a record is stored
df_big_mart_sales.withColumn('Partition_Id', spark_partition_id()).limit(5).toPandas()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Partition_Id
0,DRG48,5.78,Low Fat,0.014555,Soft Drinks,145.2102,OUT046,1997,Small,Tier 1,Supermarket Type1,3062.0142,0
1,FDE24,14.85,Low Fat,0.093445,Baking Goods,141.0812,OUT035,2004,Small,Tier 2,Supermarket Type1,1139.8496,0
2,NCP18,12.15,Low Fat,0.028715,Household,151.9708,OUT018,2009,Medium,Tier 3,Supermarket Type2,3611.2992,0
3,FDX27,20.7,Regular,0.114582,Dairy,94.3436,OUT018,2009,Medium,Tier 3,Supermarket Type2,945.436,0
4,FDU02,13.35,Low Fat,0.102671,Dairy,228.6352,OUT049,1999,Medium,Tier 1,Supermarket Type1,3435.528,0


#### Writing to Parquet file ####

In [21]:
# Create root directory
OUTPUT_DATA_ROOT = "/opt/spark-data/output/ansh-lamba/"

MODE = "APPEND"  # MODES = APPEND, OVERWRITE, ERROR, IGNORE

In [22]:
# Write partitioned dataframe to Parquet file
df_big_mart_sales \
    .write \
        .format("parquet") \
            .mode(MODE) \
                .save(f"{OUTPUT_DATA_ROOT}/big-mart-sales-partitions.parquet")

                                                                                

#### Reading from Parquet file ####

In [23]:
# Read partitioned dataframe from disk
df_big_mart_sales_partition = spark.read.format("parquet") \
                    .option('inferSchema',True) \
                        .option("header", True) \
                            .load(f"{OUTPUT_DATA_ROOT}/big-mart-sales-partitions.parquet")

In [25]:
# Check first N records
df_big_mart_sales_partition.limit(5).toPandas()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDU21,,Regular,0.134328,Snack Foods,35.0558,OUT019,1985,Small,Tier 1,Grocery Store,33.9558
1,FDN56,5.46,Regular,0.106968,Fruits and Vegetables,142.6786,OUT013,1987,High,Tier 3,Supermarket Type1,2311.6576
2,NCJ17,7.68,Low Fat,0.153178,Health and Hygiene,85.2224,OUT018,2009,Medium,Tier 3,Supermarket Type2,1278.336
3,FDG08,,Regular,0.289523,Fruits and Vegetables,172.0764,OUT019,1985,Small,Tier 1,Grocery Store,171.7764
4,NCZ41,19.85,Low Fat,0.064409,Health and Hygiene,126.7704,OUT035,2004,Small,Tier 2,Supermarket Type1,1752.3856


In [24]:
# Check total number of records in dataframe
print('Total records: {:,}'.format(df_big_mart_sales_partition.count()))

[Stage 14:>                                                         (0 + 5) / 5]

Total records: 8,523


                                                                                

In [26]:
# Filter out Tier 1 locations
df_big_mart_sales_partition_filtered = df_big_mart_sales_partition \
    .filter(col("Outlet_Location_Type") == "Tier 1")

In [27]:
# Check first N records
df_big_mart_sales_partition_filtered.limit(5).toPandas()

                                                                                

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDU21,,Regular,0.134328,Snack Foods,35.0558,OUT019,1985,Small,Tier 1,Grocery Store,33.9558
1,FDG08,,Regular,0.289523,Fruits and Vegetables,172.0764,OUT019,1985,Small,Tier 1,Grocery Store,171.7764
2,FDI60,7.22,Regular,0.038381,Baking Goods,62.351,OUT049,1999,Medium,Tier 1,Supermarket Type1,885.514
3,FDZ12,9.17,Low Fat,0.102979,Baking Goods,144.947,OUT046,1997,Small,Tier 1,Supermarket Type1,4294.41
4,FDZ38,17.6,LF,0.008001,Dairy,170.4422,OUT046,1997,Small,Tier 1,Supermarket Type1,2759.0752


In [28]:
# Check total number of records in dataframe
print('Total records: {:,}'.format(df_big_mart_sales_partition_filtered.count()))

[Stage 18:>                                                         (0 + 5) / 5]

Total records: 2,388


                                                                                

#### Sanning Optimization ####

In [29]:
# Write partitioned dataframe to Parquet file
partition_by_columns = ["Outlet_Location_Type"]

df_big_mart_sales \
    .write \
        .format("parquet") \
            .partitionBy(*partition_by_columns) \
                .mode(MODE) \
                    .save(f"{OUTPUT_DATA_ROOT}/big-mart-sales-partitions-optimized.parquet")

                                                                                

In [32]:
# Read data frame from disk - With Partitions
df_big_mart_sales_partition_optimized = spark.read.format("parquet") \
    .option('inferSchema',True) \
        .option("header", True) \
            .load(f"{OUTPUT_DATA_ROOT}/big-mart-sales-partitions-optimized.parquet") \
                .filter(col("Outlet_Location_Type") == "Tier 1")

In [33]:
# Check first N records
df_big_mart_sales_partition_optimized.limit(5).toPandas()

                                                                                

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Type,Item_Outlet_Sales,Outlet_Location_Type
0,DRD60,15.7,Low Fat,0.03729,Soft Drinks,182.7634,OUT049,1999,Medium,Supermarket Type1,3453.5046,Tier 1
1,FDY19,19.75,Low Fat,0.041429,Fruits and Vegetables,117.2466,OUT049,1999,Medium,Supermarket Type1,2239.0854,Tier 1
2,NCE06,,Low Fat,0.160179,Household,160.2894,OUT019,1985,Small,Grocery Store,323.5788,Tier 1
3,NCL31,,Low Fat,0.210596,Others,144.747,OUT019,1985,Small,Grocery Store,143.147,Tier 1
4,FDZ44,,Low Fat,0.06781,Fruits and Vegetables,118.1808,OUT019,1985,Small,Grocery Store,234.3616,Tier 1


#### Optimized Joins ####

In [36]:
# Create Siblings & Countries datasets
siblings = [
            (1, 'Kwaku Jude', 40, 'M', 10_090.50, 840), \
            (2, 'Yaw David', 36, 'M', 9_001.10, 288), \
            (3, 'Kofi Baffuor', 34, 'M', 8_200.99, 288), \
            (4, 'Abena Salo', 32, 'F', 7_905.00, 288), \
            (5, 'Abena Pat', 30, 'F', 7_005.19, 288)
            ]

countries = [
            (840, 'USA'), \
            (288, 'GHANA')
            ]

In [40]:
# Schema definitions
siblings_schema = 'Id INT, Name STRING, Age INT, Gender STRING, Salary DOUBLE, CountryId INT'
countries_schema = 'CountryId INT, CountryName STRING'

In [41]:
# Create Spark dataframes
df_siblings = spark.createDataFrame(data=siblings, schema=siblings_schema)
df_countries = spark.createDataFrame(data=countries, schema=countries_schema)

In [42]:
# Check first N records
df_siblings.limit(5).toPandas()

                                                                                

Unnamed: 0,Id,Name,Age,Gender,Salary,CountryId
0,1,Kwaku Jude,40,M,10090.5,840
1,2,Yaw David,36,M,9001.1,288
2,3,Kofi Baffuor,34,M,8200.99,288
3,4,Abena Salo,32,F,7905.0,288
4,5,Abena Pat,30,F,7005.19,288


In [43]:
# Check first N records
df_countries.limit(5).toPandas()

Unnamed: 0,CountryId,CountryName
0,840,USA
1,288,GHANA


In [44]:
# Join Siblings & Countries dataframes - Merge JOIN
dfs_siblings = df_siblings.join(df_countries, df_siblings["CountryId"] == df_countries["CountryId"], "inner")

In [45]:
# Check first N records
dfs_siblings.limit(5).toPandas()

                                                                                

Unnamed: 0,Id,Name,Age,Gender,Salary,CountryId,CountryId.1,CountryName
0,1,Kwaku Jude,40,M,10090.5,840,840,USA
1,2,Yaw David,36,M,9001.1,288,288,GHANA
2,3,Kofi Baffuor,34,M,8200.99,288,288,GHANA
3,4,Abena Salo,32,F,7905.0,288,288,GHANA
4,5,Abena Pat,30,F,7005.19,288,288,GHANA


In [46]:
# Join Siblings & Countries dataframes - Broadcast JOIN
dfs_siblings_optimized = df_siblings.join(broadcast(df_countries), df_siblings["CountryId"] == df_countries["CountryId"], "inner")

In [47]:
# Check first N records
dfs_siblings_optimized.limit(5).toPandas()

Unnamed: 0,Id,Name,Age,Gender,Salary,CountryId,CountryId.1,CountryName
0,1,Kwaku Jude,40,M,10090.5,840,840,USA
1,2,Yaw David,36,M,9001.1,288,288,GHANA
2,3,Kofi Baffuor,34,M,8200.99,288,288,GHANA
3,4,Abena Salo,32,F,7905.0,288,288,GHANA
4,5,Abena Pat,30,F,7005.19,288,288,GHANA


#### Spark SQL Hints ####

In [48]:
# Create table/view using Siblings & Countries dataframes
df_siblings.createOrReplaceTempView("tbl_siblings")
df_countries.createOrReplaceTempView("tbl_countries")

In [51]:
# Join Siblings & Countries dataframes - Merge JOIN SQL
sql_query = spark.sql("""
                      SELECT Id, Name, Age, Gender, Salary, CountryName 
                      FROM tbl_siblings sb 
                      INNER JOIN tbl_countries cs 
                      ON sb.CountryId = cs.CountryId 
                      """)

# Show data
sql_query.limit(5).toPandas()

                                                                                

Unnamed: 0,Id,Name,Age,Gender,Salary,CountryName
0,1,Kwaku Jude,40,M,10090.5,USA
1,2,Yaw David,36,M,9001.1,GHANA
2,3,Kofi Baffuor,34,M,8200.99,GHANA
3,4,Abena Salo,32,F,7905.0,GHANA
4,5,Abena Pat,30,F,7005.19,GHANA


In [52]:
# Join Siblings & Countries dataframes - Merge JOIN SQL
sql_query_optimized = spark.sql("""
                                SELECT Id, Name, Age, Gender, Salary, CountryName /* broadcast(cs) */ 
                                FROM tbl_siblings sb 
                                INNER JOIN tbl_countries cs 
                                ON sb.CountryId = cs.CountryId 
                                 """)

# Show data
sql_query_optimized.limit(5).toPandas()

                                                                                

Unnamed: 0,Id,Name,Age,Gender,Salary,CountryName
0,1,Kwaku Jude,40,M,10090.5,USA
1,2,Yaw David,36,M,9001.1,GHANA
2,3,Kofi Baffuor,34,M,8200.99,GHANA
3,4,Abena Salo,32,F,7905.0,GHANA
4,5,Abena Pat,30,F,7005.19,GHANA


#### Caching & Persistence ####