In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import time

import os
# Set environment variables (local paths)
os.environ["JAVA_HOME"] = "D:/Programs/Java"
os.environ["HADOOP_HOME"] = "D:/Programs/hadoop"
os.environ["SPARK_HOME"] = "D:/Programs/spark/spark-3.5.6-bin-hadoop3"  # Adjust if different

import findspark
# Initialize findspark
findspark.init("D:/Programs/spark/spark-3.5.6-bin-hadoop3")

In [2]:
# Using default Java Serializer

spark = SparkSession.builder \
    .appName("Optimizations") \
    .master("local[*]") \
    .getOrCreate()

In [3]:
# Using Kryo Serializer

spark = (
    SparkSession.builder
    .appName("SerializationTest")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()
)

# Setting up serialization is a static configuration
# Which means it can only be set at the time of creation of spark session
# If we try to alter the configuration after the creation of spark session dynamically
# Error!!
# spark.conf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer")

In [4]:
spark.sparkContext.setLogLevel("ERROR")
# spark.conf.get("spark.serializer") # throws error
# spark.sparkContext.getConf().get("spark.serializer") # this works
print("Serializer in use:", spark.sparkContext.getConf().get("spark.serializer"))
# Serializer in use: org.apache.spark.serializer.JavaSerializer (for java or can be None)
# Serializer in use: org.apache.spark.serializer.KryoSerializer (for kryo)
# For java you also might have to write
# spark.sparkContext.getConf().get("spark.serializer", "org.apache.spark.serializer.JavaSerializer"))

spark

Serializer in use: None


In [5]:
print("Current Serializer:", spark.sparkContext.getConf().get("spark.serializer", "org.apache.spark.serializer.JavaSerializer"))

Current Serializer: org.apache.spark.serializer.JavaSerializer


In [None]:
# Java Serialization

import pyspark.sql.functions as F
import time

start = time.time()

transactions_file = "../../data/transactions.parquet"
df_transactions = spark.read.parquet(transactions_file)

df_transformed = (
    df_transactions
    .filter(F.col("amt") > 10)
    .groupBy("city")
    .agg(F.avg("amt").alias("avg_amt"))
)

df_transformed.collect()  # trigger full computation

end = time.time()
print("Execution time with JavaSerializer:", end - start)

# Execution time with JavaSerializer: 14.865233659744263


[Row(city='san_diego', avg_amt=112.48630473111164),
 Row(city='chicago', avg_amt=112.45431524573912),
 Row(city='denver', avg_amt=112.45875942713126),
 Row(city='boston', avg_amt=112.64896775840012),
 Row(city='seattle', avg_amt=112.5738240796807),
 Row(city='los_angeles', avg_amt=112.65125815635012),
 Row(city='new_york', avg_amt=112.50231368218985),
 Row(city='san_francisco', avg_amt=112.67575526699807),
 Row(city='philadelphia', avg_amt=112.68416006667248),
 Row(city='portland', avg_amt=112.81980096459505)]

Execution time with JavaSerializer: 14.865233659744263


In [None]:
# Kryo Serialization

import pyspark.sql.functions as F
import time

start = time.time()

transactions_file = "../../data/transactions.parquet"
df_transactions = spark.read.parquet(transactions_file)

df_transformed = (
    df_transactions
    .filter(F.col("amt") > 10)
    .groupBy("city")
    .agg(F.avg("amt").alias("avg_amt"))
)

df_transformed.collect()  # trigger full computation

end = time.time()
print("Execution time with KryoSerializer:", end - start)

# Execution time with KryoSerializer: 6.044540643692017

[Row(city='san_diego', avg_amt=112.48630473111164),
 Row(city='chicago', avg_amt=112.45431524573912),
 Row(city='denver', avg_amt=112.45875942713126),
 Row(city='boston', avg_amt=112.64896775840012),
 Row(city='seattle', avg_amt=112.5738240796807),
 Row(city='los_angeles', avg_amt=112.65125815635012),
 Row(city='new_york', avg_amt=112.50231368218985),
 Row(city='san_francisco', avg_amt=112.67575526699807),
 Row(city='philadelphia', avg_amt=112.68416006667248),
 Row(city='portland', avg_amt=112.81980096459505)]

Execution time with KryoSerializer: 6.044540643692017
