In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import time

import os
# Set environment variables (local paths)
os.environ["JAVA_HOME"] = "D:/Programs/Java"
os.environ["HADOOP_HOME"] = "D:/Programs/hadoop"
os.environ["SPARK_HOME"] = "D:/Programs/spark/spark-3.5.6-bin-hadoop3"  # Adjust if different

import findspark
# Initialize findspark
findspark.init("D:/Programs/spark/spark-3.5.6-bin-hadoop3")

In [2]:
# Creating the spark session

spark = SparkSession.builder \
    .appName("ArrowOptimizations") \
    .master("local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark

In [3]:
print("Arrow Enabled:", spark.conf.get("spark.sql.execution.arrow.pyspark.enabled"))
# Arrow Enabled: false (Disabled by Default)

Arrow Enabled: false


In [4]:
transactions_file = "../../data/transactions.parquet"
df_transactions = spark.read.parquet(transactions_file)

df_transformed = (
    df_transactions
    .filter(F.col("amt") > 10)
    .groupBy("city")
    .agg(F.avg("amt").alias("avg_amt"))
)

df_transformed.show(5)

+---------+------------------+
|     city|           avg_amt|
+---------+------------------+
|san_diego|112.48630473111164|
|  chicago|112.45431524573912|
|   denver|112.45875942713126|
|   boston|112.64896775840012|
|  seattle| 112.5738240796807|
+---------+------------------+
only showing top 5 rows



In [None]:
# Arrow Disabled
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")
print("Arrow Enabled:", spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")) 
# Arrow Enabled: false

start = time.time()
pdf = df_transformed.toPandas()   # Convert to Pandas
end = time.time()

print("Time without Arrow:", end - start)

# Time without Arrow: 5.1042563915252686

Arrow Enabled: false
Time without Arrow: 5.1042563915252686


In [None]:
# Arrow Enabled
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
print("Arrow Enabled:", spark.conf.get("spark.sql.execution.arrow.pyspark.enabled"))
# Arrow Enabled: true

start = time.time()
pdf = df_transformed.toPandas()
end = time.time()

print("Time with Arrow:", end - start)

# Time with Arrow: 0.3993797302246094

Arrow Enabled: true
Time with Arrow: 0.3993797302246094


In [7]:
spark.stop()