# Columnar Read Optimization

In [1]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Parquet Columnar Optimization") \
    .master("local[*]") \
    .getOrCreate()

spark

In [3]:
# Lets create a simple Python decorator - {get_time} to get the execution timings
# If you dont know about Python decorators - check out : https://www.geeksforgeeks.org/decorators-in-python/
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        return (f"Execution time: {(end_time - start_time)*1000} ms")
    print(inner_get_time())

In [1]:
%%sh
ls -lhtr dataset/sales.parquet/

total 118M
-rw-r--r-- 1 jovyan users    0 Oct 12 12:24 _SUCCESS
-rw-r--r-- 1 jovyan users 118M Oct 12 12:24 part-00000-ca408bb4-a0c2-4ae1-a794-5d46f4655fae-c000.snappy.parquet


In [15]:
# Now lets read the dataset without specifying the schema

df_sales = spark \
    .read \
    .format("parquet") \
    .load("dataset/sales.parquet")

df_sales.printSchema()
    
@get_time
def x(): df_sales.write.format("noop").mode("overwrite").save()

root
 |-- transacted_at: string (nullable = true)
 |-- trx_id: string (nullable = true)
 |-- retailer_id: string (nullable = true)
 |-- description: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- city_id: string (nullable = true)

Execution time: 2433.0270290374756 ms


In [20]:
# Now we specify the schema before reading

_schema = "transacted_at STRING, trx_id STRING, retailer_id STRING, description STRING, amount STRING, city_id STRING"

df_sales = spark \
    .read \
    .schema(_schema) \
    .format("parquet") \
    .load("dataset/sales.parquet")

df_sales.printSchema()
    
@get_time
def x(): df_sales.write.format("noop").mode("overwrite").save()

root
 |-- transacted_at: string (nullable = true)
 |-- trx_id: string (nullable = true)
 |-- retailer_id: string (nullable = true)
 |-- description: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- city_id: string (nullable = true)

Execution time: 2173.130989074707 ms


In [21]:
# Now if we only query the required columns

_required_schema = "transacted_at STRING, trx_id STRING, amount STRING"


df_sales = spark \
    .read \
    .schema(_required_schema) \
    .format("parquet") \
    .load("dataset/sales.parquet")

df_sales.printSchema()
    
@get_time
def x(): df_sales.write.format("noop").mode("overwrite").save()

root
 |-- transacted_at: string (nullable = true)
 |-- trx_id: string (nullable = true)
 |-- amount: string (nullable = true)

Execution time: 1123.8584518432617 ms


In [22]:
# If we read the partial schema again but this time with select

_schema = "transacted_at STRING, trx_id STRING, retailer_id STRING, description STRING, amount STRING, city_id STRING"

df_sales = spark \
    .read \
    .schema(_schema) \
    .parquet("dataset/sales.parquet") \
    .select("transacted_at", "trx_id", "amount")

df_sales.printSchema()
    
@get_time
def x(): df_sales.write.format("noop").mode("overwrite").save()

root
 |-- transacted_at: string (nullable = true)
 |-- trx_id: string (nullable = true)
 |-- amount: string (nullable = true)

Execution time: 1142.024278640747 ms


In [24]:
# We can also use drop to remove the un-wanted columns

_schema = "transacted_at STRING, trx_id STRING, retailer_id STRING, description STRING, amount STRING, city_id STRING"

df_sales = spark \
    .read \
    .schema(_schema) \
    .parquet("dataset/sales.parquet") \
    .drop("retailer_id", "description", "city_id")

df_sales.printSchema()
    
@get_time
def x(): df_sales.write.format("noop").mode("overwrite").save()

root
 |-- transacted_at: string (nullable = true)
 |-- trx_id: string (nullable = true)
 |-- amount: string (nullable = true)

Execution time: 1156.738519668579 ms
