### Optimize Parquet files

In [1]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Optimize Parquet Files") \
    .master("local[*]") \
    .getOrCreate()

spark

In [2]:
# Function to generate random data between 0-7
import random

def generate_data(cnt):
    _lst = []
    for i in range(0, cnt):
        num = random.choice(range(0,11))
        _lst.append([num])
    return _lst

In [3]:
# Lets create a simple Python decorator - {get_time} to get the execution timings
# If you dont know about Python decorators - check out : https://www.geeksforgeeks.org/decorators-in-python/
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        return (f"Execution time: {(end_time - start_time)*1000} ms")
    print(inner_get_time())

In [4]:
# Genearte dataframe with 10M random numbers
_data = generate_data(10000000)
_schema = "values int"

# Create dataframe
df = spark.createDataFrame(data = _data, schema=_schema)

In [5]:
# Write data in Parquet format

@get_time
def x(): df.write.format("parquet").save("dataset/num/1/num.parquet")

Execution time: 4398.865699768066 ms


In [6]:
# Sort the data
from pyspark.sql.functions import col, asc
df_fixed = df.orderBy(col("values").asc())

In [7]:
# Write in Parquet format
@get_time
def x(): df_fixed.write.format("parquet").save("dataset/num/2/num.parquet")

Execution time: 10929.477453231812 ms


In [8]:
# Read the data from location 1 (non-optimized)
@get_time
def x():
    df = spark.read.parquet("dataset/num/1/num.parquet")
    df.write.format("noop").mode("overwrite").save()

Execution time: 791.3680076599121 ms


In [12]:
# Read the data from location 2 (optimized)
@get_time
def x():
    df = spark.read.parquet("dataset/num/2/num.parquet")
    df.write.format("noop").mode("overwrite").save()

Execution time: 568.8269138336182 ms
