# Tuning JDBC for Optimizing SQL read

In [3]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Tuning JDBC") \
    .config('spark.jars.packages', 'org.xerial:sqlite-jdbc:3.39.3.0') \
    .master("local[*]") \
    .getOrCreate()

spark

In [8]:
# Lets create a simple Python decorator - {get_time} to get the execution timings
# If you dont know about Python decorators - check out : https://www.geeksforgeeks.org/decorators-in-python/
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        return (f"Execution time: {(end_time - start_time)*1000} ms")
    print(inner_get_time())

In [9]:
# Set up to read from JDBC SQLite database
driver: str = "org.sqlite.JDBC"
db_path: str = "dataset/jdbc/demo-sqlite.db"
jdbc_url: str = "jdbc:sqlite:" + db_path
table_name: str = "sales_csv"

In [34]:
# Checking the performance for Full read without any Predicate Pushdown
@get_time
def x():
    df_full = spark \
    .read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", jdbc_url) \
    .option("dbtable", table_name) \
    .load()
    
    df_full.write.format("noop").mode("overwrite").save()
    df_full.printSchema()
    print("Number of Partitons: "+ str(df_full.rdd.getNumPartitions()))

root
 |-- transacted_at: string (nullable = true)
 |-- trx_id: integer (nullable = true)
 |-- retailer_id: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- city_id: integer (nullable = true)

Number of Partitons: 1
Execution time: 6098.56104850769 ms


In [25]:
# Lets get the Lower and Upper bound for txn_id to parallize the reading process
df_full = spark \
    .read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", jdbc_url) \
    .option("dbtable", table_name) \
    .load()

df_full.selectExpr("min(trx_id) as min_trx_id", "max(trx_id) as max_trx_id").show()

+----------+----------+
|min_trx_id|max_trx_id|
+----------+----------+
|        20|2147474653|
+----------+----------+



In [60]:
# Number of Parallel Processing cores available
spark.sparkContext.defaultParallelism

8

In [42]:
# Checking the performance for with upper/lower bound with numPartitions
@get_time
def x():
    df_full = spark \
    .read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", jdbc_url) \
    .option("dbtable", table_name) \
    .option("partitionColumn", "trx_id") \
    .option("lowerBound", 20) \
    .option("upperBound", 2147474653) \
    .option("numPartitions", 8) \
    .load()
    
    df_full.write.format("noop").mode("overwrite").save()
    df_full.printSchema()
    print("Number of Partitons: "+ str(df_full.rdd.getNumPartitions()))

root
 |-- transacted_at: string (nullable = true)
 |-- trx_id: integer (nullable = true)
 |-- retailer_id: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- city_id: integer (nullable = true)

Number of Partitons: 8
Execution time: 2504.1656494140625 ms


In [59]:
# Checking the performance for with upper/lower bound with numPartitions and fetchsize
@get_time
def x():
    df_full = spark \
    .read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", jdbc_url) \
    .option("dbtable", table_name) \
    .option("partitionColumn", "trx_id") \
    .option("lowerBound", 20) \
    .option("upperBound", 2147474653) \
    .option("numPartitions", 8) \
    .option("fetchsize", 8000) \
    .load()
    
    df_full.write.format("noop").mode("overwrite").save()
    df_full.printSchema()
    print("Number of Partitons: "+ str(df_full.rdd.getNumPartitions()))

root
 |-- transacted_at: string (nullable = true)
 |-- trx_id: integer (nullable = true)
 |-- retailer_id: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- city_id: integer (nullable = true)

Number of Partitons: 8
Execution time: 2303.252696990967 ms
