In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = (
    SparkSession.builder.appName("Reading Complex Data Formats").getOrCreate()
)

In [2]:
# Read Parquet Sales data

df_parquet = spark.read.format("parquet").load("data/input/sales_total_parquet/*.parquet")

In [3]:
df_parquet.printSchema()

root
 |-- transacted_at: timestamp (nullable = true)
 |-- trx_id: integer (nullable = true)
 |-- retailer_id: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- city_id: integer (nullable = true)



In [4]:
df_parquet.show()

+-------------------+----------+-----------+--------------------+-------+----------+
|      transacted_at|    trx_id|retailer_id|         description| amount|   city_id|
+-------------------+----------+-----------+--------------------+-------+----------+
|2017-11-25 00:30:00|1995601912| 2077350195|Walgreen       11-25| 197.23| 216510442|
|2017-11-25 00:30:00|1734117021|  644879053|unkn    ppd id: 7...|   8.58| 930259917|
|2017-11-25 00:30:00|1734117022|  847200066|Wal-Mart  ppd id:...|1737.26|1646415505|
|2017-11-25 00:30:00|1734117030| 1953761884|Home Depot     pp...|  384.5| 287177635|
|2017-11-25 00:30:00|1734117089| 1898522855| Target        11-25|  66.33|1855530529|
|2017-11-25 00:30:00|1734117117|  997626433|Sears  ppd id: 85...| 298.87| 957346984|
|2017-11-25 00:30:00|1734117123| 1953761884|unkn   ppd id: 15...|  19.55|  45522086|
|2017-11-25 00:30:00|1734117152| 1429095612|Ikea     arc id: ...|   9.39|1268541279|
|2017-11-25 00:30:00|1734117153|  847200066|unkn        Kings...|

In [5]:
# Read ORC Sales data

df_orc = spark.read.format("orc").load("data/input/sales_total_orc/*.orc")

In [6]:
df_orc.printSchema()

root
 |-- transacted_at: timestamp (nullable = true)
 |-- trx_id: integer (nullable = true)
 |-- retailer_id: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- city_id: integer (nullable = true)



In [7]:
df_orc.show()

+-------------------+----------+-----------+--------------------+-------+----------+
|      transacted_at|    trx_id|retailer_id|         description| amount|   city_id|
+-------------------+----------+-----------+--------------------+-------+----------+
|2017-11-24 19:00:00|1995601912| 2077350195|Walgreen       11-25| 197.23| 216510442|
|2017-11-24 19:00:00|1734117021|  644879053|unkn    ppd id: 7...|   8.58| 930259917|
|2017-11-24 19:00:00|1734117022|  847200066|Wal-Mart  ppd id:...|1737.26|1646415505|
|2017-11-24 19:00:00|1734117030| 1953761884|Home Depot     pp...|  384.5| 287177635|
|2017-11-24 19:00:00|1734117089| 1898522855| Target        11-25|  66.33|1855530529|
|2017-11-24 19:00:00|1734117117|  997626433|Sears  ppd id: 85...| 298.87| 957346984|
|2017-11-24 19:00:00|1734117123| 1953761884|unkn   ppd id: 15...|  19.55|  45522086|
|2017-11-24 19:00:00|1734117152| 1429095612|Ikea     arc id: ...|   9.39|1268541279|
|2017-11-24 19:00:00|1734117153|  847200066|unkn        Kings...|

In [8]:
# Benefits of Columnar Storage

# Lets create a simple Python decorator - {get_time} to get the execution timings
# If you dont know about Python decorators - check out : https://www.geeksforgeeks.org/decorators-in-python/
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        return (f"Execution time: {(end_time - start_time)*1000} ms")
    print(inner_get_time())

In [10]:
@get_time
def x():
    df = spark.read.format("parquet").load("data/input/sales_total_parquet/*.parquet")
    df.count()

Execution time: 2103.475570678711 ms


In [12]:
@get_time
def x():
    df = spark.read.format("parquet").load("data/input/sales_total_parquet/*.parquet")
    df.select("trx_id").count()

Execution time: 969.0468311309814 ms


In [13]:
# BONUS TIP
# RECURSIVE READ
'''
sales_recursive
|__ sales_1\1.parquet
|__ sales_1\sales_2\2.
'''

'\nsales_recursive\n|__ sales_1\x01.parquet\n|__ sales_1\\sales_2\x02.\n'

In [15]:
df_1 = spark.read.format("parquet").load("data/input/sales_recursive/sales_1/1.parquet")
df_1.show()

+-------------------+----------+-----------+--------------------+------+---------+
|      transacted_at|    trx_id|retailer_id|         description|amount|  city_id|
+-------------------+----------+-----------+--------------------+------+---------+
|2017-11-24 19:00:00|1734117121|  644879053|unkn    ppd id: 7...|  8.58|930259917|
|               NULL|      NULL|       NULL|                NULL|  NULL|     NULL|
+-------------------+----------+-----------+--------------------+------+---------+



In [16]:
df_1 = spark.read.format("parquet").load("data/input/sales_recursive/sales_1/sales_2/2.parquet")
df_1.show()

+-------------------+----------+-----------+--------------------+------+--------+
|      transacted_at|    trx_id|retailer_id|         description|amount| city_id|
+-------------------+----------+-----------+--------------------+------+--------+
|2017-11-24 19:00:00|1734117123| 1953761884|Walgreen       11-25| 19.55|45522086|
|               NULL|      NULL|       NULL|                NULL|  NULL|    NULL|
+-------------------+----------+-----------+--------------------+------+--------+



In [18]:
# to recursively read the parquet files within recursive folders
df_all = spark.read.format("parquet").option("recursiveFileLookup", True).load("data/input/sales_recursive/")
df_all.show()

+-------------------+----------+-----------+--------------------+------+---------+
|      transacted_at|    trx_id|retailer_id|         description|amount|  city_id|
+-------------------+----------+-----------+--------------------+------+---------+
|2017-11-24 19:00:00|1734117121|  644879053|unkn    ppd id: 7...|  8.58|930259917|
|               NULL|      NULL|       NULL|                NULL|  NULL|     NULL|
|2017-11-24 19:00:00|1734117123| 1953761884|Walgreen       11-25| 19.55| 45522086|
|               NULL|      NULL|       NULL|                NULL|  NULL|     NULL|
+-------------------+----------+-----------+--------------------+------+---------+



In [19]:
spark.stop()