In [50]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("modes_and_options").master("local[*]").config("spark.ui.port", "4042").getOrCreate()
print(spark.sparkContext.uiWebUrl)
spark

http://gypsum-gpu171.unity.rc.umass.edu:4042


In [68]:
# Read Parquet
df_total= spark.read.format("parquet").load("data/sales_total_parquet/*.parquet")

In [69]:
df_total.printSchema()

root
 |-- transacted_at: timestamp (nullable = true)
 |-- trx_id: integer (nullable = true)
 |-- retailer_id: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- city_id: integer (nullable = true)



In [70]:
df_total.show()

+-------------------+----------+-----------+--------------------+-------+----------+
|      transacted_at|    trx_id|retailer_id|         description| amount|   city_id|
+-------------------+----------+-----------+--------------------+-------+----------+
|2017-12-27 19:00:00| 330765426|  887300947|Kroger   ccd id: ...|  33.56|2068475652|
|2017-11-26 21:00:00|1377679664| 1070485878|Amazon.com    ccd...|  24.43|1640819540|
|2017-12-12 23:00:00| 472018705| 2001148981|  unkn      Columbia|   1.24| 481821583|
|2017-05-19 19:00:00|1127671830|  847200066|            Wal-Mart|2155.48|2074005445|
|2017-11-17 21:00:00| 233137169|  847200066|            Wal-Mart|   4.13|2043825401|
|2017-12-15 12:00:00| 603124844|  887300947|Kroger   ccd id: ...|  31.92|1640819540|
|2017-11-08 12:00:00|1591888712|  143327090|  Menard       11-08|   42.3|2043825401|
|2017-12-23 12:00:00|1775468459|  887300947|Kroger  arc id: 1...|  284.8|2055198208|
|2017-09-01 13:00:00|1020833609| 2120842315|Burger King   ccd...|

In [78]:
# Read ORC Sales data

df_orc = spark.read.format("orc").load("data/sales_total_orc/*.orc")

In [79]:
df_orc.printSchema()

root
 |-- transacted_at: timestamp (nullable = true)
 |-- trx_id: integer (nullable = true)
 |-- retailer_id: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- city_id: integer (nullable = true)



In [80]:
df_orc.show()

+-------------------+----------+-----------+--------------------+-------+----------+
|      transacted_at|    trx_id|retailer_id|         description| amount|   city_id|
+-------------------+----------+-----------+--------------------+-------+----------+
|2017-12-27 19:00:00| 330765426|  887300947|Kroger   ccd id: ...|  33.56|2068475652|
|2017-11-26 21:00:00|1377679664| 1070485878|Amazon.com    ccd...|  24.43|1640819540|
|2017-12-12 23:00:00| 472018705| 2001148981|  unkn      Columbia|   1.24| 481821583|
|2017-05-19 19:00:00|1127671830|  847200066|            Wal-Mart|2155.48|2074005445|
|2017-11-17 21:00:00| 233137169|  847200066|            Wal-Mart|   4.13|2043825401|
|2017-12-15 12:00:00| 603124844|  887300947|Kroger   ccd id: ...|  31.92|1640819540|
|2017-11-08 12:00:00|1591888712|  143327090|  Menard       11-08|   42.3|2043825401|
|2017-12-23 12:00:00|1775468459|  887300947|Kroger  arc id: 1...|  284.8|2055198208|
|2017-09-01 13:00:00|1020833609| 2120842315|Burger King   ccd...|

In [88]:
# Benefits of Columnar Storage

# Lets create a simple Python decorator - {get_time} to get the execution timings
# If you dont know about Python decorators - check out : https://www.geeksforgeeks.org/decorators-in-python/
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        return (f"Execution time: {(end_time - start_time)*1000} ms")
    print(inner_get_time())

In [90]:
@get_time
def x():
    df = spark.read.format("parquet").load("data/sales_data.parquet")
    df.count()

Execution time: 293.5824394226074 ms


In [91]:
@get_time
def x():
    df = spark.read.format("parquet").load("data/sales_data.parquet")
    df.select("trx_id").count()

Execution time: 187.99686431884766 ms


In [94]:
df_1 = spark.read.format("parquet").load("data/sales_recursive/1.parquet")
df_1.show()

+-------------------+----------+-----------+--------------------+------+---------+
|      transacted_at|    trx_id|retailer_id|         description|amount|  city_id|
+-------------------+----------+-----------+--------------------+------+---------+
|2017-11-24 19:00:00|1734117021|  644879053|unkn    ppd id: 7...|  8.58|930259917|
+-------------------+----------+-----------+--------------------+------+---------+



In [95]:
df_1 = spark.read.format("parquet").load("data/sales_recursive/2.parquet")
df_1.show()

+-------------------+----------+-----------+--------------------+------+--------+
|      transacted_at|    trx_id|retailer_id|         description|amount| city_id|
+-------------------+----------+-----------+--------------------+------+--------+
|2017-11-24 19:00:00|1734117123| 1953761884|unkn   ppd id: 15...| 19.55|45522086|
+-------------------+----------+-----------+--------------------+------+--------+



In [99]:
df_1 = spark.read.format("parquet").option("recursiveFileLookup", True).load("data/sales_recursive/")
df_1.show()

+-------------------+----------+-----------+--------------------+------+---------+
|      transacted_at|    trx_id|retailer_id|         description|amount|  city_id|
+-------------------+----------+-----------+--------------------+------+---------+
|2017-11-24 19:00:00|1734117123| 1953761884|unkn   ppd id: 15...| 19.55| 45522086|
|2017-11-24 19:00:00|1734117021|  644879053|unkn    ppd id: 7...|  8.58|930259917|
+-------------------+----------+-----------+--------------------+------+---------+

