In [0]:
# SparkSession
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading Complex Data Formats")
    .master('local[*]')
    .getOrCreate()
)

spark

In [0]:
# Read parquet data
df_parquet = spark.read.format('parquet').load('/FileStore/tables/emp_perf.parquet')

##### Parquet option
###### https://spark.apache.org/docs/latest/sql-data-sources-parquet.html

In [0]:
df_parquet.printSchema()

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)
 |-- Joining Date: date (nullable = true)
 |-- Performance Score: double (nullable = true)
 |-- Experience: long (nullable = true)
 |-- Status: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Session: string (nullable = true)



In [0]:
df_parquet.show()

+---+------------------+---+------+----------+------+------------+-----------------+----------+--------+-----------+-------+
| ID|              Name|Age|Gender|Department|Salary|Joining Date|Performance Score|Experience|  Status|   Location|Session|
+---+------------------+---+------+----------+------+------------+-----------------+----------+--------+-----------+-------+
|  1|      Cory Escobar| 48|Female|        HR|  5641|  2015-05-03|              2.0|        16|  Active|   New York|  Night|
|  2|   Timothy Sanchez| 25| Other|     Sales|  4249|  2020-11-09|              2.0|        11|Inactive|Los Angeles|Evening|
|  3|      Chad Nichols| 57| Other|     Sales|  3058|  2019-02-12|             NULL|         1|Inactive|   New York|Morning|
|  4|Christine Williams| 58|Female|        IT|  5895|  2017-09-08|              2.0|        13|Inactive|Los Angeles|Evening|
|  5|      Amber Harris| 35| Other|        IT|  4317|  2020-02-15|              5.0|        16|Inactive|   New York|Evening|


In [0]:
# Read ORC data
df_orc = spark.read.format('orc').load('/FileStore/tables/emp_perf.orc')

In [0]:
df_orc.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: string (nullable = true)
 |-- Joining Date: string (nullable = true)
 |-- Performance Score: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Session: string (nullable = true)



In [0]:
df_orc.show()

+---+------------------+---+------+----------+------+------------+-----------------+----------+--------+-----------+-------+
| ID|              Name|Age|Gender|Department|Salary|Joining Date|Performance Score|Experience|  Status|   Location|Session|
+---+------------------+---+------+----------+------+------------+-----------------+----------+--------+-----------+-------+
|  1|      Cory Escobar| 48|Female|        HR|  5641|  2015-05-03|              2.0|        16|  Active|   New York|  Night|
|  2|   Timothy Sanchez| 25| Other|     Sales|  4249|  2020-11-09|              2.0|        11|Inactive|Los Angeles|Evening|
|  3|      Chad Nichols| 57| Other|     Sales|  3058|  2019-02-12|             NULL|         1|Inactive|   New York|Morning|
|  4|Christine Williams| 58|Female|        IT|  5895|  2017-09-08|              2.0|        13|Inactive|Los Angeles|Evening|
|  5|      Amber Harris| 35| Other|        IT|  4317|  2020-02-15|              5.0|        16|Inactive|   New York|Evening|


In [0]:
# Benefits of Columnar Storage

# Let's create a simple Python decorator - (get_time) to get the execution timings
# If you don't know about Python decorator - check out: https://www.geeksforgeeks.org/decorators-in-python

import time

def get_time(func):
  def inner_get_time() -> str:
    start_time = time.time()
    func()
    end_time = time.time()
    return (f'Execution time: {(end_time - start_time)* 1000} ms')
  print(inner_get_time())

In [0]:
@get_time
def x():
  df = spark.read.format('parquet').load('/FileStore/tables/emp_perf.parquet')
  df.count()

Execution time: 1224.7345447540283 ms


In [0]:
@get_time
def x():
  df = spark.read.format('parquet').load('/FileStore/tables/emp_perf.parquet')
  df.select('Name').count()

Execution time: 745.201587677002 ms


In [0]:
# Recursive read
sales_recursive
|__ sales_1\1.parquet
|__ sales_1\sales_2\2.parquet

In [0]:
df_1 = spark.read.format('parquet').option('recursiveFileLookup', True).load('data/input/sales_recursive/')
df_1.show()