In [1]:
#Create a Spark Session
from pyspark.sql import SparkSession
import findspark

findspark.init()

spark = SparkSession\
            .builder\
            .appName("SparkReadJob")\
            .config("spark.sql.shuffle.partitions", 2)\
            .config("spark.default.parallelism", 2)\
            .config("spark.sql.warehouse.dir", "spark-warehouse") \
            .enableHiveSupport() \
            .master("local[2]")\
            .getOrCreate()

### 04.02 Read Parquet Files into Spark
Read a non-partitioned Parquet file into Spark. Measure the time taken. Also look at the execution plan.

In [4]:
sales_parquet = spark\
                .read\
                .parquet("dummy_hdfs/raw_parquet")

#Display the results
sales_parquet.show(5)

#show the execution plan
print("\n--------------------------EXPLAIN--------------------------")
sales_parquet.explain(True)
print("-------------------------END EXPLAIN-----------------------\n")

+---+--------+--------+----------+--------+-----+---------------+
| ID|Customer| Product|      Date|Quantity| Rate|           Tags|
+---+--------+--------+----------+--------+-----+---------------+
|  1|   Apple|Keyboard|2019/11/21|       5|31.15|Discount:Urgent|
|  2|LinkedIn| Headset|2019/11/25|       5| 36.9|  Urgent:Pickup|
|  3|Facebook|Keyboard|2019/11/24|       5|49.89|           NULL|
|  4|  Google|  Webcam|2019/11/07|       4|34.21|       Discount|
|  5|LinkedIn|  Webcam|2019/11/21|       3|48.69|         Pickup|
+---+--------+--------+----------+--------+-----+---------------+
only showing top 5 rows


--------------------------EXPLAIN--------------------------
== Parsed Logical Plan ==
Relation [ID#0,Customer#1,Product#2,Date#3,Quantity#4,Rate#5,Tags#6] parquet

== Analyzed Logical Plan ==
ID: int, Customer: string, Product: string, Date: string, Quantity: int, Rate: double, Tags: string
Relation [ID#0,Customer#1,Product#2,Date#3,Quantity#4,Rate#5,Tags#6] parquet

== Optimiz

### 04.03. Read Partitioned Data into Spark

In [7]:
sales_partitioned = spark\
                    .read\
                    .parquet("dummy_hdfs/partitioned_parquet/*")

#Display the results
sales_partitioned.show(5)

#show the execution plan
print("\n--------------------------EXPLAIN--------------------------")
sales_partitioned.explain()
print("-------------------------END EXPLAIN-----------------------\n")

+---+--------+----------+--------+-----+--------------------+
| ID|Customer|      Date|Quantity| Rate|                Tags|
+---+--------+----------+--------+-----+--------------------+
|  6|  Google|2019/11/23|       5|40.58|                NULL|
|  8|  Google|2019/11/13|       1|46.79|Urgent:Discount:P...|
| 14|   Apple|2019/11/09|       4|40.27|            Discount|
| 15|   Apple|2019/11/25|       5|38.89|                NULL|
| 20|LinkedIn|2019/11/25|       4|36.77|       Urgent:Pickup|
+---+--------+----------+--------+-----+--------------------+
only showing top 5 rows


--------------------------EXPLAIN--------------------------
== Physical Plan ==
*(1) ColumnarToRow
+- FileScan parquet [ID#44,Customer#45,Date#46,Quantity#47,Rate#48,Tags#49] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(5 paths)[file:/D:/DS/Big_Data_Analytics/Spark/Big_Data_Analytics_Hadoop_Apache_..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<ID:int,Customer:stri

In [9]:
#Read specific partition only
sales_headset = spark\
                    .read\
                    .parquet("dummy_hdfs/partitioned_parquet/Product=Headset")
sales_headset.show(5)

+---+--------+----------+--------+-----+--------------------+
| ID|Customer|      Date|Quantity| Rate|                Tags|
+---+--------+----------+--------+-----+--------------------+
|  2|LinkedIn|2019/11/25|       5| 36.9|       Urgent:Pickup|
| 10|LinkedIn|2019/11/09|       2|26.91|Urgent:Discount:P...|
| 11|Facebook|2019/11/26|       5|45.84|       Urgent:Pickup|
| 12|  Google|2019/11/05|       2|41.17|     Discount:Urgent|
| 17|   Apple|2019/11/09|       4|29.98|     Discount:Urgent|
+---+--------+----------+--------+-----+--------------------+
only showing top 5 rows



### 04.04 Read Bucketed Data into Spark

In [12]:
#Spark does not persist the Hive catalog between multiple Sparksession instances
#You can additionally use a Hive metastore if you want to persist catalog
#across SparkSession instances

#Read the bucketed table directly from disk
sales_bucketed = spark\
                    .read\
                    .parquet("spark-warehouse/product_bucket_table/*")

sales_bucketed.show(5)

#Convert into a temporary view
sales_bucketed.createOrReplaceTempView("product_bucket_table")

spark.sql("SELECT * FROM product_bucket_table WHERE Product='Webcam'").show(5)

+---+--------+--------+----------+--------+-----+---------------+
| ID|Customer| Product|      Date|Quantity| Rate|           Tags|
+---+--------+--------+----------+--------+-----+---------------+
|  1|   Apple|Keyboard|2019/11/21|       5|31.15|Discount:Urgent|
|  3|Facebook|Keyboard|2019/11/24|       5|49.89|           NULL|
|  4|  Google|  Webcam|2019/11/07|       4|34.21|       Discount|
|  5|LinkedIn|  Webcam|2019/11/21|       3|48.69|         Pickup|
|  7|LinkedIn|  Webcam|2019/11/20|       4|37.19|           NULL|
+---+--------+--------+----------+--------+-----+---------------+
only showing top 5 rows

+---+--------+-------+----------+--------+-----+---------------+
| ID|Customer|Product|      Date|Quantity| Rate|           Tags|
+---+--------+-------+----------+--------+-----+---------------+
|  4|  Google| Webcam|2019/11/07|       4|34.21|       Discount|
|  5|LinkedIn| Webcam|2019/11/21|       3|48.69|         Pickup|
|  7|LinkedIn| Webcam|2019/11/20|       4|37.19|        