In [1]:
#Create a Spark Session
from pyspark.sql import SparkSession
import findspark

findspark.init()

spark = SparkSession\
            .builder\
            .appName("SparkWriterJob")\
            .config("spark.sql.shuffle.partitions", 2)\
            .config("spark.default.parallelism", 2)\
            .master("local[2]")\
            .getOrCreate()
print(spark.version)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/12 14:16:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


3.5.1


### 03.01 Reading Files into Spark

Data can be read into Apache Spark data frames from a variety of data sources. 

examples : 
- A flat file on a local disk
- A file from HDFS
- A Kafka Topic


In this example, we will read a CSV file in a HDFS folder into a Spark Data Frame.

In [2]:
#Read the raw CSV file int a Spark DataFrame
#    Use inferSchema to infer the schema automatically from the CSV file

raw_sales_data = spark\
                .read\
                .option("inferSchema", "true")\
                .option("header", "true")\
                .csv("datasets/sales_orders.csv")

#Print the schema for verification
raw_sales_data.printSchema();

#Print the first 5 records for verification
raw_sales_data.show(5)

root
 |-- ID: integer (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Rate: double (nullable = true)
 |-- Tags: string (nullable = true)

+---+--------+--------+----------+--------+-----+---------------+
| ID|Customer| Product|      Date|Quantity| Rate|           Tags|
+---+--------+--------+----------+--------+-----+---------------+
|  1|   Apple|Keyboard|2019/11/21|       5|31.15|Discount:Urgent|
|  2|LinkedIn| Headset|2019/11/25|       5| 36.9|  Urgent:Pickup|
|  3|Facebook|Keyboard|2019/11/24|       5|49.89|           NULL|
|  4|  Google|  Webcam|2019/11/07|       4|34.21|       Discount|
|  5|LinkedIn|  Webcam|2019/11/21|       3|48.69|         Pickup|
+---+--------+--------+----------+--------+-----+---------------+
only showing top 5 rows



### 03.02 Writing to HDFS

Write the rawSalesData Data Frame into HDFS as a Parquet file. Use Parquet as the format since it enables splitting and filtering. Use GZIP as the compression codec. 

On completion, verify if the files are correctly through the filesystem

In [3]:
raw_sales_data.write\
            .option("compression", "gzip")\
            .parquet(path="dummy_hdfs/raw_parquet",
                    mode="overwrite");

### 03.03 Write to HDFS with partitioning

Write a partitioned Parquet file in HDFS. Partition will be done by Product. This will create one directory per unique product available in the raw CSV.

In [4]:
raw_sales_data.write\
            .option("compression", "gzip")\
            .partitionBy("Product")\
            .parquet(path="dummy_hdfs/partitioned_parquet",
                    mode="overwrite");

### 03.04 Writing to Hive with Bucketing

Create a Bucketed Hive table for orders. Bucketing will be done by Product. It will create 3 buckets based on the hash generated by Product. Hive tables can be queried through SQL.

In [5]:

#Make sure that the 
raw_sales_data.write\
            .format("parquet")\
            .bucketBy(3, "Product")\
            .saveAsTable("product_bucket_table")
            
#Spark Hive table is stored in spark-warehouse folder

spark.sql("SHOW tables").show(5)

#Read bucketed data
spark.sql(f"""
        SELECT * FROM product_bucket_table 
        WHERE Product='Mouse'""")\
    .show(5)
#While the files are persisted to disk


+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|  default|product_bucket_table|      false|
+---------+--------------------+-----------+

+---+--------+-------+----------+--------+-----+--------------------+
| ID|Customer|Product|      Date|Quantity| Rate|                Tags|
+---+--------+-------+----------+--------+-----+--------------------+
|  6|  Google|  Mouse|2019/11/23|       5|40.58|                NULL|
|  8|  Google|  Mouse|2019/11/13|       1|46.79|Urgent:Discount:P...|
| 14|   Apple|  Mouse|2019/11/09|       4|40.27|            Discount|
| 15|   Apple|  Mouse|2019/11/25|       5|38.89|                NULL|
| 20|LinkedIn|  Mouse|2019/11/25|       4|36.77|       Urgent:Pickup|
+---+--------+-------+----------+--------+-----+--------------------+
only showing top 5 rows



In [None]:
spark.catalog.listDatabases()