### The effects of Multiline

In [1]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Multiline Functionality") \
    .master("local[*]") \
    .getOrCreate()

spark

In [2]:
# Lets read the Multiline JSON
from pyspark.sql.functions import explode

df_json_multiline = spark \
    .read \
    .option("multiLine", True) \
    .format("json") \
    .load("dataset/orders_json/orders_json_multiline.json")

df_json_multiline.printSchema()

# Lets perform explode operation
df_temp = df_json_multiline.withColumn("orders", explode("orders"))

# Write with for performance benchmarking
df_temp.write.format("parquet").mode("overwrite").save("dataset/orders_json/output/parquet_1")

root
 |-- orders: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- cust_id: string (nullable = true)
 |    |    |-- invoice_num: string (nullable = true)
 |    |    |-- order_date: string (nullable = true)
 |    |    |-- order_id: string (nullable = true)
 |    |    |-- order_lines: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- currency_code: string (nullable = true)
 |    |    |    |    |-- discount: long (nullable = true)
 |    |    |    |    |-- discount_type: string (nullable = true)
 |    |    |    |    |-- prod_id: string (nullable = true)
 |    |    |    |    |-- qty: long (nullable = true)
 |    |    |    |    |-- tax: long (nullable = true)
 |    |    |    |    |-- tax_type: string (nullable = true)
 |    |    |-- store_id: string (nullable = true)
 |    |    |-- system_date: string (nullable = true)



In [3]:
# Check number of partitions
df_json_multiline.rdd.getNumPartitions()

1

In [5]:
# Lets read the SingleLine JSON
from pyspark.sql.functions import explode

df_json_singleline = spark \
    .read \
    .option("multiLine", False) \
    .format("json") \
    .load("dataset/orders_json/orders_json_singleline.json")

df_json_singleline.printSchema()

# Lets perform explode operation
df_temp = df_json_singleline.withColumn("orders", explode("orders"))

# Write for performance benchmarking
df_temp.write.format("parquet").mode("overwrite").save("dataset/orders_json/output/parquet_2")

root
 |-- orders: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- cust_id: string (nullable = true)
 |    |    |-- invoice_num: string (nullable = true)
 |    |    |-- order_date: string (nullable = true)
 |    |    |-- order_id: string (nullable = true)
 |    |    |-- order_lines: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- currency_code: string (nullable = true)
 |    |    |    |    |-- discount: long (nullable = true)
 |    |    |    |    |-- discount_type: string (nullable = true)
 |    |    |    |    |-- prod_id: string (nullable = true)
 |    |    |    |    |-- qty: long (nullable = true)
 |    |    |    |    |-- tax: long (nullable = true)
 |    |    |    |    |-- tax_type: string (nullable = true)
 |    |    |-- store_id: string (nullable = true)
 |    |    |-- system_date: string (nullable = true)



In [6]:
# Check number of partitions
df_json_singleline.rdd.getNumPartitions()

8

In [7]:
spark.stop()