#### PySpark Configurations ####

In [1]:
# Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
# Initialize Spark session
spark = SparkSession.builder \
            .master("spark://spark-master:7077") \
                .appName("Ansh-Lamba-PySpark-Streaming") \
                    .config("spark.ui.port", "4040") \
                        .getOrCreate()

# Enable schema inference for Streaming
spark.conf.set("spark.sql.streaming.schemaInference", "true")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/19 20:55:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Reading data from JSON file in BATCH mode ####

In [3]:
# Create root directory
INPUT_DATA_STREAMING = "/opt/spark-data/input/ansh-lamba/streaming-data"
INPUT_DATA_STREAMING_ARCHIVE = "/opt/spark-data/input/ansh-lamba/streaming-data-archive"

In [4]:
# Read JSON file with Infered schema
df_orders_batch = spark.read.format("json") \
                    .option("multiline", True) \
                        .load(f"{INPUT_DATA_STREAMING}/CustomerOrdersDay01.json")

                                                                                

In [5]:
# Check first N records
# df_orders_batch.show(5)
df_orders_batch.limit(5).toPandas()

                                                                                

Unnamed: 0,customer,items,metadata,order_id,payment,timestamp
0,"((Toronto, Canada, M5H 2N2), 501, john@example...","[(I100, 25.99, Wireless Mouse, 2), (I101, 15.4...","[(campaign, back_to_school), (channel, email)]",ORD1001,"(Credit Card, TXN7890)",2025-06-01T10:15:00Z


In [6]:
# Check dataframe schema
df_orders_batch.printSchema()

root
 |-- customer: struct (nullable = true)
 |    |-- address: struct (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- postal_code: string (nullable = true)
 |    |-- customer_id: long (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- item_id: string (nullable = true)
 |    |    |-- price: double (nullable = true)
 |    |    |-- product_name: string (nullable = true)
 |    |    |-- quantity: long (nullable = true)
 |-- metadata: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- key: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- payment: struct (nullable = true)
 |    |-- method: string (nullable = true)
 |    |-- transaction_id: string (nullable = true)


In [7]:
# Flatten JSON by selecting specific keys
df_orders_batch_flattened = df_orders_batch \
    .select('order_id','timestamp','customer.customer_id','customer.name','customer.email','customer.address.city','customer.address.country','customer.address.postal_code','items','payment.method','payment.transaction_id','metadata')

In [8]:
# Check first N records
# df_orders_batch_flattened.show(5)
df_orders_batch_flattened.limit(5).toPandas()

                                                                                

Unnamed: 0,order_id,timestamp,customer_id,name,email,city,country,postal_code,items,method,transaction_id,metadata
0,ORD1001,2025-06-01T10:15:00Z,501,John Doe,john@example.com,Toronto,Canada,M5H 2N2,"[(I100, 25.99, Wireless Mouse, 2), (I101, 15.4...",Credit Card,TXN7890,"[(campaign, back_to_school), (channel, email)]"


In [9]:
# Explode 'items' & 'metadata' columns in flattened JSON
df_orders_batch_explode = df_orders_batch_flattened \
                                .withColumn('items', explode_outer('items')) \
                                    .withColumn('metadata', explode_outer('metadata'))

In [10]:
# Check first N records
# df_orders_batch_explode.show(5)
df_orders_batch_explode.limit(5).toPandas()

Unnamed: 0,order_id,timestamp,customer_id,name,email,city,country,postal_code,items,method,transaction_id,metadata
0,ORD1001,2025-06-01T10:15:00Z,501,John Doe,john@example.com,Toronto,Canada,M5H 2N2,"(I100, 25.99, Wireless Mouse, 2)",Credit Card,TXN7890,"(campaign, back_to_school)"
1,ORD1001,2025-06-01T10:15:00Z,501,John Doe,john@example.com,Toronto,Canada,M5H 2N2,"(I100, 25.99, Wireless Mouse, 2)",Credit Card,TXN7890,"(channel, email)"
2,ORD1001,2025-06-01T10:15:00Z,501,John Doe,john@example.com,Toronto,Canada,M5H 2N2,"(I101, 15.49, USB-C Adapter, 1)",Credit Card,TXN7890,"(campaign, back_to_school)"
3,ORD1001,2025-06-01T10:15:00Z,501,John Doe,john@example.com,Toronto,Canada,M5H 2N2,"(I101, 15.49, USB-C Adapter, 1)",Credit Card,TXN7890,"(channel, email)"


In [11]:
# Final flattened JSON
df_orders_batch_exploded = df_orders_batch_explode \
    .select('order_id','timestamp','customer_id','name','email','city','country','postal_code','items.item_id','items.product_name','items.quantity','items.price','method','transaction_id','metadata.key','metadata.value')

In [12]:
# Check first N records
# df_orders_batch_exploded.show(5)
df_orders_batch_exploded.limit(5).toPandas()

Unnamed: 0,order_id,timestamp,customer_id,name,email,city,country,postal_code,item_id,product_name,quantity,price,method,transaction_id,key,value
0,ORD1001,2025-06-01T10:15:00Z,501,John Doe,john@example.com,Toronto,Canada,M5H 2N2,I100,Wireless Mouse,2,25.99,Credit Card,TXN7890,campaign,back_to_school
1,ORD1001,2025-06-01T10:15:00Z,501,John Doe,john@example.com,Toronto,Canada,M5H 2N2,I100,Wireless Mouse,2,25.99,Credit Card,TXN7890,channel,email
2,ORD1001,2025-06-01T10:15:00Z,501,John Doe,john@example.com,Toronto,Canada,M5H 2N2,I101,USB-C Adapter,1,15.49,Credit Card,TXN7890,campaign,back_to_school
3,ORD1001,2025-06-01T10:15:00Z,501,John Doe,john@example.com,Toronto,Canada,M5H 2N2,I101,USB-C Adapter,1,15.49,Credit Card,TXN7890,channel,email


#### Reading data from JSON file in STREAMING mode ####

In [13]:
"""
# Read JSON file with Infered schema - STREAMING
df_orders_streaming = spark.readStream.format("json") \
                        .option("multiline", True) \
                            .load(f"{INPUT_DATA_STREAMING}/")
"""

'\n# Read JSON file with Infered schema - STREAMING\ndf_orders_streaming = spark.readStream.format("json")                         .option("multiline", True)                             .load(f"{INPUT_DATA_STREAMING}/")\n'

In [14]:
"""
# All Transformations
# Flatten JSON by selecting specific keys
df_orders_streaming_flattened = df_orders_streaming \
    .select('order_id','timestamp','customer.customer_id','customer.name','customer.email','customer.address.city','customer.address.country','customer.address.postal_code','items','payment.method','payment.transaction_id','metadata')

# Explode 'items' & 'metadata' columns in flattened JSON
df_orders_streaming_explode = df_orders_streaming_flattened \
                                .withColumn('items', explode_outer('items')) \
                                    .withColumn('metadata', explode_outer('metadata'))

# Final flattened JSON
df_orders_streaming_exploded = df_orders_streaming_explode \
    .select('order_id','timestamp','customer_id','name','email','city','country','postal_code','items.item_id','items.product_name','items.quantity','items.price','method','transaction_id','metadata.key','metadata.value')
"""

"\n# All Transformations\n# Flatten JSON by selecting specific keys\ndf_orders_streaming_flattened = df_orders_streaming     .select('order_id','timestamp','customer.customer_id','customer.name','customer.email','customer.address.city','customer.address.country','customer.address.postal_code','items','payment.method','payment.transaction_id','metadata')\n\n# Explode 'items' & 'metadata' columns in flattened JSON\ndf_orders_streaming_explode = df_orders_streaming_flattened                                 .withColumn('items', explode_outer('items'))                                     .withColumn('metadata', explode_outer('metadata'))\n\n# Final flattened JSON\ndf_orders_streaming_exploded = df_orders_streaming_explode     .select('order_id','timestamp','customer_id','name','email','city','country','postal_code','items.item_id','items.product_name','items.quantity','items.price','method','transaction_id','metadata.key','metadata.value')\n"

In [15]:
"""
# Create root directory
OUTPUT_DATA_ROOT = "/opt/spark-data/output/ansh-lamba/streaming-data"
CHECK_POINT = "/opt/spark-data/output/ansh-lamba/checkpoint"

MODE = "append"  # MODES = append, update, complete
"""

'\n# Create root directory\nOUTPUT_DATA_ROOT = "/opt/spark-data/output/ansh-lamba/streaming-data"\nCHECK_POINT = "/opt/spark-data/output/ansh-lamba/checkpoint"\n\nMODE = "append"  # MODES = append / update / complete\n'

In [16]:
"""
# Write Streaming data to disk
streaming = df_orders_streaming_exploded \
                .writeStream \
                    .format("parquet") \
                        .option("path", f"{OUTPUT_DATA_ROOT}/") \
                            .option("checkpointLocation", f"{CHECK_POINT}/) \
                                .outputMode(MODE) \
                                    .trigger(once=True) \
                                        .start()
"""

'\n# Write Streaming data to disk\nstreaming = df_orders_streaming_exploded                 .writeStream                     .format("parquet")                         .option("path", f"{OUTPUT_DATA_ROOT}/")                             .option("checkpointLocation", f"{CHECK_POINT}/)                                 .outputMode(MODE)                                     .trigger(once=True)                                         .start()\n'

In [17]:
# Check Streaming status
# streaming.status    # Check status of stream
# spark.streams.active    # List all active streams
# streaming.lastProgress    # Get progress metrics
# print('Streaming Status:', streaming.isActive)    # Check if the stream is active

In [18]:
# Stop Streaming
# streaming.stop()

#### Reading data from JSON file in STREAMING mode - ARCHIVING ####

In [19]:
# Read JSON file with Infered schema - ARCHIVING
df_orders_streaming_archive = spark.readStream.format("json") \
                                .option("multiline", True) \
                                    .option("cleanSource", "archive") \
                                        .option("sourceArchiveDir",f"{INPUT_DATA_STREAMING_ARCHIVE}/") \
                                            .load(f"{INPUT_DATA_STREAMING}/")

In [20]:
# All Transformations
# Flatten JSON by selecting specific keys
df_orders_streaming_archive_flattened = df_orders_streaming_archive \
    .select('order_id','timestamp','customer.customer_id','customer.name','customer.email','customer.address.city','customer.address.country','customer.address.postal_code','items','payment.method','payment.transaction_id','metadata')

# Explode 'items' & 'metadata' columns in flattened JSON
df_orders_streaming_archive_explode = df_orders_streaming_archive_flattened \
                                .withColumn('items', explode_outer('items')) \
                                    .withColumn('metadata', explode_outer('metadata'))

# Final flattened JSON
df_orders_streaming_archive_exploded = df_orders_streaming_archive_explode \
    .select('order_id','timestamp','customer_id','name','email','city','country','postal_code','items.item_id','items.product_name','items.quantity','items.price','method','transaction_id','metadata.key','metadata.value')

In [21]:
# Create root directory
OUTPUT_DATA_ROOT = "/opt/spark-data/output/ansh-lamba/streaming-data"
CHECK_POINT = "/opt/spark-data/output/ansh-lamba/checkpoint"

MODE = "append"  # MODES = append, update, complete

In [22]:
# Write Streaming data to disk
streaming_archive = df_orders_streaming_archive_exploded \
                .writeStream \
                    .format("parquet") \
                        .option("path", f"{OUTPUT_DATA_ROOT}/") \
                            .option("checkpointLocation", f"{CHECK_POINT}/") \
                                .outputMode(MODE) \
                                    .trigger(once=True) \
                                        .start()

26/01/19 20:56:16 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [23]:
# Check Streaming status
print('Streaming Status:', streaming_archive.isActive)    # Check if the stream is active

Streaming Status: True


In [24]:
# Stop Streaming
# streaming.stop()