In [0]:
# Sample data in Python list
data = [
    (1001, "2024-01-15", "C001", "Widget A", 10, 25.50),
    (1002, "2024-01-16", "C002", "Widget B", 5, 15.75),
    (1003, "2024-01-16", "C001", "Widget C", 8, 22.50),
    (1004, "2024-01-17", "C003", "Widget A", 15, 25.50),
    (1005, "2024-01-18", "C004", "Widget D", 7, 30.00),
    (1006, "2024-01-19", "C002", "Widget B", 9, 15.75),
    (1007, "2024-01-20", "C005", "Widget C", 12, 22.50),
    (1008, "2024-01-21", "C003", "Widget A", 10, 25.50)
]

# Define schema
columns = ["OrderID", "OrderDate", "CustomerID", "Product", "Quantity", "Price"]

# Create Spark DataFrame
df = spark.createDataFrame(data, schema=columns)

# Save as CSV file to DBFS (Databricks File System)
output_path_csv = "/dbfs/FileStore/streaming/input/sales_data.csv"

df.write.option("header", "true").csv(output_path_csv)

print("Sales data successfully written to:", output_path_csv)


Sales data successfully written to: /dbfs/FileStore/streaming/input/sales_data.csv


In [0]:
# JSON data as Python list of dictionaries
customer_data = [
    {"CustomerID": "C001", "CustomerName": "John Doe", "Region": "North", "SignupDate": "2022-07-01"},
    {"CustomerID": "C002", "CustomerName": "Jane Smith", "Region": "South", "SignupDate": "2023-02-15"},
    {"CustomerID": "C003", "CustomerName": "Emily Johnson", "Region": "East", "SignupDate": "2021-11-20"},
    {"CustomerID": "C004", "CustomerName": "Michael Brown", "Region": "West", "SignupDate": "2022-12-05"},
    {"CustomerID": "C005", "CustomerName": "Linda Davis", "Region": "North", "SignupDate": "2023-03-10"}
]

# Create Spark DataFrame
df_customer = spark.createDataFrame(customer_data)

# Save as JSON file to DBFS (Databricks File System)
output_path_json = "/dbfs/FileStore/streaming/input/customer_data.json"

df_customer.write.mode("overwrite").json(output_path_json)

print("Customer data successfully written to:", output_path_json)


Customer data successfully written to: /dbfs/FileStore/streaming/input/customer_data.json


In [0]:
# dbutils.fs.cp("file:/Workspace/Shared/sales_data.csv","dbfs:/FileStore/streaming/input/sales_data.csv")

# dbutils.fs.cp("file:/Workspace/Shared/customer_data.json","dbfs:/FileStore/streaming/input/customer_data.json")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder.appName("StructuredStreaming").getOrCreate()

# Define the Schema for the csv data
sales_schema = "OrderID INT, OrderDate STRING, CustomerID STRING, Product STRING, Quantity INT, Price DOUBLE"

# Read streaming dat from csv files
df_sales_stream = spark.readStream.format("csv").option("header","true").schema(sales_schema).load("/dbfs/FilesStore/streaming/input/sales_data.csv")

# Define the Schema for json data
customer_schema = "CustomerID STRING, CustomerName STRING, Region STRING, SignupDate STRING"

# Read streaming dat from json file
df_customer_stream = spark.readStream.format("json").schema(customer_schema).load("/dbfs/FileStore/streaming/input/customer_data.json")


In [0]:
# Write the sales data stream to the console
sales_query =  df_sales_stream.writeStream \
    .format("console") \
    .start()


In [0]:
sales_query.stop()

In [0]:
from pyspark.sql.functions import current_date, datediff,to_timestamp

# Transform the sales data: adda anew column for total amount
df_sales_transformed = df_sales_stream.select(
    col("OrderID"),
    to_timestamp(col("OrderDate"),"yyyy-MM-dd HH:mm:ss").alias("OrderDate"), # Convert OrderDate to timestamp
    col("Product"),
    col("Quantity"),
    col("Price"),
    (col("Quantity") * col("Price")).alias("TotalAmount")
)

print("Applied transformation on sales data")

# Add Watermark to handle late data and perform an aggregatino
df_sales_aggregated = df_sales_transformed.withWatermark("OrderDate","1 day").groupBy("Product").agg({"TotalAmount":"sum"})

print("Aggregated sales data by product")

# Transform the customer data: adda anew column of years since signup
df_customer_transformed = df_customer_stream.withColumn("YearsSinceSignup",datediff(current_date(),to_timestamp(col("SignupDate"),'yyyy-MM--dd')).cast("int") / 365
)

print("Applied transformations on customer data")

Applied transformation on sales data
Aggregated sales data by product
Applied transformations on customer data


In [0]:
# Write the aggregated sales data to a console sink for debugging
sales_query = df_sales_aggregated.writeStream.outputMode("update").format("console").start()

print("Started Streaming query to write aggregated sales data to console")

# Write the transformed customer data to a console sink 
customer_query = df_customer_transformed.writeStream.outputMode("append").format("console").start()

Started Streaming query to write aggregated sales data to console


In [0]:
sales_query.stop()
customer_query.stop()