# Task 2: Predicting sales data using Spark Structured Streaming

Prerequisites: 

- Make sure 'sales_estimation_pipeline_model' is unzip, else uncomment the code for unzipping it. 
- Please update the hostip before testing

In [1]:

from pyspark.sql.functions import explode
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *

### 2.1 Create SparkSession


In [2]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.0.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0 pyspark-shell'

from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession

master = "local[4]" #four local cores 

app_name = "31171109 Assignment 3 FIT5202"

spark_conf = SparkConf().setMaster(master).setAppName(app_name)


spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
sc = spark.sparkContext


# Set a global checkpoint directory, this usually works better with RDD API 
checkpoint_directory = "checkpoints"
spark.sparkContext.setCheckpointDir(checkpoint_directory)

### 2.2 Define dataframe loading schema  & Load store CSV file


In [3]:
from pyspark.sql.types import *

stores_schema = StructType([
    StructField("Store", IntegerType(), True),
    StructField("Type", StringType(), True),
    StructField("Size", IntegerType(), True)
])

stores_df = spark.read.csv("data/stores.csv", header=True, schema=stores_schema)

### 2.3 Injest Kafka data




In [4]:
# with reference to Spark Watermark demo in Week 11
from pyspark.sql.types import *

schema = ArrayType(
    StructType([
        StructField('Store', StringType(), True),
        StructField('Date', StringType(), True),
        StructField('Temperature', StringType(), True),
        StructField('Fuel_Price', StringType(), True),
        StructField('MarkDown1', StringType(), True),
        StructField('MarkDown2', StringType(), True),
        StructField('MarkDown3', StringType(), True),
        StructField('MarkDown4', StringType(), True),
        StructField('MarkDown5', StringType(), True),
        StructField('CPI', StringType(), True),
        StructField('Unemployment', StringType(), True),
        StructField('IsHoliday', StringType(), True),
        StructField('last_weekly_sales', StringType(), True),
        StructField('ts', LongType(), True)
    ])
)

In [5]:
topic = "asgn3"

hostip = "192.168.50.193" #CHECK ME before testing

df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", f"{hostip}:9092") \
    .option("subscribe", topic) \
    .load()

df.printSchema()

df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

df=df.select(F.from_json(F.col("value").cast("string"), schema).alias('parsed'))

df.printSchema()
df = df.select(F.explode(F.col("parsed")).alias('unnested'))      
df_formatted = df.select(
                    F.col("unnested.Store").alias("Store"),
                    F.col("unnested.Date").alias("Date"),
                    F.col("unnested.Temperature").alias("Temperature"),
                    F.col("unnested.Fuel_Price").alias("Fuel_Price"),
                    F.col("unnested.MarkDown1").alias("MarkDown1"),
                    F.col("unnested.MarkDown2").alias("MarkDown2"),
                    F.col("unnested.MarkDown3").alias("MarkDown3"),
                    F.col("unnested.MarkDown4").alias("MarkDown4"),
                    F.col("unnested.MarkDown5").alias("MarkDown5"),
                    F.col("unnested.CPI").alias("CPI"),
                    F.col("unnested.Unemployment").alias("Unemployment"),
                    F.col("unnested.IsHoliday").alias("IsHoliday"),
                    F.col("unnested.last_weekly_sales").alias("last_weekly_sales"),
                    F.col("unnested.ts").alias("ts")
                )
                            
df_formatted.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)

root
 |-- parsed: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Store: string (nullable = true)
 |    |    |-- Date: string (nullable = true)
 |    |    |-- Temperature: string (nullable = true)
 |    |    |-- Fuel_Price: string (nullable = true)
 |    |    |-- MarkDown1: string (nullable = true)
 |    |    |-- MarkDown2: string (nullable = true)
 |    |    |-- MarkDown3: string (nullable = true)
 |    |    |-- MarkDown4: string (nullable = true)
 |    |    |-- MarkDown5: string (nullable = true)
 |    |    |-- CPI: string (nullable = true)
 |    |    |-- Unemployment: string (nullable = true)
 |    |    |-- IsHoliday: string (nullable = true)
 |    |    |-- last_week

### 2.4 Persist raw data in in Parquet format 

In [6]:
# write into parquet

query_raw = df_formatted.writeStream.format("parquet")\
        .outputMode("append")\
        .option("path", "raw_parquet")\
        .option("checkpointLocation", "raw_checkpoint")\
        .start()


In [7]:
#read parquet

query_raw_df = spark.read.parquet("raw_parquet")
query_raw_df.printSchema()
query_raw_df.show()

root
 |-- Store: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Temperature: string (nullable = true)
 |-- Fuel_Price: string (nullable = true)
 |-- MarkDown1: string (nullable = true)
 |-- MarkDown2: string (nullable = true)
 |-- MarkDown3: string (nullable = true)
 |-- MarkDown4: string (nullable = true)
 |-- MarkDown5: string (nullable = true)
 |-- CPI: string (nullable = true)
 |-- Unemployment: string (nullable = true)
 |-- IsHoliday: string (nullable = true)
 |-- last_weekly_sales: string (nullable = true)
 |-- ts: long (nullable = true)

+-----+----------+-----------+----------+---------+---------+---------+---------+---------+---------+------------+---------+------------------+----------+
|Store|      Date|Temperature|Fuel_Price|MarkDown1|MarkDown2|MarkDown3|MarkDown4|MarkDown5|      CPI|Unemployment|IsHoliday| last_weekly_sales|        ts|
+-----+----------+-----------+----------+---------+---------+---------+---------+---------+---------+------------+------

In [8]:
#Stop the file_sink query & delete files
#Referenced to Lab 11 
query_raw.stop()
# !rm -r raw_parquet
# !rm -r raw_checkpoint

### 2.5 Transform data formats


In [9]:
df_formatted = df_formatted.withColumn("Store", F.col("Store").cast(IntegerType())) \
                           .withColumn("Date", F.col("Date").cast(DateType())) \
                           .withColumn("Temperature", F.col("Temperature").cast(FloatType())) \
                           .withColumn("Fuel_Price", F.col("Fuel_Price").cast(FloatType())) \
                           .withColumn("MarkDown1", F.col("MarkDown1").cast(FloatType())) \
                           .withColumn("MarkDown2", F.col("MarkDown2").cast(FloatType())) \
                           .withColumn("MarkDown3", F.col("MarkDown3").cast(FloatType())) \
                           .withColumn("MarkDown4", F.col("MarkDown4").cast(FloatType())) \
                           .withColumn("MarkDown5", F.col("MarkDown5").cast(FloatType())) \
                           .withColumn("CPI", F.col("CPI").cast(FloatType())) \
                           .withColumn("Unemployment", F.col("Unemployment").cast(FloatType())) \
                           .withColumn("IsHoliday", F.col("IsHoliday").cast(BooleanType())) \
                           .withColumn("last_weekly_sales", F.col("last_weekly_sales").cast(FloatType()))\
                           .withColumn("ts", F.to_timestamp(F.col("ts")))

In [10]:
df_formatted.printSchema()

root
 |-- Store: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Temperature: float (nullable = true)
 |-- Fuel_Price: float (nullable = true)
 |-- MarkDown1: float (nullable = true)
 |-- MarkDown2: float (nullable = true)
 |-- MarkDown3: float (nullable = true)
 |-- MarkDown4: float (nullable = true)
 |-- MarkDown5: float (nullable = true)
 |-- CPI: float (nullable = true)
 |-- Unemployment: float (nullable = true)
 |-- IsHoliday: boolean (nullable = true)
 |-- last_weekly_sales: float (nullable = true)
 |-- ts: timestamp (nullable = true)



### 2.6 Prepare feature columns


In [11]:
from pyspark.sql.functions import month, dayofmonth, dayofyear, weekofyear

df_formatted = df_formatted.withColumn("Month", month("Date"))

df_formatted = df_formatted.withColumn("day_of_month", dayofmonth("Date"))

df_formatted = df_formatted.withColumn("day_of_year", dayofyear("Date"))

df_formatted = df_formatted.withColumn("week_of_year", weekofyear("Date"))


### 2.7 Join the local data


In [12]:
joined_df = df_formatted.join(stores_df, on="Store")
joined_df.printSchema()

root
 |-- Store: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Temperature: float (nullable = true)
 |-- Fuel_Price: float (nullable = true)
 |-- MarkDown1: float (nullable = true)
 |-- MarkDown2: float (nullable = true)
 |-- MarkDown3: float (nullable = true)
 |-- MarkDown4: float (nullable = true)
 |-- MarkDown5: float (nullable = true)
 |-- CPI: float (nullable = true)
 |-- Unemployment: float (nullable = true)
 |-- IsHoliday: boolean (nullable = true)
 |-- last_weekly_sales: float (nullable = true)
 |-- ts: timestamp (nullable = true)
 |-- Month: integer (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- day_of_year: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Size: integer (nullable = true)



### 2.8 Load ML model & Perform predictions


In [13]:
from pyspark.ml import PipelineModel

## Uncomment this if you have yet unzip the folder 
# import zipfile as zf
# files = zf.ZipFile("sales_estimation_pipeline_model.zip", 'r')
# files.extractall()
# files.close()

model_path = "sales_estimation_pipeline_model"

# Load
model = PipelineModel.load(model_path)

prediction = model.transform(joined_df)

predict_query = prediction.writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "predict_parquet") \
    .option("checkpointLocation", "predict_checkpoint") \
    .start()

parquet_file = spark.read.parquet("predict_parquet")
parquet_file.show()


+-----+----------+-----------+----------+---------+---------+---------+---------+---------+---------+------------+---------+-----------------+-------------------+-----+------------+-----------+------------+----+------+--------+-------------+--------------------+------------------+
|Store|      Date|Temperature|Fuel_Price|MarkDown1|MarkDown2|MarkDown3|MarkDown4|MarkDown5|      CPI|Unemployment|IsHoliday|last_weekly_sales|                 ts|Month|day_of_month|day_of_year|week_of_year|Type|  Size|Type_idx|     Type_vec|            features|        prediction|
+-----+----------+-----------+----------+---------+---------+---------+---------+---------+---------+------------+---------+-----------------+-------------------+-----+------------+-----------+------------+----+------+--------+-------------+--------------------+------------------+
|    1|2011-07-01|      85.55|     3.524|     null|     null|     null|     null|     null|215.18414|       7.962|    false|        1438830.1|2023-05-22 0

In [14]:
predict_query.stop()
# !rm -r predict_parquet
# !rm -r predict_checkpoint

### 2.9 Analyze prediction results

In [15]:
from pyspark.sql.functions import col, window, count

#filter the wanted goal requirement
df_goal = prediction.withColumn('goal', prediction['prediction'] / prediction['Size'])

df_goal = df_goal.filter(col('goal') > 8.5)

In [16]:
df_goal.printSchema() # just for checking

root
 |-- Store: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Temperature: float (nullable = true)
 |-- Fuel_Price: float (nullable = true)
 |-- MarkDown1: float (nullable = true)
 |-- MarkDown2: float (nullable = true)
 |-- MarkDown3: float (nullable = true)
 |-- MarkDown4: float (nullable = true)
 |-- MarkDown5: float (nullable = true)
 |-- CPI: float (nullable = true)
 |-- Unemployment: float (nullable = true)
 |-- IsHoliday: boolean (nullable = true)
 |-- last_weekly_sales: float (nullable = true)
 |-- ts: timestamp (nullable = true)
 |-- Month: integer (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- day_of_year: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Size: integer (nullable = true)
 |-- Type_idx: double (nullable = false)
 |-- Type_vec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)
 |-- goal: double (nullable

In [17]:
windowedCounts = df_goal \
    .withWatermark("ts", "3 seconds") \
    .groupBy(window("ts", "10 seconds", "5 seconds"),"Type")\
    .agg(count("Store").alias("count")) \
    .select("window", "Type", "count")



In [18]:
windowedCounts.printSchema()

root
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- Type: string (nullable = true)
 |-- count: long (nullable = false)



In [30]:
def show_batch(df, epoch_id):
    if df.count() > 0:
        df.show(truncate=False)
        
query = windowedCounts.writeStream.outputMode("update") \
    .format("console")\
    .trigger(processingTime='5 seconds') \
    .foreachBatch(show_batch) \
    .start()



+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:09...|   C|   10|
|{2023-05-22 05:16...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:17...|   A|   19|
|{2023-05-22 05:08...|   C|   10|
|{2023-05-22 05:22...|   A|   14|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:11...|   B|   16|
|{2023-05-22 05:20...|   A|   17|
|{2023-05-22 05:18...|   B|   15|
|{2023-05-22 05:09...|   B|   12|
|{2023-05-22 05:19...|   A|   17|
|{2023-05-22 05:17...|   C|    9|
|{2023-05-22 05:17...|   C|   10|
|{2023-05-22 05:21...|   A|   27|
|{2023-05-22 05:18...|   C|   10|
|{2023-05-22 05:18...|   C|    8|
|{2023-05-22 05:19...|   B|   17|
+--------------------+----+-----+
only showing top 20 rows

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|

In [31]:
query.stop()

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:09...|   C|   10|
|{2023-05-22 05:16...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:17...|   A|   19|
|{2023-05-22 05:08...|   C|   10|
|{2023-05-22 05:22...|   A|   14|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:11...|   B|   16|
|{2023-05-22 05:20...|   A|   17|
|{2023-05-22 05:18...|   B|   15|
|{2023-05-22 05:09...|   B|   12|
|{2023-05-22 05:19...|   A|   17|
|{2023-05-22 05:17...|   C|    9|
|{2023-05-22 05:17...|   C|   10|
|{2023-05-22 05:21...|   A|   27|
|{2023-05-22 05:18...|   C|   10|
|{2023-05-22 05:18...|   C|    8|
|{2023-05-22 05:19...|   B|   17|
+--------------------+----+-----+
only showing top 20 rows

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:09...|   C|   10|
|{2023-05-22 05:16...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:23...|   A|   14|
|{2023-05-22 05:17...|   A|   19|
|{2023-05-22 05:08...|   C|   10|
|{2023-05-22 05:22...|   A|   14|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:11...|   B|   16|
|{2023-05-22 05:20...|   A|   17|
|{2023-05-22 05:18...|   B|   15|
|{2023-05-22 05:09...|   B|   12|
|{2023-05-22 05:19...|   A|   17|
|{2023-05-22 05:17...|   C|    9|
|{2023-05-22 05:17...|   C|   10|
|{2023-05-22 05:21...|   A|   27|
|{2023-05-22 05:18...|   C|   10|
|{2023-05-22 05:18...|   C|    8|
+--------------------+----+-----+
only showing top 20 rows

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:09...|   C|   10|
|{2023-05-22 05:16...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:23...|   A|   14|
|{2023-05-22 05:17...|   A|   19|
|{2023-05-22 05:08...|   C|   10|
|{2023-05-22 05:22...|   A|   14|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:11...|   B|   16|
|{2023-05-22 05:20...|   A|   17|
|{2023-05-22 05:18...|   B|   15|
|{2023-05-22 05:09...|   B|   12|
|{2023-05-22 05:19...|   A|   17|
|{2023-05-22 05:17...|   C|    9|
|{2023-05-22 05:17...|   C|   10|
|{2023-05-22 05:21...|   A|   27|
|{2023-05-22 05:18...|   C|   10|
|{2023-05-22 05:18...|   C|    8|
+--------------------+----+-----+
only showing top 20 rows

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:09...|   C|   10|
|{2023-05-22 05:16...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:23...|   A|   14|
|{2023-05-22 05:17...|   A|   19|
|{2023-05-22 05:08...|   C|   10|
|{2023-05-22 05:22...|   A|   14|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:11...|   B|   16|
|{2023-05-22 05:20...|   A|   17|
|{2023-05-22 05:18...|   B|   15|
|{2023-05-22 05:09...|   B|   12|
|{2023-05-22 05:19...|   A|   17|
|{2023-05-22 05:17...|   C|    9|
|{2023-05-22 05:17...|   C|   10|
|{2023-05-22 05:21...|   A|   27|
|{2023-05-22 05:18...|   C|   10|
|{2023-05-22 05:18...|   C|    8|
+--------------------+----+-----+
only showing top 20 rows

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:09...|   C|   10|
|{2023-05-22 05:16...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:23...|   A|   14|
|{2023-05-22 05:17...|   A|   19|
|{2023-05-22 05:08...|   C|   10|
|{2023-05-22 05:22...|   A|   14|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:11...|   B|   16|
|{2023-05-22 05:20...|   A|   17|
|{2023-05-22 05:18...|   B|   15|
|{2023-05-22 05:09...|   B|   12|
|{2023-05-22 05:19...|   A|   17|
|{2023-05-22 05:17...|   C|    9|
|{2023-05-22 05:17...|   C|   10|
|{2023-05-22 05:21...|   A|   27|
|{2023-05-22 05:18...|   C|   10|
|{2023-05-22 05:18...|   C|    8|
+--------------------+----+-----+
only showing top 20 rows

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:09...|   C|   10|
|{2023-05-22 05:16...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:23...|   A|   14|
|{2023-05-22 05:17...|   A|   19|
|{2023-05-22 05:08...|   C|   10|
|{2023-05-22 05:22...|   A|   14|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:11...|   B|   16|
|{2023-05-22 05:20...|   A|   17|
|{2023-05-22 05:18...|   B|   15|
|{2023-05-22 05:09...|   B|   12|
|{2023-05-22 05:19...|   A|   17|
|{2023-05-22 05:17...|   C|    9|
|{2023-05-22 05:17...|   C|   10|
|{2023-05-22 05:21...|   A|   27|
|{2023-05-22 05:18...|   C|   10|
|{2023-05-22 05:18...|   C|    8|
+--------------------+----+-----+
only showing top 20 rows

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:09...|   C|   10|
|{2023-05-22 05:16...|   C|   10|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:23...|   A|   14|
|{2023-05-22 05:17...|   A|   19|
|{2023-05-22 05:08...|   C|   10|
|{2023-05-22 05:22...|   A|   14|
|{2023-05-22 05:22...|   C|   10|
|{2023-05-22 05:11...|   B|   16|
|{2023-05-22 05:20...|   A|   17|
|{2023-05-22 05:18...|   B|   15|
|{2023-05-22 05:09...|   B|   12|
|{2023-05-22 05:19...|   A|   17|
|{2023-05-22 05:17...|   C|    9|
|{2023-05-22 05:17...|   C|   10|
|{2023-05-22 05:21...|   A|   27|
|{2023-05-22 05:18...|   C|   10|
|{2023-05-22 05:18...|   C|    8|
+--------------------+----+-----+
only showing top 20 rows

+--------------------+----+-----+
|              window|Type|count|
+--------------------+----+-----+
|{2023-05-22 05:14...|   C|   10|
|{2023-05-22 05:22...|

Overall, the dataframe that has 0 count will not be shown here. 