On this demonstration, we will use Linear Regression from Spark ML to make sales predictions. Our original csv file (sources/searchterms.csv) contains different search terms for the year 2020, 2021, and 2022. Using this data, we will try to make sales predictions for the year 2023 and 2024 by leveraging the search counts for each search term.

In [52]:
# Use this section to suppress warnings generated by the code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [53]:
# Import Libraries
import findspark
findspark.init()

import os

from pyspark.sql import SparkSession, Row
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StandardScaler, VectorAssembler, StringIndexer
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.regression import LinearRegression

In [54]:
# Create SparkSession
spark = SparkSession.builder.appName("Predict Sales with Linear Regression on Spark ML").getOrCreate()

In [55]:
# Read the data from the CSV file, preview the data, and print the schema
sdf = spark.read.csv("sources/searchterms.csv", header=True, inferSchema=True)

sdf.show(5)
sdf.printSchema()

+---+-----+----+--------------+
|day|month|year|    searchterm|
+---+-----+----+--------------+
| 12|   11|2020| mobile 6 inch|
| 12|   11|2020| mobile latest|
| 12|   11|2020|   tablet wifi|
| 12|   11|2020|laptop 14 inch|
| 12|   11|2020|     mobile 5g|
+---+-----+----+--------------+
only showing top 5 rows

root
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- searchterm: string (nullable = true)



In [56]:
# Since we will use "searchterm" column for our target, let's change it to a numerical column
indexer = StringIndexer(inputCol="searchterm", outputCol="searchtermnum")
sdf = indexer.fit(sdf).transform(sdf)

In [57]:
# Double check the original "searchterm" vs. the numerical version "searchtermnum"
sdf.groupBy('searchterm').count().orderBy('count').show()
sdf.groupBy('searchtermnum').count().orderBy('count').show()

+-------------------+-----+
|         searchterm|count|
+-------------------+-----+
|          pen drive|  144|
|ebooks data science|  410|
|     laptop 14 inch|  461|
|      gaming laptop|  499|
|     tablet 10 inch|  715|
|        tablet wifi|  896|
|             laptop|  935|
|      mobile latest| 1327|
|          mobile 5g| 2301|
|      mobile 6 inch| 2312|
+-------------------+-----+

+-------------+-----+
|searchtermnum|count|
+-------------+-----+
|          9.0|  144|
|          8.0|  410|
|          7.0|  461|
|          6.0|  499|
|          5.0|  715|
|          4.0|  896|
|          3.0|  935|
|          2.0| 1327|
|          1.0| 2301|
|          0.0| 2312|
+-------------+-----+



In [58]:
# Let's use Spark SQL to get the count of each search term
sdf.createOrReplaceTempView("sdfview")

grouped_sdf = spark.sql("SELECT searchtermnum, COUNT(searchtermnum) as searchcount, year FROM sdfview GROUP BY searchtermnum, year ORDER BY year, searchtermnum")
grouped_sdf.show(5)

+-------------+-----------+----+
|searchtermnum|searchcount|year|
+-------------+-----------+----+
|          0.0|        243|2020|
|          1.0|        213|2020|
|          2.0|        134|2020|
|          3.0|         92|2020|
|          4.0|         93|2020|
+-------------+-----------+----+
only showing top 5 rows



In [59]:
# Let's create the pipeline using "searchtermnum" and "year" as features and "searchcount" as the target
assembler = VectorAssembler(inputCols=["searchtermnum", "year"], outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="searchcount", regParam=0.1)

pipeline = Pipeline(stages=[assembler, scaler, lr])

In [60]:
# Split the data into training and testing data with a 70/30 split
(training_data, testing_data) = grouped_sdf.randomSplit([0.7, 0.3], seed=42)

In [61]:
# Train the model
model = pipeline.fit(training_data)

In [62]:
# Make the predictions
predictions = model.transform(testing_data)
predictions.show(5)

+-------------+-----------+----+------------+--------------------+------------------+
|searchtermnum|searchcount|year|    features|      scaledFeatures|        prediction|
+-------------+-----------+----+------------+--------------------+------------------+
|          0.0|       1256|2022|[0.0,2022.0]|[0.0,2557.6501715...| 899.7955508879386|
|          2.0|        134|2020|[2.0,2020.0]|[0.67386211516186...| 296.8953954371973|
|          2.0|        732|2022|[2.0,2022.0]|[0.67386211516186...| 751.3004338201135|
|          3.0|         92|2020|[3.0,2020.0]|[1.01079317274279...|222.64783690322656|
|          4.0|        314|2021|[4.0,2021.0]|[1.34772423032372...|375.60279756074306|
+-------------+-----------+----+------------+--------------------+------------------+
only showing top 5 rows



In [63]:
# Evaluate the model further using R squared, Root Mean Squared Error, and Mean Absolute Error
evaluator = RegressionEvaluator(labelCol="searchcount", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print("R Squared =", r2)

evaluator = RegressionEvaluator(labelCol="searchcount", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("RMSE =", rmse)

evaluator = RegressionEvaluator(labelCol="searchcount", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(predictions)
print("MAE =", mae)

R Squared = 0.809266318611074
RMSE = 149.54214708055568
MAE = 119.99339563497611


In [64]:
# Let's make sales predictions on the year 2023 and 2024 for each search term
new_data = [
    Row(searchtermnum=0.0, year=2023),
    Row(searchtermnum=1.0, year=2023),
    Row(searchtermnum=2.0, year=2023),
    Row(searchtermnum=3.0, year=2023),
    Row(searchtermnum=4.0, year=2023),
    Row(searchtermnum=5.0, year=2023),
    Row(searchtermnum=6.0, year=2023),
    Row(searchtermnum=7.0, year=2023),
    Row(searchtermnum=8.0, year=2023),
    Row(searchtermnum=9.0, year=2023),
    Row(searchtermnum=0.0, year=2024),
    Row(searchtermnum=1.0, year=2024),
    Row(searchtermnum=2.0, year=2024),
    Row(searchtermnum=3.0, year=2024),
    Row(searchtermnum=4.0, year=2024),
    Row(searchtermnum=5.0, year=2024),
    Row(searchtermnum=6.0, year=2024),
    Row(searchtermnum=7.0, year=2024),
    Row(searchtermnum=8.0, year=2024),
    Row(searchtermnum=9.0, year=2024)
]

# Convert the list of Rows into a DataFrame
sdf_for_new_predictions = spark.createDataFrame(new_data)

# Use the existing pipeline model to make predictions on the new data
new_predictions = model.transform(sdf_for_new_predictions)

# Show the sales predictions for the year 2023 and 2024
new_predictions.show()

+-------------+----+------------+--------------------+------------------+
|searchtermnum|year|    features|      scaledFeatures|        prediction|
+-------------+----+------------+--------------------+------------------+
|          0.0|2023|[0.0,2023.0]|[0.0,2558.9150826...|1126.9980700793676|
|          1.0|2023|[1.0,2023.0]|[0.33693105758093...| 1052.750511545455|
|          2.0|2023|[2.0,2023.0]|[0.67386211516186...| 978.5029530114844|
|          3.0|2023|[3.0,2023.0]|[1.01079317274279...| 904.2553944775718|
|          4.0|2023|[4.0,2023.0]|[1.34772423032372...| 830.0078359436011|
|          5.0|2023|[5.0,2023.0]|[1.68465528790465...| 755.7602774096886|
|          6.0|2023|[6.0,2023.0]|[2.02158634548558...| 681.5127188757178|
|          7.0|2023|[7.0,2023.0]|[2.35851740306651...| 607.2651603418053|
|          8.0|2023|[8.0,2023.0]|[2.69544846064744...| 533.0176018078928|
|          9.0|2023|[9.0,2023.0]|[3.03237951822837...|458.77004327392206|
|          0.0|2024|[0.0,2024.0]|[0.0,

In [65]:
# Save the trained model to a local folder
base_dir = "model_storage_"
i = 0
while os.path.exists(f"{base_dir}{str(i).zfill(2)}"):
    i += 1
dir_path = f"{base_dir}{str(i).zfill(2)}"

model.write().save(dir_path)

In [66]:
# Let's test the persisted model by loading it back
loaded_model = PipelineModel.load(dir_path)

In [67]:
# Let's use the persisted model and make predictions on test data, to compare with the previous predictions
predictions_from_loaded_model = loaded_model.transform(testing_data)
new_predictions_from_loaded_model = loaded_model.transform(sdf_for_new_predictions)

predictions_from_loaded_model.show(5)
new_predictions_from_loaded_model.show(5)

+-------------+-----------+----+------------+--------------------+------------------+
|searchtermnum|searchcount|year|    features|      scaledFeatures|        prediction|
+-------------+-----------+----+------------+--------------------+------------------+
|          0.0|       1256|2022|[0.0,2022.0]|[0.0,2557.6501715...| 899.7955508879386|
|          2.0|        134|2020|[2.0,2020.0]|[0.67386211516186...| 296.8953954371973|
|          2.0|        732|2022|[2.0,2022.0]|[0.67386211516186...| 751.3004338201135|
|          3.0|         92|2020|[3.0,2020.0]|[1.01079317274279...|222.64783690322656|
|          4.0|        314|2021|[4.0,2021.0]|[1.34772423032372...|375.60279756074306|
+-------------+-----------+----+------------+--------------------+------------------+
only showing top 5 rows

+-------------+----+------------+--------------------+------------------+
|searchtermnum|year|    features|      scaledFeatures|        prediction|
+-------------+----+------------+--------------------

In [68]:
# Stop the SparkSession
spark.stop()