Rishabh Johri

rjohri@deloitte.com

# Spark MLlib: Predicting Repeat Customer Purchase


This notebook demonstrates how to use **PySpark MLlib** to build a classification model that predicts whether a customer is likely to make a repeat purchase.

We use a toy dataset with features:
- Age
- Total number of orders
- Average order value

The target label is `repeat_purchase` (1 = Yes, 0 = No).


In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

In [2]:
spark = SparkSession.builder \
    .appName("RepeatPurchasePrediction") \
    .getOrCreate()

In [3]:
# Load the dataset
data = spark.read.csv("/content/customer_repeat_purchase.csv", header=True, inferSchema=True)
data.show()

+-----------+---+------------+---------------+---------------+
|customer_id|age|total_orders|avg_order_value|repeat_purchase|
+-----------+---+------------+---------------+---------------+
|          1| 25|           5|           40.0|              1|
|          2| 34|           2|           20.0|              0|
|          3| 45|          10|           60.0|              1|
|          4| 23|           1|           15.0|              0|
|          5| 38|           6|           55.0|              1|
|          6| 29|           3|           22.0|              0|
|          7| 41|           9|           48.0|              1|
|          8| 36|           4|           30.0|              0|
|          9| 50|          12|           65.0|              1|
|         10| 31|           2|           18.0|              0|
|         11| 27|           5|           35.0|              1|
|         12| 33|           3|           28.0|              0|
|         13| 47|          11|           62.0|         

In [4]:
# Create feature vector
feature_cols = ["age", "total_orders", "avg_order_value"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Assemble features
data_prepared = assembler.transform(data).select("features", col("repeat_purchase").alias("label"))
data_prepared.show()

+----------------+-----+
|        features|label|
+----------------+-----+
| [25.0,5.0,40.0]|    1|
| [34.0,2.0,20.0]|    0|
|[45.0,10.0,60.0]|    1|
| [23.0,1.0,15.0]|    0|
| [38.0,6.0,55.0]|    1|
| [29.0,3.0,22.0]|    0|
| [41.0,9.0,48.0]|    1|
| [36.0,4.0,30.0]|    0|
|[50.0,12.0,65.0]|    1|
| [31.0,2.0,18.0]|    0|
| [27.0,5.0,35.0]|    1|
| [33.0,3.0,28.0]|    0|
|[47.0,11.0,62.0]|    1|
| [22.0,1.0,12.0]|    0|
| [40.0,7.0,50.0]|    1|
+----------------+-----+



In [5]:
# Split data
train_data, test_data = data_prepared.randomSplit([0.7, 0.3], seed=42)

In [6]:
# Define classifier
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

# Create pipeline
pipeline = Pipeline(stages=[dt])

# Train model
model = pipeline.fit(train_data)

In [7]:
# Make predictions
predictions = model.transform(test_data)
predictions.select("features", "label", "prediction").show()

+----------------+-----+----------+
|        features|label|prediction|
+----------------+-----+----------+
| [25.0,5.0,40.0]|    1|       1.0|
| [33.0,3.0,28.0]|    0|       0.0|
| [36.0,4.0,30.0]|    0|       0.0|
| [38.0,6.0,55.0]|    1|       1.0|
|[47.0,11.0,62.0]|    1|       1.0|
|[50.0,12.0,65.0]|    1|       1.0|
+----------------+-----+----------+



In [8]:
# Evaluate accuracy and F1 score
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

accuracy = evaluator_acc.evaluate(predictions)
f1 = evaluator_f1.evaluate(predictions)

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 1.00
F1 Score: 1.00


In [9]:
spark.stop()