0️⃣ Colab Setup – Fast PySpark Installation

In [None]:
# Install PySpark (fast, ~50MB)
!pip install -q pyspark

# Import SparkSession
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder

# Initialize Spark
spark = SparkSession.builder \
    .appName("TP4_SparkMLlib") \
    .getOrCreate()

# Optional: Check Spark version
spark.version


'3.5.1'

1️⃣ Load all CSV files into a single DataFrame

In [None]:
import glob

# Get all CSV file paths
csv_files = glob.glob("/content/drive/MyDrive/data*.csv")  # change path if needed

# Load all CSVs into a single DataFrame
df = spark.read.option("header", True).option("inferSchema", True).csv("/content/drive/MyDrive/data/*.csv")

# Preview first 5 rows
df.show(5)


+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   537226|    22811|SET OF 6 T-LIGHTS...|       6|2010-12-06 08:34:00|     2.95|   15987.0|United Kingdom|
|   537226|    21713|CITRONELLA CANDLE...|       8|2010-12-06 08:34:00|      2.1|   15987.0|United Kingdom|
|   537226|    22927|GREEN GIANT GARDE...|       2|2010-12-06 08:34:00|     5.95|   15987.0|United Kingdom|
|   537226|    20802|SMALL GLASS SUNDA...|       6|2010-12-06 08:34:00|     1.65|   15987.0|United Kingdom|
|   537226|    22052|VINTAGE CARAVAN G...|      25|2010-12-06 08:34:00|     0.42|   15987.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



2️⃣ Fill missing values (NaN) with 0

In [None]:
from pyspark.sql.functions import col  # ⚠️ import col

# Fill all missing values with 0
df = df.fillna(0)

3️⃣ Add day_of_week column from InvoiceDate

In [None]:
from pyspark.sql.functions import dayofweek, to_date, col

# Convert InvoiceDate to proper date type
df = df.withColumn("InvoiceDate", to_date(col("InvoiceDate"), "yyyy-MM-dd HH:mm:ss"))

# Add day_of_week column (1 = Sunday, 7 = Saturday)
df = df.withColumn("day_of_week", dayofweek(col("InvoiceDate")))

# Preview
df.show(5)


+---------+---------+--------------------+--------+-----------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|day_of_week|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+-----------+
|   537226|    22811|SET OF 6 T-LIGHTS...|       6| 2010-12-06|     2.95|   15987.0|United Kingdom|          2|
|   537226|    21713|CITRONELLA CANDLE...|       8| 2010-12-06|      2.1|   15987.0|United Kingdom|          2|
|   537226|    22927|GREEN GIANT GARDE...|       2| 2010-12-06|     5.95|   15987.0|United Kingdom|          2|
|   537226|    20802|SMALL GLASS SUNDA...|       6| 2010-12-06|     1.65|   15987.0|United Kingdom|          2|
|   537226|    22052|VINTAGE CARAVAN G...|      25| 2010-12-06|     0.42|   15987.0|United Kingdom|          2|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+-----

4️⃣ Split into training and test sets

In [None]:
# Training: purchases before 2010-12-13
train_df = df.filter(col("InvoiceDate") < "2010-12-13")

# Test: purchases on or after 2010-12-13
test_df = df.filter(col("InvoiceDate") >= "2010-12-13")

# Check counts
print("Training rows:", train_df.count())
print("Test rows:", test_df.count())


Training rows: 26732
Test rows: 18676


6️⃣ Create a StringIndexer for day_of_week

In [None]:
from pyspark.ml.feature import StringIndexer

# Step 6: create StringIndexer
indexer = StringIndexer(inputCol="day_of_week", outputCol="day_of_week_indexed")

# Fit and transform on training data
train_indexed = indexer.fit(train_df).transform(train_df)

# Show first 5 rows
train_indexed.show(5)


+---------+---------+--------------------+--------+-----------+---------+----------+--------------+-----------+-------------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|day_of_week|day_of_week_indexed|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+-----------+-------------------+
|   537226|    22811|SET OF 6 T-LIGHTS...|       6| 2010-12-06|     2.95|   15987.0|United Kingdom|          2|                4.0|
|   537226|    21713|CITRONELLA CANDLE...|       8| 2010-12-06|      2.1|   15987.0|United Kingdom|          2|                4.0|
|   537226|    22927|GREEN GIANT GARDE...|       2| 2010-12-06|     5.95|   15987.0|United Kingdom|          2|                4.0|
|   537226|    20802|SMALL GLASS SUNDA...|       6| 2010-12-06|     1.65|   15987.0|United Kingdom|          2|                4.0|
|   537226|    22052|VINTAGE CARAVAN G...|      25| 2010-12-06|     0.42|   

7️⃣ Discussion question :
Spark represents weekdays numerically (like Monday = 1, Saturday = 6).
But this makes it seem like Saturday > Monday, which isn’t meaningful.
✅ To fix this, we use OneHotEncoder, which creates a binary vector representation instead of ranking them numerically.

In [None]:
# 2️⃣ OneHotEncoder to fix the ordinal problem
encoder = OneHotEncoder(inputCols=["day_of_week_indexed"], outputCols=["day_of_week_encoded"])
train_encoded = encoder.fit(train_indexed).transform(train_indexed)

# Show first 5 rows after OneHotEncoding
train_encoded.show(5, truncate=False)


+---------+---------+------------------------------+--------+-----------+---------+----------+--------------+-----------+-------------------+-------------------+
|InvoiceNo|StockCode|Description                   |Quantity|InvoiceDate|UnitPrice|CustomerID|Country       |day_of_week|day_of_week_indexed|day_of_week_encoded|
+---------+---------+------------------------------+--------+-----------+---------+----------+--------------+-----------+-------------------+-------------------+
|537226   |22811    |SET OF 6 T-LIGHTS CACTI       |6       |2010-12-06 |2.95     |15987.0   |United Kingdom|2          |4.0                |(5,[4],[1.0])      |
|537226   |21713    |CITRONELLA CANDLE FLOWERPOT   |8       |2010-12-06 |2.1      |15987.0   |United Kingdom|2          |4.0                |(5,[4],[1.0])      |
|537226   |22927    |GREEN GIANT GARDEN THERMOMETER|2       |2010-12-06 |5.95     |15987.0   |United Kingdom|2          |4.0                |(5,[4],[1.0])      |
|537226   |20802    |SMALL G

8️⃣ Add VectorAssembler for features

In [None]:
from pyspark.ml.feature import VectorAssembler

# Assemble all features into a single vector column
assembler = VectorAssembler(
    inputCols=["UnitPrice", "Quantity", "day_of_week_encoded"],
    outputCol="features"
)

train_features = assembler.transform(train_encoded)

# Show first 5 rows to check the features
train_features.select("UnitPrice", "Quantity", "day_of_week_encoded", "features").show(5, truncate=False)


+---------+--------+-------------------+---------------------------+
|UnitPrice|Quantity|day_of_week_encoded|features                   |
+---------+--------+-------------------+---------------------------+
|2.95     |6       |(5,[4],[1.0])      |(7,[0,1,6],[2.95,6.0,1.0]) |
|2.1      |8       |(5,[4],[1.0])      |(7,[0,1,6],[2.1,8.0,1.0])  |
|5.95     |2       |(5,[4],[1.0])      |(7,[0,1,6],[5.95,2.0,1.0]) |
|1.65     |6       |(5,[4],[1.0])      |(7,[0,1,6],[1.65,6.0,1.0]) |
|0.42     |25      |(5,[4],[1.0])      |(7,[0,1,6],[0.42,25.0,1.0])|
+---------+--------+-------------------+---------------------------+
only showing top 5 rows



Step 9: Create Pipeline

In [None]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[indexer, encoder, assembler])
pipeline_model = pipeline.fit(train_df)
train_transformed = pipeline_model.transform(train_df)

# Show first 5 rows of final features
train_transformed.select("UnitPrice", "Quantity", "day_of_week_encoded", "features").show(5, truncate=False)


+---------+--------+-------------------+---------------------------+
|UnitPrice|Quantity|day_of_week_encoded|features                   |
+---------+--------+-------------------+---------------------------+
|2.95     |6       |(5,[4],[1.0])      |(7,[0,1,6],[2.95,6.0,1.0]) |
|2.1      |8       |(5,[4],[1.0])      |(7,[0,1,6],[2.1,8.0,1.0])  |
|5.95     |2       |(5,[4],[1.0])      |(7,[0,1,6],[5.95,2.0,1.0]) |
|1.65     |6       |(5,[4],[1.0])      |(7,[0,1,6],[1.65,6.0,1.0]) |
|0.42     |25      |(5,[4],[1.0])      |(7,[0,1,6],[0.42,25.0,1.0])|
+---------+--------+-------------------+---------------------------+
only showing top 5 rows



Step 10: StringIndexer must know the number of unique values
**Problem**: StringIndexer needs to know how many unique categories exist to properly encode them.

**Solution**:

Fit the StringIndexer on the training set, not the test set.

Spark automatically detects the number of unique categories during .fit().

Example :

In [None]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="day_of_week", outputCol="day_of_week_indexed")
indexer_model = indexer.fit(train_df)  # Spark detects unique values automatically
train_indexed = indexer_model.transform(train_df)

# Show first 5 rows
train_indexed.select("day_of_week", "day_of_week_indexed").show(5)


+-----------+-------------------+
|day_of_week|day_of_week_indexed|
+-----------+-------------------+
|          2|                4.0|
|          2|                4.0|
|          2|                4.0|
|          2|                4.0|
|          2|                4.0|
+-----------+-------------------+
only showing top 5 rows



Step 11: Transform training set using pipeline

In [None]:
from pyspark.ml import Pipeline

# Define pipeline: StringIndexer → OneHotEncoder → VectorAssembler
pipeline = Pipeline(stages=[indexer, encoder, assembler])
pipeline_model = pipeline.fit(train_df)  # Fit on training set
pipeline_model = pipeline.fit(test_df)
# Transform training data
train_transformed = pipeline_model.transform(train_df)
test_transformed = pipeline_model.transform(test_df)
# Show first 5 rows
train_transformed.show(5, truncate=False)
test_transformed.show(5, truncate=False)

+---------+---------+------------------------------+--------+-----------+---------+----------+--------------+-----------+-------------------+-------------------+---------------------------+
|InvoiceNo|StockCode|Description                   |Quantity|InvoiceDate|UnitPrice|CustomerID|Country       |day_of_week|day_of_week_indexed|day_of_week_encoded|features                   |
+---------+---------+------------------------------+--------+-----------+---------+----------+--------------+-----------+-------------------+-------------------+---------------------------+
|537226   |22811    |SET OF 6 T-LIGHTS CACTI       |6       |2010-12-06 |2.95     |15987.0   |United Kingdom|2          |1.0                |(5,[1],[1.0])      |(7,[0,1,3],[2.95,6.0,1.0]) |
|537226   |21713    |CITRONELLA CANDLE FLOWERPOT   |8       |2010-12-06 |2.1      |15987.0   |United Kingdom|2          |1.0                |(5,[1],[1.0])      |(7,[0,1,3],[2.1,8.0,1.0])  |
|537226   |22927    |GREEN GIANT GARDEN THERMOMETE

Step 12: Create KMeans instance

In [None]:
from pyspark.ml.clustering import KMeans

# Create KMeans with 20 clusters
kmeans = KMeans(featuresCol="features", predictionCol="cluster", k=20, seed=42)


Step 13: Fit KMeans on training data

In [None]:
kmeans_model_train = kmeans.fit(train_transformed)


# Transform training set to get cluster assignments
train_clusters = kmeans_model_train.transform(train_transformed)

# Show first 5 rows with clusters
train_clusters.select("UnitPrice", "Quantity", "day_of_week_encoded", "features", "cluster").show(5, truncate=False)


+---------+--------+-------------------+---------------------------+-------+
|UnitPrice|Quantity|day_of_week_encoded|features                   |cluster|
+---------+--------+-------------------+---------------------------+-------+
|2.95     |6       |(5,[1],[1.0])      |(7,[0,1,3],[2.95,6.0,1.0]) |11     |
|2.1      |8       |(5,[1],[1.0])      |(7,[0,1,3],[2.1,8.0,1.0])  |11     |
|5.95     |2       |(5,[1],[1.0])      |(7,[0,1,3],[5.95,2.0,1.0]) |11     |
|1.65     |6       |(5,[1],[1.0])      |(7,[0,1,3],[1.65,6.0,1.0]) |11     |
|0.42     |25      |(5,[1],[1.0])      |(7,[0,1,3],[0.42,25.0,1.0])|0      |
+---------+--------+-------------------+---------------------------+-------+
only showing top 5 rows



Step 14: Make predictions on test set

In [None]:
# Transform test set using the same pipeline
test_transformed = pipeline_model.transform(test_df)

# Predict clusters on test set using the trained KMeans model
test_clusters = kmeans_model_train.transform(test_transformed)

# Show first 5 rows
test_clusters.show(5, truncate=False)


+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+-----------+-------------------+-------------------+---------------------------+-------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate|UnitPrice|CustomerID|Country       |day_of_week|day_of_week_indexed|day_of_week_encoded|features                   |cluster|
+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+-----------+-------------------+-------------------+---------------------------+-------+
|539325   |22720    |SET OF 3 CAKE TINS PANTRY DESIGN   |3       |2010-12-17 |4.95     |13004.0   |United Kingdom|6          |3.0                |(5,[3],[1.0])      |(7,[0,1,5],[4.95,3.0,1.0]) |11     |
|539325   |22722    |SET OF 6 SPICE TINS PANTRY DESIGN  |4       |2010-12-17 |3.95     |13004.0   |United Kingdom|6          |3.0                |(5,[3],[1.0])      |(7,[0,1,5],[3.95,4.0,1

Step 15: Compute silhouette score

In [None]:
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="cluster", metricName="silhouette", distanceMeasure="squaredEuclidean")

silhouette = evaluator.evaluate(test_clusters)
print(f"Silhouette score: {silhouette}")


Silhouette score: 0.743506843337969
