



*   List item
*   List item





# 2 . Build a Clustering Model with Spark with a dataset of your choice

Import Required Libraries

In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

Initialize Spark Session

In [11]:
from pyspark.sql import SparkSession

try:
    spark = SparkSession.builder \
        .appName("ClusteringModel") \
        .getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    print("Spark session initialized successfully.")
except Exception as e:
    print(f"Error initializing Spark session: {e}")
    exit(1)

Spark session initialized successfully.


Load Dataset

Dataset: Wholesale customers data

In [12]:
import urllib.request
import os


wholesale_url= "https://archive.ics.uci.edu/ml/machine-learning-databases/00292/Wholesale%20customers%20data.csv"
local_path = "wholesale.csv"  # Specify a local path to save the file

urllib.request.urlretrieve(wholesale_url, local_path)

data = spark.read.csv(local_path, header=True, inferSchema=True)
data.show(5)

+-------+------+-----+----+-------+------+----------------+----------+
|Channel|Region|Fresh|Milk|Grocery|Frozen|Detergents_Paper|Delicassen|
+-------+------+-----+----+-------+------+----------------+----------+
|      2|     3|12669|9656|   7561|   214|            2674|      1338|
|      2|     3| 7057|9810|   9568|  1762|            3293|      1776|
|      2|     3| 6353|8808|   7684|  2405|            3516|      7844|
|      1|     3|13265|1196|   4221|  6404|             507|      1788|
|      2|     3|22615|5410|   7198|  3915|            1777|      5185|
+-------+------+-----+----+-------+------+----------------+----------+
only showing top 5 rows



 Data Preprocessing

In [13]:
#Select relevant features for clustering:

feature_cols = ['Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen']
data = data.select(feature_cols)

In [14]:
#Assemble features into a single vector:

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
dataset = assembler.transform(data)

Build & Train the KMeans Model

In [15]:
kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(dataset)

Make Predictions

In [16]:
predictions = model.transform(dataset)
predictions.select("prediction").show(5)

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         1|
+----------+
only showing top 5 rows



Evaluate Clustering

In [17]:
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette Score: {silhouette}")

Silhouette Score: 0.6482181662567144


Analyze Cluster Centers

In [18]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[7390.95845697 4439.76854599 6292.1958457  2495.53412463 2238.65281899
 1158.44807122]
[32768.01333333  4827.68        5723.14666667  5535.92
  1074.12        2066.64      ]
[11849.17857143 24717.10714286 33887.71428571  3409.32142857
 15459.71428571  4483.85714286]


In [19]:
spark.stop()