<a href="https://colab.research.google.com/github/stevejj4/Apache-Spark/blob/main/Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Unsupervisedd learning algorithm
# K-means
# Complexity of the iteration
# single linkage
# complete linkage
# average linkage
# ward linkage
# using spark because of big data


In [1]:
!pip install pyspark
!pip install findspark # simplifies the process of using Apache Spark python


Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=409475f25d446a8d9a7c808bee0bab311a700fb9ff9f0f7150e5455805e391ce
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans



In [3]:
# creating a spark session
spark = SparkSession.builder.appName('Clustering using spark').getOrCreate()

In [6]:
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/customers.csv


--2024-08-25 10:30:01--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/customers.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8909 (8.7K) [text/csv]
Saving to: ‘customers.csv’


2024-08-25 10:30:02 (161 MB/s) - ‘customers.csv’ saved [8909/8909]



In [7]:
# load the data in a csv file
df = spark.read.csv("customers.csv", header=True, inferSchema=True)
df.show(5)

+----------+----+-------+-----------+
|Fresh_Food|Milk|Grocery|Frozen_Food|
+----------+----+-------+-----------+
|     12669|9656|   7561|        214|
|      7057|9810|   9568|       1762|
|      6353|8808|   7684|       2405|
|     13265|1196|   4221|       6404|
|     22615|5410|   7198|       3915|
+----------+----+-------+-----------+
only showing top 5 rows



In [8]:
df.printSchema()

root
 |-- Fresh_Food: integer (nullable = true)
 |-- Milk: integer (nullable = true)
 |-- Grocery: integer (nullable = true)
 |-- Frozen_Food: integer (nullable = true)



In [9]:
# creating a feature vector
feature_cols = ['Fresh_Food','Milk','Grocery','Frozen_Food']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
df = assembler.transform(df)
df.show(5)

+----------+----+-------+-----------+--------------------+
|Fresh_Food|Milk|Grocery|Frozen_Food|            features|
+----------+----+-------+-----------+--------------------+
|     12669|9656|   7561|        214|[12669.0,9656.0,7...|
|      7057|9810|   9568|       1762|[7057.0,9810.0,95...|
|      6353|8808|   7684|       2405|[6353.0,8808.0,76...|
|     13265|1196|   4221|       6404|[13265.0,1196.0,4...|
|     22615|5410|   7198|       3915|[22615.0,5410.0,7...|
+----------+----+-------+-----------+--------------------+
only showing top 5 rows



In [10]:
#Algorith is KMeans
number_of_clusters = 3

In [17]:
# In cell ipython-input-14-afdde2110c78
kmeans = KMeans(k = number_of_clusters) # creating a clustering model

# In cell ipython-input-16-afdde2110c78
# Fitting and training the model
model = kmeans.fit(df)

In [18]:
# Make predictions on the dataset
predictions = model.transform(df)

In [19]:
# display the result
predictions.show(5)

+----------+----+-------+-----------+--------------------+----------+
|Fresh_Food|Milk|Grocery|Frozen_Food|            features|prediction|
+----------+----+-------+-----------+--------------------+----------+
|     12669|9656|   7561|        214|[12669.0,9656.0,7...|         0|
|      7057|9810|   9568|       1762|[7057.0,9810.0,95...|         0|
|      6353|8808|   7684|       2405|[6353.0,8808.0,76...|         0|
|     13265|1196|   4221|       6404|[13265.0,1196.0,4...|         0|
|     22615|5410|   7198|       3915|[22615.0,5410.0,7...|         1|
+----------+----+-------+-----------+--------------------+----------+
only showing top 5 rows



In [20]:
# How many customers are there in each cluster
predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   74|
|         2|   47|
|         0|  319|
+----------+-----+



In [21]:
spark.stop()