Import Libraries

In [153]:
import pandas as pd 
import numpy as np
import seaborn as sns  
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans 
from sklearn.preprocessing import StandardScaler
from  sklearn.metrics import silhouette_score

load dataset

In [154]:
data = pd.read_csv("Wholesale customers data.csv")
data

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185
...,...,...,...,...,...,...,...,...
435,1,3,29703,12051,16027,13135,182,2204
436,1,3,39228,1431,764,4510,93,2346
437,2,3,14531,15488,30243,437,14841,1867
438,1,3,10290,1981,2232,1038,168,2125


In [155]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Channel           440 non-null    int64
 1   Region            440 non-null    int64
 2   Fresh             440 non-null    int64
 3   Milk              440 non-null    int64
 4   Grocery           440 non-null    int64
 5   Frozen            440 non-null    int64
 6   Detergents_Paper  440 non-null    int64
 7   Delicassen        440 non-null    int64
dtypes: int64(8)
memory usage: 27.6 KB


Features Scaling

In [156]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

In [165]:
data_scaled

array([[ 1.44865163,  0.59066829,  0.05293319, ..., -0.58936716,
        -0.04356873, -0.06633906],
       [ 1.44865163,  0.59066829, -0.39130197, ..., -0.27013618,
         0.08640684,  0.08915105],
       [ 1.44865163,  0.59066829, -0.44702926, ..., -0.13753572,
         0.13323164,  2.24329255],
       ...,
       [ 1.44865163,  0.59066829,  0.20032554, ..., -0.54337975,
         2.51121768,  0.12145607],
       [-0.69029709,  0.59066829, -0.13538389, ..., -0.41944059,
        -0.56977032,  0.21304614],
       [-0.69029709,  0.59066829, -0.72930698, ..., -0.62009417,
        -0.50488752, -0.52286938]], shape=(440, 8))

Select the best number of k

In [158]:
silhouette_dict = {}

for k in range(3, 9):
    kmeans=KMeans(n_clusters=k,random_state=42).fit(data_scaled)
    labels = kmeans.labels_
    score = silhouette_score(data_scaled,  labels)
    
    print(f"Score for k={k}: {score}")
    silhouette_dict[k] = score

k = max(silhouette_dict, key=silhouette_dict.get)
score = silhouette_dict[k]
print(f"Best k: {k} withs score: {score}")

Score for k=3: 0.3567685389017652
Score for k=4: 0.34819485747460033
Score for k=5: 0.35696445697910095
Score for k=6: 0.3558904934542414
Score for k=7: 0.355326775451717
Score for k=8: 0.3562905573605705
Best k: 5 withs score: 0.35696445697910095


Fits the kmeans with the best k value

In [159]:
kmeans = KMeans(n_clusters=k, random_state=42).fit(data_scaled)

In [160]:
labels= kmeans.labels_

In [161]:
data['cluster'] = labels
data

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,cluster
0,2,3,12669,9656,7561,214,2674,1338,0
1,2,3,7057,9810,9568,1762,3293,1776,0
2,2,3,6353,8808,7684,2405,3516,7844,0
3,1,3,13265,1196,4221,6404,507,1788,1
4,2,3,22615,5410,7198,3915,1777,5185,0
...,...,...,...,...,...,...,...,...,...
435,1,3,29703,12051,16027,13135,182,2204,4
436,1,3,39228,1431,764,4510,93,2346,1
437,2,3,14531,15488,30243,437,14841,1867,0
438,1,3,10290,1981,2232,1038,168,2125,1


Cluster sizes

In [162]:
print(data['cluster'].value_counts())

cluster
1    200
0    126
3     90
4     14
2     10
Name: count, dtype: int64


Each cluster center

In [163]:
kmeans.cluster_centers_[0]

array([ 1.44865163,  0.1699285 , -0.30636283,  0.41750021,  0.65187952,
       -0.3572697 ,  0.67684797,  0.00633175])

In [164]:
data.groupby('cluster').mean()

Unnamed: 0_level_0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2.0,2.674603,8130.031746,8874.071429,14139.150794,1339.47619,6104.936508,1542.706349
1,1.0,3.0,11941.285,3060.57,3497.935,2950.78,755.69,1101.195
2,2.0,2.5,15964.9,34708.5,48536.9,3054.6,24875.2,2942.8
3,1.055556,1.311111,11979.055556,3210.777778,4122.411111,3288.777778,861.344444,1130.755556
4,1.071429,2.785714,44980.428571,13146.785714,11504.0,19013.142857,1516.0,8937.642857
