**Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data.**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist, pdist
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, ward, complete, average
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

In [None]:
mall = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
mall.sample()

In [None]:
mall.rename(columns = {'Annual Income (k$)': 'Annual_Income', 'Spending Score (1-100)': 'Spending_Score'}, inplace = True)

In [None]:
mall.sample()

In [None]:
mall_clust = mall[['Annual_Income', 'Spending_Score']]

In [None]:
plt.figure(figsize=(6,6))
sns.scatterplot(x = 'Annual_Income', y = 'Spending_Score', data = mall_clust)
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.show()

Let's determine optimal number of cluster using 3 methods:
- KMeans Clustering
- Agglomerative Clustering
- DBSCAN

# KMeans Clustering

## *Elbow Method*

In [None]:
number_of_cluster = range(2,11)
clusterings = [KMeans(n_clusters = k).fit(mall_clust) for k in number_of_cluster]
centroids = [k.cluster_centers_ for k in clusterings]

D_k = [cdist(mall_clust, cent, 'euclidean') for cent in centroids] 
cIdx = [np.argmin(D, axis = 1) for D in D_k] 
dist = [np.min(D, axis = 1) for D in D_k] 
avg_withinSS = [sum(d)/mall_clust.shape[0] for d in dist] 

In [None]:
plt.figure(figsize=(19,8))
sns.lineplot(number_of_cluster, avg_withinSS)
sns.scatterplot(number_of_cluster, avg_withinSS)
plt.xticks(number_of_cluster)
plt.xlabel('Number Of Cluster')
plt.ylabel('Average Within SS')
plt.show()

- Seems like **the best n_clusters is 5** based on this plot. I will check the best n_clusters with Silhouette Method.

## *Silhouette Method*

In [None]:
s_score = []

number_of_cluster = range(2,11)

for i in number_of_cluster:
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(mall_clust)
    labels = kmeans.labels_
    s_score.append(silhouette_score(mall_clust, labels, metric = 'euclidean'))
    
s_score

In [None]:
plt.figure(figsize=(18,8))
sns.lineplot(number_of_cluster, s_score)
sns.scatterplot(number_of_cluster, s_score)
plt.xticks(number_of_cluster)
plt.xlabel('number of clusters')
plt.ylabel('silhouette score')
plt.show()

- From this silhouette score and plot, **the best number of cluster is also 5**.

In [None]:
kmeans = KMeans(n_clusters = 5)
kmeans.fit(mall_clust)
data_mall = mall_clust.copy()
data_mall['cluster'] = kmeans.labels_

In [None]:
plt.figure(figsize = (8,8))
sns.scatterplot(x = 'Annual_Income', y = 'Spending_Score', data = data_mall, hue = 'cluster', palette = 'bright')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.title('Silhouette Plot')
plt.show()

Let's take a deep dive to see the hidden pattern using Agglomerative Clustering.

# Agglomerative Clustering

In [None]:
plt.figure(figsize=(8,8))

sns.scatterplot(
    x= 'Annual_Income',
    y= 'Spending_Score',
    data= mall_clust
)

plt.xlabel('Annual Income')
plt.ylabel('Spending Score')

for i,txt in enumerate(range(0, mall_clust.shape[0]-1)):
    plt.annotate(txt,(mall['Annual_Income'][i]+0.3, mall['Spending_Score'][i]+0.3), fontsize=12)

## *Dendrogram*

In [None]:
linkage_array = ward(mall_clust)
plt.figure(figsize = (8,8))
dendrogram(linkage_array, 25)
ax = plt.gca()
bounds = ax.get_xbound()

ax.plot(bounds, [300,300], '--', c = 'k')
ax.plot(bounds, [150,150], '--', c = 'k')

ax.text(bounds[1], 300, 'two clusters', va = 'center', fontdict = {'size': 12})
ax.text(bounds[1], 150, 'three clusters', va = 'center', fontdict = {'size': 12})

plt.title('Ward')
plt.xlabel('Sample Index')
plt.ylabel('Cluster Distance')
plt.show()

In [None]:
linkage_array = average(mall_clust)
plt.figure(figsize = (8,8))
dendrogram(linkage_array, 25)
ax = plt.gca()
bounds = ax.get_xbound()

ax.plot(bounds, [50,50], '--', c = 'k')
ax.plot(bounds, [30,30], '--', c = 'k')

ax.text(bounds[1], 50, 'two clusters', va = 'center', fontdict = {'size': 12})
ax.text(bounds[1], 30, 'three clusters', va = 'center', fontdict = {'size': 12})

plt.title('Average')
plt.xlabel('Sample Index')
plt.ylabel('Cluster Distance')
plt.show()

In [None]:
linkage_array = complete(mall_clust)
plt.figure(figsize = (8,8))
dendrogram(linkage_array, 25)
ax = plt.gca()
bounds = ax.get_xbound()

ax.plot(bounds, [120,120], '--', c = 'k')
ax.plot(bounds, [80,80], '--', c = 'k')

ax.text(bounds[1], 120, 'two clusters', va = 'center', fontdict = {'size': 12})
ax.text(bounds[1], 80, 'three clusters', va = 'center', fontdict = {'size': 12})

plt.title('Complete')
plt.xlabel('Sample Index')
plt.ylabel('Cluster Distance')
plt.show()

*Results Comparison*

In [None]:
agg_ward = AgglomerativeClustering(n_clusters = 5, linkage = 'ward')
mall['ward'] = agg_ward.fit_predict(mall_clust)

agg_ward = AgglomerativeClustering(n_clusters = 5, linkage = 'average')
mall['average'] = agg_ward.fit_predict(mall_clust)

agg_ward = AgglomerativeClustering(n_clusters = 5, linkage = 'complete')
mall['complete'] = agg_ward.fit_predict(mall_clust)

mall.head(3)

In [None]:
plt.figure(figsize = (15,8))
plt.subplot(1, 3, 1)
sns.scatterplot(x = 'Annual_Income', y = 'Spending_Score', data = mall, hue = 'ward', palette = 'bright')
plt.legend(loc = 5)
plt.title('Ward')
plt.subplot(1, 3, 2)
sns.scatterplot(x = 'Annual_Income', y = 'Spending_Score', data = mall, hue = 'average', palette = 'bright')
plt.legend(loc = 5)
plt.title('Average')
plt.subplot(1, 3, 3)
sns.scatterplot(x = 'Annual_Income', y = 'Spending_Score', data = mall, hue = 'complete', palette = 'bright')
plt.legend(loc = 5)
plt.title('Complete')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.show()

In [None]:
cluster_name = ['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4']
color = ['Blue', 'Orange', 'Green', 'Red', 'Purple']
ward_list = ['High Income VS Low Spending', 'Medium Income VS Medium Spending', 'High Income  VS High Spending',
             'Low Income VS High Spending', 'Low Income VS Low Spending']
avg_list = ['High Income VS Low Spending', 'Medium Income VS Medium Spending', 'High Income  VS High Spending',
            'Low Income VS High Spending', 'Highest Income VS Highest Spending']
comp_list = ['High Income VS High Spending', 'Medium Income VS Medium Spending', 'High Income  VS Low Spending',
             'Low Income VS High Spending', 'Low Income VS Low Spending']
summary = pd.DataFrame({'Color': color, 'Name': cluster_name, 'Ward': ward_list,
                        'Average': avg_list, 'Complete': comp_list})
summary

# DBSCAN

*Initial Clustering*

In [None]:
scaler = StandardScaler()
mall_clust_scaled = scaler.fit_transform(mall_clust)
dbscan = DBSCAN(eps = 0.2, min_samples = 10)
cluster = dbscan.fit_predict(mall_clust_scaled)

silhouette_score(mall_clust_scaled, cluster)

I decide to define low epsilon score with high min samples depends on how much data that been used. 

*Optimizing Minimum Sample And Epsilon*

In [None]:
for eps in [i/10 for i in range(2,6)]:
    for min_samples in range (6,10):
        print(f'\neps {eps}')
        print(f'\min samples {min_samples}')
        
        dbscan = DBSCAN(eps = eps, min_samples = min_samples)
        labels = dbscan.fit_predict(mall_clust_scaled)
        score = silhouette_score(mall_clust_scaled, labels)
        
        print(f'clusters present: {np.unique(labels)}')
        print(f'clusters sizes: {np.bincount(labels + 1)}')
        print(f'Silhouette Score: {score}')

- The best hyperparam are **eps: 0.4 and min samples: 8**, because **it has the highest silhouette score**, but samples is inlcuded with noise.

*Final Result DBSCAN*

In [None]:
dbscan = DBSCAN(eps = 0.4, min_samples = 8)
labels = dbscan.fit_predict(mall_clust_scaled)
mall_clust['cluster'] = labels

In [None]:
plt.figure(figsize = (8,8))
sns.scatterplot(x = 'Annual_Income', y = 'Spending_Score', data = mall_clust, hue = 'cluster', palette = 'bright')
plt.title('DBSCAN')
plt.show()

In [None]:
cluster_name = ['Cluster -1', 'Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4']
color = ['Blue', 'Orange', 'Green', 'Red', 'Purple', 'Brown']
vs_list = ['Noise', 'Low Income VS High Spending', 'Lowest Income  VS Lowest Spending',
           'Medium Income VS Medium Spending', 'High Income VS Highest Spending',
           'High Income VS Low Spending']
dbscan_summary = pd.DataFrame({
    'Color': color,
    'Cluster DBSCAN': cluster_name,
    'Annual Income VS Spending Score': vs_list
})
dbscan_summary