In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
df.head()

# EDA

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.Gender.value_counts()

In [None]:
sns.boxplot(data = df, x = 'Gender', y = 'Age')
plt.show()

In [None]:
sns.boxplot(data = df, x = 'Gender', y = 'Annual Income (k$)')
plt.show()

In [None]:
sns.boxplot(data = df, x = 'Gender', y = 'Spending Score (1-100)')
plt.show()

# K-means Modelling

## Scaling the data

In [None]:
cluster_df = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

# instantiate
scaler = StandardScaler()

# fit_transform
cluster_df_scaled = scaler.fit_transform(cluster_df)
cluster_df_scaled.shape

In [None]:
cluster_df_scaled = pd.DataFrame(cluster_df_scaled)
cluster_df_scaled.columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
cluster_df_scaled.head()

## Trial run with 3 clusters

In [None]:
kmeans = KMeans(n_clusters=3, max_iter=50)
kmeans.fit(cluster_df_scaled)

In [None]:
kmeans.labels_

## Hopkins Statistics 

In [None]:
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
hopkins(cluster_df_scaled)

## Finding Optimal number of clusters

### Elbow Curve

In [None]:
# elbow-curve/SSD
ssd = []
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(cluster_df_scaled)
    
    ssd.append(kmeans.inertia_)
    
# plot the SSDs for each n_clusters
ssd_df = pd.DataFrame(list(zip(range_n_clusters,ssd)), columns =['Clusters', 'SSD'])
sns.lineplot(data = ssd_df, x = 'Clusters', y = 'SSD')
plt.title('Elbow Curve')
plt.show()

### Silhouette Score

In [None]:
# silhouette analysis
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for num_clusters in range_n_clusters:
    
    # intialise kmeans
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(cluster_df_scaled)
    
    cluster_labels = kmeans.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(cluster_df_scaled, cluster_labels)
    print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))

## Final Model with Optimal clusters

In [None]:
# final model with k=4
kmeans = KMeans(n_clusters=4, max_iter=50)
kmeans.fit(cluster_df_scaled)

In [None]:
# assign the label
df['cluster_id'] = kmeans.labels_
df.head()

## Analyzing the clusters

In [None]:
sns.boxplot(x='cluster_id', y='Age', data=df)
plt.title('Age variation across clusters')
plt.show()

In [None]:
sns.boxplot(x='cluster_id', y='Annual Income (k$)', data=df)
plt.title('Annual Income across clusters')
plt.show()

In [None]:
sns.boxplot(x='cluster_id', y='Spending Score (1-100)', data=df)
plt.title('Spending Score across clusters')
plt.show()

* Cluster 0 : People in their 50-60s with decent annual income & decent Spending score
* Cluster 1 : People in their 20s with low annual income but high Spending Score
* Cluster 2 : People in their 30-40s with high annual income but low Spending Score
* Cluster 3 : People in their 30s with high annual income & high Spending Score

# Hierarchical Clustering

### Single linkage

In [None]:
mergings = linkage(cluster_df_scaled, method="single", metric='euclidean')
dendrogram(mergings)
plt.show()

Single linkage does not seem to represent a very useful dendrogram.

### Complete linkage

In [None]:
mergings = linkage(cluster_df_scaled, method="complete", metric='euclidean')
dendrogram(mergings)
plt.show()

In [None]:
# 4 clusters
cluster_labels = cut_tree(mergings, n_clusters=4).reshape(-1, )
cluster_labels

In [None]:
# assign cluster labels
df['hcluster_labels'] = cluster_labels
df.head()

### Analyzing the clusters

In [None]:
# plots
sns.boxplot(x='hcluster_labels', y='Age', data=df)
plt.show()

In [None]:
sns.boxplot(x='hcluster_labels', y='Annual Income (k$)', data=df)
plt.show()

In [None]:
sns.boxplot(x='hcluster_labels', y='Spending Score (1-100)', data=df)
plt.show()

* Cluster 0 : People in their 20s with low annual income but decent Spending score
* Cluster 1 : People in their 50-60s with decent annual income & decent Spending Score
* Cluster 2 : People in their 30s with high annual income & high Spending Score
* Cluster 3 : People in their 30-40s with high annual income but low Spending Score