In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
customers= pd.read_csv('Mall_Customers.csv')
customers.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [4]:
customers.isna().sum()

CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

# Data Preperation

In [None]:
customersml= customers.copy(deep= True)
customersml.head()

## Removing unusable columns

In [None]:
customersml.drop(labels= 'CustomerID', axis= 1, inplace= True)

In [None]:
customersml.head()

## Converting Categorical Variable to Numeric

In [None]:
customersml['Gender']= customersml['Gender'].map({'Male': 1, 'Female': 0})

In [None]:
customersml.head()

In [None]:
customersml.info()

## Scaling the Variables

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler= MinMaxScaler()
customersml[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]= scaler.fit_transform(customersml[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']])
customersml.head()

## Checking Clustering Feasibility using Hopkins Statistic

In [None]:
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
hopkins(customersml)

## Clustering

### Using Hierarchical Clustering

In [None]:
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

In [None]:
mergings= linkage(customersml, method= 'complete', metric= 'euclidean')

In [None]:
dendrogram(mergings)
plt.show()

In [None]:
cut_tree(mergings, n_clusters= 3).reshape(-1,)

In [None]:
customers['Dendro_Cluster']= cut_tree(mergings, n_clusters= 3).reshape(-1,)

In [None]:
customers.head()

### Using Elbow Curve to find optimal clusters

In [None]:
from sklearn.cluster import KMeans

In [None]:
clusters= [i for i in range(1, 11)]

ssd= []

for num_clusters in clusters[1:]:
    kmeans= KMeans(n_clusters= num_clusters, max_iter= 50)
    kmeans.fit(customersml)
    ssd.append(kmeans.inertia_)
    
plt.figure()
plt.plot(ssd, 'r--x', label= 'Sum of Squared Distances')
plt.xticks(np.arange(10), np.array([1,2,3,4,5,6,7,8,9,10]))
plt.legend()
plt.show()

Elbow curve shows the best results expectation with 3 clusters.

Using silhouette score to asses further.

### Using Silhouette Score

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
for num_clusters in clusters[1:]:
    kmeans= KMeans(n_clusters=num_clusters, max_iter= 50)
    kmeans.fit(customersml)
    sil_score= silhouette_score(X= customersml, labels= kmeans.labels_)
    print(f'Silhouette score for {num_clusters} is {round(sil_score, 4)}')

In [None]:
#Final model with k= 4

kmeans= KMeans(n_clusters= 3, max_iter= 50)
kmeans.fit(customersml)


In [None]:
customers['kmeans cluster']= kmeans.labels_

# Attribute Visualization

In [None]:
cluster_type= ['Dendro_Cluster', 'kmeans cluster']
attributes= ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']

for i in range(len(attributes)):
    fig, ax= plt.subplots(nrows= 1, ncols= 2, figsize= (20, 5), sharey= True)

    for j in range(len(cluster_type)):
        sns.boxplot(x= cluster_type[j], y= attributes[i], data= customers, ax= ax[j])
        ax[j].set_title(f'{attributes[i]} as per {cluster_type[j]}', fontsize= 20)
        ax[j].set_xlabel('')
        ax[j].set_ylabel('')
        plt.tight_layout()

plt.show()


In [None]:
cluster_size= pd.DataFrame()
cluster_size['Cluster Number']= np.arange(3)
cluster_size['Dendrogram']= [len(customers[customers['Dendro_Cluster']== i]) for i in range(3)]
cluster_size['K Means']= [len(customers[customers['kmeans cluster']== i]) for i in range(3)]
cluster_size