In [None]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Data display coustomization
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
# To perform Hierarchical clustering
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

In [None]:
# import all libraries and dependencies for machine learning
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
from math import isnan

# Data Preparation

## Data Loading

In [None]:
mall= pd.read_csv(r"/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")
mall.head()

In [None]:
mall.shape

In [None]:
mall.info()

In [None]:
mall.describe()

# Duplicate Check

In [None]:
mall_d= mall.copy()
mall_d.drop_duplicates(subset=None,inplace=True)

In [None]:
mall_d.shape

In [None]:
mall.shape

The shape after running the drop duplicate command is same as the original dataframe.

Hence we can conclude that there were zero duplicate values in the dataset.

# Data Cleaning

Null Percentage: Columns

In [None]:
(mall.isnull().sum() * 100 / len(mall)).value_counts(ascending=False)

Null Count: Columns

In [None]:
mall.isnull().sum()

Null Percentage: Rows

In [None]:
(mall.isnull().sum(axis=1) * 100 / len(mall)).value_counts(ascending=False)

Null Count: Rows

In [None]:
mall.isnull().sum(axis=1).value_counts(ascending=False)

There are no missing / Null values either in columns or rows

# Exploratory Data Analytics

Univariate Analysis

**Gender**

In [None]:
plt.figure(figsize = (5,5))
gender = mall['Gender'].sort_values(ascending = False)
ax = sns.countplot(x='Gender', data= mall)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))
plt.xticks(rotation=90)
plt.show()

Data is not balanced, 27% more Females have participated  than males 

**Age**

In [None]:
 
plt.figure(figsize = (20,5))
gender = mall['Age'].sort_values(ascending = False)
ax = sns.countplot(x='Age', data= mall)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))

plt.show()

Audience are from Age 18 to 70

**Annual Income (k$)**

In [None]:
plt.figure(figsize = (25,5))
gender = mall['Annual Income (k$)'].sort_values(ascending = False)
ax = sns.countplot(x='Annual Income (k$)', data= mall)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))

plt.show()

Audience are from Annual Income(k$) range between 15 to 137

**Spending Score (1-100)**

In [None]:
plt.figure(figsize = (27,5))
gender = mall['Spending Score (1-100)'].sort_values(ascending = False)
ax = sns.countplot(x='Spending Score (1-100)', data= mall)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))

plt.show()

Audience are having Spending Score (1-100) between 1 to 99 

In [None]:
# Let's check the correlation coefficients to see which variables are highly correlated

plt.figure(figsize = (5,5))
sns.heatmap(mall.corr(), annot = True, cmap="rainbow")
plt.savefig('Correlation')
plt.show()

- Age and Spending Score (1-100) are moderately correlated with correlation of -0.33

In [None]:
sns.pairplot(mall,corner=True,diag_kind="kde")
plt.show()

## Outlier Analysis

In [None]:
# Data before Outlier Treatment 
mall.describe()

In [None]:
f, axes = plt.subplots(1,3, figsize=(15,5))
s=sns.violinplot(y=mall.Age,ax=axes[0])
axes[0].set_title('Age')
s=sns.violinplot(y=mall['Annual Income (k$)'],ax=axes[1])
axes[1].set_title('Annual Income (k$)')
s=sns.violinplot(y=mall['Spending Score (1-100)'],ax=axes[2])
axes[2].set_title('Spending Score (1-100)')
plt.show()


There is an outlier in Annual Income (k$) field but Income & Spending Score(1-100) has no outliers 

## We use Percentile Capping (Winsorization) for outliers handling

In [None]:
Q3 = mall['Annual Income (k$)'].quantile(0.99)
Q1 = mall['Annual Income (k$)'].quantile(0.01)
mall['Annual Income (k$)'][mall['Annual Income (k$)']<=Q1]=Q1
mall['Annual Income (k$)'][mall['Annual Income (k$)']>=Q3]=Q3

In [None]:
# Data After Outlier Treatment 
mall.describe()

In [None]:
f, axes = plt.subplots(1,3, figsize=(15,5))
s=sns.violinplot(y=mall.Age,ax=axes[0])
axes[0].set_title('Age')
s=sns.violinplot(y=mall['Annual Income (k$)'],ax=axes[1])
axes[1].set_title('Annual Income (k$)')
s=sns.violinplot(y=mall['Spending Score (1-100)'],ax=axes[2])
axes[2].set_title('Spending Score (1-100)')
plt.show()

In [None]:
# Dropping CustomerID,Gender field to form cluster

mall_c = mall.drop(['CustomerID','Gender'],axis=1,inplace=True)

In [None]:
mall.head()

# Hopkins Statistics Test

The Hopkins statistic (introduced by Brian Hopkins and John Gordon Skellam) is a way of measuring the cluster tendency of a data set.It acts as a statistical hypothesis test where the null hypothesis is that the data is generated by a Poisson point process and are thus uniformly randomly distributed. A value close to 1 tends to indicate the data is highly clustered, random data will tend to result in values around 0.5, and uniformly distributed data will tend to result in values close to 0.

• If the value is between {0.01, ...,0.3}, the data is regularly spaced.

• If the value is around 0.5, it is random.

• If the value is between {0.7, ..., 0.99}, it has a high tendency to cluster.

In [None]:
def hopkins(X):
    d = X.shape[1]
    n = len(X)
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    HS = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(HS):
        print(ujd, wjd)
        HS = 0
 
    return HS

In [None]:
# Hopkins score
Hopkins_score=round(hopkins(mall),2)

In [None]:
print("{} is a good Hopkins score for Clustering.".format(Hopkins_score))

# Rescaling the Features

Most software packages use SVD to compute the principal components and assume that the data is scaled and centred, so it is important to do standardisation/normalisation. There are two common ways of rescaling:

- Min-Max scaling
- Standardisation (mean-0, sigma-1)

Here, we will use Standardisation Scaling.

In [None]:
# Standarisation technique for scaling
scaler = StandardScaler()
mall_scaled = scaler.fit_transform(mall)

In [None]:
mall_scaled

In [None]:
mall_df1 = pd.DataFrame(mall_scaled, columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)'])
mall_df1.head()


# Model Building

## Hierarchical Clustering

Hierarchical clustering involves creating clusters that have a predetermined ordering from top to bottom. For example, all files and folders on the hard disk are organized in a hierarchy. There are two types of hierarchical clustering,

- Divisive
- Agglomerative.

## Single Linkage:

In single linkage hierarchical clustering, the distance between two clusters is defined as the shortest distance between two points in each cluster. For example, the distance between clusters “r” and “s” to the left is equal to the length of the arrow between their two closest points.

A fundamental step for any unsupervised algorithm is to determine the optimal number of clusters into which the data may be clustered. The Elbow Method is one of the most popular methods to determine this optimal value of k.

In [None]:
# Single linkage
plt.figure(figsize = (20,10))
mergings = linkage(mall_df1, method='single',metric='euclidean')
dendrogram(mergings)
plt.show()

Looking at the above elbow curve it looks good to proceed with 4 clusters.

## Complete Linkage

In complete linkage hierarchical clustering, the distance between two clusters is defined as the longest distance between two points in each cluster. For example, the distance between clusters “r” and “s” to the left is equal to the length of the arrow between their two furthest points.

In [None]:
# Complete Linkage
plt.figure(figsize = (20,10))
mergings = linkage(mall_df1, method='complete',metric='euclidean')
dendrogram(mergings)
plt.show()

We will opt for 4 as cluster

In [None]:
# 4 clusters
cluster_labels = cut_tree(mergings, n_clusters=4).reshape(-1, )
cluster_labels

In [None]:
# Assign the label

mall_d['Cluster_Id'] = cluster_labels
mall_d.head()

In [None]:
## Number of customers in each cluster
mall_d['Cluster_Id'].value_counts(ascending=True)

In [None]:
mall_d.columns

It seems there are good number of countries in each clusters.

In [None]:
plt.figure(figsize = (20,15))
plt.subplot(3,1,1)
sns.scatterplot(x = 'Age', y = 'Annual Income (k$)',hue='Cluster_Id',data = mall_d,legend='full',palette="Set1")
plt.subplot(3,1,2)
sns.scatterplot(x = 'Annual Income (k$)', y = 'Spending Score (1-100)',hue='Cluster_Id', data = mall_d,legend='full',palette="Set1")
plt.subplot(3,1,3)
sns.scatterplot(x = 'Spending Score (1-100)', y = 'Age',hue='Cluster_Id',data= mall_d,legend='full',palette="Set1")
plt.show()

In [None]:
 #Violin plot on Original attributes to visualize the spread of the data

fig, axes = plt.subplots(1,3, figsize=(20,5))

sns.violinplot(x = 'Cluster_Id', y = 'Age', data = mall_d,ax=axes[0])
sns.violinplot(x = 'Cluster_Id', y = 'Annual Income (k$)', data = mall_d,ax=axes[1])
sns.violinplot(x = 'Cluster_Id', y = 'Spending Score (1-100)', data=mall_d,ax=axes[2])
plt.show()

In [None]:
mall_d.head()

In [None]:
mall_d[['Age', 'Annual Income (k$)','Spending Score (1-100)','Cluster_Id']].groupby('Cluster_Id').mean()

Cluster 0  are those people whose 
- Avg Age : 54
- Avg Annual Income (k$) : 47.7k
- Avg Spending Score (1-100) : 40 

We can label them Medium Spender 

In [None]:
group_0= mall_d[mall_d['Cluster_Id']==0]
group_0.head()

In [None]:
fig, axes = plt.subplots(1,3, figsize=(20,5))

sns.violinplot(x = 'Gender', y = 'Age', data = group_0,ax=axes[0])
sns.violinplot(x = 'Gender', y = 'Annual Income (k$)', data = group_0,ax=axes[1])
sns.violinplot(x = 'Gender', y = 'Spending Score (1-100)', data=group_0,ax=axes[2])
plt.show()

- Mean Age of this cluster for Male is more than Females
- Males earn more than females
- Mean Spending Score (1-100) is same for both gender 

Cluster 1  are those people whose 
- Avg Age : 25
- Avg Annual Income (k$) : 40 k
- Avg Spending Score (1-100) : 60 

We can label them Large Spender

In [None]:
group_1= mall_d[mall_d['Cluster_Id']==1]
group_1.head()

In [None]:
fig, axes = plt.subplots(1,3, figsize=(20,5))

sns.violinplot(x = 'Gender', y = 'Age', data = group_1,ax=axes[0])
sns.violinplot(x = 'Gender', y = 'Annual Income (k$)', data = group_1,ax=axes[1])
sns.violinplot(x = 'Gender', y = 'Spending Score (1-100)', data=group_1,ax=axes[2])
plt.show()

- Mean Age of this cluster are same for both genders 
- Males earn more than females
- Mean Spending Score (1-100) is more for males 

Cluster 2 are those people whose 
- Avg Age : 32
- Avg Annual Income (k$) : 86 k
- Avg Spending Score (1-100) : 81

We can label them Extra Spender

In [None]:
group_2= mall_d[mall_d['Cluster_Id']==2]
group_2.head()

In [None]:
fig, axes = plt.subplots(1,3, figsize=(20,5))

sns.violinplot(x = 'Gender', y = 'Age', data = group_2,ax=axes[0])
sns.violinplot(x = 'Gender', y = 'Annual Income (k$)', data = group_2,ax=axes[1])
sns.violinplot(x = 'Gender', y = 'Spending Score (1-100)', data=group_2,ax=axes[2])
plt.show()

- Age range for males are higher than females 
- Males earn more than females
- Mean Spending Score (1-100) is more for males 

Cluster 3 are those people whose 
- Avg Age : 40
- Avg Annual Income (k$) : 86.5 k
- Avg Spending Score (1-100) : 19

We can label them Low Spender

In [None]:
group_3= mall_d[mall_d['Cluster_Id']==3]
group_3.head()

In [None]:
fig, axes = plt.subplots(1,3, figsize=(20,5))

sns.violinplot(x = 'Gender', y = 'Age', data = group_3,ax=axes[0])
sns.violinplot(x = 'Gender', y = 'Annual Income (k$)', data = group_3,ax=axes[1])
sns.violinplot(x = 'Gender', y = 'Spending Score (1-100)', data=group_3,ax=axes[2])
plt.show()

- Age range for males are higher than females 
- Annual Income range for males are lower than females 
- Mean Spending Score (1-100) is more for females 

In [None]:
mall_d[['Age', 'Annual Income (k$)','Spending Score (1-100)','Cluster_Id']].groupby('Cluster_Id').mean()

Final Points 

- Target Cluster 1 with more offers 
- Reward Cluster 2 people for being  loyal customer.
- Improve the services to  attract Cluster 3 
- Target Cluster 0 with better employees support 