In [None]:
import pandas as pd
import numpy as np

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")

In [None]:
df.head()

In [None]:
df.describe()

25% of customers fall in the age of 28, 50% are 36 and 75% are 49.

# Histogram of numerical features

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

df_plots=df.select_dtypes(exclude="object")

fig=make_subplots(rows=2, cols=2,subplot_titles=df_plots.columns)

index=0

for i in range(1,3):
    for j in range(1,3):
        data=df[df_plots.columns[index]]
        trace=go.Histogram(x=data)
        fig.append_trace(trace,i,j)
        index+=1
        
fig.update_layout(height=900,width=1200,title_text="Numerical Attributes")

Most distributions seem to follow a somewhat Gaussian distribution.

Let's write the above code as a function so we can use it later.

In [None]:
def plot_hist_num(df):
    df_plots=df.select_dtypes(exclude="object")

    fig=make_subplots(rows=1, cols=3,subplot_titles=df_plots.columns)

    index=0

    for i in range(1,2):
        for j in range(1,4):
            data=df[df_plots.columns[index]]
            trace=go.Histogram(x=data)
            fig.append_trace(trace,i,j)
            index+=1
        
    fig.update_layout(height=300,width=900,title_text="Numerical Attributes")
    fig.show()

# Missing values

In [None]:
df.isnull().sum()

# Outliers

We will check this with the skewness value. Explains the extent to which data is normally distributed. Value should lie between -1 to +1. Any major deviation from this indicates presence of extreme values.

In [None]:
df.skew()

Since the skewness values lie in the desired range, no changes will be made to the dataset.

Source: https://stats.stackexchange.com/questions/328109/k-means-does-it-make-sense-to-remove-the-outliers-after-clustering-the-datasets#:~:text=4%20Answers&text=K%2Dmeans%20can%20be%20quite,means%2C%20or%20you%20use%20DBSCAN.

If there were outliers, we would need to treat it as K Means is sensitive to outliers. We have 2 options:

1. Remove outlier first and then apply your clustering algorithm (for this step itself you may use clustering algorithms!). Please note that k-means itself is not a Soft Clustering algorithm so it does not model the overlaps. For that you may use algorithms like Fuzzy C-Means. There you can define an overlap by clusters for which the memberships of a sample are closer than a threshold.

2. Ignore the outlier removal and just use more robust variations of K-means, e.g. K-medoids or K-Medians, to reduce the effect of outliers.

# Dropping unnecessary features

We will also drop the CustomerID column.

In [None]:
df.drop(['CustomerID','Gender'],axis=1,inplace=True)
df.head(2)

# Standardizing the dataset using Power Transformer

In [None]:
df.shape

In [None]:
from sklearn.preprocessing import PowerTransformer

pt=PowerTransformer()

#PowerTransformer() takes the input of the form {array-like, sparse matrix, dataframe} of shape (n_samples, n_features)
df_transformed=pt.fit_transform(df.values.reshape(-1,3))

In [None]:
#convert array to dataframe to plot it
pd_df_transformed=pd.DataFrame(df_transformed,columns=df.columns)

#plot the histogram to see change in distrbution
plot_hist_num(pd_df_transformed)

In [None]:
pd_df_transformed.describe()

Features are now standardized and have a gaussian distribution.

# Standardizing the dataset using Quantile Transformer

Source: https://scikit-learn.org/stable/modules/preprocessing.html#non-linear-transformation

Two types of transformations are available: quantile transforms and power transforms.

QuantileTransformer applies a non-linear transformation such that the probability density function of each feature will be mapped to a uniform distribution. As RobustScaler, QuantileTransformer is robust to outliers in the sense that adding or removing outliers in the training set will yield approximately the same transformation on held out data.

<b> Effect of different transformations on different types of distributions </b>

<img src="https://scikit-learn.org/stable/_images/sphx_glr_plot_map_data_to_normal_0011.png">

We will also use Quantile transformer on our dataset to see if it performs better than Power Transformer.

In [None]:
from sklearn.preprocessing import QuantileTransformer

qt=QuantileTransformer(random_state=0)

#PowerTransformer() takes the input of the form {array-like, sparse matrix, dataframe} of shape (n_samples, n_features)
df_quantile_transformed=qt.fit_transform(df.values.reshape(-1,3))

In [None]:
#convert array to dataframe to plot it
pd_df_quantile_transformed=pd.DataFrame(df_quantile_transformed,columns=df.columns)

#plot the histogram to see change in distrbution
plot_hist_num(pd_df_quantile_transformed)

Power Transformer seems to have done a better job and hence we will stick with it.

# T-SNE

Source: https://jakevdp.github.io/PythonDataScienceHandbook/05.11-k-means.html

We can use the t-distributed stochastic neighbor embedding (t-SNE) algorithm to pre-process the data before performing clustering. t-SNE is a nonlinear embedding algorithm that is particularly adept at preserving points within clusters. 

In [None]:
'''
from sklearn.manifold import TSNE

# Project the data: this step will take several seconds
tsne = TSNE(n_components=2, init='random', random_state=0)

#Fit_transform() accpets input of the type array, shape (n_samples, n_features) 
sne_df_transformed = tsne.fit_transform(df_transformed)
'''

In [None]:
'''
#plot the clusters obtained from t-SNE
fig = go.Figure(data=go.Scatter(x=sne_df_transformed.T[0],
                                y=sne_df_transformed.T[1],
                                mode='markers')) 

fig.update_layout(title='t-SNE distribution of data')
fig.show()
'''

# MDS for visualization

t-SNE is framed as a visualization tool rather than a pre-processing or analysis tool.

Source: https://stats.stackexchange.com/questions/351474/does-it-make-sense-to-run-dbscan-on-the-output-from-t-sne

T-SNE is a manifold technique and as such does not preserve distances; therefore it is not recommended to run distance-based (e.g. k-means) or density-based (e.g. DBSCAN) clustering algorithms on the output of T-SNE. If you want a dimensional reduction algorithm that does preserve distances, you can use PCA  instead of T-SNE.

In [None]:
from sklearn.manifold import MDS
mds = MDS(n_components = 2)

mds_df_transformed = mds.fit_transform(df_transformed)

In [None]:
#plot the clusters obtained from K Means
fig = go.Figure(data=go.Scatter(x=mds_df_transformed.T[0],
                                y=mds_df_transformed.T[1],
                                mode='markers')) 

fig.update_layout(title='MDS Transformed data for visualization')
fig.show()


# Clustering

We will try different clustering techqniues and see which one does best using the Silhoutte score.

## 1. K Means

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import plotly.express as px

In [None]:
silhouette_k_means=[]

for k in range(2,10):
    k_test=KMeans(n_clusters=k)
    cluster_labels=k_test.fit_predict(df_transformed)
    silhouette_avg = silhouette_score(df_transformed, cluster_labels)
    silhouette_k_means.append(silhouette_avg)
    
px.line(x=range(2,10),y=silhouette_k_means)

We will choose the number of clusters as 6 based on the above graph

In [None]:
'''
We will keep a track of the silhouette score and the model using silhouette_score_compiled
We will keep a track of the DB score and the model using db_score_compiled
'''

silhouette_score_compiled={}
db_score_compiled={}

In [None]:
#plug in optimal number of clusters 

k_means=KMeans(n_clusters=6)
kmeans_labels=k_means.fit_predict(df_transformed)
silhouette_score_compiled['K Means'] = silhouette_score(df_transformed, kmeans_labels)
db_score_compiled['K Means']=metrics.davies_bouldin_score(df_transformed,kmeans_labels)
print(silhouette_score_compiled)

In [None]:
#plot the clusters obtained from K Means
fig = go.Figure(data=go.Scatter(x=mds_df_transformed.T[0],
                                y=mds_df_transformed.T[1],
                                mode='markers',
                                marker_color=kmeans_labels,text=kmeans_labels)) 

fig.update_layout(title='K Means')
fig.show()


## 2. Mean Shift

In [None]:
from sklearn.cluster import MeanShift
from sklearn.cluster import estimate_bandwidth

est_bandwidth = estimate_bandwidth(df_transformed,quantile=0.1,n_samples=10000)
ms = MeanShift(bandwidth= est_bandwidth)
ms_labels=ms.fit_predict(df_transformed)
silhouette_score_compiled['Mean Shift'] = silhouette_score(df_transformed, ms_labels)
db_score_compiled['Mean Shift']=metrics.davies_bouldin_score(df_transformed,ms_labels)
print(silhouette_score_compiled)

In [None]:
#plot the clusters obtained from Mean Shift
fig = go.Figure(data=go.Scatter(x=mds_df_transformed.T[0],
                                y=mds_df_transformed.T[1],
                                mode='markers',
                                marker_color=ms_labels,text=ms_labels)) 

fig.update_layout(title='Mean Shift')
fig.show()


## 3. DBSCAN

In [None]:
from sklearn.cluster import DBSCAN 
from matplotlib import pyplot as plt
from sklearn.neighbors import NearestNeighbors

#### Choosing optimal epsilon value

Source: https://towardsdatascience.com/machine-learning-clustering-dbscan-determine-the-optimal-value-for-epsilon-eps-python-example-3100091cfbc#:~:text=In%20layman's%20terms%2C%20we%20find,and%20select%20that%20as%20epsilon.

We must provide a value for epsilon which defines the maximum distance between two points. The following paper, describes an approach for automatically determining the optimal value for Eps:
https://iopscience.iop.org/article/10.1088/1755-1315/31/1/012012/pdf

In layman’s terms, we find a suitable value for epsilon by calculating the distance to the nearest n points for each point, sorting and plotting the results. Then we look to see where the change is most pronounced (think of the angle between your arm and forearm) and select that as epsilon.

We can calculate the distance from each point to its closest neighbour using the NearestNeighbors. The point itself is included in n_neighbors. The kneighbors method returns two arrays, one which contains the distance to the closest n_neighbors points and the other which contains the index for each of those points.

In [None]:
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(df_transformed)
distances, indices = nbrs.kneighbors(df_transformed)

#sort and plot the results
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)

The optimal value for epsilon will be found at the point of maximum curvature. We train our model, selecting 0.2 for eps and setting min_samples to 5

#### Choosing optimal 'minPts'

Source: https://stackoverflow.com/questions/12893492/choosing-eps-and-minpts-for-dbscan-r

A low minPts means it will build more clusters from noise, so don't choose it too small.

minPts is best set by a domain expert who understands the data well. Unfortunately many cases we don't know the domain knowledge, especially after data is normalized. One heuristic approach is use ln(n), where n is the total number of points to be clustered.

In [None]:
np.log(len(df_transformed))

In [None]:
# we will select the optimal values using grid search method
from sklearn import metrics

db_results=pd.DataFrame(columns=['Eps','Min_Samples','Number of Cluster','Silhouette Score'])
for i in range(1,12):
    for j in range(1,12):
        dbscan_cluster = DBSCAN(eps=i*0.2, min_samples=j)
        clusters=dbscan_cluster.fit_predict(df_transformed)
        if len(np.unique(clusters))>2:
              db_results=db_results.append({'Eps':i*0.2,
                                      'Min_Samples':j,
                                      'Number of Cluster':len(np.unique(clusters)),
                                      'Silhouette Score':metrics.silhouette_score(df_transformed,clusters),
                                      'Davies Bouldin Score':metrics.davies_bouldin_score(df_transformed,clusters)}, ignore_index=True)

In [None]:
db_results.sort_values('Silhouette Score',ascending=False)[:5]

In [None]:
#choosing min_samples as 6 and eps as 0.6
dbscan = DBSCAN(eps=0.6,min_samples=6)
dbscan_labels= dbscan.fit_predict(df_transformed)
silhouette_score_compiled['DBSCAN'] = silhouette_score(df_transformed, dbscan_labels)
db_score_compiled['DBSCAN']=metrics.davies_bouldin_score(df_transformed,dbscan_labels)
print(silhouette_score_compiled)

In [None]:
#plot the clusters obtained from DBSCAN
fig = go.Figure(data=go.Scatter(x=mds_df_transformed.T[0],
                                y=mds_df_transformed.T[1],
                                mode='markers',
                                marker_color=dbscan_labels,text=dbscan_labels)) 

fig.update_layout(title='DBSCAN')
fig.show()


## 4. Gaussian Mixture Models (GMMs)

#### Optimal n_components

In [None]:
from sklearn.mixture import GaussianMixture
from sklearn import metrics

We will also compute Davies Bouldin score. The metric is Davies Bouldin that is defined as the average similarity measure of each cluster with its most similar cluster, where similarity is the ratio of within-cluster distances to between-cluster distances. The minimum score is zero, with lower values indicating better clustering.

In [None]:
parameters=['full','tied','diag','spherical']
n_clusters=np.arange(1,10)
results_=pd.DataFrame(columns=['Covariance Type','Number of Cluster','Silhouette Score','Davies Bouldin Score'])
for i in parameters:
    for j in n_clusters:
        gmm_cluster=GaussianMixture(n_components=j,covariance_type=i,random_state=123)
        clusters=gmm_cluster.fit_predict(df_transformed)
        if len(np.unique(clusters))>=2:
            results_=results_.append({"Covariance Type":i,'Number of Cluster':j,"Silhouette Score":metrics.silhouette_score(df_transformed,clusters),
                                    'Davies Bouldin Score':metrics.davies_bouldin_score(df_transformed,clusters)}
                                   ,ignore_index=True)

In [None]:
results_.sort_values('Silhouette Score',ascending=False)[:5]

In [None]:
gmm_labels = GaussianMixture(n_components=7,covariance_type='tied').fit_predict(df_transformed)
silhouette_score_compiled['GMM'] = silhouette_score(df_transformed, gmm_labels)
db_score_compiled['GMM']=metrics.davies_bouldin_score(df_transformed,gmm_labels)
print(silhouette_score_compiled)

In [None]:
#plot the clusters obtained from GMM
fig = go.Figure(data=go.Scatter(x=mds_df_transformed.T[0],
                                y=mds_df_transformed.T[1],
                                mode='markers',
                                marker_color=gmm_labels,text=gmm_labels)) 

fig.update_layout(title='GMM')
fig.show()


## 5. Agglomerative Hierarchical Clustering

Hierarchical clustering is a clustering technique that aims to create a tree like clustering hierarchy within the data. On this model, to determine the n_clusters, we can able to use a dendogram.

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
parameters=['ward', 'complete', 'average', 'single']
n_clusters=np.arange(1,10)
agh_cluster_results_=pd.DataFrame(columns=['Linkage Type','Number of Cluster','Silhouette Score','Davies Bouldin Score'])
for i in parameters:
    for j in n_clusters:
        agh_cluster=AgglomerativeClustering(n_clusters=j,linkage=i)
        clusters=agh_cluster.fit_predict(df_transformed)
        if len(np.unique(clusters))>=2:
            agh_cluster_results_=agh_cluster_results_.append({"Linkage Type":i,'Number of Cluster':j,"Silhouette Score":metrics.silhouette_score(df_transformed,clusters),
                                    'Davies Bouldin Score':metrics.davies_bouldin_score(df_transformed,clusters)}
                                   ,ignore_index=True)

In [None]:
agh_cluster_results_.sort_values('Silhouette Score',ascending=False)[:5]

In [None]:
agh_labels=AgglomerativeClustering(n_clusters=8,linkage='average').fit_predict(df_transformed)
silhouette_score_compiled['Agglomerative Hierarchical Clustering'] = silhouette_score(df_transformed, agh_labels)
db_score_compiled['Agglomerative Hierarchical Clustering']=metrics.davies_bouldin_score(df_transformed,agh_labels)
print(silhouette_score_compiled)

In [None]:
#plot the clusters obtained from Agglomerative Hierarchical Clustering
fig = go.Figure(data=go.Scatter(x=mds_df_transformed.T[0],
                                y=mds_df_transformed.T[1],
                                mode='markers',
                                marker_color=agh_labels,text=agh_labels)) 

fig.update_layout(title='Agglomerative Hierarchical Clustering')
fig.show()


# Compare the results

In [None]:
ss_df = pd.DataFrame(list(silhouette_score_compiled.items()),columns = ['Algo','Silhouette Score']) 
db_df = pd.DataFrame(list(db_score_compiled.items()),columns = ['Algo','Davies Bouldin Score']) 
final_results=pd.merge(ss_df,db_df,left_on="Algo",right_on="Algo")
final_results.sort_values('Silhouette Score',ascending=False)

K-Means has the best Silhouette and the second best Davies Bouldin score. For this reason, K-Means Algorithm is more suitable for customer segmentation. Thus we have 6 customer types. Let’s try to understand behaviours or labels of customers.

# Understanding the results

In [None]:
df['Final Clusters']=kmeans_labels
df.head(4)

In [None]:
df['Final Clusters'].value_counts().index.sort_values(ascending=True)

In [None]:
age=[]
income=[]
spend=[]
cluster_k=[]
for i in df['Final Clusters'].value_counts().index.sort_values(ascending=True):
    df_test=df[df['Final Clusters']==i]
    cluster_k.append(i)
    age.append(round(df_test['Age'].mean(),0))
    income.append(round(df_test['Annual Income (k$)'].mean(),0))
    spend.append(round(df_test['Spending Score (1-100)'].mean(),0))

In [None]:
d={'CLuster':cluster_k,'Age':age,'Income(k$)':income,'Spending score':spend}
df_cluster_result=pd.DataFrame(d)
df_cluster_result

In [None]:
df_plots=df_cluster_result[["Age","Income(k$)","Spending score"]]
fig=make_subplots(rows=1, cols=3,subplot_titles=df_plots.columns)

index=0

for i in range(1,2):
    for j in range(1,4):
        data=df_cluster_result[df_plots.columns[index]]
        trace=go.Box(x=data)
        fig.append_trace(trace,i,j)
        index+=1
        
fig.update_layout(height=300,width=900,title_text="Boxplot of features of final Clusters")
fig.show()

- Cluster 0: Young age, high income and highest spending score - Marketing campaigns towards these groups to keep encouraging them to spend
- Cluster 1: Senior age, low income, low spending score
- Cluster 2: Senior age, high income, low spending socre - More marketing campaigns targeted towards these people as income is high but spending is very low
- CLuster 3: Young age, medium income, medium spending score
- cluster 4: Senior age, medium income, medium spending score
- Cluster 5: Young age, low income, high spending score - Marketing campaigns towards these groups to keep encouraging them to spend

In [None]:
trace1 = go.Scatter3d(
    x= df['Age'],
    y= df['Spending Score (1-100)'],
    z= df['Annual Income (k$)'],
    mode='markers',
     marker=dict(
        color = df['Final Clusters'], 
        size= 10,
        line=dict(
            color= df['Final Clusters'],
            width= 12
        ),
        opacity=0.8
     )
)
data1 = [trace1]

layout = go.Layout(
    title = 'Character vs Gender vs Alive or not',
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0  
    ),
    scene = dict(
            xaxis = dict(title  = 'Age'),
            yaxis = dict(title  = 'Spending Score'),
            zaxis = dict(title  = 'Annual Income')
        )
)

fig = go.Figure(data = data1, layout = layout)
fig.show("notebook")


## Source:
https://github.com/muhammetbektas/Unsupervised-Learning/blob/master/Segmentation_of_Credit_Card_Users.ipynb