## Algorithms

1.  DBSCAN
2.  K-Means
3.  Gaussian Mixture models
4.  BIRCH

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Libraries

In [None]:
from warnings import filterwarnings
filterwarnings("ignore")

import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, Birch
from sklearn.mixture import GaussianMixture

In [None]:
sns.set_style('darkgrid', {"axes.facecolor": ".85"})
sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 1.2})

## EDA

In [None]:
data = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.isna().sum()

In [None]:
data.info()

In [None]:
data.describe()

## PCA

In [None]:
X = data.drop(['CustomerID', 'Gender'], axis=1)

In [None]:
# PCA for visualization
pca = PCA(n_components = 2)
pca.fit(X)
X_pca = pca.transform(X)

In [None]:
# plot PCA components
fig, axs = plt.subplots(figsize=[10,10])
sns.scatterplot(x=X_pca[:,0],
                y=X_pca[:,1],
                edgecolor=None,
                alpha=0.8,
                ax=axs)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Scatter plot')
plt.show()

In [None]:
pca.explained_variance_ratio_

## DBSCAN

In [None]:
model = DBSCAN(eps=12, min_samples=3)
labels = model.fit_predict(X)

In [None]:
np.unique(labels)

In [None]:
# plot PCA components
fig, axs = plt.subplots(figsize=[10,10])
sns.scatterplot(x=X_pca[:,0],
                y=X_pca[:,1],
                hue=labels,
                palette=sns.xkcd_palette(['black', 'greenish teal', 'cyan', 'red pink', 'amber', 'purple', 'red orange']),
                edgecolor=None,
                alpha=0.8,
                ax=axs)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Clusters')
plt.show()

## K means

In [None]:
inertia = []
for n in range(2 , 11):
    model = KMeans(n_clusters=n, random_state=21, algorithm='elkan')
    model.fit(X)
    inertia.append(model.inertia_)

In [None]:
plt.figure(figsize = (12 ,6))
plt.plot(np.arange(2 , 11) , inertia , 'o', c=sns.xkcd_rgb['red pink'])
plt.plot(np.arange(2 , 11) , inertia , '-' ,c=sns.xkcd_rgb['greenish teal'], alpha = 0.8)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()

### Five clusters

In [None]:
model = KMeans(n_clusters=5, random_state= 21, algorithm='elkan')
model.fit(X)
labels = model.labels_

In [None]:
# plot PCA components
fig, axs = plt.subplots(figsize=[10,10])
sns.scatterplot(x=X_pca[:,0],
                y=X_pca[:,1],
                hue=labels,
                palette=sns.color_palette('husl', 5),
                edgecolor=None,
                alpha=0.8,
                ax=axs)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Clusters')
plt.show()

### Six clusters

In [None]:
model = KMeans(n_clusters=6, random_state= 21, algorithm='elkan')
model.fit(X)
labels = model.labels_

In [None]:
# plot PCA components
fig, axs = plt.subplots(figsize=[10,10])
sns.scatterplot(x=X_pca[:,0],
                y=X_pca[:,1],
                hue=labels,
                palette=sns.color_palette('husl', 6),
                edgecolor=None,
                alpha=0.8,
                ax=axs)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Clusters')
plt.show()

## Gaussian Mixtures

In [None]:
score = []
ppl = []
aic = []
bic = []
for n in range(2 , 11):
    model = GaussianMixture(n_components=n)
    model.fit(X)
    score.append(silhouette_score(X, model.predict(X)))
    ppl.append(np.exp(model.score(X)))
    aic.append(model.aic(X))
    bic.append(model.bic(X))

In [None]:
fig, axs = plt.subplots(2,1, figsize = (12 ,12))

## plot silhouette score
axs[0].plot(np.arange(2 , 11) , score , 'o', c=sns.xkcd_rgb['red pink'])
axs[0].plot(np.arange(2 , 11) , score , '-' ,c=sns.xkcd_rgb['greenish teal'], alpha = 0.8)
axs[0].set_xlabel('Number of Clusters') , axs[0].set_ylabel('Silhouette Score')
# axs[0].set_title('Silhouette Score')

## plot aic and bic
axs[1].plot(np.arange(2 , 11) , aic , '-', c=sns.xkcd_rgb['amber'], label='AIC')
axs[1].plot(np.arange(2 , 11) , bic , '-' ,c=sns.xkcd_rgb['greenish teal'], label='BIC')
axs[1].set_ylabel('')
axs[1].set_xlabel('number of clusters')
axs[1].legend(loc='best')


## plot perplexcity
# axs[2].plot(np.arange(2 , 11) , ppl , '-' ,c=sns.xkcd_rgb['greenish teal'])
# axs[2].set_ylabel('perplexity')
# axs[2].set_xlabel('number of clusters')

plt.show()

### Five clusters

In [None]:
model = GaussianMixture(n_components=5)
model.fit(X)
labels = model.predict(X)

In [None]:
# plot PCA components
fig, axs = plt.subplots(figsize=[10,10])
sns.scatterplot(x=X_pca[:,0],
                y=X_pca[:,1],
                hue=labels,
                palette=sns.color_palette('husl', 5),
                edgecolor=None,
                alpha=0.8,
                ax=axs)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Clusters')
plt.show()

In [None]:
# Print Silhouette score for five clusters
silhouette_score(X, model.predict(X))

### Six Clusters

In [None]:
model = GaussianMixture(n_components=6)
model.fit(X)
labels = model.predict(X)

In [None]:
# plot PCA components
fig, axs = plt.subplots(figsize=[10,10])
sns.scatterplot(x=X_pca[:,0],
                y=X_pca[:,1],
                hue=labels,
                palette=sns.color_palette('husl', 6),
                edgecolor=None,
                alpha=0.8,
                ax=axs)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Clusters')
plt.show()

In [None]:
# Print Silhouette score for six clusters
silhouette_score(X, model.predict(X))

## Birch

In [None]:
model = Birch(n_clusters=5)
model.fit(X)
labels = model.predict(X)

In [None]:
np.unique(labels)

In [None]:
# plot PCA components
fig, axs = plt.subplots(figsize=[10,10])
sns.scatterplot(x=X_pca[:,0],
                y=X_pca[:,1],
                hue=labels,
                palette=sns.color_palette('husl', 5),
                edgecolor=None,
                alpha=0.8,
                ax=axs)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Clusters')
plt.show()

## References

*  https://www.mygreatlearning.com/blog/dbscan-algorithm/
*  https://blog.floydhub.com/introduction-to-k-means-clustering-in-python-with-scikit-learn/
*  https://www.analyticsvidhya.com/blog/2019/10/gaussian-mixture-models-clustering/
*  https://jakevdp.github.io/PythonDataScienceHandbook/05.12-gaussian-mixtures.html
*  https://dl.acm.org/doi/10.1145/235968.233324
*  https://machinelearningmastery.com/clustering-algorithms-with-python/