In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df= pd.read_csv("../input/customer-segmentation/Mall_Customers.csv")
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.drop("CustomerID",axis=1,inplace=True)
df.head()

In [None]:
df.corr()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(13,10))
sns.heatmap(df.corr(),annot=True, cmap="viridis")

In [None]:
df=pd.get_dummies(data=df, columns=["Genre"],drop_first=True)
df

In [None]:
X= df.values
X.shape

## 1. KMeans Clustering

<font color="blue">
1.1. How KMeans Clustering Works:

In [None]:
from IPython.display import Image
url="https://i.stack.imgur.com/FQhxk.jpg"
Image(url,width=800, height=800)

<font color="blue">
In the step 1 in the algorithm, each observation is randomly assigned to a cluster.

In the step 2a in the algorithm,the cluster centroid for each cluster is computed, which are shown as large colored disk as shown top-right of the figure.

Initially these centroids are almost overlapping as we can see from the figure because initial cluster assignments are chosen randomly.

In the step 2a in the algorithm(bottom-left of the figure above), each observation is assigned to the nearest centroid.

In bottom-center of the figure above, step 2a once again is performed which lead to new cluster centroids.

We basically keep repeating these steps until there is no new cluster which means data points are being reassigned to a new cluster centroid.

At the bottom-right, we have the results obtained after about 10 iterations

<font color="blue">
1.2. Implementation of the Algorithm

In [None]:
from sklearn.cluster import KMeans

In [None]:
loss=list()
for i in range(1,20):
    kmeans=KMeans(n_clusters= i, init="k-means++")
    kmeans.fit(X)
    loss.append(kmeans.inertia_)
sns.set_style("darkgrid")
plt.figure(figsize=(12,10))
plt.plot(range(1,20), loss)
plt.title("Elbow Method")
plt.xlabel("Number of Cluster")  
plt.ylabel("loss")
plt.show()
#As we can see, we can have best cluster value when number of cluster is equal to 5

In [None]:
print(kmeans.inertia_)

In [None]:
kmeans=KMeans(n_clusters=5, init="k-means++")
my_clusters=kmeans.fit_predict(X)

In [None]:
cluster_df=pd.DataFrame(my_clusters,columns=["KMeans Clusters"])
cluster_df

In [None]:
new_df=pd.concat([df, cluster_df], axis=1)
new_df

<font color="blue">
We can easily check the centroid values by averaging the features in each cluster as follows

In [None]:
new_df.groupby("KMeans Clusters").mean()
#It is apparnt that Annual Income and Spending Score plays important role in the number of clusters

In [None]:
kmeans.cluster_centers_

In [None]:
plt.figure(figsize=(12,10))
sns.scatterplot(x=new_df["Annual Income (k$)"],y= new_df["Spending Score (1-100)"],hue=new_df["KMeans Clusters"],palette="magma")
#form this plot we can say that, if the mall make ads, it has higher chance to sell its product to the customer in cluster 0 and 3

In [None]:
plt.figure(figsize=(12,10))
sns.scatterplot(x=new_df["Genre_Male"],y= new_df["Spending Score (1-100)"],hue=new_df["KMeans Clusters"],palette="viridis")


In [None]:
plt.figure(figsize=(12,10))
sns.scatterplot(x=new_df["Age"],y= new_df["Spending Score (1-100)"],hue=new_df["KMeans Clusters"],palette="viridis")

In [None]:
from mpl_toolkits.mplot3d import Axes3D 
fig = plt.figure(1, figsize=(15, 10))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
plt.cla()
ax.set_xlabel('Spending Score (1-100)')
ax.set_ylabel('Age')
ax.set_zlabel('Annual Income (k$) ')
ax.scatter(X[:, 2], X[:, 0], X[:, 1], c= my_clusters.astype(np.float))
#Here we can see better the combination of three feature with different clusters

## 2. Hierarchical Clustering

<font color="blue">
Hierarchical clustering (also called hierarchical cluster analysis or HCA) is a method of cluster analysis which seeks to build a hierarchy of clusters.

Strategies for hierarchical clustering generally fall into two types:[1]

1.Agglomerative: This is a "bottom-up" approach: each observation starts in its own cluster, and pairs of clusters are merged as one moves up the hierarchy.

2.Divisive: This is a "top-down" approach: all observations start in one cluster, and splits are performed recursively as one moves down the hierarchy.

In general, the merges and splits are determined in a greedy manner. The results of hierarchical clustering[2] are usually presented in a dendrogram.

<font color="blue">
2.2. Using Dendogram in order to Find the Optimal Number of Clusters

In [None]:
from scipy.cluster import hierarchy 
hier=hierarchy.dendrogram(hierarchy.linkage(X, method="ward"))
plt.title("Dendogram of Hierarchical Clustering")
plt.xlabel("Observation Points")
plt.ylabel("Euclidean Distance")
plt.show() # it seems to have 3 or 5 clusters are better option

<font color="blue">
2.2. Using Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
ac=AgglomerativeClustering(n_clusters=5, affinity="euclidean",linkage="ward")
agglomerative_clusters= ac.fit_predict(X)
agglomerative_clusters

In [None]:
df3= pd.DataFrame(agglomerative_clusters, columns=["Agglomerative Clusters"])
df3

In [None]:
new_df=pd.concat([new_df,df3],axis=1)
new_df

## 3. Comparison of Both Algorithms

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 10), sharey=True)
fig.suptitle('Distribution of Cluster')

# KMeans Clustering
sns.scatterplot(ax=axes[0], x=new_df["Annual Income (k$)"], y=new_df["Spending Score (1-100)"],hue=new_df["KMeans Clusters"],palette="viridis")
axes[0].set_title("According to KMeans Clusters")
#Agglomerative Clustering
sns.scatterplot(ax=axes[1], x=new_df["Annual Income (k$)"], y=new_df["Spending Score (1-100)"],hue=new_df["Agglomerative Clusters"],palette="viridis")
axes[0].set_title("According to Agglomerative Clusters")


<font color="blue">
Above we test the accuracy of both clusters with each other. It seems both of them creates approximately same clusters regardless of the value difference in the entire dataset