In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
df = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
df.head()

# EDA

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.info()

## CustomerID

In [None]:
df.drop('CustomerID', axis =1, inplace=True)
df.head()

## Gender

In [None]:
sns.countplot(x= df.Gender);

## Age

In [None]:
sns.histplot(df.Age, kde=True);

## Annual Income (k$)

In [None]:
df.rename({'Annual Income (k$)': 'Income'}, axis =1, inplace = True)

In [None]:
sns.histplot(df.Income, kde=True);

In [None]:
sns.scatterplot(y= df.Income, x= df.Age, hue= df.Gender);

## Spending Score (1-100)

In [None]:
df.rename({'Spending Score (1-100)': 'Score'}, axis =1, inplace = True)

In [None]:
sns.histplot(df.Score, kde=True);

In [None]:
sns.scatterplot(y= df.Income, x= df.Score);

In [None]:
sns.scatterplot(y= df.Age, x= df.Score);

In [None]:
sns.scatterplot(y= df.Income, x= df.Score);

This is the most informative visualization till now, as we can observe about 5 clusters:
1. Low Score, Low Income
2. Low Score, High Income
3. Mid Score, Mid Income
4. High Score, Low Income
5. High Score, High Income

**Considering only those two features, we can build our first**

In [None]:
X= df[['Income', 'Score']].values

# Clustering using K-means

In [None]:
from sklearn.cluster import KMeans

km5 = KMeans(n_clusters= 5, init='k-means++', random_state=0)
y_pred = km5.fit_predict(X)

In [None]:
#Visualizing all the clusters 
plt.figure(figsize=(10,10))
plt.scatter(X[y_pred == 0, 0], X[y_pred == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_pred == 1, 0], X[y_pred == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_pred == 2, 0], X[y_pred == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_pred == 3, 0], X[y_pred == 3, 1], s = 100, c = 'yellow', label = 'Cluster 4')
plt.scatter(X[y_pred == 4, 0], X[y_pred == 4, 1], s = 100, c = 'brown', label = 'Cluster 5')

plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

**We can try different number of clusters to find the optimum number of clusters using Elbow Method**

## Elbow Method

In [None]:
# Within-cluster Sum of Squares (Inertia)
inertia=[]
k_range= range(1,11)

for i in k_range:
    kmeans = KMeans(n_clusters= i, init='k-means++', random_state=0)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

#Visualizing the ELBOW method to get the optimal value of K 
fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(x=k_range, y=inertia, ax=ax)
plt.title('The Elbow Method')
plt.xlabel('No of clusters "k"')
plt.ylabel('Inertia')

# Annotate arrow
ax.annotate('Possible Elbow Point', xy=(3, 105000), xytext=(4, 150000), xycoords='data',          
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='blue', lw=2))

ax.annotate('Possible Elbow Point', xy=(5, 46000), xytext=(5, 80000), xycoords='data',          
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='blue', lw=2))

plt.show()


### Try 3 Clusters

In [None]:
from sklearn.cluster import KMeans

km3 = KMeans(n_clusters= 3, init='k-means++', random_state=0)
y_pred = km3.fit_predict(X)

In [None]:
#Visualizing all the clusters 
plt.figure(figsize=(10,10))
plt.scatter(X[y_pred == 0, 0], X[y_pred == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_pred == 1, 0], X[y_pred == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_pred == 2, 0], X[y_pred == 2, 1], s = 100, c = 'green', label = 'Cluster 3')

plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

### As we deal with Unsupervised Learning, we have no hard metric to judge the performance and choosing the best no. of clusters, but we may  have two good options (3 or 5 clusters) and the business expert can choose the most suitable for this case study

## Try all features

In [None]:
X= pd.get_dummies(df, drop_first= True)
X

In [None]:
from sklearn.cluster import KMeans

km5 = KMeans(n_clusters= 5, init='k-means++', random_state=0)
y_pred = km5.fit_predict(X.values)

In [None]:
X

In [None]:
X= X.values

In [None]:
#Visualizing all the clusters 
plt.figure(figsize=(10,10))
plt.scatter(X[y_pred == 0, 1], X[y_pred == 0, 2], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_pred == 1, 1], X[y_pred == 1, 2], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_pred == 2, 1], X[y_pred == 2, 2], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_pred == 3, 1], X[y_pred == 3, 2], s = 100, c = 'yellow', label = 'Cluster 4')
plt.scatter(X[y_pred == 4, 1], X[y_pred == 4, 2], s = 100, c = 'brown', label = 'Cluster 5')

plt.title('Clusters of customers')
plt.xlabel('Income')
plt.ylabel('Score')
plt.legend()
plt.show()

# Clustering using Hierarchical Clustering

## Using the dendrogram to find the optimal number of clusters

In [None]:
X= df[['Income', 'Score']].values

In [None]:
import scipy.cluster.hierarchy as sch

plt.figure(figsize=(20,10))
dendrogram = sch.dendrogram(sch.linkage(X, method = 'ward'))

plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()

### Dendrogram also shows possible 3 or 5 clusters

In [None]:
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
y_hc = hc.fit_predict(X)

In [None]:
plt.figure(figsize=(10,10))

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

# Wrap Up All in One Place


In [None]:
def clustering(X= df[['Income', 'Score']].values):
    #Kmeans
    from sklearn.cluster import KMeans
    km5 = KMeans(n_clusters= 5, init='k-means++', random_state=0)
    y_pred = km5.fit_predict(X)
    
    #Visualizing all the clusters 
    fig= plt.figure(figsize=(20,15))
    fig.add_subplot(221)
    plt.scatter(X[y_pred == 0, 0], X[y_pred == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
    plt.scatter(X[y_pred == 1, 0], X[y_pred == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
    plt.scatter(X[y_pred == 2, 0], X[y_pred == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
    plt.scatter(X[y_pred == 3, 0], X[y_pred == 3, 1], s = 100, c = 'yellow', label = 'Cluster 4')
    plt.scatter(X[y_pred == 4, 0], X[y_pred == 4, 1], s = 100, c = 'brown', label = 'Cluster 5')

    plt.title('Kmeans Clustering of customers')
    plt.xlabel('Income')
    plt.ylabel('Score')
    plt.legend()
    
    #Agglomerative
    from sklearn.cluster import AgglomerativeClustering
    hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
    y_hc = hc.fit_predict(X)
   
    fig.add_subplot(222)
    plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
    plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
    plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
    plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
    plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
    plt.title('Agglomerative Clusters of customers')
    plt.xlabel('Income')
    plt.ylabel('Score')
    plt.legend()
    plt.show()

In [None]:
clustering()