# Mall Customer Segmentation

In [None]:
#import the libraries
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt #data visualization 
import seaborn as sns  #phyton library for visualization

import os
df = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')

The label of this dataset is unknown so we will perform unsupervised learning by using the KMeans Clustering Algorithm 

In [None]:
df.head()

In [None]:
df.info()
df.describe()

In [None]:
#check for missing values in the dataset
df.isnull().sum()

there are no non null or missing values in all columns in the dataset

## Data Visualization

In [None]:
plt.style.use('fivethirtyeight')

In [None]:
plt.figure(1 , figsize = (15 , 6))
n = 0 
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
    n += 1
    plt.subplot(1 , 3 , n)
    plt.subplots_adjust(hspace =0.5 , wspace = 0.5)
    sns.distplot(df[x] , bins = 20)
    plt.title('Distplot of {}'.format(x))
plt.show()

In [None]:
plt.figure(1 , figsize = (15 , 7))
n = 0 
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
    for y in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
        n += 1
        plt.subplot(3 , 3 , n)
        plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
        sns.regplot(x = x , y = y , data = df)
        plt.ylabel(y.split()[0]+' '+y.split()[1] if len(y.split()) > 1 else y )
plt.show()

Annual Income and Spending Score shows the most promising correlation, where we can cluster the samples into five groups.

## Feature Selection

In [None]:
X = df[["Annual Income (k$)", "Spending Score (1-100)"]]
X = X.values

# Clustering using K- means

## Conventional

In [None]:
from sklearn.cluster import KMeans

Ks = range(1, 20)
KMNs = [KMeans(n_clusters=i) for i in Ks]
score = [KMNs[i].fit(X).score(X) for i in range(len(KMNs))]
plt.plot(Ks, score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

KMN = KMeans(n_clusters=5).fit(X)
KMNLabels = KMN.labels_
KMNClusterCenters = KMN.cluster_centers_
YPredict = KMN.predict(X)
print("KMN Labels : ", KMNLabels[:10])
print("KMN Cluster Centre : ", KMNClusterCenters)
print("Y Predict : ", YPredict[:10])
print("Shiloutte Score: ", silhouette_score(X, YPredict))

df = pd.DataFrame(X, columns=["Annual Income (k$)", "Spending Score (1-100)"])
plt.subplot(121)
sns.swarmplot(KMNLabels , df["Annual Income (k$)"])
plt.subplot(122)
sns.swarmplot(KMNLabels , df["Spending Score (1-100)"])
plt.show()

In [None]:
#Visualizing all the clusters 
plt.scatter(X[YPredict == 0, 0], X[YPredict == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[YPredict == 1, 0], X[YPredict == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[YPredict == 2, 0], X[YPredict == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[YPredict == 3, 0], X[YPredict == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[YPredict == 4, 0], X[YPredict == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(KMN.cluster_centers_[:, 0], KMN.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

## With MinMax Scaler

In [None]:
df2 = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')

from sklearn.preprocessing import MinMaxScaler
minmax_scale = MinMaxScaler().fit(df2[['Annual Income (k$)', 'Spending Score (1-100)']])
X2 = minmax_scale.transform(df2[['Annual Income (k$)', 'Spending Score (1-100)']])

In [None]:
from sklearn.cluster import KMeans

Ks2 = range(1, 20)
KMNs2 = [KMeans(n_clusters=i) for i in Ks2]
score2 = [KMNs2[i].fit(X2).score(X2) for i in range(len(KMNs2))]
plt.plot(Ks2, score2)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()

In [None]:
KMN2 = KMeans(n_clusters=5).fit(X2)
KMNLabels2 = KMN2.labels_
KMNClusterCenters2 = KMN2.cluster_centers_
YPredict2 = KMN2.predict(X2)
print("KMN Labels : ", KMNLabels2[:10])
print("KMN Cluster Centre : ", KMNClusterCenters2)
print("Y Predict : ", YPredict2[:10])
print("Shiloutte Score: ", silhouette_score(X2, YPredict2))


df_minmax = pd.DataFrame(X2, columns=["Annual Income (k$)", "Spending Score (1-100)"])
plt.subplot(121)
sns.swarmplot(KMNLabels2 , df_minmax["Annual Income (k$)"])
plt.subplot(122)
sns.swarmplot(KMNLabels2 , df_minmax["Spending Score (1-100)"])
plt.show()

In [None]:
#Visualizing all the clusters 
plt.scatter(X2[YPredict2 == 0, 0], X2[YPredict2 == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X2[YPredict2 == 1, 0], X2[YPredict2 == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X2[YPredict2 == 2, 0], X2[YPredict2 == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X2[YPredict2 == 3, 0], X2[YPredict2 == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X2[YPredict2 == 4, 0], X2[YPredict2 == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(KMN2.cluster_centers_[:, 0], KMN2.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

# With Standard Scaler

In [None]:
df3 = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')

from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler().fit(df3[['Annual Income (k$)', 'Spending Score (1-100)']])
X3 = std_scale.transform(df3[['Annual Income (k$)', 'Spending Score (1-100)']])

In [None]:
from sklearn.cluster import KMeans

Ks3 = range(1, 20)
KMNs3 = [KMeans(n_clusters=i) for i in Ks3]
score3 = [KMNs3[i].fit(X3).score(X3) for i in range(len(KMNs3))]
plt.plot(Ks3, score3)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()

In [None]:
KMN3 = KMeans(n_clusters=5).fit(X3)
KMNLabels3 = KMN3.labels_
KMNClusterCenters3 = KMN3.cluster_centers_
YPredict3 = KMN3.predict(X3)
print("KMN Labels : ", KMNLabels3[:10])
print("KMN Cluster Centre : ", KMNClusterCenters3)
print("Y Predict : ", YPredict3[:10])
print("Shiloutte Score: ", silhouette_score(X3, YPredict3))

df_std = pd.DataFrame(X3, columns=["Annual Income (k$)", "Spending Score (1-100)"])
plt.subplot(121)
sns.swarmplot(KMNLabels3 , df_std["Annual Income (k$)"])
plt.subplot(122)
sns.swarmplot(KMNLabels3 , df_std["Spending Score (1-100)"])
plt.show()

In [None]:
#Visualizing all the clusters 
plt.scatter(X3[YPredict3 == 0, 0], X3[YPredict3 == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X3[YPredict3 == 1, 0], X3[YPredict3 == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X3[YPredict3 == 2, 0], X3[YPredict3 == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X3[YPredict3 == 3, 0], X3[YPredict3 == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X3[YPredict3 == 4, 0], X3[YPredict3 == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(KMN3.cluster_centers_[:, 0], KMN3.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()