# Mall customer segmentation by K-Means

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
df = pd.read_csv('../input/mall-customers/Mall_Customers.csv')

In [None]:
df.head(5)

In [None]:
df = df.rename(columns={'Genre':'Gender'})

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
# Finding whether there are missing values
df.isnull().sum()

# EDA

Annual income and spending score have higher sd than age, meaning that they are dispersed and may help clustering.

In [None]:
df.describe()

if we look at the pairplot of annual income and spending score, it is obvious that the customers can be divided into 5. 

Combining age and spending, it can be divided into 2 groups: top-left and bottom-right, purchasing power of youngers is relatively higher.

In [None]:
feature = df.drop(columns='CustomerID')

In [None]:
sns.pairplot(feature,kind="scatter")

In [None]:
# Ratio of male and female
gender_count = df['Gender'].value_counts()
plt.pie(gender_count.values, labels=gender_count.index, labeldistance=1.2, 
        wedgeprops = { 'linewidth' : 3, 'edgecolor' : 'white' })

If we take a deep dive into the features, it is observed that spending score has 3 peaks(0-20,40-60,80-100), as for annual income, it tends to right-skewed.

In [None]:
plt.figure(figsize = (20 , 10))
n=1
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
    plt.subplot(1,3,n)
    plt.subplots_adjust(hspace =0.5 , wspace = 0.2)
    sns.distplot(df[x], bins = 15)
    plt.title('{} Distribution'.format(x))
    n+=1

# Clustering by k-means

In [None]:
from sklearn.cluster import KMeans

K-means clustering is a type of unsupervised learning, it is usually used when we don't know their groups/categories. The algorithm assign each data point to one of K groups based on the features similarity. 

It is useful to find groups which have not been explicitly labeled in the data. This can be used to confirm business assumptions about what types of groups exist or to identify unknown groups in complex data sets.

In [None]:
# Turn non-numeric feature into number
# Male=0, Female=1
feature.loc[feature['Gender']=='Male','Gender']=0
feature.loc[feature['Gender']=='Female','Gender']=1

# Elbow Method
- Objective: Find optimal value of k in KMeans

- Principal: 

    plots the value of cost function produced by different values of k. When k increases, the data-point can be further "voted" to the nearest cluster and the distances between the respective centroids will decrease. However, the improvements in sum of squared  error (SSE) will decline and start to level-off with the increase of k. Such distortion resembles "elbow".


- Terms:

    1. Distortion
average of the euclidean squared distance from the centroid of the respective clusters

    2. Inertia
Sum of squared distances of samples to their closest cluster center


Distortion mainly appear when k is between 3 and 5, so we should try k=3/4/5.

In [None]:
inertias = []
for i in range(1, 11):
    km = KMeans(n_clusters=i).fit(feature)
    inertias.append(km.inertia_)

In [None]:
plt.plot(range(1, 11), inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()

In [None]:
inertias

# Modelling 

In [None]:
# 3 clusterings
# No of customers in each group respectively
km = KMeans(n_clusters=3).fit(feature)
y_km = km.fit_predict(feature)
n_cluster, km_count = np.unique(y_km, return_counts=True)
plt.bar(n_cluster, km_count)
plt.ylabel('No of customer')
plt.xlabel('Clustering')
plt.title('Customer segmentation by 3 groups')

In [None]:
plt.scatter(df['Annual Income (k$)'], 
            df['Spending Score (1-100)'], 
            c=y_km, s=100)


plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.title('Customer segmentation by 3 groups')

plt.show()

In [None]:
# 4 clusterings
# No of customers in each group respectively
km = KMeans(n_clusters=4).fit(feature)
y_km = km.fit_predict(feature)
n_cluster, km_count = np.unique(y_km, return_counts=True)
plt.bar(n_cluster, km_count)
plt.ylabel('No of customer')
plt.xlabel('Clustering')
plt.title('Customer segmentation by 4 groups')


In [None]:
plt.scatter(df['Annual Income (k$)'], 
            df['Spending Score (1-100)'], 
            c=y_km, s=100)


plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.title('Customer segmentation by 4 groups')

plt.show()

In [None]:
# 5 clusterings
# No of customers in each group respectively
km = KMeans(n_clusters=5).fit(feature)
y_km = km.fit_predict(feature)
n_cluster, km_count = np.unique(y_km, return_counts=True)
plt.bar(n_cluster, km_count)
plt.ylabel('No of customer')
plt.xlabel('Clustering')
plt.title('Customer segmentation by 5 groups')

In [None]:
plt.scatter(df['Annual Income (k$)'], 
            df['Spending Score (1-100)'], 
            c=y_km, s=100)


plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.title('Customer segmentation by 5 groups')

plt.show()