In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Mall customer segmentation using K-Means clustering
**We are going to try to segment customer data we've got from mall. Dataset is small and easy to work with, has no missing values and almost no outliers (will ignore it).**

**Dataset overview:**

1. **Customer ID: Id of customer, this field will be used as index as it's not useful**
2. **Gender: customer gender - female / male**
3. **Age: age of customer, we've got customers with age from 18 to 70 years**
4. **Annual Income: income of customer, will be renamed to income only, values from 13 to 137**
5. **Spending Score: Score assigned by the mall based on customer behavior and spending nature, values from 1 to 99**

# Import Libraries

In [None]:
import numpy as np # useful python library for linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization
%matplotlib notebook

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from mpl_toolkits import mplot3d

import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('max_rows',200)
df=pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv') # importing data
df.set_index('CustomerID',inplace=True) # setting customer id as index
df.head(10) # quick glimpse of data

In [None]:
df.info() # getting to know the data

In [None]:
df.describe() # various statistical analysis of the data

# Data Visualization

In [None]:
"""
Visualization of Gender distribution in data
Right figure- bar plot
Left figure- pie chart

"""

plt.figure(figsize = (15 , 6))

plt.subplot(1,2,1)
sns.countplot(y='Gender',data=df,palette='colorblind')
ax=plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)


plt.subplot(1,2,2)
plt.pie(x=df['Gender'].value_counts(),labels=['Female','Male'],autopct='%1.2f%%')
ax=plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()

In [None]:
# Distribution of Numerical data with the help of histograms

plt.figure(figsize = (15 , 6))
n = 0 
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
    n += 1
    plt.subplot(1 , 3 , n)
    plt.subplots_adjust(hspace =0.5 , wspace = 0.5)
    sns.distplot(df[x] , bins = 20)
    plt.title('Distplot of {}'.format(x))
    ax=plt.gca()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
cols=df.drop('Gender',axis=1).columns
x=sns.PairGrid(data=df,hue='Gender',vars=cols,palette='colorblind',layout_pad=True)
x.map_offdiag(sns.scatterplot)
x.add_legend()
x.fig.set_size_inches(15,15)
x

**The above graph is an important part of clustering as it can uncover various patterns among the features and help us determine the optimal choice of clusters**


**You can see there seems to be 2 groups of customers by age vs score (top left quarter vs bottom right quarter), where diagonal is delimiting them.**

**What is more important is actually chart Income vs Score where we can see 5 different groups of customers (corners & center). What does it mean? We've probably found ideal way to cluster our customers based on income and score!**

# Income & Score by Age

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(2,1,1)
sns.barplot(x=df['Age'], y=df['Annual Income (k$)'], hue=df['Gender'], ci=0)
plt.title('Income by Age')
plt.xlabel('')

plt.subplot(2,1,2)
sns.barplot(x=df['Age'], y=df['Spending Score (1-100)'], hue=df['Gender'], ci=0)
plt.title('Score by Age')

plt.show()

**Last, check if there is significant difference (increasing/decreasing trend) when looking on Income or Score by Age. What is bit weird for me is that 18 years people has almost same score as 60 years old.**

**You may notice that income seems to be highest for age group 25-50 comparing to others and similary, score is higer for group of people in age 20-40 comparing to others.**

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), cmap = 'Wistia', annot = True)
plt.title('Heatmap for the Data', fontsize = 20)
plt.show()

**The Above Graph for Showing the correlation between the different attributes of the Mall Customer Segementation Dataset, This Heat map reflects the most correlated features with Orange Color and least correlated features with yellow color.**

**We can clearly see that these attributes do not have good correlation among them, that's why we will proceed with all of the features.**

# Model Building

# Segmentation using Age and Spending Score Data

In [None]:
#Segmentation using Age and Spending Score Data

X1=df[['Age' , 'Spending Score (1-100)']]

scaler=MinMaxScaler()
X1=scaler.fit_transform(X1)

error1=[]
for i in range(1,16):
    clf1=KMeans(n_clusters = i ,init='k-means++',n_init = 10,max_iter=300,tol=0.0001,random_state=0,algorithm='auto')
    clf1.fit(X1)
    error1.append(clf1.inertia_)

In [None]:
"""
Elbow Method to find optimal number of clusters
"""

plt.figure(figsize=(15,7))
sns.lineplot(x=range(1,16),y=error1,ci=None)
plt.xticks(ticks=range(1,16),labels=range(1,16))
ax=plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title('Segmentation using Age and Spending Score (Elbow Method)')

**Looking at the above graph it is difficult to pinpoint the best K for the K means clustering, hence we might have to do it by intuition**

In [None]:
# Density Distribution of data
sns.jointplot(x='Spending Score (1-100)',y='Age',data=df,kind='kde')

**The data seems to be concentrated in two major spots. Hence we should try using K=2 for our mode**

In [None]:
clf1=KMeans(n_clusters = 2 ,init='k-means++',n_init = 10,max_iter=300,tol=0.0001,random_state=0,algorithm='auto')
clf1.fit(X1)

labels1=clf1.labels_
centroids1=clf1.cluster_centers_
df['labels1']=labels1

plt.figure(figsize=(12,12))
sns.scatterplot(x='Age',y='Spending Score (1-100)',data=df,hue='labels1',s=100,palette='colorblind',legend=False)
ax=plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title('Segmentation using Age and Spending Score (Cluster Visualisation)')

**Looking at the above clusters two conclusions can be made**

**1.Young people (except a few outliers) having an average-high spending score**

**2.Older a person gets the less he/she spends as we can see middle aged and old people have a low-average spending score**


# Segmentation using Annual Income and Spending Score

In [None]:
#Segmentation using Annual Income (k$) and Spending Score

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler,MinMaxScaler
X2=df[['Annual Income (k$)' , 'Spending Score (1-100)']]

scaler=MinMaxScaler()
X2=scaler.fit_transform(X2)

error2=[]
for i in range(1,16):
    clf2=KMeans(n_clusters = i ,init='k-means++',n_init = 10,max_iter=300,tol=0.0001,random_state=0,algorithm='auto')
    clf2.fit(X2)
    error2.append(clf2.inertia_)

In [None]:
plt.figure(figsize=(15,7))
sns.lineplot(x=range(1,16),y=error2,ci=None)
plt.xticks(ticks=range(1,16),labels=range(1,16))
ax=plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title('Segmentation using Annual Income (k$) and Spending Score (Elbow Method)')

**From the above plot it is clear that the optimal clusters should be 5**

In [None]:
# Density Distribution of data
sns.jointplot(x='Spending Score (1-100)',y='Annual Income (k$)',data=df,kind='kde')

**Majority data seems to be concentrated in the middle with a few data clustered in the four corners**

In [None]:
clf2=KMeans(n_clusters = 5 ,init='k-means++',n_init = 10,max_iter=300,tol=0.0001,random_state=0,algorithm='auto')
clf2.fit(X2)

labels2=clf2.labels_
centroids2=clf2.cluster_centers_
df['labels2']=labels2

plt.figure(figsize=(12,12))
sns.scatterplot(x='Annual Income (k$)',y='Spending Score (1-100)',data=df,hue='labels2',palette=sns.color_palette("hls", 5),s=100,legend='full')
ax=plt.gca()
ax.spines['top'].set_visible(False)  
ax.spines['right'].set_visible(False)
plt.title('Segmentation using Annual Income (k$) and Spending Score (Cluster Visualisation)')

**This Clustering Analysis gives us a very clear insight about the different segments of the customers in the Mall. There are clearly Five segments of Customers based on their Annual Income and Spending Score which are reportedly the best factors/attributes to determine the segments of a customer in a Mall.**


* **Poor and not-spender - customers with low income and low spending score (cluster #4)**
* **Poor and spender - customers with low income, but spending a lot (cluster #1)**
* **Neutral - customers with mid income and mid spending score (cluster #0)**
* **Rich and not-spender - customers with high income and low spending score (cluster #2)**
* **Rich and spender - customers with high income and high spending score (cluster #3)**

# Segmentation using Annual Income and Age

In [None]:
#Segmentation using Annual Income (k$) and Age

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler,MinMaxScaler
X3=df[['Annual Income (k$)' , 'Age']]

scaler=MinMaxScaler()
X3=scaler.fit_transform(X3)

error3=[]
for i in range(1,16):
    clf3=KMeans(n_clusters = i ,init='k-means++',n_init = 10,max_iter=300,tol=0.0001,random_state=0,algorithm='auto')
    clf3.fit(X3)
    error3.append(clf3.inertia_)

In [None]:
plt.figure(figsize=(15,7))
sns.lineplot(x=range(1,16),y=error3,ci=None)
plt.xticks(ticks=range(1,16),labels=range(1,16))
ax=plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title('Segmentation using Annual Income (k$) and Age (Elbow Method)')

**Optimal clusters are 3**

In [None]:
# Density Distribution of data
sns.jointplot(x='Age',y='Annual Income (k$)',data=df,kind='kde')

In [None]:
clf3=KMeans(n_clusters = 3 ,init='k-means++',n_init = 10,max_iter=300,tol=0.0001,random_state=0,algorithm='auto')
clf3.fit(X3)

labels3=clf3.labels_
centroids3=clf3.cluster_centers_
df['labels3']=labels3

plt.figure(figsize=(12,12))
sns.scatterplot(y='Annual Income (k$)',x='Age',data=df,hue='labels3',s=100,palette='colorblind',legend='full')
ax=plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title('Segmentation using Annual Income (k$) and Age (Cluster Visualisation)')

**From the above scatter plot we can group the data based on annual income and age into three group**
* **Young low earners (cluster #0)**
* **Middle aged high earners (cluster #2)**
* **Old low earners (cluster #3)**

**As the age increases the annual income first increases and is maximum for middle aged people (30-50) and then decreases**

# Model combining all features

In [None]:
#Segmentation using Annual Income (k$),Age and Spending Score (1-100)   

X4=df[['Annual Income (k$)' , 'Age','Spending Score (1-100)']]

scaler=MinMaxScaler()     
X4=scaler.fit_transform(X4)

error4=[]
for i in range(1,16):
    clf4=KMeans(n_clusters = i ,init='k-means++',n_init = 10,max_iter=300,tol=0.0001,random_state=0,algorithm='auto')
    clf4.fit(X4)
    error4.append(clf4.inertia_)

In [None]:
plt.figure(figsize=(15,7))
sns.lineplot(x=range(1,16),y=error4,ci=None)
plt.xticks(ticks=range(1,16),labels=range(1,16))
ax=plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title('Segmentation using Annual Income (k$), Age and Spending Score (1-100) (Elbow Method)')

In [None]:
clf4=KMeans(n_clusters = 4 ,init='k-means++',n_init = 10,max_iter=300,tol=0.0001,random_state=0,algorithm='auto')
clf4.fit(X4)

labels4=clf4.labels_
centroids4=clf4.cluster_centers_
df['labels4']=labels4

fig=plt.figure(figsize=(15,15))
ax=plt.axes(projection='3d')
z=np.array(df['Age'])
x=np.array(df['Spending Score (1-100)'])
y=np.array(df['Annual Income (k$)'])

ax.set_xlabel('Spending Score (1-100)')
ax.set_ylabel('Annual Income (k$)')
ax.set_zlabel('Age')

ax.scatter3D(x,y,z,c=np.array(df['labels4']),cmap='rainbow',s=100)