In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree
from kmodes.kprototypes import KPrototypes
from kmodes.kmodes import KModes
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D 

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("../input/mall-customers/Mall_Customers.csv")
df.shape

#### Data inspection

In [None]:
df.head()

In [None]:
df.info()

"Genre" is categorical feature and others are numerical

In [None]:
df.describe()

#### CustomerID is index of the entries. The segmentation analysis will be done based on other features

In [None]:
df.isnull().sum()

#### There is no missing entries

#### Univariate analysis

In [None]:
# check for any outliers before trying kmeans

f, axes = plt.subplots(2, 2, figsize=(12,6))

index1 = 0
index2 = 0

for col in list(['Age', 'Annual Income (k$)', 'Spending Score (1-100)']):
    sns.boxplot(df[col], ax=axes[index1][index2]);
    index2 = index2+1
    if index2==2:
        index2 = 0
        index1 = index1+1
plt.show()

In [None]:
# get rid of outlier in annual income

df = df[df['Annual Income (k$)']<130]

In [None]:
# cross check for any outliers

f, axes = plt.subplots(2, 2, figsize=(12,6))

index1 = 0
index2 = 0

for col in list(['Age', 'Annual Income (k$)', 'Spending Score (1-100)']):
    sns.boxplot(df[col], ax=axes[index1][index2]);
    index2 = index2+1
    if index2==2:
        index2 = 0
        index1 = index1+1
plt.show()

In [None]:
# more females than male in the mall customer table

sns.countplot(df['Genre']);

#### Bivariate analysis

In [None]:


# there is no strong correlation between the numeric variables

sns.pairplot(df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']])
plt.show()

#### There is no strong correlation between the features. Confirm with heatmap of correlation

In [None]:
sns.heatmap(df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].corr(), annot=True)
plt.show()

In [None]:
# bivariate analysis with gender
# the distribution of age, income and spending score are similar between male and female

f, axes = plt.subplots(2, 2, figsize=(12,6))

index1 = 0
index2 = 0

for col in list(['Age', 'Annual Income (k$)', 'Spending Score (1-100)']):
    sns.boxplot(y=col, x='Genre', data=df, ax=axes[index1][index2]);
    index2 = index2+1
    if index2==2:
        index2 = 0
        index1 = index1+1
plt.show()

#### The median of male and female among age, income and spending features are nearly same.

In [None]:
# scale the features

scaler = StandardScaler()
df2 = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]
df2_scaled= scaler.fit_transform(df2)
df2 = pd.DataFrame(df2_scaled, index=df2.index, columns=df2.columns)

In [None]:
final_df = df[['CustomerID', 'Genre']]
final_df = pd.concat([final_df, df2], axis=1)

In [None]:
final_df.head()

#### final_df is the dataframe to analyse with kmeans

#### Segmentation of Age and annual income

In [None]:
# Find out the optimum number of clusters based on intertia

df2 = final_df[['Age', 'Annual Income (k$)']]

inertia = []

for n in range(1, 11):
    df2_auto = KMeans(n_clusters = n ,init='k-means++', n_init = 10 ,max_iter=300, 
                       tol=0.0001,  random_state= 111  , algorithm='auto')
    df2_auto.fit(df2)
    inertia.append(df2_auto.inertia_)
    
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()

In [None]:
# 3 number of cluster appears to be optimum choice

df2_auto = KMeans(n_clusters = 3 ,init='k-means++', n_init = 10 ,max_iter=300, 
                       tol=0.0001,  random_state= 111  , algorithm='auto')
df2_auto.fit(df2)

In [None]:
df2.loc[:,'cluster'] = df2_auto.labels_

In [None]:
df2.head()

In [None]:
# visualize the distribution of age and income across clusters
sns.scatterplot(data=df2, x='Age', y='Annual Income (k$)', hue='cluster')
plt.show()

#### The age and income plot has got segmented into 3 clusters

#### Segmentation of Age and spending

In [None]:
df2 = final_df[['Age', 'Spending Score (1-100)']]

inertia = []

for n in range(1, 11):
    df2_auto = KMeans(n_clusters = n ,init='k-means++', n_init = 10 ,max_iter=300, 
                       tol=0.0001,  random_state= 111  , algorithm='auto')
    df2_auto.fit(df2)
    inertia.append(df2_auto.inertia_)
    
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()

In [None]:
# 3 number of cluster appears to be optimum choice

df2_auto = KMeans(n_clusters = 3 ,init='k-means++', n_init = 10 ,max_iter=300, 
                       tol=0.0001,  random_state= 111  , algorithm='auto')
df2_auto.fit(df2)

In [None]:
df2.loc[:,'cluster'] = df2_auto.labels_

In [None]:
# visualize the distribution of age and income across clusters
sns.scatterplot(data=df2, x='Age', y='Spending Score (1-100)', hue='cluster')
plt.show()

#### The scatter plot between age and spending score got segmented into 3 clusters

#### Segmentation of Income and Spending score

In [None]:
df2 = final_df[['Annual Income (k$)', 'Spending Score (1-100)']]

inertia = []

for n in range(1, 11):
    df2_auto = KMeans(n_clusters = n ,init='k-means++', n_init = 10 ,max_iter=300, 
                       tol=0.0001,  random_state= 111  , algorithm='auto')
    df2_auto.fit(df2)
    inertia.append(df2_auto.inertia_)
    
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()

In [None]:
# 4 number of cluster appears to be optimum choice

df2_auto = KMeans(n_clusters = 4 ,init='k-means++', n_init = 10 ,max_iter=300, 
                       tol=0.0001,  random_state= 111  , algorithm='auto')
df2_auto.fit(df2)

In [None]:
df2.loc[:,'cluster'] = df2_auto.labels_

In [None]:
# visualize the distribution of age and income across clusters
sns.scatterplot(data=df2, x='Annual Income (k$)', y='Spending Score (1-100)', hue='cluster')
plt.show()

The scatter plot between income and spending score got segmented into 4 clusters

#### Segmentation of Age, Income and Spending score

In [None]:
df2 = final_df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

inertia = []

for n in range(1, 11):
    df2_auto = KMeans(n_clusters = n ,init='k-means++', n_init = 10 ,max_iter=300, 
                       tol=0.0001,  random_state= 111  , algorithm='auto')
    df2_auto.fit(df2)
    inertia.append(df2_auto.inertia_)
    
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()

In [None]:
# 4 number of cluster appears to be optimum choice

df2_auto = KMeans(n_clusters = 4 ,init='k-means++', n_init = 10 ,max_iter=300, 
                       tol=0.0001,  random_state= 111  , algorithm='auto')
df2_auto.fit(df2)

In [None]:
df2.loc[:,'cluster'] = df2_auto.labels_

In [None]:
df2['color'] = df2['cluster'].map({3:'red', 2:'blue', 1:'green', 0:'yellow'})

In [None]:
df2.head()

In [None]:
# visualize the distribution of age and income across clusters

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

x = df2['Age']
y = df2['Annual Income (k$)']
z = df2['Spending Score (1-100)']
m = df2['color']

ax.set_xlabel("Age")
ax.set_ylabel("Annual Income (k$)")
ax.set_zlabel("Spending Score (1-100)")

ax.scatter(x, y, z, color=m)

plt.show()

#### The scatter plot between age, income, spending score, color got segmented into 4 clusters