In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering

## Importing Dataset

In [None]:
dataset = pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
dataset

In [None]:
dataset.info()

* The data is complete, there are zero null values

In [None]:
dataset.describe()

## Features

### Age

In [None]:
fig, axs = plt.subplots(1,2,figsize=(14,6), gridspec_kw={'width_ratios': [3, 1], 'wspace' : 0.3})
sns.histplot(dataset['Age'], bins=10, kde=True, color='blue', ax=axs[0])
sns.boxplot(data=dataset, y='Age', color='orange', ax=axs[1])

plt.show()

* We can see that the age ranges between 18 and 70. The distribution is right skewed showing that the proportion of young people is higher.

### Annual Income (k$)

In [None]:
fig, axs = plt.subplots(1,2,figsize=(14,6), gridspec_kw={'width_ratios': [3, 1], 'wspace' : 0.3})
sns.histplot(dataset['Annual Income (k$)'], bins=15, kde=True, color='blue', ax=axs[0])
sns.boxplot(data=dataset, y='Annual Income (k$)', color='orange', ax=axs[1])

plt.show()

* The distribution of Annual Income is also right skewed and we can see that most number of people earn around 60-80K annually.

### Spending Score

In [None]:
fig, axs = plt.subplots(1,2,figsize=(14,6), gridspec_kw={'width_ratios': [3, 1], 'wspace' : 0.3})
sns.histplot(dataset['Spending Score (1-100)'], bins=10, kde=True, color='blue', ax=axs[0])
sns.boxplot(data=dataset, y='Spending Score (1-100)', color='orange', ax=axs[1])

plt.show()

* The spending scores are symmetrically distributed and a huge proportion of customers have a spending score between 40 and 60.

### Gender

In [None]:
plt.figure(figsize = (15,10))

plt.subplot(2, 2, 1)
sns.countplot(data=dataset, x='Gender')
for i in range(3):
    plt.subplot(2, 2, i+2)
    plt.subplots_adjust(wspace=0.3)
    sns.violinplot(data=dataset, y=dataset.iloc[:,i+2], x='Gender')
    
plt.show()

* We can see that the number of females is higher than the number of males and the distributions of the continuous features based upon gender are very similar

## Spending Score V/S Age using K-Means Clustering

In [None]:
x = dataset.loc[:,['Age','Spending Score (1-100)']]

In [None]:
plt.figure(figsize = (7,5))

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i,
                    init = 'k-means++',
                    random_state = 42)
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)
plt.plot(range(1,11), wcss)
plt.title("Elbow Method")
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")

plt.show()

* The elbow bends at 4 i.e. rate of change of WCSS after 4 is very less as compared to before 4 and hence 4 is the best number of clusters in this case.

In [None]:
kmeans = KMeans(n_clusters = 4,
                  init = 'k-means++',
                  random_state = 42)
x_kmeans = kmeans.fit_predict(x)

In [None]:
x_kmeans = np.where(x_kmeans==0,4,x_kmeans)
x['Cluster'] = x_kmeans.tolist()
x['Gender'] = dataset['Gender']

fig, axs = plt.subplots(1,2,figsize=(16,7), gridspec_kw={'width_ratios': [2, 1]})
axs[0].set_title('Clusters - Spending Score V/S Age')
sns.scatterplot(data=x, x='Age', y='Spending Score (1-100)', hue='Cluster', palette=['gold', 'blue', 'green', 'red'], s=70, ax=axs[0])

axs[1].set_title('Gender Ratio within clusters')
sns.countplot(data=x, y='Cluster', hue='Gender', ax=axs[1])

plt.show()

* We can divide our data in 4 clusters based on Age and Spending score and the 4 clusters can be characterised as:
1. Yellow - Young customers with high spending score
2. Blue - Young customers with medium spending score
3. Green - Old customers with medium spending score
4. Red - Customers with low spending score

* All the clusters have nearly equal number of customers.

* The gender ratio within clusters is also balanced. In yellow and blue clusters the numbers of females is a little higher which is fine as the number of total females is also higher than the total number of males but in red and green clusters the number is almost equal.

## Spending Score V/S Annual Income using Hierarchical Clustering

In [None]:
z = dataset.loc[:,['Annual Income (k$)','Spending Score (1-100)']]

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i,
                    init = 'k-means++',
                    random_state = 42)
    kmeans.fit(z)
    wcss.append(kmeans.inertia_)

In [None]:
fig, axs = plt.subplots(1,2,figsize=(16,6))
axs[0].set_title('Dendrogram')
dendrogram = sch.dendrogram(sch.linkage(z, method = 'ward'), ax=axs[0])
axs[0].set_xlabel('Observation Points')
axs[0].set_ylabel('Distances')

axs[1].set_title('Elbow Method')
axs[1].plot(range(1,11), wcss)
axs[1].set_xlabel('Number of Clusters')
axs[1].set_ylabel("WCSS")
 
plt.show()

* From the dendrogram we can find the best number of clusters by looking at the longest vertical distance which is not intersected by any extended horizontal. In our case the longest vertical distance is after the fourth horizontal line from the top i.e. the second orange line from the top and hence number of clusters should be 5.
* It is also evident from the WCSS curve that the number of clusters should be 5 as the elbow bends at k=5.

In [None]:
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
z_hc = hc.fit_predict(z)

In [None]:
z_hc = np.where(z_hc==0,5,z_hc)
z['Cluster'] = z_hc.tolist()
z['Gender'] = dataset['Gender']

fig, axs = plt.subplots(1,2,figsize=(16,7), gridspec_kw={'width_ratios': [2, 1]})
axs[0].set_title('Clusters - Spending Score V/S Annual Income')
sns.scatterplot(data=z, x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster', palette=['gold', 'blue', 'green', 'magenta', 'red'], s=70, ax=axs[0])

axs[1].set_title('Gender Ratio within clusters')
sns.countplot(data=z, y='Cluster', hue='Gender', ax=axs[1])

plt.show()

* We can divide our data in 5 clusters based on Annual Income and Spending score and the 5 clusters can be characterised as:
1. Yellow - Customers with average annual income and average spending score
2. Blue - Customers with high annual income and high spending score
3. Green - Customers with low annual income and high spending score
4. Magenta - Customers with low annual income and low spending score
5. Red - Customers with high annual income and low spending score

* Majority of the customers belong to the yellow clusters.

* In yellow cluster the numbers of females is much higher but in rest of the clusters the number is almost equal.