In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing libraries
import matplotlib.pyplot as plt
import seaborn as sns

#importing sklearn libraries
import sklearn
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

import plotly.express as px

## Step 1: Reading and understanding data:

In [None]:
mall_df= pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
mall_df.head()

In [None]:
#check the shape of dataset

mall_df.shape

In [None]:
#check the info of dataset

mall_df.info()

In [None]:
#describe the dataframe

mall_df.describe()

In [None]:
#check all the columns

mall_df.columns

## Step 2: Data Cleaning

In [None]:
#checking null values

mall_df.isnull().sum()

In [None]:
#percentage null values

100*mall_df.isnull().sum()/len(mall_df)

In [None]:
# checking duplicate values

mall_df.duplicated(subset = 'CustomerID').sum()

## Step 3: Exploratory Data Analysis (EDA)

In [None]:
# Box Plot
plt.figure(figsize=(15, 5))
features = ['Age', 'Annual Income (k$)',
       'Spending Score (1-100)']
for i in enumerate(features):
    ax = plt.subplot(1, 3, i[0]+1)
    sns.boxplot(mall_df[i[1]])
    plt.xticks(rotation=20)

In [None]:
# distribution Plot
plt.figure(figsize=(15, 5))
features = ['Age', 'Annual Income (k$)',
       'Spending Score (1-100)']
for i in enumerate(features):
    ax = plt.subplot(1, 3, i[0]+1)
    sns.distplot(mall_df[i[1]])
    plt.xticks(rotation=20)

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1,3,1)
sns.boxplot(x=mall_df.Gender, y=mall_df.Age)
plt.title('Age')

plt.subplot(1,3,2)
sns.boxplot(x=mall_df.Gender, y=mall_df['Annual Income (k$)'])
plt.title('Annual Income (k$)')

plt.subplot(1,3,3)
sns.boxplot(x=mall_df.Gender, y=mall_df['Spending Score (1-100)'])
plt.title('Spending Score (1-100)')

plt.show()

In [None]:
#scatter plot
plt.figure(figsize=(15,5))
plt.subplot(1,3,1)
sns.scatterplot(x=mall_df.Age,y=mall_df['Spending Score (1-100)'],hue=mall_df.Gender)

plt.subplot(1,3,3)
sns.scatterplot(x=mall_df.Age,y=mall_df['Annual Income (k$)'],hue=mall_df.Gender)

plt.subplot(1,3,2)
sns.scatterplot(x=mall_df['Annual Income (k$)'],y=mall_df['Spending Score (1-100)'],hue=mall_df.Gender)

## Step 4: Preparing Data

In [None]:
# creating a new df with only numerical column
mall_df1= mall_df.drop(['CustomerID', 'Gender'], axis=1)
mall_df1.head()

In [None]:
#converting Gender Variable
#Male - 1
#Female -0
'''''variablelist =  ['Gender']

# Defining the map function
def binary_map(x):
    return x.map({'Male': 1, "Female": 0})

# Applying the function to the columns
mall_df1[variablelist] = mall_df1[variablelist].apply(binary_map)
mall_df1.head()'''

### Scaling Data

In [None]:
# initiate an object
scaler= StandardScaler()

# fit-transform data
mall_df1_scaled= scaler.fit_transform(mall_df1)
mall_df1_scaled.shape

In [None]:
mall_df1_scaled= pd.DataFrame(mall_df1_scaled)
mall_df1_scaled.columns = ['Age','Annual Income (k$)','Spending Score (1-100)']
mall_df1_scaled.head()

## Step 5: Hopkins Statistics

In [None]:
# function hopkin statistics

from random import sample
from numpy.random import uniform
from math import isnan
from sklearn.neighbors import NearestNeighbors
 
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
        
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
# Evaluate Hopkins Statistics
print('Hopkins statistics is: ', round(hopkins(mall_df1_scaled),2))

## Step 6: Hierarchical Clustering

In [None]:
# single linkage
mergings = linkage(mall_df1_scaled, method="single", metric='euclidean')
dendrogram(mergings)
plt.show()

In [None]:
# complete linkage
mergings = linkage(mall_df1_scaled, method="complete", metric='euclidean')
dendrogram(mergings)
plt.show()

Using Complete linkage, we will consider clusters=4.

In [None]:
# 3 clusters
cluster_labels = cut_tree(mergings, n_clusters=4).reshape(-1, )
cluster_labels

In [None]:
# assign the label
mall_df['cluster_labels'] = cluster_labels
mall_df.head()

In [None]:
mall_df['cluster_labels'].value_counts()

In [None]:
# plot
plt.title('Age')
sns.boxplot(x='cluster_labels', y='Age', data=mall_df)
plt.show()

In [None]:
# plot
plt.title('Annual Income (k$)')
sns.boxplot(x='cluster_labels', y='Annual Income (k$)', data=mall_df)
plt.show()

In [None]:
# plot
plt.title('Spending Score (1-100)')
sns.boxplot(x='cluster_labels', y='Spending Score (1-100)', data=mall_df)
plt.show()

In [None]:
sns.countplot(data= mall_df , hue='Gender', x='cluster_labels').tick_params(axis='x', rotation = 45)

In [None]:
#scatter plot gdpp-child_mort

fig = px.scatter(mall_df, x="Spending Score (1-100)", y="Age", color="cluster_labels")
fig.show()

In [None]:
#scatter plot gdpp-child_mort

fig = px.scatter(mall_df, x="Spending Score (1-100)", y="Annual Income (k$)", color="cluster_labels")
fig.show()

In [None]:
#scatter plot gdpp-child_mort

fig = px.scatter(mall_df, x="Annual Income (k$)", y="Age", color="cluster_labels")
fig.show()

In [None]:
mall_grouped= mall_df.groupby('cluster_labels')

In [None]:
mall_grouped['Age', 'Annual Income (k$)',
       'Spending Score (1-100)'].mean().sort_values(by=['Age', 'Annual Income (k$)',
       'Spending Score (1-100)'], ascending=[True, True, True])

## Conclusion:

* Cluster 0 belong to Low Income and High Spending score people who are mostly young aged.
* Cluster 1 belong to Average Income and Average Spending score people who are mostly old aged.
* Cluster 2 belong to High Income and High Spending score people who are at their 30s.
* Cluster 3 belongs to High Income and low spending score people who are mid aged.