# Table of Contents

[01. Import Library](#01)<br>

[02. Load Data](#02)<br>

[03. Exploratory Data Analysis (EDA)](#03)<br>
&nbsp;&nbsp;&nbsp;[3.1. Use matplotlib](#3.1)<br>
&nbsp;&nbsp;&nbsp;[3.2. Use plotly_express](#3.2)<br>
&nbsp;&nbsp;&nbsp;[3.3. Use seaborn](#3.3)<br>

[04. Clustering](#04)<br>
&nbsp;&nbsp;&nbsp;[4.1. K-means](#4.1)<br>
&nbsp;&nbsp;&nbsp;[4.2. Hierarchical clustering](#4.2)<br>
&nbsp;&nbsp;&nbsp;[4.3. K-prototypes](#4.3)<br>

# 01. Import Library<a id='01'></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import pandas_profiling
import plotly_express as px
import matplotlib.pyplot as plt

import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')

from scipy import stats
from scipy.stats import norm, skew 
import scipy.cluster.hierarchy as shc


from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import DistanceMetric

from kmodes.kprototypes import KPrototypes

# !pip install gower
# import gower

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 02. Load Data <a id='02'></a>

In [None]:
print("List of files:", os.listdir('/kaggle/input/customer-segmentation-tutorial-in-python'))

# Load data
df = pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
print("\nData length:",df.shape)
print("\nData columns:",df.columns)
print("\nData columns:",df.info())
print("\nData:\n\n",df.head())

# 03. Exploratory Data Analysis (EDA)<a id='03'></a>

In [None]:
# Correlation
df_corr = df.corr()
df_corr

In [None]:
df_corr.style.background_gradient(cmap='coolwarm', axis=None)

In [None]:
# Use panda profile report
df.profile_report()

## 3.1. Use matplotlib<a id='3.1'></a>

In [None]:
# Use matplotlib

# plt.style.use('ggplot')
plt.hist(df['Annual Income (k$)'], bins = 100)

# Add title and axis names
plt.title('Annual Income')
plt.xlabel('k$')
plt.ylabel('Frequency') 

plt.show()

In [None]:
# Scatter Plot
fig, ax = plt.subplots()
ax.scatter(df['Age'], df['Annual Income (k$)'])
plt.ylabel('Annual Income', fontsize=12)
plt.xlabel('Age', fontsize=12)
plt.title('Annual Income by Age', fontsize=16)
plt.show()

In [None]:
# Scatter Plot
fig, ax = plt.subplots()
ax.scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'])
plt.xlabel('Annual Income', fontsize=12)
plt.ylabel('Spending Score (1-100)', fontsize=12)
plt.title('Annual Income by Spending Score (1-100)', fontsize=16)
plt.show()

In [None]:
# QQ-plot
fig = plt.figure()
ax = fig.add_subplot()
res = stats.probplot(df['Annual Income (k$)'], plot=plt)
plt.show()

## 3.2. Use plotly_express<a id='3.2'></a>

In [None]:
# Scatter Plot with color from 2nd variable
px.scatter(df, x='Annual Income (k$)', y='Spending Score (1-100)', color='Age')

In [None]:
# Scatter Plot with color from 2nd variable
px.scatter(df, x='Annual Income (k$)', y='Spending Score (1-100)', color='Gender')

In [None]:
# Box Plot
px.box(df[['Gender', 'Spending Score (1-100)']].sort_values(by='Gender')
       , x='Gender'
       , y='Spending Score (1-100)'
       , color='Gender')

In [None]:
# Box Plot
px.box(df[['Age', 'Spending Score (1-100)']].sort_values(by='Age')
       , x='Age'
       , y='Spending Score (1-100)'
       , color='Age')

In [None]:
# Box Plot
px.box(df[['Age', 'Annual Income (k$)']].sort_values(by='Age')
       , x='Age'
       , y='Annual Income (k$)'
       , color='Age')

## 3.3. Use seaborn<a id='3.3'></a>

In [None]:
# Plot the distribution
sns.displot(df, x="Annual Income (k$)", col="Gender", kind="kde")

In [None]:
# Plot the distribution
sns.displot(df, x="Spending Score (1-100)", col="Gender", kind="kde")

# 04. Clustering<a id='04'></a>

In [None]:
# scale data
# numerical variables
var_num = ['Age', 'Spending Score (1-100)', 'Annual Income (k$)']
df_scaled = normalize(df[var_num])
df_scaled = pd.DataFrame(df_scaled, columns=var_num)

df_scaled_full = pd.concat([df_scaled, df[['Gender']]], axis=1)
print(df_scaled_full.shape)

# gower 
# df_gower = gower.gower_matrix(df_scaled_full)
# print(df_gower.shape)
# df_gower_mean = df_gower.mean(0)
# df_gower_mean = df_gower_mean.reshape(-1,1)
# print(df_gower_mean.shape)
# df_gower_mean

## 4.1. K-means<a id='4.1'></a>

In [None]:
# Age and spending Score

X = df_scaled[['Age', 'Spending Score (1-100)']].iloc[: , :].values
inertia = []

k = 30

for n in range(1 , k+1):
    algorithm = (KMeans(n_clusters = n
                        ,init='k-means++'
                        ,n_init = n
                        ,max_iter=300
                        ,tol=0.0001
                        ,random_state=123  
                        ,algorithm='full') )
    algorithm.fit(X)
    inertia.append(algorithm.inertia_)
    
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , k+1) , inertia , 'o')
plt.plot(np.arange(1 , k+1) , inertia , '-' , alpha = 0.5)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()

In [None]:
# Age, spending Score and income
X = df_scaled[['Age', 'Spending Score (1-100)', 'Annual Income (k$)']].iloc[: , :].values
inertia = []

k = 30

for n in range(1 , k+1):
    algorithm = (KMeans(n_clusters = n
                        ,init='k-means++'
                        ,n_init = n
                        ,max_iter=300
                        ,tol=0.0001
                        ,random_state=123  
                        ,algorithm='full') )
    algorithm.fit(X)
    inertia.append(algorithm.inertia_)
    
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , k+1) , inertia , 'o')
plt.plot(np.arange(1 , k+1) , inertia , '-' , alpha = 0.5)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()

In [None]:
k_cluster = (KMeans(n_clusters = 7 
                    ,init='k-means++'
                    ,n_init = k 
                    ,max_iter=300
                    ,tol=0.0001
                    ,random_state= 123  
                    ,algorithm='full') )
k_cluster.fit(X)
centroids = algorithm.cluster_centers_

In [None]:
px.scatter(df_scaled, x='Annual Income (k$)', y='Spending Score (1-100)', color=k_cluster.labels_.astype(str))

In [None]:
px.scatter(df_scaled, x='Age', y='Spending Score (1-100)', color=k_cluster.labels_.astype(str))

## 4.2. Hierarchical clustering<a id='4.2'></a>

In [None]:
plt.figure(figsize=(10, 7))  
plt.title("Dendrograms")  
dend = shc.dendrogram(shc.linkage(df_scaled, method='ward'))

In [None]:
"""
The x-axis contains the samples and y-axis represents the distance between these samples. 
The vertical line with maximum distance is the blue line 
and hence we can decide a threshold of 2 and cut the dendrogram:
"""
plt.figure(figsize=(10, 7))  
plt.title("Dendrograms")  
dend = shc.dendrogram(shc.linkage(df_scaled, method='ward'))
plt.axhline(y=2, color='r', linestyle='--')

In [None]:
cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')  
cluster.fit_predict(df_scaled)

In [None]:
px.scatter(df_scaled, x='Annual Income (k$)', y='Spending Score (1-100)', color=cluster.labels_.astype(str))

In [None]:
px.scatter(df_scaled, x='Age', y='Annual Income (k$)', color=cluster.labels_.astype(str))

## 4.3. K-prototypes<a id='4.3'></a>

In [None]:
kproto = KPrototypes(n_clusters=5, init='Cao')
df_scaled_full_dummy = pd.get_dummies(df_scaled_full, columns=["Gender"])
clusters = kproto.fit_predict(df_scaled_full_dummy, categorical=[0, 1])
#join data with labels 
labels = pd.DataFrame(clusters)
labeledCustomers = pd.concat((df_scaled_full,labels),axis=1)
labeledCustomers = labeledCustomers.rename({0:'labels'},axis=1)

In [None]:
px.scatter(df_scaled, x='Age', y='Annual Income (k$)', color=labeledCustomers["labels"].astype(str))