In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Pre_Processing Stages #

In [None]:
#Look components of dataset
import pandas as pd
dataset = pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset.isnull().any()

In [None]:
#Columns's string treatment and drop unused column
dataset.columns = [s.strip().replace(' ', '_') for s in dataset.columns]
dataset = dataset.rename(columns = {'Annual_Income_(k$)':'Annual_Income', 'Spending_Score_(1-100)':'Spending_Score'})
dataset = dataset.drop(columns = 'CustomerID')
dataset.head()

In [None]:
#Explore data distribution for numerical columns
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set(style='white')
plt.clf()

# Make a function to create plots
def create_plot(columns):
    fig, axs = plt.subplots(3, 2, figsize=(9, 8))
    for i, col in enumerate(columns):
        sns.boxplot(dataset[col], ax = axs[i][0])
        sns.distplot(dataset[col], ax = axs[i][1])
        axs[i][0].set_title('mean = %.2f\n median = %.2f\n std = %.2f'%(dataset[col].mean(), dataset[col].median(), dataset[col].std()))
    plt.setp(axs)
    plt.tight_layout()
    plt.show()

# Call create_plot function
numerical_columns = ['Age','Annual_Income','Spending_Score']
create_plot(numerical_columns)


From these three tables, we can conclude:
* The average age of customers who visit the mall is 38 years with average annual income = 61.50 k$ and the average spending score obtained by customers is 50.20
* There are outliers in the 'Annual Income' column
* The distribution of data in the three numeric columns does not experience a significant skew (relatively normal)

Now, we will treat the outliers

In [None]:
#Overcome outliers in Annual_Income column with Interquartil (IQR) method
Q1 = dataset['Annual_Income'].quantile(0.25)
Q3 = dataset['Annual_Income'].quantile(0.75)

print('Q1 = {} and Q3 = {}'.format(Q1,Q3))

In [None]:
IQR = Q3-Q1
outlier_lowerLimit = Q1-1.5*IQR 
outlier_upperLimit = Q3+1.5*IQR
print('outlier_lowerLimit = {} dan outlier_upperLimit = {}'.format(outlier_lowerLimit,outlier_upperLimit))
#Negative value means there are no outlier

In [None]:
outliers = dataset[(dataset['Annual_Income']>outlier_upperLimit)|(dataset['Annual_Income']<outlier_lowerLimit)]
outliers

> Because there are only 2 data outliers, the outliers treatment that I do here is to delete the two data

In [None]:
print('Initial dataset size = ', dataset.shape)
dataset_final = dataset[~((dataset['Annual_Income']>outlier_upperLimit)|(dataset['Annual_Income']<outlier_lowerLimit))]
print('Final dataset size = ', dataset_final.shape)

In [None]:
#Let save this new dataset for the last modeling process
dataset_final.to_csv('Mall Customers Drop.csv')

In [None]:
#Visualize the count gender type who visited the Mall
fig,axs = plt.subplots(1,1,figsize=(9,7))

#Create a plot
sns.countplot(dataset_final['Gender'], order = dataset_final['Gender'].value_counts().head(10).index, ax =axs)
axs.set_title('Countplot Gender', fontsize = 20)
plt.xticks(rotation = 0)
#Create annotate
for i in axs.patches:
    axs.annotate(format(i.get_height(), '.0f'),
                    (i.get_x() + i.get_width() / 2., i.get_height()),
                    ha = 'center',
                    va = 'center',
                    xytext = (0, 10),
                    textcoords = 'offset points')

# Setting Plot
sns.despine(right=True,top = True, left = True)
axs.axes.yaxis.set_visible(False)
plt.setp(axs)
plt.tight_layout()
plt.show();

> From the gender column countplot, it can be seen that the dominant gender of Mall visitors is Female

In [None]:
#Encode category column
from sklearn.preprocessing import LabelEncoder

category_column = ['Gender']
for col in category_column:
    dataset_final[col] = LabelEncoder().fit_transform(dataset_final[col])

# Print the data final
print(dataset_final)

In [None]:
#Save data final for modeling stage
dataset_final.to_csv('Mall Customers Clean.csv')

# Modelling Process Stage
We Use K-Means Clustering Methode

In [None]:
#Load the previously saved final dataset
data = pd.read_csv('Mall Customers Clean.csv')
data.head()

In [None]:
data = data.drop(columns = 'Unnamed: 0')
data.head()

In [None]:
#Finding the best k value with Elbow Plot
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes #for both category and numeric clustering
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='white')

# Iteration to get cost value
cost = {}
for k in range(1,10):
    kproto = KPrototypes(n_clusters = k,random_state=75)
    kproto.fit_predict(data, categorical=[0]) #index [0] for column 'Age'
    cost[k]= kproto.cost_

# Elbow Plot Visualization
sns.pointplot(x=list(cost.keys()), y=list(cost.values()), color = 'red')
plt.title('Elbow Plot\n', fontsize = 15)
plt.show()


> We can see that the optimum **k** value is **6**

In [None]:
# Let use k = 6 to fit model clustering
import pickle

kproto = KPrototypes(n_clusters=6, random_state = 75)
kproto = kproto.fit(data, categorical=[0])

# Save Model
pickle.dump(kproto, open('cluster.pkl', 'wb'))

In [None]:
# Determine each customer's category
clusters = kproto.predict(data, categorical=[0])
print('Customer Segmentation: {}\n'.format(clusters))

> See that there are 6 indexes (from 0-5) referring to 6 number of clusters

In [None]:
#Load initial dataset which has dropped the outliers
df = pd.read_csv('Mall Customers Drop.csv')
df = df.drop(columns = 'Unnamed: 0')
df.head()

In [None]:
# Combining initial data and customer segments
df_final = df.copy()
df_final['cluster'] = clusters
print(df_final.head())

In [None]:
# Display customer data based on its cluster
for i in range (0,6):
    print('\nCustomer cluster: {}\n'.format(i))
    print(df_final[df_final['cluster']== i])


In [None]:
# Numerical Data Cluster Visualization
columns_numeric = ['Age','Annual_Income','Spending_Score']

for i in columns_numeric:
    plt.figure(figsize=(6,4))
    ax = sns.boxplot(x = 'cluster',y = i, data = df_final)
    plt.title('\nBox Plot {}\n'.format(i), fontsize=15)
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white')

# Categorical Data Cluster Visualization
columns_categorical = ['Gender']

for i in columns_categorical:
    plt.figure(figsize=(9,7))
    ax = sns.countplot(data = df_final, x = 'cluster', hue = i )
    plt.title('\nCount Plot {}\n'.format(i), fontsize=15)
    ax.legend(loc="upper center")
    for p in ax.patches:
        ax.annotate(format(p.get_height(), '.0f'),
                    (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha = 'center',
                    va = 'center',
                    xytext = (0, 10),
                    textcoords = 'offset points')

    sns.despine(right=True,top = True, left = True)
    ax.axes.yaxis.set_visible(False)
    plt.show()

From the results of observations on the boxplot and countplot of each cluster above, the clustering can be made as follows:

* Cluster 0: Silver Society Members. The content of this cluster is the general public with an average annual income of 25 k (dollars). This cluster has an age range of about 20 - 68 years with an average of 45 years where women dominate. In addition, this cluster has a low spending_score (5-40).
* Cluster 1: Gold Society Member, the content of this cluster is the elderly general public with an average annual income of 55 k (dollars). This cluster has an age range of about 45 - 70 years with an average of 55 years where women dominate. In addition, this cluster has a moderate spending_score (30-60).
* Cluster 2: Diamond Student Member, the contents of this cluster are students and students with an average annual income of 25 k (dollars). This cluster has an age range of about 20 - 35 years with an average of 22 years where women dominate. In addition, this cluster has a high spending_score (60-98).
* Cluster 3: Silver Entrepreneur Member, the contents of this cluster are entrepreneurs with an average annual income of 80 k (dollars). This cluster has an age range of about 20 - 55 years with an average of 43 years where males predominate. In addition, this cluster has a low spending_score (5-40).
* Cluster 4: Gold Young Entrepreneur Member, the contents of this cluster are young entrepreneurs with an average annual income of 60 k (dollars). This cluster has an age range of about 20 - 55 years with an average of 43 years where women dominate. In addition, this cluster has a moderate spending_score (30-60).
* Cluster 5: Diamond Young Entrepreneur Member, the contents of this cluster are young entrepreneurs with an average annual income of 75 k (dollars). This cluster has an age range of about 25 - 37 years with an average of 31 years where women dominate. In addition, this cluster has a high spending_score (65-90).

Note: To simplify naming, the cluster level is based more on the spending_score value (silver = low, gold = medium, diamond = high)

In [None]:
# Mapping column names
df_final['Segmentation'] = df_final['cluster'].map({
    0: 'Silver Society Member',
    1: 'Gold Society Member',
    2: 'Diamond Student Member',
    3: 'Silver Enterpreneur Member',
    4: 'Gold Young Enterpreneur Member',
    5: 'Diamond Young Enterpreneur Member'
    }) 

print(df_final.head())  

# END
# Thank you. May be useful