In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
data.head()

In [None]:
data.info()

Hence no null values

# EDA

In [None]:
numerical_features = ["Spending Score (1-100)", "Annual Income (k$)", 'Age']
categorical_features = ['Gender']

In [None]:
def plot_num_cat(feature, target, figsize=None):
    # Attrition vs Age Distribution
    fig = plt.figure(figsize=(10,6))

    for value in data[target].unique():
        sns.kdeplot(data[data[target]==value][feature])

    fig.legend(labels=data[target].unique())
    plt.title('{} distribution based on {}'.format(feature, target))
    plt.show()
    
def plot_num_num(feature, target):
    sns.regplot(x=feature, y=target, data=data, color='#244747')
    plt.show()
    
def plot_cat_cat(feature, target):
    plot_data = data.groupby([feature, target])[feature].agg({'count'}).reset_index()

    fig = px.sunburst(plot_data, path=[feature, target], values='count', #color_continuous_scale='gray', color=feature, 
                      title='Affect of {} on Customer {}'.format(feature, target), width = 600, height = 600)
    
    fig.update_layout(plot_bgcolor='white', title_font_family='Calibri Black', title_font_color='#221f1f', 
                      title_font_size=22, title_x=0.5)
    fig.update_traces(textinfo = 'label + percent parent')
    fig.show()

In [None]:
# looking at gender distribution
plt.figure(figsize=(5, 5))
patches, texts, autotexts = plt.pie(data['Gender'].value_counts(), autopct='%1.2f%%', 
                                    labels=data.groupby('Gender').count().reset_index()['Gender'],
                                    shadow=True, startangle=90, explode=(0.05, 0.05), colors=['#91b8bd', '#244747']);
plt.setp(texts, size=15);
plt.setp(autotexts, size=15, color='white');
plt.text(-1.65, 1.3, 'Male & Female Distribution', fontfamily='serif', fontsize=17, fontweight='bold');
plt.text(-1.65, 1.15, 'Females are slightly more than Males', fontfamily='serif', fontsize=12);
plt.show()

In [None]:
# Looking at annual income distribution
plt.figure(figsize=(10, 5));
sns.displot(x="Annual Income (k$)", data=data, kde=True, bins=20, color='#244747');
plt.text(0, 28, 'Annual Income Distribution', fontfamily='serif', fontsize=17, fontweight='bold');

In [None]:
# looking at spending score distribution
plt.figure(figsize=(10, 10))
sns.displot(x="Spending Score (1-100)", data=data, palette='husl', kde=True, bins=20, color='#244747');
plt.text(-20, 23, 'Spending Score Distribution', fontfamily='serif', fontsize=17, fontweight='bold');
plt.show()

In [None]:
# looking at age distribution
plt.figure(figsize=(10,5))
sns.histplot(x="Age", data=data, palette='husl', kde=True, bins=20, color='#244747');
plt.text(15, 25, 'Age Distribution', fontfamily='serif', fontsize=17, fontweight='bold');

In [None]:
for feature in numerical_features:
    for target in categorical_features:
        plot_num_cat(feature, target)

In [None]:
plt.figtext(0.1, 1, "How Numerical variables relate to each other", fontfamily='serif', fontsize=14, fontweight='bold')
for feature, target in list(itertools.combinations(numerical_features, 2)):
    plot_num_num(feature, target)


**Observations-**
* Spending Score- Females spend more than Males and most of it lies between 40 - 60
* Annual Income- Males have slightly higher income, most of the people have income between 40 - 90
* Age- Females are usually older

# CORRELATION

### Label encoding categorical features for correlation

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

In [None]:
df = data.copy()
path = '/kaggle/working'
for i, feature in enumerate(categorical_features):
    le = LabelEncoder()

    # create directory to save label encoding models
    if not os.path.exists(os.path.join(path, "TextEncoding")):
        os.makedirs(os.path.join(path, "TextEncoding"))

    # perform label encoding
    le.fit(df[feature])
    #print(feature)
    
    # save the encoder
    joblib.dump(le, open(os.path.join(path, "TextEncoding/le_{}.sav".format(feature)), 'wb'))
    
    # transfrom training data
    df[feature] = le.transform(df[feature])

    # get classes & remove first column to elude from dummy variable trap
    columns = list(map(lambda x: feature+' '+str(x), list(le.classes_)))[1:]
    
    # save classes
    joblib.dump(columns, 
                open(os.path.join(path, "TextEncoding/le_{}_classes.sav".format(feature)), 'wb'))

### Bivariate Analysis Correlation plot for numerical features

In [None]:
plt.figure(figsize=(5, 3))
sns.heatmap(round(data[numerical_features].corr(method='spearman'), 2), 
            annot=True, mask=None, cmap='GnBu')
plt.show()

### Bivariate Analysis Correlation plot with the Categorical variables

In [None]:
plt.figure(figsize=(5, 3))
sns.heatmap(round(df[categorical_features+numerical_features].corr(method='spearman'), 2), annot=True,
            mask=None, cmap='GnBu')
plt.show()

**Observations-**
* People with higher age usually have low Spending Score

# Analyzing features using VIF

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Calculating VIF
vif = pd.DataFrame()
temp = df.dropna()
vif["variables"] = [feature for feature in categorical_features+numerical_features if feature not in []]
vif["VIF"] = [variance_inflation_factor(temp[vif['variables']].values, i) for i in range(len(vif["variables"]))]
print(vif)

All features can be included

# Looking at Outliers

In [None]:
NumericData = data[[feature for feature in numerical_features if feature not in []]]
NumericMelt = NumericData.melt()
plt.figure(figsize=(10,5))
plt.figtext(0.1, 1, "Boxplots for Numerical variables", fontfamily='serif', fontsize=17, fontweight='bold')
bp = sns.boxplot(x='variable', y='value', data=NumericMelt, palette=['#244247', '#91b8bd', 'gray'])
bp.set_xticklabels(bp.get_xticklabels(), rotation=0)
plt.show()

In [None]:
# Percentage of outliers present in each variable
outlier_percentage = {}
for feature in numerical_features:
    tempData = data.sort_values(by=feature)[feature]
    Q1, Q3 = tempData.quantile([0.25, 0.75])
    IQR = Q3 - Q1
    Lower_range = Q1 - (1.5 * IQR)
    Upper_range = Q3 + (1.5 * IQR)
    outlier_percentage[feature] = round((((tempData<(Q1 - 1.5 * IQR)) | (tempData>(Q3 + 1.5 * IQR))).sum()/tempData.shape[0])*100,2)
outlier_percentage

# Handling Categorical Features (Label and One Hot Encoding)

In [None]:
df = data.copy()
path = '/kaggle/working'
for i, feature in enumerate(categorical_features):
    
    le = LabelEncoder()
    ohe = OneHotEncoder(sparse=False)

    # create directory to save label encoding models
    if not os.path.exists(os.path.join(path, "TextEncoding")):
        os.makedirs(os.path.join(path, "TextEncoding"))

    # perform label encoding
    le.fit(df[feature])
    # save the encoder
    joblib.dump(le, open(os.path.join(path, "TextEncoding/le_{}.sav".format(feature)), 'wb'))
    
    # transfrom training data
    df[feature] = le.transform(df[feature])

    # get classes & remove first column to elude from dummy variable trap
    columns = list(map(lambda x: feature+' '+str(x), list(le.classes_)))[1:]
    
    # save classes
    joblib.dump(columns, 
                open(os.path.join(path, "TextEncoding/le_{}_classes.sav".format(feature)), 'wb'))
    # load classes
    columns = joblib.load(
        open(os.path.join(path, "TextEncoding/le_{}_classes.sav".format(feature)), 'rb'))

    if len(le.classes_)>2:
        # perform one hot encoding
        ohe.fit(df[[feature]])
        # save the encoder
        joblib.dump(ohe, open(os.path.join(path, "TextEncoding/ohe_{}.sav".format(feature)), 'wb'))

        # transfrom training data
        # removing first column of encoded data to elude from dummy variable trap
        tempData = ohe.transform(df[[feature]])[:, 1:]

        # create Dataframe with columns as classes
        tempData = pd.DataFrame(tempData, columns=columns)
    else:
        tempData = df[[feature]]
    
    # create dataframe with all the label encoded categorical features along with hot encoding
    if i==0:
        encodedData = pd.DataFrame(data=tempData, columns=tempData.columns.values.tolist())
    else:
        encodedData = pd.concat([encodedData, tempData], axis=1)

In [None]:
# merge numerical features and categorical encoded features
df = df[numerical_features]
df = pd.concat([df, encodedData], axis=1)
df.info()

# Traning ML Model

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, rand_score
from sklearn.mixture import GaussianMixture
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

In [None]:
# Rescaling to [0,1]
#scaler = StandardScaler()
#scaler.fit(train_data[feature_cols])
#train_data[feature_cols] = scaler.transform(train_data[feature_cols])

# Model 1: KMeans

In [None]:
feature_cols = [feature for feature in df.columns if feature not in(['Gender', 'Age'])]
train_data = df.copy()[feature_cols]
print('features used- ', feature_cols)

In [None]:
# Using ELBOW Method to figure out number of clusters
inertia=[]
silhouetteScore = []
for i in range(2,11):
    kmeans = KMeans(n_clusters= i, init='k-means++', random_state=0)
    kmeans.fit(train_data)
    inertia.append(kmeans.inertia_)
    silhouetteScore.append(silhouette_score(train_data, kmeans.predict(train_data)))

fig, ax1 = plt.subplots(figsize=(8, 5))
fig.text(0.1, 1, 'Spending Score (1-100) and Annual Income (k$)', fontfamily='serif', fontsize=12, fontweight='bold')
fig.text(0.1, 0.95, 'We want to select a point where Inertia is low & Silhouette Score is high, and the number of clusters is not overwhelming for the business.',
         fontfamily='serif',fontsize=10)
fig.text(1.4, 1, 'Inertia', fontweight="bold", fontfamily='serif', fontsize=15, color='#244747')
fig.text(1.51, 1, "|", fontweight="bold", fontfamily='serif', fontsize=15, color='black')
fig.text(1.53, 1, 'Silhouette Score', fontweight="bold", fontfamily='serif', fontsize=15, color='#91b8bd')

ax1.plot(range(2,11), inertia, '-', color='#244747', linewidth=5)
ax1.plot(range(2,11), inertia, 'o', color='#91b8bd')
ax1.set_ylabel('Inertia')

ax2 = ax1.twinx()
ax2.plot(range(2,11), silhouetteScore, '-', color='#91b8bd', linewidth=5)
ax2.plot(range(2,11), silhouetteScore, 'o', color='#244747', alpha=0.8)
ax2.set_ylabel('Silhouette Score')

plt.xlabel('Number of clusters')
plt.show()

Elbow is present at 5, Sihouette Score is max at 5 as well and hence n_clusters=5

In [None]:
model = KMeans(n_clusters=5, init='k-means++', random_state=111, algorithm='elkan')
y = model.fit_predict(train_data[feature_cols])

In [None]:
# Visualizing all the clusters 
plt.figure(figsize=(10,5))
sns.scatterplot(x=train_data[feature_cols[0]], y=train_data[feature_cols[1]], 
                hue=y, palette=sns.color_palette('hls', len(np.unique(y))), s=100)
#sns.scatterplot(x=model.cluster_centers_[:, 0], y=model.cluster_centers_[:, 1], label='Centroids', s=150, color='orange')
plt.title('Cluster of Customers'.format(feature_cols[0], feature_cols[1]), size=15, pad=10)
plt.xlabel(feature_cols[0], size=12)
plt.ylabel(feature_cols[1], size=12)
plt.legend(loc=0, bbox_to_anchor=[1,1])
plt.show()

# Model Interpretation 
* Cluster 0 -> Earning low but Spending is high
* Cluster 1 -> average in terms of Earning and Spending 
* Cluster 2 -> Earning high and also Spending high
* Cluster 3 -> Earning high but Spending less
* Cluster 4 -> Earning less and Spending less

In [None]:
feature_cols = [feature for feature in df.columns if feature not in(['Gender'])]
train_data = df.copy()[feature_cols]
print('features used- ', feature_cols)

In [None]:
# Using ELBOW Method to figure out number of clusters
inertia=[]
silhouetteScore = []
for i in range(2,11):
    kmeans = KMeans(n_clusters= i, init='k-means++', random_state=0)
    kmeans.fit(train_data)
    inertia.append(kmeans.inertia_)
    silhouetteScore.append(silhouette_score(train_data, kmeans.predict(train_data)))

fig, ax1 = plt.subplots(figsize=(8, 5))
fig.text(0.1, 1, 'Spending Score (1-100) and Annual Income (k$)', fontfamily='serif', fontsize=12, fontweight='bold')
fig.text(0.1, 0.95, 'We want to select a point where Inertia is low & Silhouette Score is high, and the number of clusters is not overwhelming for the business.',
         fontfamily='serif',fontsize=10)
fig.text(1.4, 1, 'Inertia', fontweight="bold", fontfamily='serif', fontsize=15, color='#244747')
fig.text(1.51, 1, "|", fontweight="bold", fontfamily='serif', fontsize=15, color='black')
fig.text(1.53, 1, 'Silhouette Score', fontweight="bold", fontfamily='serif', fontsize=15, color='#91b8bd')

ax1.plot(range(2,11), inertia, '-', color='#244747', linewidth=5)
ax1.plot(range(2,11), inertia, 'o', color='#91b8bd')
ax1.set_ylabel('Inertia')

ax2 = ax1.twinx()
ax2.plot(range(2,11), silhouetteScore, '-', color='#91b8bd', linewidth=5)
ax2.plot(range(2,11), silhouetteScore, 'o', color='#244747', alpha=0.8)
ax2.set_ylabel('Silhouette Score')

plt.xlabel('Number of clusters')
plt.show()

Elbow is present at 6, Sihouette Score is max at 6 as well and hence n_clusters=6

In [None]:
model = KMeans(n_clusters=6, init='k-means++', random_state=19, algorithm='elkan')
y = model.fit_predict(train_data)

In [None]:
fig = px.scatter_3d(train_data, x="Annual Income (k$)", y="Spending Score (1-100)", z="Age",
                    color=y, opacity=0.8, size=y+1)
fig.show()

## Interpreting model

In [None]:
train_data['cluster'] = y
train_data.groupby(['cluster']).agg(['mean', 'median']).reset_index()

* Cluster 0 - Moderate spending score, Moderate income, young age - Valuable
* Cluster 1 - Low spending score, High income, moderate age - Targets
* Cluster 2 - High spending score, High income, young age - Most Valuable
* Cluster 3 - Moderate spending score, Moderate income, old age - Less Valuable
* Cluster 4 - High spending score, low income, young age - More Valuable
* Cluster 5 - Low spending score, low income, moderate age - Least Valuable

In [None]:
data['cluster'] = train_data['cluster']
plot_cat_cat('cluster', 'Gender')

In [None]:
for feature in ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']:
    plot_num_cat(feature, 'cluster')

**Observations-**
* Cluster 0 - Valuable
    * Age- 10 to 40
    * Spending Score- 30 to 60
    * Annual Income- 30 to 70
* Cluster 1 - Targets
    * Age- 30 to 60
    * Spending Score- 0 to 30
    * Annual Income- 60 to 110
* Cluster 2 - Most Valuable
    * Age- 20 to 45
    * Spending Score- 60 to 100
    * Annual Income- 60 to 110
* Cluster 3 - Less Valuable
    * Age- 40 to 70
    * Spending Score- 30 to 60
    * Annual Income- 30 to 70
* Cluster 4 - More Valuable
    * Age- 10 to 40
    * Spending Score- 60 to 100
    * Annual Income- 10 to 40
* Cluster 5 - Least Valuable
    * Age- 20 to 65
    * Spending Score- 0 to 40
    * Annual Income- 10 to 40

# Model 2: Gaussian Mixture

In [None]:
feature_cols = [feature for feature in df.columns if feature not in(['Gender'])]
train_data = df.copy()[feature_cols]
print('features used- ', feature_cols)

In [None]:
# Number of clusters is determined using elbow method above
model = GaussianMixture(n_components=6)
y = model.fit_predict(train_data[feature_cols])

In [None]:
# Visualizing all the clusters 
plt.figure(figsize=(10,5))
sns.scatterplot(x=train_data[feature_cols[0]], y=train_data[feature_cols[1]], 
                hue=y, palette=sns.color_palette('hls', 5), s=100)
plt.title('Cluster of Customers'.format(feature_cols[0], feature_cols[1]), size=15, pad=10)
plt.xlabel(feature_cols[0], size=12)
plt.ylabel(feature_cols[1], size=12)
plt.legend(loc=0, bbox_to_anchor=[1,1])
plt.show()

# Model 3: Hierarchical Model

In [None]:
feature_cols = [feature for feature in df.columns if feature not in(['Gender', 'Age'])]
train_data = df.copy()[feature_cols]
print('features used- ', feature_cols)

In [None]:
plt.figure(figsize = (12, 5))
plt.text(5, 465, 'Spending Score (1-100) and Annual Income (k$)', fontfamily='serif', fontsize=15, fontweight='bold')
plt.text(5, 440, 'The no. of clusters is the no. of vertical lines in the dendrogram cut by a horizontal line that can transverse the maximum distance vertically without intersecting a cluster.',
         fontfamily='serif',fontsize=12)
dendo = dendrogram(linkage(train_data[feature_cols], method = 'ward'))
plt.plot([115]*2000, color='r')
plt.plot([240]*2000, color='r')
plt.text(5, -50, 'Here, we can have either 5 clusters or 3 clusters',
         fontfamily='serif',fontsize=12)
plt.show()

In [None]:
model = AgglomerativeClustering(n_clusters = 5, affinity='euclidean', linkage='ward')
y = model.fit_predict(train_data[feature_cols])

In [None]:
# Visualizing all the clusters 
plt.figure(figsize=(10,5))
sns.scatterplot(x=train_data[feature_cols[0]], y=train_data[feature_cols[1]], 
                hue=y, palette=sns.color_palette('hls', 5), s=100)
plt.title('Cluster of Customers'.format(feature_cols[0], feature_cols[1]), size=15, pad=10)
plt.xlabel(feature_cols[0], size=12)
plt.ylabel(feature_cols[1], size=12)
plt.legend(loc=0, bbox_to_anchor=[1,1])
plt.show()

# Model 4: DBSCAN

In [None]:
feature_cols = [feature for feature in df.columns if feature not in(['Gender', 'Age'])]
train_data = df.copy()[feature_cols]
print('features used- ', feature_cols)

In [None]:
from sklearn.neighbors import NearestNeighbors
# finding nearest points distance for every row in data
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(train_data)
distances, indices = nbrs.kneighbors(train_data)

# Plotting K-distance Graph
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.figure(figsize=(10,5))
plt.text(-10, 17, 'TK-distance Graph', fontfamily='serif', fontsize=15, fontweight='bold')
plt.text(-10, 16, 'The optimum value of epsilon is at the point of maximum curvature in the K-Distance Graph, which is 6 in this case.',
        fontfamily='serif', fontsize=12)
plt.plot(distances)
plt.xlabel('Data Points sorted by distance', fontsize=14)
plt.ylabel('Epsilon', fontsize=14)
plt.show()

In [None]:
model = DBSCAN(eps=6, min_samples=3)
y = model.fit_predict(train_data)

In [None]:
# Visualizing all the clusters 
plt.figure(figsize=(10,5))
sns.scatterplot(x=train_data[feature_cols[0]], y=train_data[feature_cols[1]], 
                hue=y, palette=sns.color_palette('hls', len(np.unique(y))), s=100)
plt.title('Cluster of Customers'.format(feature_cols[0], feature_cols[1]), size=15, pad=10)
plt.xlabel(feature_cols[0], size=12)
plt.ylabel(feature_cols[1], size=12)
plt.legend(loc=0, bbox_to_anchor=[1,1])
plt.show()