## Introduction and Motivation


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score, calinski_harabasz_score
import matplotlib.cm as cm

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.columns

In [None]:
df[['female','male']] = pd.get_dummies(df['Gender'])
df.head()

In [None]:
sns.barplot(df['Gender'],2);
sum(df['male']),sum(df['female'])

In [None]:
loc = 0 # location of a figure
f, axes = plt.subplots(3,2, figsize = (5,10))
for column in ['Age', 'Annual Income (k$)','Spending Score (1-100)']:
    sns.distplot(df[column],ax=axes[loc,0])
    sns.violinplot(df[column],ax=axes[loc,1])
    loc += 1

In [None]:
sns.pairplot(df[['Age','Annual Income (k$)','Spending Score (1-100)']]);

# Implement and evaluating K-means clustering
* Key metrics for k-means clustering are 'Age', 'Annual Income (k$)','Spending Score (1-100)','female'. 
* Inertias are derived to show the effect of number of clusters.
* Evaluating 'k' by Elbow method
* Evaluating 'k' by Silhouette analysis

In [None]:
X = df[['Age', 'Annual Income (k$)','Spending Score (1-100)','female']]

In [None]:
# find Elbow by plotting
inertia = []
labels = []
for n_clus in range(1,15):
    estimator = KMeans(n_clusters = n_clus)
    estimator.fit(X)
    labels.append(estimator.labels_)
    inertia.append(estimator.inertia_)

In [None]:
sns.lineplot(range(1,15),inertia);

According to the plot, the sum of squared distance drops significantly from 2 and become flatter at 6, however, it is still not obvious 

In [None]:
# use silhousette value to
silhouette_avg = []
calinski_score = []
n_clusters = 2
for label_set in labels[2:]:
    silhouette_avg.append(silhouette_score(X, label_set))
    calinski_score.append(calinski_harabasz_score(X, label_set))
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_score(X, label_set), "The average calinski harabasz score is :", calinski_harabasz_score(X, label_set))
    n_clusters +=1

## Silhouette plots for various labels by *k* (the number of cluster)
reference: 
https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

In [None]:
for n_cluster in range(2,11):
    fig, ax1 = plt.subplots(1, 1)
    fig.set_size_inches(7, 7)

    # The subplots are the silhouette plots
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_cluster + 1) * 10])
    
    sample_silhouette_values = silhouette_samples(X, labels[n_cluster]) # value for the k cluster
    
    y_lower = 10
    for i in range(n_cluster):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[labels[n_cluster] == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_cluster)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)
        
        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg[k-2], color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    plt.show()

## Plot calinski harabasz scores for various numbers of clusters

In [None]:
sns.lineplot(range(2,14),calinski_score)
plt.title('Calinski Harabasz scores vs. n_cluster')
plt.xlabel('n_cluster')
plt.ylabel('Calinski Harabasz scores');

## Conclusion
> Silhouette analysis suggests that `n_cluster` value of 5 is a good pick for given data, due to the less fluctuations in the size of the silhouette plot and also due to the higher score.

> Calinski Harabasz analysis also indicates the same conclusion that the 5 clusters is the optimized choice, due to its high score which reflects denser and better separated clusters.