In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.cluster import KMeans
import plotly.graph_objs as go
import plotly as py
py.offline.init_notebook_mode(connected = True)
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.metrics import silhouette_score

# Introduction

K-Means Clustering is "Unsupervised Learning" which is one of the simple and popular algorithm in Machine Learning. (Unsupervised algorithms using only input vectors without output/label feature.)

K-Means algorithms is using group similar data points together.
1. First randomly select centroids following K (number of cluster which we had specificed).
2. Discover underlying patterns by calculations to optimize the positions of the centroids use assing points to its nearest cluster.
3. The position of centroids will be updated by the mean squared distance between centroids and data points.
4. Repeated above steps until the 

Cluster refers to a collection of data points aggregated together because of certain similarities. 

Every data point is allocated to each of the clusters through reducing the in-cluster sum of squares.

K-means Overview
Before diving into the dataset, let us briefly discuss how k-means works:
The process begins with k centroids initialised at random.
These centroids are used to assign points to its nearest cluster.
The mean of all points within the cluster is then used to update the position of the centroids.
The above steps are repeated until the values of the centroids stabilise.

# Load Dataset

In [None]:
df = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
print(df.shape)
df.head()

In [None]:
df.info()

* **Dataset has 200 rows and 5 columns which are CustomerID, Gender, Age, Annual Income (k$) and Spending Score (1-100)**
* **Dataset has no missing value**

# Explore Data

**First we will visualize the overall of data to find pattern between features**

In [None]:
g = sns.PairGrid(df[['Gender', 'Age', 'Annual Income (k$)','Spending Score (1-100)']], hue="Gender", height=3, aspect = 1.7, palette= 'Set1') 
g.map_upper(sns.scatterplot) 
g.map_lower(sns.regplot)
g.map_diag(sns.kdeplot) 

for ax in g.axes.flat: 
    plt.setp(ax.get_xticklabels(), rotation=45) 
g.add_legend() 
g.set(alpha=0.5);

* **We can see pattern of Spending Score (1-100) and Annual Income (k$)**

In [None]:
g= sns.FacetGrid(df, col="Gender", height=5) 
g.map(sns.regplot, "Age", "Annual Income (k$)");

**Male has slightly lower annual income than female.**

In [None]:
g= sns.FacetGrid(df, col="Gender", height=5) 
g.map(sns.regplot, "Age", "Spending Score (1-100)");

**Both of male and female are lower spending score when higher age.**

**According to the figure chart above "Gender" is not relate to the pattern we would not using "Gender" in clustering.** 

**So far we are going to drop columns 'CustomerID' and 'Gender'**

In [None]:
data = df.drop(columns=(['CustomerID', 'Gender']))

In [None]:
data.head()

**Displayed data on a two-dimensional space Annual Income and Spending Score.**

In [None]:
plt.figure(figsize=(8,6))

sns.scatterplot(data=data, x="Annual Income (k$)", y="Spending Score (1-100)");

In [None]:
# We would plot each group by different no.cluster
def plot_(max_cluster, df):
    
    plt.figure(figsize=(15,15))
    for i in range(1,max_cluster+1):
        # Fit data
        k = KMeans(n_clusters=i, init = 'random', random_state=0)
        k.fit(df)
        k.predict(df)
        
        # Create new column
        df['label'] = k.labels_+1
#         df['center'] = k.cluster_centers_

        # Visualize scatter plot
        plt.subplot(3,3, i)
        plt.scatter(x = k.cluster_centers_[: , 1] , y =  k.cluster_centers_[: , 2] , s = 300 , c = 'orange' , alpha = 0.5)

        sns.scatterplot(data=df, x="Annual Income (k$)", y="Spending Score (1-100)", hue="label", palette= 'Set1')
        
plot_(9,data)

**According figure the optimal number of cluster should be around 5 - 6 groups.** 

**For more information we will use Elbow methods to determine this optimal value of k.**


**Now we use Scikit-Learn KMeans and Elbow methods to determine this optimal value of k.**

In [None]:
# Using number of cluster  to 10 to loop and visualize elbow plot

inertia = []
for i in range(2,11):
    
    # intialise kmeans
    k = KMeans(n_clusters=i, max_iter=100, random_state=0)
    k.fit(data)
    
    inertia.append(k.inertia_)
    
    cluster_labels = k.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(data, cluster_labels)
    print("For n_clusters={}, the silhouette score is {:.4f}".format(i, silhouette_avg))

# Visualize elbow plot and choose the no. of cluster that make a conner or line in this case we choose 5 group.
plt.figure(figsize=(12,6))
sns.lineplot(x=[i for i in range(2,11)], y=inertia)
plt.scatter(5,inertia[3], s = 50, c = 'red', marker='D')
plt.title('The Elbow Method')
plt.xlabel('No. of Clusters')
plt.ylabel('Inertia');
plt.show()

**In this case we choose K = 5**

We use Silhouette Score to decide the number of clusters which is compute the mean Silhouette Coefficient of all samples. The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar.

The number of cluster would represented by the size of graph, If there are too many or too few clusters the size will display narrower than others.

In [None]:
n_clusters = 5

k = KMeans(n_clusters, init = 'random', random_state=0)

visualizer = SilhouetteVisualizer(k, colors='yellowbrick',)
visualizer.fit(data)  
visualizer.show()
print("For n_clusters={}, the silhouette score is {:.4f}".format(n_clusters, visualizer.silhouette_score_))

In [None]:
label = k.fit_predict(data)

#Assign label to df
df['label'] = label+1
df.head(10)

In [None]:
plt.figure(figsize=(8,6))
sns.color_palette("pastel")
sns.scatterplot(data=data, x="Age", y="Spending Score (1-100)", hue ='label', palette= 'Set1');

In [None]:
sns.pairplot(df, hue="label", palette= 'Set1');

In [None]:
sns.scatterplot(x="Age", y="Spending Score (1-100)", hue="label", data=data, palette= 'Set1');

We consider to choose 5 groups and define each group to
1. Low Annual Income Low Spending Score
2. Low Annual Income High Spending Score
3. Middle Annual Income Middle Spending Score
3. High Annual Income Low Spending Score
4. High Annual Income High Spending Score

actually number of K(cluster group) depend on each project or strategy we focus on such as marketing campaign, business objective or policy.

In [None]:
k = KMeans(n_clusters=5, init = 'random', random_state=0)
k.fit_predict(data)

data['label'] = k.labels_
# data['center'] = k.cluster_centers_

In [None]:
trace1 = go.Scatter3d(
    x= data['Age'],
    y= data['Spending Score (1-100)'],
    z= data['Annual Income (k$)'],
    mode='markers',
    marker=dict(
        color = data['label'], 
        size= 5,
        line=dict(color= data['label'],width= 12),
        opacity=0.7
     )
)

data_ = [trace1]

layout = go.Layout(
    title= 'K Mean Clustering',
    scene = dict(
            xaxis = dict(title  = 'Age'),
            yaxis = dict(title  = 'Spending Score'),
            zaxis = dict(title  = 'Annual Income')
        )
)
fig = go.Figure(data=data_, layout=layout)
# iplot(fig)
py.offline.iplot(fig)