In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Customer Segmentation

1. Importing the data
2. Analyzing the data
3. Exploratory Data Analysis (Visualization)
4. Clustering using K-Means
5. Finding the K value - Elbow Method, Silhouette Score Method
6. 3D Plot of Clusters


### 1. Importing the data

In [None]:
customer_data = pd.read_csv("/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")

## 2. Analyzing the data

In [None]:
# Checking a few records to have an idea of the values in each columns

customer_data.head()

* We have 5 features - all of them seem to be self explanatory except Spending Score for it has been mentioned in the data description - Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data.

In [None]:
# Checking what are the different data types and if there are any missing values

customer_data.info()

* There are 4 numeric columns and 1 categorical column.

In [None]:
customer_data.describe()

## 3. Exploratory Data Analysis

In [None]:
# importing libraries for visualization

import matplotlib.pyplot as plt
import seaborn as sns

### Analyzing the Gender of customers

In [None]:
# plotting Count of Customers based on Gender 

sns.countplot(customer_data["Gender"])
plt.title("Count of Customers based on Gender")
plt.show()

* There are more Female customers coming into the mall.

In [None]:
# Representing the above analysis in a pie-chart 

labels = ["Female", "Male"]
plt.pie(customer_data["Gender"].value_counts(), labels=labels, autopct='%1.1f%%',
        shadow=True)
plt.title("% Customers based on Gender")
plt.show()

* 56% of the customers are Female
* 44% of the customers are Male

### Analyzing the Age of customers

In [None]:
# Histogram to show distribution of Age

sns.distplot(customer_data["Age"])
plt.show()

In [None]:
sns.boxplot(customer_data["Age"])
plt.show()

* The maximum Age is 70

In [None]:
sns.distplot(customer_data["Annual Income (k$)"])
plt.show()

* The Annual income seems to be normally distributed

In [None]:
sns.boxplot(customer_data["Annual Income (k$)"])
plt.show()

* There is an outlier though with ~140K income.

### Analyzing the Spending Score of customers

In [None]:
sns.distplot(customer_data["Spending Score (1-100)"])
plt.show()

In [None]:
sns.boxplot(customer_data["Spending Score (1-100)"])
plt.show()

### K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

### Finding the optimum value of K

**1. Elbow Method**

In [None]:
k = np.arange(2,11)

inertia = []
score = []

for i in k:
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(customer_data[["Age","Annual Income (k$)","Spending Score (1-100)"]])
    inertia.append(kmeans.inertia_)

plt.plot(k,inertia,"o-")
plt.xticks(k)
plt.xlabel("K Value")
plt.ylabel("Inertia")
plt.title("Finding the value of K - Elbow Method")
plt.show()

* We can see that the Inertia values tend to decrease sharply until 6 and the slop decreases after that.
* **Inertia** here is defined as performance metric and is the mean squared distance between each instance and it's closest centroid.
* Having said that, we need a way to validate if using K value of 6 results in optimum solution or not.

**2. Silhouette Method**

* Silhouette Score is the mean Silhouette Coefficient over all the instances.
* An instance's Silhouette Coefficient is equal to (b-a)/max(a,b), where a is the mean distance to the other instances in the same cluster and b is the mean distance to the nearest cluster.
* Silhouette Coefficient can vary between -1 and +1.
* A coefficient close to +1 means that the instance is well inside its own cluster and far from other clusters, while a coefficient close to 0 means that it is close to a cluster boundary, and finally a coefficient close to -1 means that the isntance may have been assigned to the wrong cluster.

In [None]:
from sklearn.metrics import silhouette_score

n = np.arange(2,11)
score=[]
for i in n:
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(customer_data[["Age","Annual Income (k$)","Spending Score (1-100)"]])
    score.append(silhouette_score(customer_data[["Age","Annual Income (k$)","Spending Score (1-100)"]], kmeans.labels_))
    

plt.plot(k,score,"*-")
plt.xlabel("K Value")
plt.ylabel("Silhouette Score")
plt.title("Finding the value of K - Silhouette Method")
plt.xticks(k)
plt.show()

* We can see above that the Silhouette Score reaches maximum at K value of 6, hence we will choose the same as our cluster size.

### 3D Plot of Clusters

Source: https://www.kaggle.com/kushal1996/customer-segmentation-k-means-analysis

In [None]:
model = KMeans(n_clusters = 6, random_state= 111)
model.fit(customer_data[["Age","Annual Income (k$)","Spending Score (1-100)"]])
labels = model.labels_
centroids = model.cluster_centers_

In [None]:
import plotly as py
import plotly.graph_objs as go

In [None]:
customer_data['label'] =  labels
trace1 = go.Scatter3d(
    x= customer_data['Age'],
    y= customer_data['Spending Score (1-100)'],
    z= customer_data['Annual Income (k$)'],
    mode='markers',
     marker=dict(
        color = customer_data['label'], 
        size= 20,
        line=dict(
            color= customer_data['label'],
            width= 12
        ),
        opacity=0.8
     )
)
data = [trace1]
layout = go.Layout(
    title= 'Clusters',
    scene = dict(
            xaxis = dict(title  = 'Age'),
            yaxis = dict(title  = 'Spending Score'),
            zaxis = dict(title  = 'Annual Income')
        )
)
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)