In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

**Data insights**

Read the data and store it in a dataframe

In [None]:
df = pd.read_csv("/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")
df.head()

Check if there's any null values

In [None]:
df.isnull().any()

200 rows and 5 features(columns)

In [None]:
df.shape
      

In [None]:
df.describe()

From the describe we get, mean of the spending score is 50 which is not that good! The spending score gives the quality of the customers. It's based on customer behaviour and purchasing data. 
Average age of the customers is 38.

Checking if there's any categorical variable then we've to convert into numeric.

In [None]:
df.dtypes

Taking two features: 
1. Annual Income (index location: 3)
2. Spending Score (index location: 4)

In [None]:
X = df.iloc[:, [3, 4]].values

In [None]:
type(X)

In [None]:
X

In [None]:
#Visualise data points
plt.scatter(X[:, 0], X[:,1], marker = '+')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score out of 10')
plt.show()

In [None]:
X[:,1] #Spending score

In [None]:
X[:, 0] #Annual income

In some cases, if the initialization of clusters is not appropriate, K-Means can result in arbitrarily bad clusters. This is where K-Means++ helps. It specifies a procedure to initialize the cluster centers before moving forward with the standard k-means clustering algorithm.

Using the K-Means++ algorithm, we optimize the step where we randomly pick the cluster centroid. We are more likely to find a solution that is competitive to the optimal K-Means solution while using the K-Means++ initialization.
The steps to initialize the centroids using K-Means++ are:

The first cluster is chosen uniformly at random from the data points that we want to cluster. This is similar to what we do in K-Means, but instead of randomly picking all the centroids, we just pick one centroid here
Next, we compute the distance (D(x)) of each data point (x) from the cluster center that has already been chosen
Then, choose the new cluster center from the data points with the probability of x being proportional to (D(x))2
We then repeat steps 2 and 3 until k clusters have been chosen

In [None]:
#KMeans Algorithm to decide the optimum cluster number , KMeans++ using Elbow Mmethod
#to figure out K for KMeans, I will use ELBOW Method on KMEANS++ Calculation
from sklearn.cluster import KMeans
SSE = []

#we always assume the max number of cluster would be 10
#you can judge the number of clusters by doing averaging
###Static code to get max no of clusters

for i in range(1,11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 0)
    kmeans.fit(X)
    SSE.append(kmeans.inertia_)

    #inertia_ is the formula used to segregate the data points into clusters

In [None]:
SSE

In [None]:
#Visualizing the ELBOW method to get the optimal value of K 

plt.plot(range(1,11), wcss)
plt.title('The Elbow Method')
plt.xlabel('No of clusters')
plt.ylabel('SSE')
plt.show()

In [None]:
#If you zoom out this curve then you will see that last elbow comes at k=5
#no matter what range we select ex- (1,21) also i will see the same behaviour but if we chose higher range it is little difficult to visualize the ELBOW
#that is why we usually prefer range (1,11)
##Finally we got that k=5

#Model Build
kmeansmodel = KMeans(n_clusters = 5, init ='k-means++', random_state = 0)
y_kmeans = kmeansmodel.fit_predict(X)

#For unsupervised learning we use "fit_predict()" wherein for supervised learning we use "fit_tranform()"
#y_kmeans is the final model . Now how and where we will deploy this model in production is depends on what tool we are using.
#This use case is very common and it is used in BFS industry(credit card) and retail for customer segmenattion.


In [None]:
type(y_kmeans)

In [None]:
y_kmeans

In [None]:
#Visualizing all the clusters 

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

###Model Interpretation 
#Cluster 1 (Red Color) -> earning high but spending less
#cluster 2 (Blue Colr) -> average in terms of earning and spending 
#cluster 3 (Green Color) -> earning high and also spending high [TARGET SET]
#cluster 4 (cyan Color) -> earning less but spending more
#Cluster 5 (magenta Color) -> Earning less , spending less


######We can put Cluster 3 into some alerting system where email can be send to them on daily basis as these re easy to converse ######
#wherein others we can set like once in a week or once in a month

In [None]:
print('[0, 0] = ', X[y_kmeans == 0, 0])
print('[0, 1] = ', X[y_kmeans == 0, 1])
print('[1, 0] = ', X[y_kmeans == 1, 0])
print('[1, 1] = ', X[y_kmeans == 1, 1])
print('[2, 0] = ', X[y_kmeans == 2, 0])
print('[2, 1] = ', X[y_kmeans == 2, 1])
print('[3, 0] = ', X[y_kmeans == 3, 0])
print('[3, 1] = ', X[y_kmeans == 3, 1])
print('[4, 0] = ', X[y_kmeans == 4, 0])
print('[4, 1] = ', X[y_kmeans == 4, 1])


In [None]:
print(kmeans.cluster_centers_[:, 0])
print(kmeans.cluster_centers_[:, 1])

Source: [https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-k-means-clustering/](http://)