In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from sklearn.cluster import KMeans

## Data Collection & Analysis

In [None]:
customer_data = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')

In [None]:
# first 5 rows in the dataframe
customer_data.head()

In [None]:
# finding the number of rows and columns
customer_data.shape

# data available for 200 customers

In [None]:
# getting some informations about the dataset
customer_data.info()

## Missing Values

In [None]:
# checking for missing values
customer_data.isnull().sum()

### Feature Selection
#### Choosing the Annual Income Column & Spending Score column

In [None]:
X = customer_data.iloc[:,[3,4]].values
# x-axis : annual income 
# y-axis : spending score

## Choosing the number of clusters
### WCSS  ->  Within Clusters Sum of Squares

In [None]:
# finding wcss value for different number of clusters
# cost function --> within cluster the sum of square distances of each data point from the centroid of that cluster

wcss = []

for i in range(1,11):
  kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
  # ‘k-means++’ : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence

  kmeans.fit(X)

  wcss.append(kmeans.inertia_)
  # inertia_float : Sum of squared distances of samples to their closest cluster center.

In [None]:
# plot an elbow graph

sns.set()
plt.plot(range(1,11), wcss)
plt.title('The Elbow Point Graph')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

## Training the k-Means Clustering Model using k=5

In [None]:
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=0) #best-fit model

# return a label for each data point based on their cluster
Y = kmeans.fit_predict(X)

print(Y)

## Visualizing all the Clusters

In [None]:
# Cluster Centers
kmeans.cluster_centers_

In [None]:
# plotting all the clusters and their Centroids
# x-axis : annual income 
# y-axis : spending score

plt.figure(figsize=(8,8))
plt.scatter(X[Y==0,0], X[Y==0,1], s=50, c='green', label='Cluster 1')
plt.scatter(X[Y==1,0], X[Y==1,1], s=50, c='red', label='Cluster 2')
plt.scatter(X[Y==2,0], X[Y==2,1], s=50, c='yellow', label='Cluster 3')
plt.scatter(X[Y==3,0], X[Y==3,1], s=50, c='violet', label='Cluster 4')
plt.scatter(X[Y==4,0], X[Y==4,1], s=50, c='blue', label='Cluster 5')

# plot the centroids
  # cluster_centers_ndarray of shape (n_clusters, n_features) : Coordinates of cluster centers. 
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], s=100, c='cyan', label='Centroids')

plt.title('Customer Groups')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.show()

### Measuring performance of clusteres - silhouette_score

In [None]:
# Metrics for clustering algorithms
from sklearn.metrics import silhouette_samples, silhouette_score

# Based on the elbow plot we choose various candidates for number of clusters
clusters = [4,5,6,7,8,9]




for cluster in clusters:
  # creating a sub-plot with 2 columns
  fig, (ax1, ax2) = plt.subplots(1,2) # 1 row, 2 columns
  fig.set_size_inches(18, 7)

  # silhouette coefficient can range from -1 to 1
  # -1 being worst, 1 being the best
  ax1.set_xlim([-0.2,1])

  # we need to insert blank space between silhouette plots, 
  ax1.set_ylim([0, len(X) + (cluster + 1) * 10])

  km_cluster = KMeans(n_clusters=cluster, random_state= 1)
  cluster_labels = km_cluster.fit_predict(X)


  # "silhoutte_score" gives the average value for all the samples, a perspective into density and separation of clusters formed

  silhoutte_avg = silhouette_score(X, cluster_labels)
  print("For n_clusters = ", cluster, " The average silhouette_score is :", silhoutte_avg)


  # Compute the silhouette scores for each sample
  sample_silhouette_values = silhouette_samples(X, cluster_labels)



  # plotting silhoutte graph
  y_lower = 10
  for i in range(cluster):
      # Aggregate the silhouette scores for samples belonging to
      # cluster i, and sort them
      ith_cluster_silhouette_values = \
          sample_silhouette_values[cluster_labels == i]

      ith_cluster_silhouette_values.sort()

      size_cluster_i = ith_cluster_silhouette_values.shape[0]
      y_upper = y_lower + size_cluster_i

      color = cm.nipy_spectral(float(i) / cluster)
      ax1.fill_betweenx(np.arange(y_lower, y_upper),
                        0, ith_cluster_silhouette_values,
                        facecolor=color, edgecolor=color, alpha=0.7)

      # Label the silhouette plots with their cluster numbers at the middle
      ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

      # Compute the new y_lower for next plot
      y_lower = y_upper + 10  # 10 for the 0 samples

  ax1.set_title("The silhouette plot for the various clusters.")
  ax1.set_xlabel("The silhouette coefficient values")
  ax1.set_ylabel("Cluster label")

  # The vertical line for average silhouette score of all the values
  ax1.axvline(x=silhoutte_avg, color="red", linestyle="--")

  ax1.set_yticks([])  # Clear the yaxis labels / ticks
  ax1.set_xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])

  # 2nd Plot showing the actual clusters formed
  colors = cm.nipy_spectral(cluster_labels.astype(float) / cluster)
  ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
              c=colors, edgecolor='k')

  # Labeling the clusters
  centers = km_cluster.cluster_centers_


  # Draw white circles at cluster centers
  ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
              c="white", alpha=1, s=200, edgecolor='k')

  for i, c in enumerate(centers):
      ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                  s=50, edgecolor='k')

  ax2.set_title("The visualization of the clustered data.")
  ax2.set_xlabel("Feature space for the 1st feature")
  ax2.set_ylabel("Feature space for the 2nd feature")

  plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                "with n_clusters = %d" % cluster),
                fontsize=14, fontweight='bold')

plt.show()

In [None]:
# Note: a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters.
# Observations: clusters = 5 has got the highest silhoutte score