# Customer Segmentation using Clustering Algorithms

#### In this notebook, we want to first estimate appropriate number of clusters of customers and analyze on the basis of various features along with the use of optimum number of clusteres. Then, we would like to find out the targeted cluster/group. For this, we will use KMeans and Hierarchial clustering algorithms. The dataset has a number of features and we shall study the customers' expenditure habits.

In [None]:
__author__ = 'rsh'

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Get the customer mall data

df = pd.read_csv('./mall_customers.csv')

In [None]:
print ('Shape of the dataset {}'.format(df.shape))
df.head(10)

In [None]:
columns = ['id', 'gender', 'age', 'annual_income', 'spending_score']
col_dict = {}

for key, val in zip(df.columns.values, columns):
    col_dict[key] = val

df.rename(columns=col_dict, inplace=True)

In [None]:
df.columns

In [None]:
df.tail()

In [None]:
# Plots of Age ad Spending score distributions
plt.subplots(1, 3, figsize=(20, 5))
plt.subplot(1, 2, 1)
plt.hist(df.age)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')

plt.subplot(1,2,2)
plt.hist(df.spending_score)
plt.title('Spending Score Distribution')
plt.xlabel('Spending score')
plt.ylabel('Count')
plt.show()

In [None]:
# Count the gender type
print (df.gender.value_counts())
#gender_data 
plt.bar(['Female', 'Male'], df.gender.value_counts().values)
plt.title('Gender distribution')
plt.xlabel('Gender')
plt.ylabel('Counts')
plt.show()

In [None]:
# Age vs Spending Score
plt.subplots(1, 3, figsize=(20, 5))
plt.subplot(1, 3, 1)
plt.scatter(df.age, df.spending_score)
plt.title('Age vs Spending Score')
plt.xlabel('Age')
plt.ylabel('Spending Score')

plt.subplot(1, 3, 2)
plt.scatter(df.age, df.annual_income)
plt.title('Age vs Annual Income')
plt.xlabel('Age')
plt.ylabel('Annual Income')

plt.subplot(1, 3, 3)
plt.scatter(df.annual_income, df.spending_score)
plt.title('Annual Income vs Spending Score')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')

plt.show()

In [None]:
# Analyzing Annual Income vs Spending Score
col1_index = 3
col2_index = 4
data = df.iloc[:, [col1_index, col2_index]] # Annual Income vs Spending Score
data.shape

In [None]:
from sklearn.cluster import KMeans

wcss = []
for i in range(1, 11):
    classifier = KMeans(n_clusters=i, random_state=101)
    classifier.fit(data)
    wcss.append(classifier.inertia_)

In [None]:
# Elbow Method
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('wcss')
plt.show()

#### Based on Elbow method, it is found that number of clusters = 5 is the best option

In [None]:
# Using number of clusters = 5 in the KMeans algorithm
clusterModel = KMeans(n_clusters = 5, random_state=101)
y_means = clusterModel.fit_predict(data)

print (y_means)

print (clusterModel.cluster_centers_)

In [None]:
plt.figure(figsize=(12,5))
plt.scatter(data.values[y_means == 0, 0], data.values[y_means == 0, 1], s = 100, c = 'purple', label = 'category1')
plt.scatter(data.values[y_means == 1, 0], data.values[y_means == 1, 1], s = 100, c = 'blue', label = 'category2')
plt.scatter(data.values[y_means == 2, 0], data.values[y_means == 2, 1], s = 100, c = 'red', label = 'category3')
plt.scatter(data.values[y_means == 3, 0], data.values[y_means == 3, 1], s = 100, c = 'green', label = 'category4')
plt.scatter(data.values[y_means == 4, 0], data.values[y_means == 4, 1], s = 100, c = 'orange', label = 'category5')
plt.scatter(clusterModel.cluster_centers_[:,0], clusterModel.cluster_centers_[:, 1], s = 250, c = 'black' , label = 'centeroid')

plt.title('Segmentation using KMeans Clustering (Annual Income vs Spending Score)')
plt.xlabel(df.columns[col1_index])
plt.ylabel(df.columns[col2_index])
plt.legend()
plt.show()

## Clustering using Hierarchial Clustering Algorithm

In [None]:
import scipy.cluster.hierarchy as sch

In [None]:
dgram = sch.dendrogram(sch.linkage(data.values, method='ward'))

plt.title('Dendogram')
plt.xlabel('Clusters')
plt.ylabel('Distance')
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
cluster.fit_predict(data)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data.values[:, 0], data.values[:, 1], c=cluster.labels_, cmap='rainbow')

plt.xlabel(df.columns[col1_index])
plt.ylabel(df.columns[col2_index])

plt.title('Customer Segmentation using Agglomerative Clustering')
plt.show()