In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.stats import zscore

In [2]:
#reading the csv file into pandas dataframe
custData = pd.read_csv("Cust_Spend_Data.csv")
custData.head(10)

Unnamed: 0,Cust_ID,Name,Avg_Mthly_Spend,No_Of_Visits,Apparel_Items,FnS_Items,Staples_Items
0,1,A,10000,2,1,1,0
1,2,B,7000,3,0,10,9
2,3,C,7000,7,1,3,4
3,4,D,6500,5,1,1,4
4,5,E,6000,6,0,12,3
5,6,F,4000,3,0,1,8
6,7,G,2500,5,0,11,2
7,8,H,2500,3,0,1,1
8,9,I,2000,2,0,2,2
9,10,J,1000,4,0,1,7


In [3]:
custData.shape

(10, 7)

In [4]:
custData.dtypes

Cust_ID             int64
Name               object
Avg_Mthly_Spend     int64
No_Of_Visits        int64
Apparel_Items       int64
FnS_Items           int64
Staples_Items       int64
dtype: object

In [5]:
#All the rows after 2nd columns are considered
custDataAttr=custData.iloc[:,2:]
custDataAttr.head()

Unnamed: 0,Avg_Mthly_Spend,No_Of_Visits,Apparel_Items,FnS_Items,Staples_Items
0,10000,2,1,1,0
1,7000,3,0,10,9
2,7000,7,1,3,4
3,6500,5,1,1,4
4,6000,6,0,12,3


In [6]:
custDataScaled=custDataAttr.apply(zscore)
custDataScaled.head(10)

Unnamed: 0,Avg_Mthly_Spend,No_Of_Visits,Apparel_Items,FnS_Items,Staples_Items
0,1.886498,-1.240347,1.527525,-0.741433,-1.380131
1,0.787567,-0.620174,-0.654654,1.280656,1.725164
2,0.787567,1.860521,1.527525,-0.29208,0.0
3,0.604412,0.620174,1.527525,-0.741433,0.0
4,0.421257,1.240347,-0.654654,1.730009,-0.345033
5,-0.311364,-0.620174,-0.654654,-0.741433,1.380131
6,-0.860829,0.620174,-0.654654,1.505333,-0.690066
7,-0.860829,-0.620174,-0.654654,-0.741433,-1.035098
8,-1.043984,-1.240347,-0.654654,-0.516756,-0.690066
9,-1.410294,0.0,-0.654654,-0.741433,1.035098


In [11]:
#importing seaborn for statistucal plots
sns.pairplot(custDataScaled, height=2,aspect=2,diag_kind='kde')

<seaborn.axisgrid.PairGrid at 0x13f5714c0>

In [12]:
from sklearn.cluster import AgglomerativeClustering

In [15]:
model = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='average')

In [16]:
model.fit(custDataScaled)

In [17]:
custDataAttr['labels']=model.labels_
custDataAttr.head(10)

#custDataAttr.groupby(["Labels"]).count()

Unnamed: 0,Avg_Mthly_Spend,No_Of_Visits,Apparel_Items,FnS_Items,Staples_Items,labels
0,10000,2,1,1,0,1
1,7000,3,0,10,9,0
2,7000,7,1,3,4,1
3,6500,5,1,1,4,1
4,6000,6,0,12,3,0
5,4000,3,0,1,8,0
6,2500,5,0,11,2,0
7,2500,3,0,1,1,0
8,2000,2,0,2,2,0
9,1000,4,0,1,7,0


In [18]:
custDataClust = custDataAttr.groupby(['labels'])
custDataClust

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x141cae030>

In [20]:
custDataClust.mean()

Unnamed: 0_level_0,Avg_Mthly_Spend,No_Of_Visits,Apparel_Items,FnS_Items,Staples_Items
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3571.428571,3.714286,0.0,5.428571,4.571429
1,7833.333333,4.666667,1.0,1.666667,2.666667


In [22]:
from scipy.cluster.hierarchy import cophenet,dendrogram,linkage

In [24]:
from scipy.spatial.distance import pdist #Pairwise distribution between two data points

In [37]:
#cophenet index is a measure of the correlation between the distance of pojnts in feature space

Z = linkage(custDataScaled, metric='euclidean', method='average')
c, coph_dists = cophenet(Z, pdist(custDataScaled))
c

0.8681149436293064

In [38]:
plt.figure(figsize=(10,5))
plt.title('Agglomerative Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z, leaf_rotation=90.,color_threshold=40,leaf_font_size=8.)
plt.tight_layout()

In [39]:
#cophenet index is a measure of the correlation between the distance of pojnts in feature space

Z = linkage(custDataScaled, metric='euclidean', method='complete')
c, coph_dists = cophenet(Z, pdist(custDataScaled))
c

0.8606955190809153

In [40]:
plt.figure(figsize=(10,5))
plt.title('Agglomerative Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z, leaf_rotation=90.,color_threshold=90,leaf_font_size=10. )
plt.tight_layout()

In [41]:
#cophenet index is a measure of the correlation between the distance of pojnts in feature space

Z = linkage(custDataScaled, metric='euclidean', method='ward')
c, coph_dists = cophenet(Z, pdist(custDataScaled))
c

0.8453818941339526

In [42]:
plt.figure(figsize=(10,5))
plt.title('Agglomerative Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z, leaf_rotation=90.,color_threshold=600,leaf_font_size=10. )
plt.tight_layout()