# Mall Customer Segmentation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
df.head(3)

In [None]:
df.info()

In [None]:
df.drop(columns='CustomerID', inplace=True)

In [None]:
#distribution of categorical binary variable: Gender
plt.figure(figsize=(12,4));
plt.subplot(1,2,1);
sns.countplot(df['Gender']);
plt.title('gender value_counts');
plt.subplot(1,2,2);
df['Gender'].value_counts().plot(kind='pie',autopct='%.1f%%');
plt.title('gender proportion');

In [None]:
#one-hot encoding of Gender
df['Male']=pd.get_dummies(df['Gender'],drop_first=True)
df.drop(columns='Gender',inplace=True)
df.head(3)

In [None]:
sns.pairplot(df);

The relationship between annual income and spending score seems interesting. Let's explore a little further, by bringing the other variables into the conversation.

In [None]:
sns.scatterplot(x='Spending Score (1-100)', y='Annual Income (k$)',
               data=df, hue='Male');
plt.legend(loc=[1.1,0.7]);

In [None]:
sns.scatterplot(x='Spending Score (1-100)', y='Annual Income (k$)',
               data=df, hue='Age');
plt.legend(loc=[1.1,0.7]);

There doesn't seem to be a strong relationship between Gender and Spending Score, although with Age there does seem to be a degree of negative correlation, with older people tending to spend less. The correlation heatmap below confirms this.

In [None]:
sns.heatmap(df.corr(), annot=True, fmt='1.1f');

### Clustering


### K-means

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from matplotlib import cm

#### Clustering According to Annual Income and Spending Score

In [None]:
x=df[['Annual Income (k$)','Spending Score (1-100)']].values
inertia=[]

for i in range(1,11):
    km=KMeans(n_clusters=i,random_state=33)
    km.fit(x)
    inertia.append(km.inertia_)
    
sns.lineplot(range(1,11),inertia);

With the lineplot above we are using the 'elbow' method to determine the optimal number of clusters. The graph shows how the within-cluster SSE (Sum of Squared Errors) decreases as the number of clusters increases. The point where the line makes an 'elbow' is the optimal number of clusters. In the above case that number is five.

In [None]:
km=KMeans(n_clusters=5, random_state=33)
clusters=km.fit_predict(x)

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,0], x[:,1], hue =clusters);
plt.title('Clusters');
plt.legend(loc=(1.05,0.7));

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,0], x[:,1], hue =clusters);
sns.scatterplot(km.cluster_centers_[:,0], km.cluster_centers_[:,1], color='red');
plt.title('Clusters plus Cluster Centroids');
plt.legend(loc=(1.05,0.7));

#Graphing Silhouette
#code from Sebastian Raschka's book 'Python Machine Learning'
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.title('Silhouette Graph');
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show();

The silhouette graph above(code taken from Sebastian Raschka's book 'Python Machine Learning'), plots a measure of how tightly grouped the samples in the clusters are. The silhouette coefficient takes values between -1 and 1, the closer to 1 the better the clustering. The vertical red line depicts the average coefficient across all samples.

In the above cease the silhouettes aren't close to zero, and the clustering is considered rather good.


#### Clustering According to Age and Spending Score

In [None]:
#initializations
x=df[['Age','Spending Score (1-100)']].values
inertia=[]

#kmeans with various cluster-numbers
for i in range (1,11):
    km=KMeans(n_clusters=i, random_state=33)
    km.fit(x)
    inertia.append(km.inertia_)

#plot inertia
ind=np.arange(1,11)
sns.lineplot(ind,inertia);

In [None]:
km=KMeans(n_clusters=4,random_state=33)
clusters=km.fit_predict(x)

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,1],x[:,0],hue=clusters);
plt.title('Clusters');
plt.legend(loc=(1.05,0.7));

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,1],x[:,0],hue=clusters);
sns.scatterplot(km.cluster_centers_[:,1],km.cluster_centers_[:,0], color='red')
plt.title('Clusters plus Cluster Centers');
plt.legend(loc=(1.05,0.7));

#Graphing Silhouette
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.title('Silhouette Graph');
plt.show();

### Clustering using all variables (no standardization of data)

In [None]:
x=df.values
inertia=[]

for i in range(1,11):
    km=KMeans(n_clusters=i,random_state=33)
    km.fit(x)
    inertia.append(km.inertia_)
    
sns.lineplot(range(1,11),inertia);

In [None]:
km=KMeans(n_clusters=6,random_state=33)
clusters=km.fit_predict(x)

#Graphing Silhouette
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.title('Silhouette Graph');
plt.show();

### Clustering using all variables (standardized data)

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x=sc.fit_transform(df.values)
inertia=[]

for i in range(1,11):
    km=KMeans(n_clusters=i,random_state=33)
    km.fit(x)
    inertia.append(km.inertia_)
    
sns.lineplot(range(1,11),inertia);

The 'elbow' graph is vague, and although eight seems to be the optimal number of clusters, we will try with six, seven, and eight clusters, and compare the silhouette graphs.

In [None]:
#six clusters
km=KMeans(n_clusters=6,random_state=33)
clusters=km.fit_predict(x)

#Graphing Silhouette
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show();

In [None]:
#seven clusters
km=KMeans(n_clusters=7,random_state=33)
clusters=km.fit_predict(x)

#Graphing Silhouette
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show();

In [None]:
#eight clusters
km=KMeans(n_clusters=8,random_state=33)
clusters=km.fit_predict(x)

#Graphing Silhouette
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show();

The silhouette graphs confirm that eight is the best number of clusters, although, as is usually the case with real-world data that have more than two or three dimensions, the results aren't perfect.

# PCA

In [None]:
from sklearn.decomposition import PCA

### PCA on raw data

In [None]:
pca=PCA(n_components=2)
x=pca.fit_transform(df.values)
sns.scatterplot(x[:,0],x[:,1]);

plt.figure()
sns.scatterplot(x[:,0],x[:,1],hue=df['Spending Score (1-100)']);
plt.legend(loc=(1.01,0.64));

plt.figure()
sns.scatterplot(x[:,0],x[:,1],hue=df['Annual Income (k$)']);
plt.legend(loc=(1.01,0.64));

plt.figure()
sns.scatterplot(x[:,0],x[:,1],hue=df['Age']);
plt.legend(loc=(1.01,0.57));

plt.figure()
sns.scatterplot(x[:,0],x[:,1],hue=df['Male']);
plt.legend(loc=(1.01,0.64));

In [None]:
#elbow graph to determine optimal n_clusters
inertia=[]
for i in range(1,11):
    km=KMeans(n_clusters=i,random_state=33)
    km.fit(x)
    inertia.append(km.inertia_)
    
sns.lineplot(range(1,11),inertia);

In [None]:
km=KMeans(n_clusters=5, random_state=33)
clusters=km.fit_predict(x)

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,0], x[:,1], hue =clusters);
plt.title('Clusters');
plt.legend(loc=(1.05,0.7));

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,0], x[:,1], hue =clusters);
sns.scatterplot(km.cluster_centers_[:,0], km.cluster_centers_[:,1], color='red')
plt.title('Clusters plus Cluster Centers');
plt.legend(loc=(1.05,0.7));

#Graphing Silhouette
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.title('Silhouette Graph');
plt.show();

### PCA on standardized data

In [None]:
pca=PCA(n_components=2)
sc=StandardScaler()
x=pca.fit_transform(sc.fit_transform(df.values))
sns.scatterplot(x[:,0],x[:,1]);

plt.figure()
sns.scatterplot(x[:,0],x[:,1],hue=df['Spending Score (1-100)']);
plt.legend(loc=(1.01,0.64));

plt.figure()
sns.scatterplot(x[:,0],x[:,1],hue=df['Annual Income (k$)']);
plt.legend(loc=(1.01,0.64));

plt.figure()
sns.scatterplot(x[:,0],x[:,1],hue=df['Age']);
plt.legend(loc=(1.01,0.57));

plt.figure()
sns.scatterplot(x[:,0],x[:,1],hue=df['Male']);
plt.legend(loc=(1.01,0.64));

In [None]:
#elbow graph to determine optimal n_clusters
inertia=[]
for i in range(1,11):
    km=KMeans(n_clusters=i,random_state=33)
    km.fit(x)
    inertia.append(km.inertia_)
    
sns.lineplot(range(1,11),inertia);

Which cluster number is best, four or six? Let's try both.

In [None]:
km=KMeans(n_clusters=4, random_state=33)
clusters=km.fit_predict(x)

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,0], x[:,1], hue =clusters);
plt.title('Clusters');
plt.legend(loc=(1.05,0.7));

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,0], x[:,1], hue =clusters);
sns.scatterplot(km.cluster_centers_[:,0], km.cluster_centers_[:,1], color='red')
plt.title('Clusters plus Cluster Centers');
plt.legend(loc=(1.05,0.7));

#Graphing Silhouette
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.title('Silhouette Graph');
plt.show();

In [None]:
km=KMeans(n_clusters=6, random_state=33)
clusters=km.fit_predict(x)

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,0], x[:,1], hue =clusters);
plt.title('Clusters');
plt.legend(loc=(1.05,0.7));

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,0], x[:,1], hue =clusters);
sns.scatterplot(km.cluster_centers_[:,0], km.cluster_centers_[:,1], color='red')
plt.title('Clusters plus Cluster Centers');
plt.legend(loc=(1.05,0.7));

#Graphing Silhouette
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.title('Silhouette Graph');
plt.show();

Clearly four clusters is a better option.

So, should we keep the clustering on raw PCA data, or standardized PCA data? In theory, we should keep the standardized one, but in practice the raw version seems better, both from visual inspection and from the silhouette figure. (Perhaps it worked that way because the data aren't very complex).

Now let's move on to hierarchical clustering.


### Hierarchical Clustering (Agglomerative)

In [None]:
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import linkage
from sklearn.cluster import AgglomerativeClustering as agglo

#### Clustering According to Annual Income and Spending Score

In [None]:
x=df[['Annual Income (k$)','Spending Score (1-100)']].values
dend=dendrogram(linkage(x, method='ward'))
plt.show();

In [None]:
ag=agglo(n_clusters=5, affinity='euclidean',linkage='complete')
clusters=ag.fit_predict(x)

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,0], x[:,1], hue =clusters);
plt.title('Clusters');
plt.legend(loc=(1.05,0.7));


#Graphing Silhouette
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.title('Silhouette Graph');
plt.show();

#### Clustering According to Age and Spending Score

In [None]:
x=df[['Age','Spending Score (1-100)']].values
den=dendrogram(linkage(x,method='ward'))
plt.show();

From the above dendrogram it is unclear whether the otimal number of clusters is three or four. Although four clusters seem to be best, we will try both options.

In [None]:
#four clusters

ag=agglo(n_clusters=4, affinity='euclidean',linkage='complete')
clusters=ag.fit_predict(x)

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,1], x[:,0], hue =clusters);
plt.title('Clusters');
plt.legend(loc=(1.05,0.7));


#Graphing Silhouette
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.title('Silhouette Graph');
plt.show();

In [None]:
#three clusters
ag=agglo(n_clusters=3, affinity='euclidean',linkage='complete')
clusters=ag.fit_predict(x)

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,1], x[:,0], hue =clusters);
plt.title('Clusters');
plt.legend(loc=(1.05,0.7));


#Graphing Silhouette
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.title('Silhouette Graph');
plt.show();

As we see in the silhouette graph, the three-cluster is the worse of the two options, so we stick to four clusters, which was also suggested by the 'elbow' method used in k-means.

Now, as with k-means, we will explore the scenario where we don't want to cluster based on a specific attribute, but want to use them all.


### Clustering using all variables (standardized data)

In [None]:
x=df.values
den=dendrogram(linkage(x,method='ward'))
plt.show();

In [None]:
#the dendrogram suggests six clusters
ag=agglo(n_clusters=6, affinity='euclidean',linkage='complete')
clusters=ag.fit_predict(x)

#Graphing Silhouette
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show();

### PCA on raw data

In [None]:
pca=PCA(n_components=2)
x=pca.fit_transform(df.values)
dend=dendrogram(linkage(x,method='ward'))
plt.show();

In [None]:
#dendrogram suggests five clusters
ag=agglo(n_clusters=5, linkage='complete')
clusters=ag.fit_predict(x)

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,0], x[:,1], hue =clusters);
plt.title('Clusters');
plt.legend(loc=(1.05,0.7));

#Graphing Silhouette
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.title('Silhouette Graph');
plt.show();

### PCA on standardized data

In [None]:
sc=StandardScaler()
pca=PCA(n_components=2)
x=pca.fit_transform(sc.fit_transform(df.values))
dend=dendrogram(linkage(x,method='ward'))
plt.show();

In [None]:
#dendrogram suggests four clusters
ag=agglo(n_clusters=4, linkage='complete')
clusters=ag.fit_predict(x)

plt.figure(figsize=(7,5));
sns.scatterplot(x[:,0], x[:,1], hue =clusters);
plt.title('Clusters');
plt.legend(loc=(1.05,0.7));

#Graphing Silhouette
labels=np.unique(clusters)
n_clusters=labels.shape[0]
sils=silhouette_samples(x,clusters,metric='euclidean')
y_ax_lower, y_ax_upper=0, 0
yticks=[]
plt.figure(figsize=(6,5))
for i,c in enumerate(labels):
    cluster_sil=sils[clusters==c]
    cluster_sil.sort()
    y_ax_upper +=len(cluster_sil)
    color=cm.jet(float(i)/n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
            cluster_sil, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_ax_lower+y_ax_upper)/2.)
    y_ax_lower+=len(cluster_sil)
silhouette_avg=np.mean(sils)
plt.axvline(silhouette_avg,color='red', linestyle='--')
plt.yticks(yticks,labels+1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.title('Silhouette Graph');
plt.show();

Agglomerative clustering yielded some interesting results, but k-means seems better, and for this project is picked as the best option.