In [None]:
#import needed libraries 
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans, MeanShift,estimate_bandwidth
import seaborn as sns
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from bokeh.plotting import figure,show,output_notebook
output_notebook()


%matplotlib inline

In [None]:
#import dataset
df = pd.read_csv('./custdata.csv')
df.head()

In [None]:
df.info()

### Normalize Columns that can be normalized

In [None]:
#normalize Fresh, Milk, Grocery, Frozen, Detergents_Paper, Delicassen columns

cols = ['Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen']
df_norm = df

df_norm.iloc[:,[2,3,4,5,6,7]] = normalize(df_norm.iloc[:,[2,3,4,5,6,7]])

### Fit kmeans with 3 central clusters to start off

In [None]:
km = KMeans(9)
km.fit(df_norm)

centers = km.cluster_centers_
centers


In [None]:
labels = km.labels_
silhouette_score(df_norm,labels,metric='euclidean')

### Silhouette score as a measure of number of clusters

In [None]:
#your code here:
scores = []
for k in range(2,16):
    km = KMeans(k)
    km.fit(df_norm)
    labels = km.labels_
    score = silhouette_score(df_norm,labels,metric='euclidean')
    scores.append(score)

In [None]:
plt.plot(range(2,16),scores) #without reducing dimensions / features kmeans the ideal value for k would be 9.

In [None]:
sns.pairplot(df)

In [None]:
pca = PCA(n_components=2)
X_df = pca.fit(df_norm).transform(df_norm)

In [None]:
# Percentage of variance explained (first two components):
print "First component: " + str(pca.explained_variance_ratio_[0])
print "Second component: " + str(pca.explained_variance_ratio_[1])

In [None]:
km = KMeans(8)
km.fit(X_df)

centers = km.cluster_centers_
centers

In [None]:
labels = km.labels_
silhouette_score(X_df,labels,metric='euclidean')

In [None]:
scores = []
for k in range(2,16):
    km = KMeans(k)
    km.fit(X_df)
    labels = km.labels_
    score = silhouette_score(X_df,labels,metric='euclidean')
    scores.append(score)

In [None]:
plt.plot(range(2,16),scores) #8 is optimal value for clusters

In [None]:
p = figure(title="Clusters in dataset",tools='')

p.circle(x = X_df[:,0],y= X_df[:,1],size = 5)


p.circle(x= centers[:,0],y=centers[:,1],
        alpha=0.4,
        color='green',
        size=100)

show(p)

In [None]:
bandwidth = estimate_bandwidth(X_df, quantile=0.3)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X_df)

In [None]:
labels = ms.labels_
silhouette_score(X_df,labels,metric='euclidean')

In [None]:
centers = ms.cluster_centers_

p = figure(title="Clusters in dataset",tools='')

p.circle(x = X_df[:,0],y= X_df[:,1],size = 5)


p.circle(x= centers[:,0],y=centers[:,1],
        alpha=0.4,
        color='green',
        size=100)

show(p)