In [39]:
from sklearn.cluster import KMeans
import numpy as np
from bokeh.io import output_notebook, show
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure

In [40]:
output_notebook()

In [41]:
from pandas import read_csv
dataset = read_csv('Wholesale customers data.csv')
columns = ["Channel", "Region", "Fresh", "Milk", "Grocery", "Frozen", "Detergents_Paper", "Delicassen"]

In [42]:
#Converting to numpy array 
data = dataset[list(columns)].values
data

array([[    2,     3, 12669, ...,   214,  2674,  1338],
       [    2,     3,  7057, ...,  1762,  3293,  1776],
       [    2,     3,  6353, ...,  2405,  3516,  7844],
       ..., 
       [    2,     3, 14531, ...,   437, 14841,  1867],
       [    1,     3, 10290, ...,  1038,   168,  2125],
       [    1,     3,  2787, ...,    65,   477,    52]], dtype=int64)

In [43]:
#Selecting only two attributes of the dataset, Milk and Grocery
finaldata = data[:,[3,4]]
finaldata

array([[ 9656,  7561],
       [ 9810,  9568],
       [ 8808,  7684],
       [ 1196,  4221],
       [ 5410,  7198],
       [ 8259,  5126],
       [ 3199,  6975],
       [ 4956,  9426],
       [ 3648,  6192],
       [11093, 18881],
       [ 5403, 12974],
       [ 1124,  4523],
       [12319, 11757],
       [ 6208, 14982],
       [ 9465, 12091],
       [ 1114,  3821],
       [ 8816, 12121],
       [ 6157,  2933],
       [ 6327, 10099],
       [ 2495,  9464],
       [ 4519,  4602],
       [  871,  2010],
       [ 1917,  4469],
       [36423, 22019],
       [ 9776, 13792],
       [ 4230,  7595],
       [  961,  2861],
       [  803,  3045],
       [20484, 25957],
       [ 2100,  2609],
       [ 3610, 11107],
       [ 4339,  3133],
       [ 1318,  2886],
       [ 4786,  7326],
       [ 1979,  2262],
       [ 5491, 11091],
       [ 4362,  5428],
       [10556, 12477],
       [15729, 16709],
       [  555,   902],
       [ 4332,  4757],
       [ 3065,  5956],
       [ 7555, 14961],
       [110

In [44]:
#Implementing K-Means clustering algorithm
kmeans = KMeans(n_clusters=3)
kmeans.fit(finaldata)
kmeans.labels_

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       2, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 2, 0, 0, 1, 1, 1, 0, 1, 1, 2, 0, 1, 1, 1, 2, 1, 0, 1, 2, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 2, 2, 1, 1, 1, 1, 1,
       2, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 2,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 2, 1, 1, 0, 0, 2, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 2, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1,

In [57]:
#hover = HoverTool(tooltips=[("$index", "$index"),("(Milk, Groceries)", "(@x, @y)")])
hover = HoverTool(tooltips=[("(Milk, Groceries)", "($x, $y)")])
TOOLS = "box_select,lasso_select,box_zoom,wheel_zoom,pan,help,hover"
p1 = figure(width=500, height=500, title='K-Means Clustering', x_axis_label = "Milk", y_axis_label = "Grocery", tools=TOOLS)

In [58]:
centroid_x = []
centroid_y = []

for entry in kmeans.cluster_centers_:
    centroid_x.append(entry[0])
    centroid_y.append(entry[1])

p1.circle_cross(x=centroid_x, y=centroid_y, size=25, fill_alpha=0, line_width=2, color=['red', 'green', 'blue'])

In [59]:
i=0
for sample in finaldata:
    if kmeans.labels_[i] == 0:
        p1.circle(x=sample[0], y=sample[1], size=5, color="red")
    if kmeans.labels_[i] == 1:
        p1.circle(x=sample[0], y=sample[1], size=5, color="green")
    if kmeans.labels_[i] == 2:
        p1.circle(x=sample[0], y=sample[1], size=5, color="blue")
    i += 1

In [60]:
#show(p1)

In [61]:
from sklearn.cluster import AgglomerativeClustering
hierarchical = AgglomerativeClustering(n_clusters=10, affinity='euclidean', linkage='ward')
hierarchical.fit(finaldata)
labels = hierarchical.labels_
print(labels) #To find the number of clusters generated

[1 3 1 0 1 1 0 3 0 4 3 0 3 3 3 0 3 0 3 0 0 0 0 2 3 0 0 0 6 0 3 0 0 1 0 3 0
 3 2 0 0 0 3 4 3 6 4 8 3 6 0 0 0 3 0 0 6 3 0 3 0 8 1 4 0 6 0 3 1 0 0 4 0 3
 3 0 0 6 0 0 0 3 3 0 0 9 5 3 0 0 0 0 6 0 3 0 0 0 0 0 3 4 3 0 0 0 3 4 1 6 0
 4 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 3 1 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 6 0 0
 0 0 0 0 0 1 0 4 3 0 3 3 3 0 0 6 1 4 3 0 0 0 3 2 1 4 0 3 1 1 1 0 3 2 3 2 0
 0 0 3 3 3 0 0 0 4 0 1 1 3 0 0 4 6 3 0 0 4 0 1 0 4 0 6 0 1 3 4 6 0 3 0 0 3
 0 0 0 0 3 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 3 3 0 0 0 0 0 6 0 3 1 0 1 0 0
 1 0 0 0 0 4 2 4 0 4 0 0 0 1 0 0 0 0 0 0 3 0 1 0 0 3 0 0 0 0 0 0 0 0 3 0 0
 0 0 3 0 1 4 3 3 4 3 4 0 0 2 0 0 4 0 0 2 0 0 0 2 0 0 0 3 0 2 0 0 0 0 0 6 0
 7 0 3 0 0 0 0 3 3 1 4 0 0 3 3 0 4 0 4 0 4 0 0 0 4 1 0 0 0 0 0 0 3 0 0 0 0
 0 0 0 3 0 0 3 0 0 3 0 0 1 0 2 0 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 0 1 1 0 0 1
 4 0 0 0 0 3 1 0 3 3 3 4 0 3 3 0 0 3 0 3 1 1 0 3 1 0 0 0 4 0 6 0 0]


In [62]:
p2 = figure(width=500, height=500, title='Agglomerative Clustering', x_axis_label = "Milk", y_axis_label = "Grocery", tools=TOOLS)

In [63]:
i=0
for sample in finaldata:
    if hierarchical.labels_[i] == 0:
        p2.circle(x=sample[0], y=sample[1], size=5, color="red")
    if hierarchical.labels_[i] == 1:
        p2.circle(x=sample[0], y=sample[1], size=5, color="green")
    if hierarchical.labels_[i] == 2:
        p2.circle(x=sample[0], y=sample[1], size=5, color="blue")
    if hierarchical.labels_[i] == 3:
        p2.circle(x=sample[0], y=sample[1], size=5, color="purple")
    if hierarchical.labels_[i] == 4:
        p2.circle(x=sample[0], y=sample[1], size=5, color="yellow")
    if hierarchical.labels_[i] == 5:
        p2.circle(x=sample[0], y=sample[1], size=5, color="orange")
    if hierarchical.labels_[i] == 6:
        p2.circle(x=sample[0], y=sample[1], size=5, color="black")
    i += 1

In [67]:
#show(p2)

In [64]:
from bokeh.layouts import gridplot
TOOLS = "box_select,lasso_select,box_zoom,wheel_zoom,pan,help"
p = gridplot([[p1, p2]])
show(p)