In [1]:
from bokeh.models import ColumnDataSource, HoverTool, CustomJS, Select, Slider
from bokeh.plotting import figure
from bokeh.io       import output_notebook, show
from bokeh.layouts  import column, row
from bokeh.palettes import inferno

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn import preprocessing

import pandas as pd
import numpy as np

output_notebook()

## K-Means Clustering

In [2]:
# load the data and preprocess it
df = pd.read_csv('Wholesale customers data.csv')
C = df.columns.values
df = preprocessing.scale(df) # center to mean and standardize to unit variance
df = pd.DataFrame(df, columns=C)

colors = []
cols = []
for K in range(2,21):
    kmeans = KMeans(n_clusters=K).fit(df)
    L = [inferno(K)[x] for x in kmeans.labels_]
    colors.append(L)
    cols.append(str(K))

colors = np.asarray(colors)
KMeansColors = pd.DataFrame(colors.T, columns = cols) # construct DataFrame to store color legend for each K

Explanation of the code in the above cell: The KMeans clustering algorithm is run, starting from 2 means(***K***=2) upto 20 means(***K***=20). For each ***K***, the cluster label assigned to each data point is computed. The cluster labels are used to assign colors to each data point.

Finally, a DataFrame is constructed, whose **columns store the color legend for each *K***. These columns are used while interacting with the Slider in the plot.

In [3]:
CDS = ColumnDataSource(df)
colorCDSKMeans = ColumnDataSource(KMeansColors)
PlotCDS = ColumnDataSource(data={'x':df['Channel'].values,
                                 'y':df['Channel'].values,
                                 'legend':KMeansColors['2'].values})

colorChangerKMeans=CustomJS(args=dict(s1=colorCDSKMeans, s2=PlotCDS), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var Num_means = cb_obj.value;
    d2.legend = d1[Num_means.toString()];
    s2.change.emit();
""")

xAxisDataKMeans=CustomJS(args=dict(s1=CDS, s2=PlotCDS), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var dimension = cb_obj.value;
    d2.x = d1[dimension];
    s2.change.emit();
""")

yAxisDataKMeans=CustomJS(args=dict(s1=CDS, s2=PlotCDS), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var dimension = cb_obj.value;
    d2.y = d1[dimension];
    s2.change.emit();
""")

p = figure(title="Scatter plot, K-Means Clustering", width=600, height=500)
p.scatter(x='x', y='y', source=PlotCDS, fill_color='legend', size=7)

mS=Slider(title="Number of Means", start=2, end=20, value=2, callback=colorChangerKMeans)
xS=Select(title="X-Axis", value="Channel", options=list(df.columns.values), callback=xAxisDataKMeans)
yS=Select(title="Y-Axis", value="Channel", options=list(df.columns.values), callback=yAxisDataKMeans)

layout = row(p, column(mS, xS, yS))
show(layout)

The above plot only conveys some information about the dimensions interacting with each other. For a better visualisation, dimensionality reduction techniques can be used, to see if it helps to understand the structure of the data.

The plot below uses PCA to reduce the dimensio of the data to two dimensions, and it then plots the data.

In [4]:
pca = PCA(n_components=2)
pca.fit(df)
pcaDF = pca.transform(df)
pcaDF = pd.DataFrame(pcaDF, columns=['x','y'])

PCADataKMeans = ColumnDataSource(data={'x':pcaDF['x'].values, 'y':pcaDF['y'].values , 'legend':KMeansColors['2'].values})

colorChangerKMeansPCA=CustomJS(args=dict(s1=colorCDSKMeans, s2=PCADataKMeans), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var Num_means = cb_obj.value;
    d2.legend = d1[Num_means.toString()];
    s2.change.emit();
""")

p = figure(title="Scatter plot with PCA, using KMeans Clustering", width=600, height=500)
p.scatter(x='x', y='y', source=PCADataKMeans, fill_color='legend', size=7)

mS=Slider(title="Number of Means", start=2, end=20, value=2, callback=colorChangerKMeansPCA)

layoutPCAKMeans = row(p, mS)
show(layoutPCAKMeans)

## Method 2: Agglomerative Clustering

In [5]:
colors = []
cols = []
for N in range(2,21): # 2 neighbors to 20 neighbors
    K = AgglomerativeClustering(n_clusters=N).fit(df)
    L = [inferno(N)[x] for x in K.labels_]
    colors.append(L)
    cols.append(str(N))

colors = np.asarray(colors)
AggloColors = pd.DataFrame(colors.T, columns = cols)

In [6]:
colorCDSAgglo = ColumnDataSource(AggloColors)
PlotCDS2 = ColumnDataSource(data={'x':df['Channel'].values,
                                  'y':df['Channel'].values,
                                  'legend':AggloColors['2'].values})

colorChangerAgglo=CustomJS(args=dict(s1=colorCDSAgglo, s2=PlotCDS2), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var Num_means = cb_obj.value;
    d2.legend = d1[Num_means.toString()];
    s2.change.emit();
""")

xAxisDataAgglo=CustomJS(args=dict(s1=CDS, s2=PlotCDS2), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var dimension = cb_obj.value;
    d2.x = d1[dimension];
    s2.change.emit();
""")

yAxisDataAgglo=CustomJS(args=dict(s1=CDS, s2=PlotCDS2), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var dimension = cb_obj.value;
    d2.y = d1[dimension]
    s2.change.emit();
""")

p = figure(title="Scatter plot, using Agglomerative CLustering", width=600, height=500)
p.scatter(x='x', y='y', source=PlotCDS2, fill_color='legend', size=7)

mS=Slider(title="Number of Means", start=2, end=20, value=2, callback=colorChangerAgglo)
xS=Select(title="X-Axis", value="Channel", options=list(df.columns.values), callback=xAxisDataAgglo)
yS=Select(title="Y-Axis", value="Channel", options=list(df.columns.values), callback=yAxisDataAgglo)

layout = row(p, column(mS, xS, yS))
show(layout)

In [7]:
PCADataAgglo = ColumnDataSource(data={'x':pcaDF['x'].values, 'y':pcaDF['y'].values , 'legend':AggloColors['2'].values})

colorChangerAggloPCA=CustomJS(args=dict(s1=colorCDSAgglo, s2=PCADataAgglo), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var Num_means = cb_obj.value;
    d2.legend = d1[Num_means.toString()];
    s2.change.emit();
""")

p = figure(title="Scatter plot with PCA, using Agglomerative Clustering", width=600, height=500)
p.scatter(x='x', y='y', source=PCADataAgglo, fill_color='legend', size=7)

mS=Slider(title="Number of Means", start=2, end=20, value=2, callback=colorChangerAggloPCA)

layoutPCAAgglo = row(p, mS)
show(layoutPCAAgglo)

## Final comparision between the two chosen methods

In [8]:
colorChangerComparision=CustomJS(args=dict(s1=colorCDSAgglo, 
                                           s2=PCADataAgglo,
                                           s3=colorCDSKMeans,
                                           s4=PCADataKMeans), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var d3 = s3.data;
    var d4 = s4.data;
    var Num_means = cb_obj.value;
    d2.legend = d1[Num_means.toString()];
    d4.legend = d3[Num_means.toString()];
    s2.change.emit();
    s4.change.emit();
""")

p1 = figure(title="KMeans Clustering", width=450, height=400)
p1.scatter(x='x', y='y', source=PCADataKMeans, fill_color='legend', size=7)

p2 = figure(title="Agglomerative Clustering", width=450, height=400)
p2.scatter(x='x', y='y', source=PCADataAgglo, fill_color='legend', size=7)

slider = Slider(title="Number of Means", start=2, end=20, value=2, callback=colorChangerComparision)

layoutComp = column(slider, row(p1, p2))
show(layoutComp)