In [64]:
from bokeh.models   import ColumnDataSource, HoverTool, CustomJS, Select
from bokeh.plotting import figure
from bokeh.io       import output_notebook, show
from bokeh.layouts  import column, row

import pandas as pd
import numpy as np

output_notebook()

In [65]:
C = pd.read_csv('Wholesale customers data.csv')

# Compute some statistics on complete, will contrast with missing in each method
Cmean = C.mean(axis=0)
Cmed  = C.median(axis=0)

In [66]:
Missing = pd.read_csv('Wholesale customers data-missing.csv')

### Compare the Descriptive Statistics of the original data and the missing data

In [67]:
Missing_mean = Missing.mean(axis=0)
Missing_med  = Missing.median(axis=0)

# take transpose for easier handling through CDS objects
Means = pd.DataFrame({'Original':Cmean, 'Missing':Missing_mean}).T
Meds  = pd.DataFrame({'Original':Cmed,  'Missing':Missing_med}).T

In [68]:
Means

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
Missing,1.322727,2.543182,12000.297727,5795.719818,7972.655251,3071.931818,2891.719178,1525.71754
Original,1.322727,2.543182,12000.297727,5796.265909,7951.277273,3071.931818,2881.493182,1524.870455


In [69]:
Meds

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
Missing,1.0,3.0,8504.0,3620.0,4755.5,1526.0,820.5,964.0
Original,1.0,3.0,8504.0,3627.0,4755.5,1526.0,816.5,965.5


## Make a scatter plot with two drop down menus, for selecting the attributes to be plotted against each other

In [70]:
# make a scatter plot of the data, with selectors for each of the axes

CDS = ColumnDataSource(Missing) # M1 is the DataFrame that has been filled with zeroes
PlotCDS = ColumnDataSource(data={'x':Missing['Channel'].values, 'y':Missing['Channel'].values})

xCallback=CustomJS(args=dict(s1=CDS, s2=PlotCDS), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var dimension = cb_obj.value;
    d2.x = d1[dimension];
    s2.change.emit();
""")

yCallback=CustomJS(args=dict(s1=CDS, s2=PlotCDS), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var dimension = cb_obj.value;
    d2.y = d1[dimension]
    s2.change.emit();
""")

p = figure(title="Scatter plot", 
           width=400, height=400)
p.scatter(x='x', y='y', source=PlotCDS)

xS=Select(title="X-Axis", value="Channel", options=list(Means.columns.values), callback=xCallback)
yS=Select(title="Y-Axis", value="Channel", options=list(Means.columns.values), callback=yCallback)

layout = row(p, column(xS, yS))
show(layout)

# Replacement Methods

### Method 1 : Replace with a fixed scalar(zero is chosen here) and plot comparision with respect to mean and median

In [76]:
Replaced = Missing.fillna(value=0)
Replaced_mean = Replaced.mean(axis=0)
Replaced_med  = Replaced.median(axis=0)

# take transpose for easier handling through CDS objects
Means = pd.DataFrame({'Original':Cmean, 'Replaced':Replaced_mean}).T
Meds  = pd.DataFrame({'Original':Cmed,  'Replaced':Replaced_med}).T

# Create CDS objects for Mean and Median DataFrames
MeanCDS = ColumnDataSource(Means)
MedCDS  = ColumnDataSource(Meds)
meanPlotCDS = ColumnDataSource(data={'x':Means.index.values, 'y':Means['Channel'].values})
medPlotCDS  = ColumnDataSource(data={'x':Meds.index.values, 'y':Meds['Channel'].values})

Callback = CustomJS(args=dict(s1=MeanCDS, s2=meanPlotCDS, s3=MedCDS, s4=medPlotCDS), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var d3 = s3.data;
    var d4 = s4.data;
    var dimension = cb_obj.value;
    d2.y = d1[dimension];
    d4.y = d3[dimension];
    s2.change.emit();
    s4.change.emit();
""")

meanplot=figure(title="Comparision of Means", 
         width=400, height=500, 
         x_range=list(Means.index.values))
meanplot.vbar(x='x', top='y', bottom=0, source=meanPlotCDS, width=0.5)
meanplot.add_tools(HoverTool(tooltips=[("Value", "@y{0.000}"), ("Category", "@x")]))

medplot=figure(title="Comparision of Medians", 
         width=400, height=500, 
         x_range=list(Means.index.values))
medplot.vbar(x='x', top='y', bottom=0, source=medPlotCDS, width=0.5)
medplot.add_tools(HoverTool(tooltips=[("Value", "@y{0.000}"), ("Category", "@x")]))

Dimension=Select(title="Dimension", value="Channel", options=list(Means.columns.values), callback=Callback)

layout = column(Dimension, row(meanplot, medplot))
show(layout)

### Method 2: Replace missing values with the mean of the column

In [77]:
Replaced = Missing.fillna(Missing_mean)
Replaced_mean = Replaced.mean()
Replaced_med  = Replaced.median(axis=0)

# take transpose for easier handling through CDS objects
Means2 = pd.DataFrame({'Original':Cmean, 'Replaced':Replaced_mean}).T
Meds2  = pd.DataFrame({'Original':Cmed,  'Replaced':Replaced_med}).T

# Create CDS objects for Mean and Median DataFrames
MeanCDS2 = ColumnDataSource(Means2)
MedCDS2  = ColumnDataSource(Meds2)
meanPlotCDS2 = ColumnDataSource(data={'x':Means2.index.values, 'y':Means2['Channel'].values})
medPlotCDS2  = ColumnDataSource(data={'x':Meds2.index.values, 'y':Meds2['Channel'].values})

Callback = CustomJS(args=dict(s1=MeanCDS2, s2=meanPlotCDS2, s3=MedCDS2, s4=medPlotCDS2), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var d3 = s3.data;
    var d4 = s4.data;
    var dimension = cb_obj.value;
    d2.y = d1[dimension];
    d4.y = d3[dimension];
    s2.change.emit();
    s4.change.emit();
""")

meanplot=figure(title="Comparision of Means", 
         width=400, height=500, 
         x_range=list(Means.index.values))
meanplot.vbar(x='x', top='y', bottom=0, source=meanPlotCDS2, width=0.5)
meanplot.add_tools(HoverTool(tooltips=[("Value", "@y{0.000}"), ("Category", "@x")]))

medplot=figure(title="Comparision of Medians", 
         width=400, height=500, 
         x_range=list(Means.index.values))
medplot.vbar(x='x', top='y', bottom=0, source=medPlotCDS2, width=0.5)
medplot.add_tools(HoverTool(tooltips=[("Value", "@y{0.000}"), ("Category", "@x")]))

Dimension=Select(title="Dimension", value="Channel", options=list(Means2.columns.values), callback=Callback)

layout = column(Dimension, row(meanplot, medplot))
show(layout)

### Method 3 : Replace missing values with the median of the column

In [78]:
Replaced = Missing.fillna(Missing_med)
Replaced_mean = Replaced.mean()
Replaced_med  = Replaced.median(axis=0)

# take transpose for easier handling through CDS objects
Means3 = pd.DataFrame({'Original':Cmean, 'Replaced':Replaced_mean}).T
Meds3   = pd.DataFrame({'Original':Cmed,  'Replaced':Replaced_med}).T

# Create CDS objects for Mean and Median DataFrames
MeanCDS3 = ColumnDataSource(Means3)
MedCDS3  = ColumnDataSource(Meds3)
meanPlotCDS3 = ColumnDataSource(data={'x':Means3.index.values, 'y':Means3['Channel'].values})
medPlotCDS3  = ColumnDataSource(data={'x':Meds3.index.values, 'y':Meds3['Channel'].values})

Callback = CustomJS(args=dict(s1=MeanCDS3, s2=meanPlotCDS3, s3=MedCDS3, s4=medPlotCDS3), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var d3 = s3.data;
    var d4 = s4.data;
    var dimension = cb_obj.value;
    d2.y = d1[dimension];
    d4.y = d3[dimension];
    s2.change.emit();
    s4.change.emit();
""")

meanplot=figure(title="Comparision of Means", 
         width=400, height=500, 
         x_range=list(Means.index.values))
meanplot.vbar(x='x', top='y', bottom=0, source=meanPlotCDS3, width=0.5)
meanplot.add_tools(HoverTool(tooltips=[("Value", "@y{0.000}"), ("Category", "@x")]))

medplot=figure(title="Comparision of Medians", 
         width=400, height=500, 
         x_range=list(Means.index.values))
medplot.vbar(x='x', top='y', bottom=0, source=medPlotCDS3, width=0.5)
medplot.add_tools(HoverTool(tooltips=[("Value", "@y{0.000}"), ("Category", "@x")]))

Dimension=Select(title="Dimension", value="Channel", options=list(Means3.columns.values), callback=Callback)

layout = column(Dimension, row(meanplot, medplot))
show(layout)

### Method 4 : Remove the missing rows

In [79]:
Replaced = Missing.fillna(Missing_med)
Replaced_mean = Replaced.mean()
Replaced_med  = Replaced.median(axis=0)

# take transpose for easier handling through CDS objects
Means4 = pd.DataFrame({'Original':Cmean, 'Replaced':Replaced_mean}).T
Meds4   = pd.DataFrame({'Original':Cmed,  'Replaced':Replaced_med}).T

# Create CDS objects for Mean and Median DataFrames
MeanCDS4 = ColumnDataSource(Means4)
MedCDS4  = ColumnDataSource(Meds4)
meanPlotCDS4 = ColumnDataSource(data={'x':Means4.index.values, 'y':Means4['Channel'].values})
medPlotCDS4  = ColumnDataSource(data={'x':Meds4.index.values, 'y':Meds4['Channel'].values})

Callback = CustomJS(args=dict(s1=MeanCDS4, s2=meanPlotCDS4, s3=MedCDS4, s4=medPlotCDS4), code="""
    var d1 = s1.data;
    var d2 = s2.data;
    var d3 = s3.data;
    var d4 = s4.data;
    var dimension = cb_obj.value;
    d2.y = d1[dimension];
    d4.y = d3[dimension];
    s2.change.emit();
    s4.change.emit();
""")

meanplot=figure(title="Comparision of Means", 
         width=400, height=500, 
         x_range=list(Means.index.values))
meanplot.vbar(x='x', top='y', bottom=0, source=meanPlotCDS4, width=0.5)
meanplot.add_tools(HoverTool(tooltips=[("Value", "@y{0.000}"), ("Category", "@x")]))

medplot=figure(title="Comparision of Medians", 
         width=400, height=500, 
         x_range=list(Means.index.values))
medplot.vbar(x='x', top='y', bottom=0, source=medPlotCDS4, width=0.5)
medplot.add_tools(HoverTool(tooltips=[("Value", "@y{0.000}"), ("Category", "@x")]))

Dimension=Select(title="Dimension", value="Channel", options=list(Means4.columns.values), callback=Callback)

layout = column(Dimension, row(meanplot, medplot))
show(layout)