In [73]:
from bokeh.io import output_notebook, show
from bokeh.layouts import column, row, widgetbox
from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource, LabelSet, CustomJS, Slider, Range1d
from bokeh.models.widgets import Select, Panel, Tabs
import pandas as pd
import os
import numpy as np
from scipy import stats
from scipy.optimize import curve_fit
import math
import copy
import csv

In [74]:
output_notebook()

In [75]:
with open('Wholesale customers data.csv', 'rt') as csvfile:
    original_data = csv.reader(csvfile, delimiter=',')
    original_data = np.array(list(original_data))

with open('Wholesale customers data-missing.csv', 'rt') as csvfile:
    modified_data = csv.reader(csvfile, delimiter=',')
    modified_data = np.array(list(modified_data))

#separating column names from data
columns = original_data[0]
original_data = original_data[1:,:]
original_data = original_data.astype(np.float)
original_data = original_data/1000
modified_data = modified_data[1:,:]

test_data = np.empty([len(modified_data),len(modified_data[0])]).astype(np.float)
cols = np.zeros((len(modified_data[0]))).astype(np.int)
missing_rows = np.empty([6,len(modified_data[0])]).astype(np.float)
missing_rows_fromOriginal = np.empty([6,len(modified_data[0])]).astype(np.float)
total_missing = 0
for i in range(0,len(modified_data)):
    for j in range(0,len(modified_data[0])):
        if modified_data[i][j] == '':
            cols[j]+=1
            test_data[i][j] = 0
        else:
            test_data[i][j] = modified_data[i][j].astype(np.float)

test_data = test_data/1000
missing_indices = [[],[]]
for i in range(0,len(test_data)):
    for j in range(0,len(test_data[0])):
        if test_data[i][j] == 0:
            missing_rows[total_missing] = test_data[i]
            missing_rows_fromOriginal[total_missing] = original_data[i]
            total_missing+=1
            missing_indices[0].append(i)
            missing_indices[1].append(j)

test_data = np.delete(test_data, missing_indices[0], axis=0)
#obtained test data with missing values removed

print('~~Missing Values~~')
print('Column Name:'.ljust(20),'Number of Missing Values:')
for k in range(len(cols)):
    print(columns[k].ljust(20),cols[k])

~~Missing Values~~
Column Name:         Number of Missing Values:
Channel              0
Region               0
Fresh                0
Milk                 1
Grocery              2
Frozen               0
Detergents_Paper     2
Delicassen           1


In [76]:
#Methods to handle missing values:
#Method 1 - Ignore Values (ready to plot)

#Method 2 - Mean
column_mean = np.mean(test_data, axis=0)

#Method 3 - Median
column_median = np.median(test_data, axis=0)

#Method 4 - Mode
column_mode = stats.mode(test_data)[0][0]

#Method 5 - Fit to function
#~~~~Column 4~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
error_divide = 100
#---With column 2
def func_c32(x,a,b,c):
    return a * np.exp(-b * x) + c
popt_c32, pcov = curve_fit(func_c32, test_data[:,2], test_data[:,3])
x_c32 = np.linspace(min(test_data[:,2]),max(test_data[:,2]),500)
y_c32 = func_c32(x_c32, *popt_c32)
err_c32 = 0
for k in range(0,len(test_data[:,2])):
    err_c32+=math.pow(func_c32(test_data[k,2],*popt_c32) - test_data[k,3],2)
err_c32 = err_c32/error_divide
p_c32 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 3. MSE in exponential fit: '+str(int(err_c32)))
source_c32 = ColumnDataSource(data=dict(
    xAxis=test_data[:,2],
    yAxis=test_data[:,3]))
p_c32.scatter(x='xAxis', y='yAxis', source=source_c32, size= 8)
p_c32.line(x=x_c32, y=y_c32, line_width = 2, color = 'red')

#---With column 4
def func_c34(x,a,b):
    return a * x + b
popt_c34, pcov = curve_fit(func_c34, test_data[:,4], test_data[:,3])
x_c34 = np.linspace(min(test_data[:,4]),max(test_data[:,4]),500)
y_c34 = func_c34(x_c34, *popt_c34)
err_c34 = 0
for k in range(0,len(test_data[:,4])):
    err_c34+=math.pow(func_c34(test_data[k,4],*popt_c34) - test_data[k,3],2)
err_c34 = err_c34/error_divide
p_c34 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 5. MSE in linear fit: '+str(int(err_c34)))
source_c34 = ColumnDataSource(data=dict(
    xAxis=test_data[:,4],
    yAxis=test_data[:,3]))
p_c34.scatter(x='xAxis', y='yAxis', source=source_c34, size= 8)
p_c34.line(x=x_c34, y=y_c34, line_width = 2, color = 'red')

#---With column 5
def func_c35(x,a,b,c):
    return a * np.exp(-b * x) + c
popt_c35, pcov = curve_fit(func_c35, test_data[:,5], test_data[:,3])
x_c35 = np.linspace(min(test_data[:,5]),max(test_data[:,5]),5000)
y_c35 = func_c35(x_c35, *popt_c35)
err_c35 = 0
for k in range(0,len(test_data[:,5])):
    err_c35+=math.pow(func_c35(test_data[k,5],*popt_c35) - test_data[k,3],2)
err_c35 = err_c35/error_divide
p_c35 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 6. MSE in exponential fit: '+str(int(err_c35)))
source_c35 = ColumnDataSource(data=dict(
    xAxis=test_data[:,5],
    yAxis=test_data[:,3]))
p_c35.scatter(x='xAxis', y='yAxis', source=source_c35, size= 8)
p_c35.line(x=x_c35, y=y_c35, line_width = 2, color = 'red')

#---With column 6
def func_c36(x,a,b):
    return a * x + b
popt_c36, pcov = curve_fit(func_c36, test_data[:,6], test_data[:,3])
x_c36 = np.linspace(min(test_data[:,6]),max(test_data[:,6]),500)
y_c36 = func_c36(x_c36, *popt_c36)
err_c36 = 0
for k in range(0,len(test_data[:,6])):
    err_c36+=math.pow(func_c36(test_data[k,6],*popt_c36) - test_data[k,3],2)
err_c36 = err_c36/error_divide
p_c36 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 7. MSE in linear fit: '+str(int(err_c36)))
source_c36 = ColumnDataSource(data=dict(
    xAxis=test_data[:,6],
    yAxis=test_data[:,3]))
p_c36.scatter(x='xAxis', y='yAxis', source=source_c36, size= 8)
p_c36.line(x=x_c36, y=y_c36, line_width = 2, color = 'red')

#---With column 7
def func_c37(x,a,b,c):
    return a * np.exp(-b * x) + c
popt_c37, pcov = curve_fit(func_c37, test_data[:,7], test_data[:,3])
x_c37 = np.linspace(min(test_data[:,7]),max(test_data[:,7]),500)
y_c37 = func_c37(x_c37, *popt_c37)
err_c37 = 0
for k in range(0,len(test_data[:,7])):
    err_c37+=math.pow(func_c37(test_data[k,7],*popt_c37) - test_data[k,3],2)
err_c37 = err_c37/error_divide
p_c37 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 8. MSE in exponential fit: '+str(int(err_c37)))
source_c37 = ColumnDataSource(data=dict(
    xAxis=test_data[:,7],
    yAxis=test_data[:,3]))
p_c37.scatter(x='xAxis', y='yAxis', source=source_c37, size= 8)
p_c37.line(x=x_c37, y=y_c37, line_width = 2, color = 'red')

layout_c3 = row(p_c32,p_c34,p_c35,p_c36,p_c37)
show(layout_c3)

### Plots for estimating Column 4 using other columns
Column 5 is used to estimate column 4, with an MSE of 112<br>
X-Axes: Values of corresponding columns<br>
Y-Axes: Values of column 4

In [77]:
#~~~~Column 5~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#---With column 2
def func_c42(x,a,b,c):
    return a * np.exp(-b * x) + c
popt_c42, pcov = curve_fit(func_c42, test_data[:,2], test_data[:,4])
x_c42 = np.linspace(min(test_data[:,2]),max(test_data[:,2]),500)
y_c42 = func_c42(x_c42, *popt_c42)
err_c42 = 0
for k in range(0,len(test_data[:,2])):
    err_c42+=math.pow(func_c42(test_data[k,2],*popt_c42) - test_data[k,4],2)
err_c42 = err_c42/error_divide
p_c42 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 3. MSE in exponential fit: '+str(int(err_c42)))
source_c42 = ColumnDataSource(data=dict(
    xAxis=test_data[:,2],
    yAxis=test_data[:,4]))
p_c42.scatter(x='xAxis', y='yAxis', source=source_c42, size= 8)
p_c42.line(x=x_c42, y=y_c42, line_width = 2, color = 'red')

#---With column 3
def func_c43(x,a,b):
    return a * x + b
popt_c43, pcov = curve_fit(func_c43, test_data[:,3], test_data[:,4])
x_c43 = np.linspace(min(test_data[:,3]),max(test_data[:,3]),500)
y_c43 = func_c43(x_c43, *popt_c43)
err_c43 = 0
for k in range(0,len(test_data[:,3])):
    err_c43+=math.pow(func_c43(test_data[k,3],*popt_c43) - test_data[k,4],2)
err_c43 = err_c43/error_divide
p_c43 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 4. MSE in linear fit: '+str(int(err_c43)))
source_c43 = ColumnDataSource(data=dict(
    xAxis=test_data[:,3],
    yAxis=test_data[:,4]))
p_c43.scatter(x='xAxis', y='yAxis', source=source_c43, size= 8)
p_c43.line(x=x_c43, y=y_c43, line_width = 2, color = 'red')

#---With column 5
def func_c45(x,a,b,c):
    return a * np.exp(-b * x) + c
popt_c45, pcov = curve_fit(func_c45, test_data[:,5], test_data[:,4])
x_c45 = np.linspace(min(test_data[:,5]),max(test_data[:,5]),5000)
y_c45 = func_c45(x_c45, *popt_c45)
err_c45 = 0
for k in range(0,len(test_data[:,5])):
    err_c45+=math.pow(func_c45(test_data[k,5],*popt_c45) - test_data[k,4],2)
err_c45 = err_c45/error_divide
p_c45 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 6. MSE in exponential fit: '+str(int(err_c45)))
source_c45 = ColumnDataSource(data=dict(
    xAxis=test_data[:,5],
    yAxis=test_data[:,4]))
p_c45.scatter(x='xAxis', y='yAxis', source=source_c45, size= 8)
p_c45.line(x=x_c45, y=y_c45, line_width = 2, color = 'red')

#---With column 6
def func_c46(x,a,b):
    return a * x + b
popt_c46, pcov = curve_fit(func_c46, test_data[:,6], test_data[:,4])
x_c46 = np.linspace(min(test_data[:,6]),max(test_data[:,6]),500)
y_c46 = func_c46(x_c46, *popt_c46)
err_c46 = 0
for k in range(0,len(test_data[:,6])):
    err_c46+=math.pow(func_c46(test_data[k,6],*popt_c46) - test_data[k,4],2)
err_c46 = err_c46/error_divide
p_c46 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 7. MSE in linear fit: '+str(int(err_c46)))
source_c46 = ColumnDataSource(data=dict(
    xAxis=test_data[:,6],
    yAxis=test_data[:,4]))
p_c46.scatter(x='xAxis', y='yAxis', source=source_c46, size= 8)
p_c46.line(x=x_c46, y=y_c46, line_width = 2, color = 'red')

#---With column 7
def func_c47(x,a,b,c):
    return a * np.exp(-b * x) + c
popt_c47, pcov = curve_fit(func_c47, test_data[:,7], test_data[:,4])
x_c47 = np.linspace(min(test_data[:,7]),max(test_data[:,7]),500)
y_c47 = func_c47(x_c47, *popt_c47)
err_c47 = 0
for k in range(0,len(test_data[:,7])):
    err_c47+=math.pow(func_c47(test_data[k,7],*popt_c47) - test_data[k,4],2)
err_c47 = err_c47/error_divide
p_c47 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 8. MSE in exponential fit: '+str(int(err_c47)))
source_c47 = ColumnDataSource(data=dict(
    xAxis=test_data[:,7],
    yAxis=test_data[:,4]))
p_c47.scatter(x='xAxis', y='yAxis', source=source_c47, size= 8)
p_c47.line(x=x_c47, y=y_c47, line_width = 2, color = 'red')

layout_c4 = row(p_c42,p_c43,p_c45,p_c46,p_c47)
show(layout_c4)

### Plots for estimating Column 5 using other columns
Column 7 is used to estimate column 5, with an MSE of 57<br>
X-Axes: Values of corresponding columns<br>
Y-Axes: Values of column 5

In [78]:
#~~~~Column 7~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#---With column 2
def func_c62(x,a,b,c):
    return a * np.exp(-b * x) + c
popt_c62, pcov = curve_fit(func_c62, test_data[:,2], test_data[:,6])
x_c62 = np.linspace(min(test_data[:,2]),max(test_data[:,2]),500)
y_c62 = func_c62(x_c62, *popt_c62)
err_c62 = 0
for k in range(0,len(test_data[:,2])):
    err_c62+=math.pow(func_c62(test_data[k,2],*popt_c62) - test_data[k,6],2)
err_c62 = err_c62/error_divide
p_c62 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 2. MSE in exponential fit: '+str(int(err_c62)))
source_c62 = ColumnDataSource(data=dict(
    xAxis=test_data[:,2],
    yAxis=test_data[:,6]))
p_c62.scatter(x='xAxis', y='yAxis', source=source_c62, size= 8)
p_c62.line(x=x_c62, y=y_c62, line_width = 2, color = 'red')

#---With column 3
def func_c63(x,a,b):
    return a * x + b
popt_c63, pcov = curve_fit(func_c63, test_data[:,3], test_data[:,6])
x_c63 = np.linspace(min(test_data[:,3]),max(test_data[:,3]),500)
y_c63 = func_c63(x_c63, *popt_c63)
err_c63 = 0
for k in range(0,len(test_data[:,3])):
    err_c63+=math.pow(func_c63(test_data[k,3],*popt_c63) - test_data[k,6],2)
err_c63 = err_c63/error_divide
p_c63 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 4. MSE in linear fit: '+str(int(err_c63)))
source_c63 = ColumnDataSource(data=dict(
    xAxis=test_data[:,3],
    yAxis=test_data[:,6]))
p_c63.scatter(x='xAxis', y='yAxis', source=source_c63, size= 8)
p_c63.line(x=x_c63, y=y_c63, line_width = 2, color = 'red')

#---With column 4
def func_c64(x,a,b):
    return a * x + b
popt_c64, pcov = curve_fit(func_c64, test_data[:,4], test_data[:,6])
x_c64 = np.linspace(min(test_data[:,4]),max(test_data[:,4]),5000)
y_c64 = func_c64(x_c64, *popt_c64)
err_c64 = 0
for k in range(0,len(test_data[:,4])):
    err_c64+=math.pow(func_c64(test_data[k,4],*popt_c64) - test_data[k,6],2)
err_c64 = err_c64/error_divide
p_c64 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 5. MSE in linear fit: '+str(int(err_c64)))
source_c64 = ColumnDataSource(data=dict(
    xAxis=test_data[:,4],
    yAxis=test_data[:,6]))
p_c64.scatter(x='xAxis', y='yAxis', source=source_c64, size= 8)
p_c64.line(x=x_c64, y=y_c64, line_width = 2, color = 'red')

#---With column 5
def func_c65(x,a,b,c):
    return a * np.exp(-b * x) + c
popt_c65, pcov = curve_fit(func_c65, test_data[:,5], test_data[:,6])
x_c65 = np.linspace(min(test_data[:,5]),max(test_data[:,5]),500)
y_c65 = func_c65(x_c65, *popt_c65)
err_c65 = 0
for k in range(0,len(test_data[:,5])):
    err_c65+=math.pow(func_c65(test_data[k,5],*popt_c65) - test_data[k,6],2)
err_c65 = err_c65/error_divide
p_c65 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 6. MSE in exponential fit: '+str(int(err_c65)))
source_c65 = ColumnDataSource(data=dict(
    xAxis=test_data[:,5],
    yAxis=test_data[:,6]))
p_c65.scatter(x='xAxis', y='yAxis', source=source_c65, size= 8)
p_c65.line(x=x_c65, y=y_c65, line_width = 2, color = 'red')

#---With column 7
def func_c67(x,a,b,c):
    return a * np.exp(-b * x) + c
popt_c67, pcov = curve_fit(func_c67, test_data[:,7], test_data[:,6])
x_c67 = np.linspace(min(test_data[:,7]),max(test_data[:,7]),500)
y_c67 = func_c67(x_c67, *popt_c67)
err_c67 = 0
for k in range(0,len(test_data[:,7])):
    err_c67+=math.pow(func_c67(test_data[k,7],*popt_c67) - test_data[k,6],2)
err_c67 = err_c67/error_divide
p_c67 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 8. MSE in exponential fit: '+str(int(err_c67)))
source_c67 = ColumnDataSource(data=dict(
    xAxis=test_data[:,7],
    yAxis=test_data[:,6]))
p_c67.scatter(x='xAxis', y='yAxis', source=source_c67, size= 8)
p_c67.line(x=x_c67, y=y_c67, line_width = 2, color = 'red')

layout_c6 = row(p_c62,p_c63,p_c64,p_c65,p_c67)
show(layout_c6)

### Plots for estimating Column 7 using other columns
Column 5 is used to estimate column 7, with an MSE of 14<br>
X-Axes: Values of corresponding columns<br>
Y-Axes: Values of column 7

In [79]:
#~~~~Column 8~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#---With column 2
def func_c72(x,a,b):
    return a * x + b
popt_c72, pcov = curve_fit(func_c72, test_data[:,2], test_data[:,7])
x_c72 = np.linspace(min(test_data[:,2]),max(test_data[:,2]),500)
y_c72 = func_c72(x_c72, *popt_c72)
err_c72 = 0
for k in range(0,len(test_data[:,2])):
    err_c72+=math.pow(func_c72(test_data[k,2],*popt_c72) - test_data[k,7],2)
err_c72 = err_c72/error_divide
p_c72 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 3. MSE in linear fit: '+str(int(err_c72)))
source_c72 = ColumnDataSource(data=dict(
    xAxis=test_data[:,2],
    yAxis=test_data[:,7]))
p_c72.scatter(x='xAxis', y='yAxis', source=source_c72, size= 8)
p_c72.line(x=x_c72, y=y_c72, line_width = 2, color = 'red')

#---With column 3
def func_c73(x,a,b):
    return a * x + b
popt_c73, pcov = curve_fit(func_c73, test_data[:,3], test_data[:,7])
x_c73 = np.linspace(min(test_data[:,3]),max(test_data[:,3]),500)
y_c73 = func_c73(x_c73, *popt_c73)
err_c73 = 0
for k in range(0,len(test_data[:,3])):
    err_c73+=math.pow(func_c73(test_data[k,3],*popt_c73) - test_data[k,7],2)
err_c73 = err_c73/error_divide
p_c73 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 4. MSE in linear fit: '+str(int(err_c73)))
source_c73 = ColumnDataSource(data=dict(
    xAxis=test_data[:,3],
    yAxis=test_data[:,7]))
p_c73.scatter(x='xAxis', y='yAxis', source=source_c73, size= 8)
p_c73.line(x=x_c73, y=y_c73, line_width = 2, color = 'red')

#---With column 4
def func_c74(x,a,b,c):
    return a * np.exp(-b * x) + c
popt_c74, pcov = curve_fit(func_c74, test_data[:,4], test_data[:,7])
x_c74 = np.linspace(min(test_data[:,4]),max(test_data[:,4]),5000)
y_c74 = func_c74(x_c74, *popt_c74)
err_c74 = 0
for k in range(0,len(test_data[:,4])):
    err_c74+=math.pow(func_c74(test_data[k,4],*popt_c74) - test_data[k,7],2)
err_c74 = err_c74/error_divide
p_c74 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 5. MSE in exponential fit: '+str(int(err_c74)))
source_c74 = ColumnDataSource(data=dict(
    xAxis=test_data[:,4],
    yAxis=test_data[:,7]))
p_c74.scatter(x='xAxis', y='yAxis', source=source_c74, size= 8)
p_c74.line(x=x_c74, y=y_c74, line_width = 2, color = 'red')

#---With column 5
def func_c75(x,a,b):
    return a * x + b
popt_c75, pcov = curve_fit(func_c75, test_data[:,5], test_data[:,7])
x_c75 = np.linspace(min(test_data[:,5]),max(test_data[:,5]),500)
y_c75 = func_c75(x_c75, *popt_c75)
err_c75 = 0
for k in range(0,len(test_data[:,5])):
    err_c75+=math.pow(func_c75(test_data[k,5],*popt_c75) - test_data[k,7],2)
err_c75 = err_c75/error_divide
p_c75 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 6. MSE in linear fit: '+str(int(err_c75)))
source_c75 = ColumnDataSource(data=dict(
    xAxis=test_data[:,5],
    yAxis=test_data[:,7]))
p_c75.scatter(x='xAxis', y='yAxis', source=source_c75, size= 8)
p_c75.line(x=x_c75, y=y_c75, line_width = 2, color = 'red')

#---With column 6
def func_c76(x,a,b,c):
    return a * np.exp(-b * x) + c
popt_c76, pcov = curve_fit(func_c76, test_data[:,6], test_data[:,7])
x_c76 = np.linspace(min(test_data[:,6]),max(test_data[:,6]),500)
y_c76 = func_c76(x_c76, *popt_c76)
err_c76 = 0
for k in range(0,len(test_data[:,6])):
    err_c76+=math.pow(func_c76(test_data[k,6],*popt_c76) - test_data[k,7],2)
err_c76 = err_c76/error_divide
p_c76 = figure(plot_width = 500, plot_height = 300, title='Relation with Column 7. MSE in exponential fit: '+str(int(err_c76)))
source_c76 = ColumnDataSource(data=dict(
    xAxis=test_data[:,6],
    yAxis=test_data[:,7]))
p_c76.scatter(x='xAxis', y='yAxis', source=source_c76, size= 8)
p_c76.line(x=x_c76, y=y_c76, line_width = 2, color = 'red')

layout_c7 = row(p_c72,p_c73,p_c74,p_c75,p_c76)
show(layout_c7)

### Plots for estimating Column 8 using other columns
Column 4 is used to estimate column 8, with an MSE of 29<br>
X-Axes: Values of corresponding columns<br>
Y-Axes: Values of column 8

In [80]:
filledRows_mean = copy.deepcopy(missing_rows)
filledRows_median = copy.deepcopy(missing_rows)
filledRows_mode = copy.deepcopy(missing_rows)
filledRows_fit = copy.deepcopy(missing_rows)
for i in range(0,6):
    column_num = missing_indices[1][i]
    filledRows_mean[i][column_num] = column_mean[column_num]
    filledRows_median[i][column_num] = column_median[column_num]
    filledRows_mode[i][column_num] = column_mode[column_num]
    if column_num==3:
        filledRows_fit[i][column_num] = func_c34(missing_rows[i][4], *popt_c34)
        
    elif column_num ==4:
        filledRows_fit[i][column_num] = func_c46(missing_rows[i][6], *popt_c46)
            
    elif column_num ==6:
        filledRows_fit[i][column_num] = func_c64(missing_rows[i][4], *popt_c64)
            
    elif column_num ==7:
        filledRows_fit[i][column_num] = func_c73(missing_rows[i][3], *popt_c73)
    
    else:
        print('?')

source_originalData = ColumnDataSource(data=dict(
    x_c3=original_data[:,3],y_c3=original_data[:,3],
    x_c4=original_data[:,4],y_c4=original_data[:,4],
    x_c6=original_data[:,6],y_c6=original_data[:,6],
    x_c7=original_data[:,7],y_c7=original_data[:,7]
))

data_source = ColumnDataSource(data=dict(
    meanData=filledRows_mean,
    medianData=filledRows_median,
    modeData=filledRows_mode,
    fitData=filledRows_fit
))

plot_source = ColumnDataSource(data=dict(
    original_c3=missing_rows_fromOriginal[:,3],filledRows_c3=filledRows_mean[:,3],
    original_c4=missing_rows_fromOriginal[:,4],filledRows_c4=filledRows_mean[:,4],
    original_c6=missing_rows_fromOriginal[:,6],filledRows_c6=filledRows_mean[:,6],
    original_c7=missing_rows_fromOriginal[:,7],filledRows_c7=filledRows_mean[:,7],
    color_c3 = ['navy','navy','red','navy','navy','navy'],size_c3=[4,4,8,4,4,4],
    color_c4 = ['red','red','navy','navy','navy','navy'],size_c4=[8,8,4,4,4,4],
    color_c6 = ['navy','navy','navy','red','navy','red'],size_c6=[4,4,4,8,4,8],
    color_c7 = ['navy','navy','navy','navy','red','navy'],size_c7=[4,4,4,4,8,4],
))

line_source = ColumnDataSource(data=dict(
    c3_x=[filledRows_mean[2,3],missing_rows_fromOriginal[2,3]], c3_y=[filledRows_mean[2,3],filledRows_mean[2,3]],
    c4_x1=[filledRows_mean[0,4],missing_rows_fromOriginal[0,4]], c4_y1=[filledRows_mean[0,4],filledRows_mean[0,4]],
    c4_x2=[filledRows_mean[1,4],missing_rows_fromOriginal[1,4]], c4_y2=[filledRows_mean[1,4],filledRows_mean[1,4]],
    c6_x1=[filledRows_mean[3,6],missing_rows_fromOriginal[3,6]], c6_y1=[filledRows_mean[3,6],filledRows_mean[3,6]],
    c6_x2=[filledRows_mean[5,6],missing_rows_fromOriginal[5,6]], c6_y2=[filledRows_mean[5,6],filledRows_mean[5,6]],
    c7_x=[filledRows_mean[4,7],missing_rows_fromOriginal[4,7]], c7_y=[filledRows_mean[4,7],filledRows_mean[4,7]]
))

callback_select = CustomJS(args=dict(plot_source=plot_source,data_source=data_source,line_source=line_source), code="""
    var data = data_source.data
    var plot_data = plot_source.data
    c3 = plot_data['filledRows_c3']
    c4 = plot_data['filledRows_c4']
    c6 = plot_data['filledRows_c6']
    c7 = plot_data['filledRows_c7']
    var selection = cb_obj.value
    line_data = line_source.data
    
    switch(selection){
        case 'Mean':
            for (i=0; i < 6; i++){
                c3[i] = data['meanData'][(i*8)+3]
                c4[i] = data['meanData'][(i*8)+4]
                c6[i] = data['meanData'][(i*8)+6]
                c7[i] = data['meanData'][(i*8)+7]
            }
            break;
        
        case 'Median':
            for (i=0; i < 6; i++){
                c3[i] = data['medianData'][(i*8)+3]
                c4[i] = data['medianData'][(i*8)+4]
                c6[i] = data['medianData'][(i*8)+6]
                c7[i] = data['medianData'][(i*8)+7]
            }
            break;
            
        case 'Mode':
            for (i=0; i < 6; i++){
                c3[i] = data['modeData'][(i*8)+3]
                c4[i] = data['modeData'][(i*8)+4]
                c6[i] = data['modeData'][(i*8)+6]
                c7[i] = data['modeData'][(i*8)+7]
            }
            break;
            
        case 'Fit to model':
            for (i=0; i < 6; i++){
                c3[i] = data['fitData'][(i*8)+3]
                c4[i] = data['fitData'][(i*8)+4]
                c6[i] = data['fitData'][(i*8)+6]
                c7[i] = data['fitData'][(i*8)+7]
            }
            break;
    }
    
    line_data.c3_x[0] = c3[2]
    line_data.c3_y = [c3[2],c3[2]]
    line_data.c4_x1[0] = c4[0]
    line_data.c4_y1 = [c4[0],c4[0]]
    line_data.c4_x2[0] = c4[1]
    line_data.c4_y2 = [c4[1],c4[1]]
    line_data.c6_x1[0] = c6[3]
    line_data.c6_y1 = [c6[3],c6[3]]
    line_data.c6_x2[0] = c6[5]
    line_data.c6_y2 = [c6[5],c6[5]]
    line_data.c7_x[0] = c7[4]
    line_data.c7_y = [c7[4],c7[4]]

    plot_source.trigger('change');
    line_source.trigger('change');
    """)


widths = 460
heights = 330
plot_c3 = figure(plot_width = widths, plot_height = heights, title='Column 4: Difference from original values',
                tools = ['pan', 'wheel_zoom', 'reset', 'resize','box_zoom','tap','previewsave','crosshair'])
plot_c3.scatter(x='x_c3', y='y_c3', source=source_originalData, size = 4, color='navy')
plot_c3.scatter(x='original_c3', y='filledRows_c3', source=plot_source, size='size_c3', color='color_c3')
points_c32 = [filledRows_mean[2,3],filledRows_median[2,3],filledRows_mode[2,3],filledRows_fit[2,3]]
plot_c3.line(x=[missing_rows_fromOriginal[2,3],missing_rows_fromOriginal[2,3]],y=[max(points_c32),min(points_c32)])
plot_c3.line(x=[min(test_data[:,3]),max(test_data[:,3])],y=[min(test_data[:,3]),max(test_data[:,3])])
plot_c3.line(x='c3_x', y='c3_y', source=line_source)
plot_c3.xaxis.axis_label = 'Original Column Data'
plot_c3.yaxis.axis_label = 'Filled-in Column Data'

plot_c4 = figure(plot_width = widths, plot_height = heights, title='Column 5: Difference from original values',
                tools = ['pan', 'wheel_zoom', 'reset', 'resize','box_zoom','tap','previewsave','crosshair'])
plot_c4.scatter(x='x_c4', y='y_c4', source=source_originalData, size = 4, color='navy')
plot_c4.scatter(x='original_c4', y='filledRows_c4', source=plot_source, size='size_c4', color='color_c4')
points_c40 = [filledRows_mean[0,4],filledRows_median[0,4],filledRows_mode[0,4],filledRows_fit[0,4]]
points_c41 = [filledRows_mean[1,4],filledRows_median[1,4],filledRows_mode[1,4],filledRows_fit[1,4]]
plot_c4.line(x=[missing_rows_fromOriginal[0,4],missing_rows_fromOriginal[0,4]],y=[max(points_c40),min(points_c40)])
plot_c4.line(x=[missing_rows_fromOriginal[1,4],missing_rows_fromOriginal[1,4]],y=[max(points_c41),min(points_c41)])
plot_c4.line(x=[min(original_data[:,4]),max(test_data[:,4])],y=[min(original_data[:,4]),max(test_data[:,4])])
plot_c4.line(x='c4_x1', y='c4_y1', source=line_source)
plot_c4.line(x='c4_x2', y='c4_y2', source=line_source)
plot_c4.xaxis.axis_label = 'Original Column Data'
plot_c4.yaxis.axis_label = 'Filled-in Column Data'

plot_c6 = figure(plot_width = widths, plot_height = heights, title='Column 7: Difference from original values',
                tools = ['pan', 'wheel_zoom', 'reset', 'resize','box_zoom','tap','previewsave','crosshair'])
plot_c6.scatter(x='x_c6', y='y_c6', source=source_originalData, size = 4, color='navy')
plot_c6.scatter(x='original_c6', y='filledRows_c6', source=plot_source, size='size_c6', color='color_c6')
points_c63 = [filledRows_mean[3,6],filledRows_median[3,6],filledRows_mode[3,6],filledRows_fit[3,6]]
points_c65 = [filledRows_mean[5,6],filledRows_median[5,6],filledRows_mode[5,6],filledRows_fit[5,6]]
plot_c6.line(x=[missing_rows_fromOriginal[3,6],missing_rows_fromOriginal[3,6]],y=[max(points_c63),min(points_c63)])
plot_c6.line(x=[missing_rows_fromOriginal[5,6],missing_rows_fromOriginal[5,6]],y=[max(points_c65),min(points_c65)])
plot_c6.line(x=[min(test_data[:,6]),max(test_data[:,6])],y=[min(test_data[:,6]),max(test_data[:,6])])
plot_c6.line(x='c6_x1', y='c6_y1', source=line_source)
plot_c6.line(x='c6_x2', y='c6_y2', source=line_source)
plot_c6.xaxis.axis_label = 'Original Column Data'
plot_c6.yaxis.axis_label = 'Filled-in Column Data'

plot_c7 = figure(plot_width = widths, plot_height = heights, title='Column 8: Difference from original values',
                tools = ['pan', 'wheel_zoom', 'reset', 'resize','box_zoom','tap','previewsave','crosshair'])
plot_c7.scatter(x='x_c7', y='y_c7', source=source_originalData, size = 4, color='navy')
plot_c7.scatter(x='original_c7', y='filledRows_c7', source=plot_source, size='size_c7', color='color_c7')
points_c7 = [filledRows_mean[4,7],filledRows_median[4,7],filledRows_mode[4,7],filledRows_fit[4,7]]
plot_c7.line(x=[missing_rows_fromOriginal[4,7],missing_rows_fromOriginal[4,7]],y=[min(points_c7),max(points_c7)])
plot_c7.line(x=[min(test_data[:,7]),max(test_data[:,7])],y=[min(test_data[:,7]),max(test_data[:,7])])
plot_c7.line(x='c7_x', y='c7_y', source=line_source)
plot_c7.xaxis.axis_label = 'Original Column Data'
plot_c7.yaxis.axis_label = 'Filled-in Column Data'

xRange = 10
yRange = 10
lowerRange = -1
plot_c3.x_range = Range1d(lowerRange,xRange)
plot_c3.y_range = Range1d(lowerRange,yRange)
plot_c4.x_range = Range1d(lowerRange,xRange)
plot_c4.y_range = Range1d(lowerRange,yRange)
plot_c6.x_range = Range1d(lowerRange,xRange)
plot_c6.y_range = Range1d(lowerRange,yRange)
plot_c7.x_range = Range1d(lowerRange,xRange)
plot_c7.y_range = Range1d(lowerRange,yRange)

'''np.set_printoptions(precision=4)
np.set_printoptions(suppress=True)
print(missing_rows_fromOriginal,'Original-----------\n')
print(missing_rows,'Missing-----------\n')
print(filledRows_mean,'Mean-----------\n')
print(filledRows_median,'Median-----------\n')
print(filledRows_mode,'Mode-----------\n')
print(filledRows_fit,'Fit-----------\n')'''

select = Select(title="Select Imputation Method", value='Mean', options=['Mean','Median','Mode','Fit to model'], callback=callback_select)

final_layout = column(select,row(plot_c3,plot_c4),row(plot_c6,plot_c7))
tab1 = Panel(child=final_layout, title='Data Plots')

c3_raw = missing_rows_fromOriginal[2,3]
c3_errorSource = ColumnDataSource(data=dict(
    x = [1,2,3,4],
    y = [abs(c3_raw-filledRows_mean[2,3]),abs(c3_raw-filledRows_median[2,3]),abs(c3_raw-filledRows_mode[2,3]),abs(c3_raw-filledRows_fit[2,3])],
    errors = ['Mean', 'Median', 'Mode', 'Fit']
))
c3_error = figure(plot_width=widths, plot_height=heights, title='Column 4 mean error in estimation')
c3_error.xaxis.visible = False
labels_c3 = LabelSet(x='x', 
                  y= 'y', 
                  text='errors', 
                  level='glyph',
                  source=c3_errorSource,
                  x_offset=1, y_offset=1, render_mode='canvas')
c3_error.add_layout(labels_c3)
c3_error.line(x='x', y='y', source=c3_errorSource, line_width=2, color="navy", alpha=0.3)
c3_error.circle(x='x', y='y', source=c3_errorSource, size=8, color="navy")
c3_error.x_range=Range1d(0.5,4.5)
c3_error.y_range=Range1d(0,6)

c4_raw = missing_rows_fromOriginal[0,4] + missing_rows_fromOriginal[1,4]
c4_errorSource = ColumnDataSource(data=dict(
    x = [1,2,3,4],
    y = [abs(c4_raw-filledRows_mean[0,4]-filledRows_mean[1,4])/2,abs(c4_raw-filledRows_median[0,4]-filledRows_median[1,4])/2,abs(c4_raw-filledRows_mode[0,4]-filledRows_mode[1,4])/2,abs(c4_raw-filledRows_fit[0,4]-filledRows_fit[1,4])/2],
    errors = ['Mean', 'Median', 'Mode', 'Fit']
))
c4_error = figure(plot_width=widths, plot_height=heights, title='Column 5 mean error in estimation')
c4_error.xaxis.visible = False
labels_c4 = LabelSet(x='x', 
                  y= 'y', 
                  text='errors', 
                  level='glyph',
                  source=c4_errorSource,
                  x_offset=1, y_offset=1, render_mode='canvas')
c4_error.add_layout(labels_c4)
c4_error.line(x='x', y='y', source=c4_errorSource, line_width=2, color="navy", alpha=0.3)
c4_error.circle(x='x', y='y', source=c4_errorSource, size=8, color="navy")
c4_error.x_range=Range1d(0.5,4.5)
c4_error.y_range=Range1d(0,5.5)

c6_raw = missing_rows_fromOriginal[3,6] + missing_rows_fromOriginal[5,6]
c6_errorSource = ColumnDataSource(data=dict(
    x = [1,2,3,4],
    y = [abs(c6_raw-filledRows_mean[3,6]-filledRows_mean[5,6])/2,abs(c6_raw-filledRows_median[3,6]-filledRows_median[5,6])/2,abs(c6_raw-filledRows_mode[3,6]-filledRows_mode[5,6])/2,abs(c6_raw-filledRows_fit[3,6]-filledRows_fit[5,6])/2],
    errors = ['Mean', 'Median', 'Mode', 'Fit']
))
c6_error = figure(plot_width=widths, plot_height=heights, title='Column 7 mean error in estimation')
c6_error.xaxis.visible = False
labels_c6 = LabelSet(x='x', 
                  y= 'y', 
                  text='errors', 
                  level='glyph',
                  source=c6_errorSource,
                  x_offset=1, y_offset=1, render_mode='canvas')
c6_error.add_layout(labels_c6)
c6_error.line(x='x', y='y', source=c6_errorSource, line_width=2, color="navy", alpha=0.3)
c6_error.circle(x='x', y='y', source=c6_errorSource, size=8, color="navy")
c6_error.x_range=Range1d(0.5,4.5)
c6_error.y_range=Range1d(0,5.5)

c7_raw = missing_rows_fromOriginal[4,7]
c7_errorSource = ColumnDataSource(data=dict(
    x = [1,2,3,4],
    y = [abs(c7_raw-filledRows_mean[4,7]),abs(c7_raw-filledRows_median[4,7]),abs(c7_raw-filledRows_mode[4,7]),abs(c7_raw-filledRows_fit[4,7])],
    errors = ['Mean', 'Median', 'Mode', 'Fit']
))
c7_error = figure(plot_width=widths, plot_height=heights, title='Column 8 mean error in estimation')
c7_error.xaxis.visible = False
labels_c7 = LabelSet(x='x', 
                  y= 'y', 
                  text='errors', 
                  level='glyph',
                  source=c7_errorSource,
                  x_offset=1, y_offset=1, render_mode='canvas')
c7_error.add_layout(labels_c7)
c7_error.line(x='x', y='y', source=c7_errorSource, line_width=2, color="navy", alpha=0.3)
c7_error.circle(x='x', y='y', source=c7_errorSource, size=8, color="navy")
c7_error.x_range=Range1d(0.5,4.5)
c7_error.y_range=Range1d(0,5.5)

final_layout_error = column(row(c3_error,c4_error),row(c6_error,c7_error))
tab2 = Panel(child=final_layout_error, title="Error Plots")

tabs = Tabs(tabs=[ tab1, tab2 ])

show(tabs)

#### The plots above represent the original columns on the x-axis, and the filled-in columns on the y-axis. 
The red circles are values that were filled in, with the thin blue line representing the range of values that they take over the different methods of imputation. The intersection of the y = x line with the range line represents the original data point in the raw data. The data is scaled down by 1000 to prevent overflow in curve fitting calculations.
<br>The error plots inform us that the model fitting offers a good accuracy in all cases, with the median also giving a good estimation of the missing data in most cases. The mode is the worst type of imputation method for this data.