In [80]:
__author__ = "Siddharth Chandrasekaran"
__license__ = "GPL"
__version__ = "1.0.1"
__email__ = "schandraseka@umass.edu"


from bokeh.core.properties import field
from bokeh.io import curdoc
from bokeh.layouts import layout
from bokeh.models import (ColumnDataSource, HoverTool, SingleIntervalTicker, Slider, CategoricalColorMapper, Button, FactorRange, LinearColorMapper, ColorBar)
from bokeh.palettes import Category10, Category20, Plasma256, Paired, linear_palette, Viridis256
from bokeh.plotting import figure	
from bokeh.io import output_notebook
from bokeh.charts import HeatMap, show, bins
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.neighbors import kneighbors_graph
from sklearn import cluster
from bokeh.palettes import Spectral6
from bokeh.transform import factor_cmap
from bokeh.layouts import widgetbox
from bokeh.models.widgets import Button, RadioButtonGroup, Select, Slider
from functools import partial
from bokeh.models.widgets import DataTable
from bokeh.models.glyphs import HBar
from sklearn import linear_model
from itertools import product
import time

data = pd.read_csv("nutrition_raw_anonymized_data.csv")
data.replace(('Innie', 'Outie'), ('No', 'Yes'), inplace=True)

diseases = data.iloc[:,1:4].replace(('No', 'Yes'), (0, 1))
#display(diseases)
habits = data.iloc[:,5:27].replace(('No', 'Yes'), (0, 1))
#display(habits)
foodhabits = data.iloc[:,28:1093]
#display(foodhabits)



train_data = pd.concat([habits, foodhabits], axis=1)
normalized_data = pd.concat([diseases, train_data], axis =1)
correlation_matrix = normalized_data.corr()
cols = pd.Series(correlation_matrix.columns.tolist())
correlation_matrix['Feature1'] = cols.values
#display(correlation_matrix)
correlation_matrix  = correlation_matrix .set_index('Feature1')
correlation_matrix .columns.name = 'Feature2'
correlation_matrix  = pd.DataFrame(correlation_matrix .stack(), columns=['Correlation']).reset_index()

correlation_matrix = correlation_matrix.loc[correlation_matrix['Feature1'].isin(list(diseases.columns))]
correlation_matrix = correlation_matrix.loc[~(correlation_matrix['Feature2'].isin(list(diseases.columns)))]

display(correlation_matrix.head(1500))

colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
mapper = LinearColorMapper(palette=colors, low=correlation_matrix['Correlation'].min(), high=correlation_matrix['Correlation'].max())
source = ColumnDataSource(correlation_matrix)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
#print(list(set(list(correlation_matrix['Feature_2']))))
p = figure(title="Heat map of correlation between the diseases and the various other fields",  plot_width=900, plot_height=400,
           x_range=list(set(list(correlation_matrix['Feature2']))), y_range = list(set(list(correlation_matrix['Feature2']))))
p.rect(x="Feature1", y="Feature2", width=1, height=1,source=source,fill_color={'field': 'Correlation', 'transform': mapper},line_color=None)

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "5pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = 22 / 21
color_bar = ColorBar(color_mapper=mapper)

p.add_layout(color_bar)

#color_bar = ColorBar(color_mapper=mapper)
#p.add_layout(color_bar, 'right')
layout = layout([
    [p]
])
curdoc().add_root(layout)
curdoc().title = "690V Assignment - Clustering wholesale dataset"
show(layout)




Unnamed: 0,Feature1,Feature2,Correlation
3,cancer,ever_smoked,0.057117
4,cancer,currently_smoke,-0.075771
5,cancer,smoke_often,0.188982
6,cancer,smoke_rarely,-0.251691
7,cancer,never_smoked,-0.057117
8,cancer,quit_smoking,0.119289
9,cancer,left_hand,-0.069473
10,cancer,right_hand,0.152001
11,cancer,readingMath,0.057117
12,cancer,mathReading,-0.069400


E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: Feature1, Feature2 [renderer: GlyphRenderer(id='a438a228-e478-4883-b559-b3b9d0cf0e3b', ...)]
W-1005 (SNAPPED_TOOLBAR_ANNOTATIONS): Snapped toolbars and annotations on the same side MAY overlap visually: Figure(id='333fb298-9a3e-4e6e-9de3-28ff8720b3aa', ...)
E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: Feature1, Feature2 [renderer: GlyphRenderer(id='2dd3af9d-9ea7-4fd5-ae98-6ee6a2493961', ...)]
W-1005 (SNAPPED_TOOLBAR_ANNOTATIONS): Snapped toolbars and annotations on the same side MAY overlap visually: Figure(id='f788e87b-9837-4ac9-b0f2-21f699ab9448', ...)
E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: Feature1, Feature2 [renderer: GlyphRenderer(id='8698b79b-9a2d-4604-85fe-ce989f783ecb', ...)]
W-1005 (SNAPPED_TOOLBAR_ANNOTATIONS): Snapped toolbars and annotations on the same side MAY overlap visually: Figure(id='18477f57-fec0-4961-98cf-f36b39feaa8d', ...)
W-1001 (NO_DATA_RENDERERS

In [41]:
test_data = diseases
def compute_error(y_hat, y):
	# mean absolute error
	return np.abs(y_hat - y).mean()


#Parse param grid
def parse_param_grid(param_grid):
    for p in param_grid:
            items = sorted(p.items())
            keys, values = zip(*items)
            for v in product(*values):
                params = dict(zip(keys, v))
                yield params

#Only prints out the top n results based on out of sample error
def get_nbest_result(results,n):
    results.sort(key=lambda tup: tup[0])
    return results[0]

#Grid Search CV custom
def cross_validation(basemodel, cv, X, y, paramgridIterator):
    result = []
    X = X.as_matrix()
    for param in paramgridIterator:
        start_time = time.time()
        print("Starting : "+ str(param))
        validation_folds_score = []
        kf = KFold(n_splits=cv, random_state=1, shuffle=True)
        for train_index, test_index in kf.split(X):
            model = basemodel.set_params(**param)
            
            y_train, y_test = y[train_index], y[test_index]
            X_train, X_test = X[train_index], X[test_index]
            fittedmodel = model.fit(X_train, y_train)
            ycap = fittedmodel.predict(X_test)
            validation_folds_score.append(compute_error(ycap, y_test))
        result.append((sum(validation_folds_score)/float(len(validation_folds_score)), param))
        print("--- %s seconds ---" % (time.time() - start_time))
    return result

for column in test_data:
    y = test_data[column]
    X = train_data
    param_grid = {"penalty": ["l2"],
              "class_weight": ["balanced"],
		      "solver": ["liblinear","newton-cg" ],																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																			
		      }
    param_grid = dict(param_grid)
    paramgridIterator = parse_param_grid([param_grid])
    lr = linear_model.LogisticRegression()
    nbesttreemodel = get_nbest_result(cross_validation(lr, 3, X, y, paramgridIterator), 1)
    print((nbesttreemodel[1]))
    fittedmodel = lr.set_params(**nbesttreemodel[1]).fit(X, y)
    display(fittedmodel.coef_)


Starting : {'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}
--- 0.055269718170166016 seconds ---
Starting : {'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'newton-cg'}
--- 0.27640628814697266 seconds ---
{'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}


array([[  7.10172024e-05,  -2.34964740e-05,   5.19808984e-05, ...,
          8.76329312e-06,  -1.29333068e-05,  -4.64647209e-06]])

Starting : {'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}
--- 0.05665302276611328 seconds ---
Starting : {'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'newton-cg'}
--- 0.2936105728149414 seconds ---
{'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}


array([[  1.74446681e-04,   8.47136365e-06,  -4.89196706e-05, ...,
         -9.73940618e-06,  -2.37617914e-05,  -3.88845742e-07]])

Starting : {'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}
--- 0.05323600769042969 seconds ---
Starting : {'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'newton-cg'}
--- 0.5605061054229736 seconds ---
{'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}


array([[  5.11383770e-04,   7.92727958e-05,  -8.18406573e-06, ...,
          1.36894233e-05,  -2.98575012e-05,  -4.53410788e-06]])