In [67]:
from sklearn.cluster import KMeans
import numpy as np
from bokeh.models import ColumnDataSource,Legend,HoverTool
from bokeh.io import output_notebook, show, push_notebook
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure
from ipywidgets import interact

In [68]:
output_notebook()

In [69]:
from pandas import read_csv
dataset = read_csv('nutrition_raw_anonymized_data.csv')

First, I've tried to spot features which may contribute to diabetes, heart disease or cancer. The aim is to figure out if things like high Cholestorol, Carbohydrates, sweet or fat intake leads to one of the ailments

In [70]:
newdf = dataset[['cancer', 'diabetes', 'heart_disease', 'GROUP_SWEETS_TOTAL_GRAMS', 'DT_CHOL', 'DT_CARB', 'SOLID_FATS']].copy()
newdf.loc[: , "diabetes"] = newdf.loc[: , "diabetes"].replace(to_replace=['Yes','No'], value=[2,1])
newdf.loc[: , "cancer"] = newdf.loc[: , "cancer"].replace(to_replace=['Yes','No'], value=[2,1])
newdf.loc[: , "heart_disease"] = newdf.loc[: , "heart_disease"].replace(to_replace=['Yes','No'], value=[2,1])

In [71]:
newdf

Unnamed: 0,cancer,diabetes,heart_disease,GROUP_SWEETS_TOTAL_GRAMS,DT_CHOL,DT_CARB,SOLID_FATS
0,2,1,1,0.247,507.92,62.1,61.19
1,1,2,2,578.3,269.81,197.57,40.76
2,2,2,2,649.85,259.12,254.19,54.01
3,1,1,1,196.51,224.06,377.33,25.88
4,2,1,1,125.91,165.06,201.19,23.6
5,1,1,1,52.7,71.13,167.93,17.21
6,2,1,1,64.94,71.6,142.38,15.74
7,2,1,2,155.29,338.24,147.1,35.3
8,2,2,2,27.74,118.31,133.87,23.41
9,2,1,1,4.35,379.11,249.77,42.53


In [72]:
source = ColumnDataSource(data=dict(x=newdf['DT_CHOL'],y=newdf['cancer']))

In [73]:
x_w = 450
y_w = 450
#x=newdf['cancer']
#py=['Grocery','Fresh','Frozen','Delicassen','Detergents_Paper']
TOOLS = "box_select,lasso_select,help"
p = figure(tools=TOOLS, plot_width=x_w, plot_height=y_w, title=None)
p.xaxis.axis_label= 'x'
p.yaxis.axis_label= 'y'
fig = p.circle('x', 'y', source=source)

In [74]:
def update(x,y):
    fig.data_source.data['x'] = newdf[x]
    fig.data_source.data['y'] = newdf[y]
    p.xaxis.axis_label = x
    p.yaxis.axis_label = y
    push_notebook()

In [75]:
#Making use of an interactive visualization to see if inferences can be drawn
interact(update,x=['DT_CHOL','DT_CARB','GROUP_SWEETS_TOTAL_GRAMS','SOLID_FATS',],y=['cancer','diabetes','heart_disease'])
show(p,notebook_handle=True)

As it happens, there are very few clear indications of an association. What we can clearly make out is that people with a low sugar/cholesterol/fat/carbohydrate intake do not usually get the ailments.

In [76]:
#Using the Apriori algorithm to find association rules related to smoking
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [77]:
dataset1 = dataset[['cancer','ever_smoked','currently_smoke','smoke_often', 'smoke_rarely', 'never_smoked']]
df = dataset1.replace(to_replace=['Yes','No'], value=[1,0])
df

Unnamed: 0,cancer,ever_smoked,currently_smoke,smoke_often,smoke_rarely,never_smoked
0,1,1,1,1,0,0
1,0,1,1,0,1,0
2,1,0,0,0,0,1
3,0,0,0,0,0,1
4,1,0,0,0,0,1
5,0,1,1,0,1,0
6,1,0,0,0,0,1
7,1,0,0,0,0,1
8,1,1,0,0,0,0
9,1,0,0,0,0,1


In [78]:
frequent_itemsets = apriori(df, min_support=0.07, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules

Unnamed: 0,antecedants,consequents,support,confidence,lift
0,(cancer),(ever_smoked),0.518519,0.321429,1.084821
1,(ever_smoked),(cancer),0.296296,0.5625,1.084821
2,(ever_smoked),(currently_smoke),0.296296,0.3125,3.375
3,(currently_smoke),(ever_smoked),0.092593,1.0,3.375


A Market Basket analysis of the features related to smoking reveals that ever_smoked->cancer has the highest confidence and is thus the most reliable of the association rules generated. Correspondingly, cancer->ever_smoked is also related.
Results are further corroborated by the fact that currently_smoke->ever_smoked has a confidence of 1 (which makes sense that a person who smokes currently will obviously be classified as a part of 'ever_smoked')

In [79]:
#Using K-Means Clustering to find the feature that has the greatest relation to diabetes.
maxval=0
for i in range (28,1092):
    newdata = dataset.as_matrix(columns=dataset.columns[i:i+1])
    kmeans1 = KMeans(n_clusters=2)
    kmeans1.fit(newdata)
    km_labels = kmeans1.labels_
    #Calculating the indexes assigned to the first cluster
    clus0 = [x[0] for x, value in np.ndenumerate(km_labels) if value==0]
    y1=0
    n1=0
    #Calculating the features in the first cluster with diabetes value as Yes
    for j in clus0:
        if dataset.loc[j:j].diabetes.any() == "Yes":
            y1=y1+1
    if len(clus0)>0:
        n1=(len(clus0))-y1
        #Computing a score to compare different features based on the presence of diabetic people in the cluster
        #54 represents the total number of individuals and 15 represents the total number of individuals who has diabetes
        yscore1=y1*(len(clus0))/15*54
    
    y2=0
    n2=0
    #Calculating the features in the second cluster with diabetes value as Yes
    clus1 = [x[0] for x, value in np.ndenumerate(km_labels) if value==1]
    for j in clus1:
        if dataset.loc[j:j].diabetes.any() == "Yes":
            y2=y2+1
    if len(clus1)>0:
        n2=(len(clus1))-y2
        yscore2=y2*(len(clus1))/15*54
    
    yscore = yscore1+yscore2
    
    if yscore>maxval:
        maxval = y1/len(clus0)
        print(dataset.columns[i])
        print("Yscore is %d" %(yscore))
    print("y1 %d/%d y2 %d/%d " %(y1,len(clus0),y2,len(clus1)))
    #print("Yes = %d" %y)
    #print("No = %d" %n)

    

BREAKFASTSANDWICHQUAN
Yscore is 2448
y1 13/52 y2 2/2 
EGGSFREQ
Yscore is 1933
y1 13/39 y2 2/15 
EGGSQUAN
Yscore is 1810
y1 11/41 y2 4/13 
YOGURTFREQ
Yscore is 1468
y1 8/30 y2 7/24 
YOGURTQUAN
Yscore is 1450
y1 7/29 y2 8/25 
COTTAGECHEESEFREQ
Yscore is 1861
y1 11/43 y2 4/11 
COTTAGECHEESEQUAN
Yscore is 2052
y1 2/12 y2 13/42 
CREAMCHEESEFREQ
Yscore is 1609
y1 11/33 y2 4/21 
CREAMCHEESEQUAN
Yscore is 1846
y1 12/39 y2 3/15 
SLICEDCHEESEFREQ
Yscore is 1566
y1 9/37 y2 6/17 
SLICEDCHEESEQUAN
Yscore is 2041
y1 12/45 y2 3/9 
COLDCEREALFREQ
Yscore is 1684
y1 11/36 y2 4/18 
COLDCEREALQUAN
Yscore is 1728
y1 10/42 y2 5/12 
WHOLEGRAINCEREALFREQ
Yscore is 1674
y1 10/39 y2 5/15 
WHOLEGRAINCEREALQUAN
Yscore is 1512
y1 10/30 y2 5/24 
GRITSFREQ
Yscore is 2250
y1 13/47 y2 2/7 
GRITSQUAN
Yscore is 2300
y1 14/45 y2 1/9 
MILKONCEREALFREQ
Yscore is 1566
y1 9/37 y2 6/17 
MILKONCEREALQUAN
Yscore is 3283
y1 15/54 y2 0/0 
BROWNRICEFREQ
Yscore is 1461
y1 8/28 y2 7/26 
BROWNRICEQUAN
Yscore is 1620
y1 12/32 y2 3/22 

COOKIESFREQ
Yscore is 1461
y1 8/28 y2 7/26 
COOKIESQUAN
Yscore is 1638
y1 5/17 y2 10/37 
PUMPKINPIEFREQ
Yscore is 1468
y1 6/26 y2 9/28 
PUMPKINPIEQUAN
Yscore is 2440
y1 1/6 y2 14/48 
OTHERPIESFREQ
Yscore is 1634
y1 4/20 y2 11/34 
OTHERPIESQUAN
Yscore is 2408
y1 13/51 y2 2/3 
ICECREAMFREQ
Yscore is 1566
y1 5/21 y2 10/33 
ICECREAMQUAN
Yscore is 1584
y1 5/20 y2 10/34 
PUDDINGFREQ
Yscore is 1483
y1 7/20 y2 8/34 
PUDDINGQUAN
Yscore is 2170
y1 13/45 y2 2/9 
SAUCEICECREAMFREQ
Yscore is 2368
y1 13/50 y2 2/4 
SAUCEICECREAMQUAN
Yscore is 2289
y1 13/48 y2 2/6 
POPSICLESFREQ
Yscore is 1530
y1 5/23 y2 10/31 
POPSICLESQUAN
Yscore is 1735
y1 4/16 y2 11/38 
CHOCOLATECANDYFREQ
Yscore is 1620
y1 10/36 y2 5/18 
CHOCOLATECANDYQUAN
Yscore is 1764
y1 5/10 y2 10/44 
OTHERCANDIESFREQ
Yscore is 1490
y1 7/18 y2 8/36 
OTHERCANDIESQUAN
Yscore is 1634
y1 4/20 y2 11/34 
MARGARINEFREQ
Yscore is 2210
y1 13/46 y2 2/8 
MARGARINEQUAN
Yscore is 2131
y1 13/44 y2 2/10 
BUTTERFREQ
Yscore is 1461
y1 7/26 y2 8/28 
BUTTERQUAN


BCOMPLEXTYPEVITSAMOUNT
Yscore is 2754
y1 15/51 y2 0/3 
BCOMPLEXTYPEVITSYEARS
Yscore is 2808
y1 15/52 y2 0/2 
ANTIOXIDANTCOMBOAMOUNT
Yscore is 2862
y1 15/53 y2 0/1 
ANTIOXIDANTCOMBOYEARS
Yscore is 2862
y1 15/53 y2 0/1 
VITAMINAAMOUNT
Yscore is 2862
y1 15/53 y2 0/1 
VITAMINAYEARS
Yscore is 2862
y1 15/53 y2 0/1 
VITAMINB6AMOUNT
Yscore is 2674
y1 14/53 y2 1/1 
VITAMINB6YEARS
Yscore is 2627
y1 14/52 y2 1/2 
VITAMINB12AMOUNT
Yscore is 2534
y1 14/50 y2 1/4 
VITAMINB12YEARS
Yscore is 2627
y1 14/52 y2 1/2 
VITAMINCAMOUNT
Yscore is 2289
y1 13/48 y2 2/6 
VITAMINCYEARS
Yscore is 2250
y1 13/47 y2 2/7 
VITAMINDAMOUNT
Yscore is 1846
y1 12/39 y2 3/15 
VITAMINDYEARS
Yscore is 1879
y1 3/14 y2 12/40 
VITAMINEAMOUNT
Yscore is 2534
y1 14/50 y2 1/4 
VITAMINEYEARS
Yscore is 2581
y1 14/51 y2 1/3 
FOLICACIDAMOUNT
Yscore is 2448
y1 13/52 y2 2/2 
FOLICACIDYEARS
Yscore is 2448
y1 13/52 y2 2/2 
CALCIUMAMOUNT
Yscore is 1911
y1 12/41 y2 3/13 
CALCIUMYEARS
Yscore is 1976
y1 12/43 y2 3/11 
IRONAMOUNT
Yscore is 2581
y1

F_TOTAL
Yscore is 1810
y1 11/41 y2 4/13 
F_CITMLB
Yscore is 2091
y1 13/43 y2 2/11 
F_OTHER
Yscore is 2368
y1 13/50 y2 2/4 
F_JUICE
Yscore is 2329
y1 13/49 y2 2/5 
F_WHOLE
Yscore is 2250
y1 13/47 y2 2/7 
V_TOTAL
Yscore is 1861
y1 11/43 y2 4/11 
V_DRKGR
Yscore is 2674
y1 14/53 y2 1/1 
V_REDOR_TOTAL
Yscore is 1911
y1 11/45 y2 4/9 
V_REDOR_TOMATO
Yscore is 2448
y1 13/52 y2 2/2 
V_REDOR_OTHER
Yscore is 2203
y1 12/50 y2 3/4 
V_STARCHY_TOTAL
Yscore is 2674
y1 14/53 y2 1/1 
V_STARCHY_POTATO
Yscore is 2674
y1 14/53 y2 1/1 
V_STARCHY_OTHER
Yscore is 1944
y1 12/42 y2 3/12 
V_OTHER
Yscore is 2131
y1 2/10 y2 13/44 
V_LEGUMES
Yscore is 2674
y1 14/53 y2 1/1 
G_TOTAL
Yscore is 1659
y1 11/35 y2 4/19 
G_WHOLE
Yscore is 1584
y1 10/34 y2 5/20 
G_REFINED
Yscore is 1785
y1 4/14 y2 11/40 
PF_TOTAL
Yscore is 1836
y1 4/12 y2 11/42 
PF_MPS_TOTAL
Yscore is 1656
y1 5/16 y2 10/38 
PF_MEAT
Yscore is 1501
y1 9/31 y2 6/23 
PF_CUREDMEAT
Yscore is 2073
y1 12/46 y2 3/8 
PF_ORGAN
Yscore is 2674
y1 14/53 y2 1/1 
PF_POULT


GROUP_CORN_BREAD_HUSH_PUPPIES_TOTAL_GRAMS
Yscore is 2808
y1 15/52 y2 0/2 
GROUP_SANDWICH_BUNS_TOTAL_GRAMS
Yscore is 2916
y1 15/54 y2 0/0 
GROUP_SANDWICH_BUNS_WHITE_TOTAL_GRAMS
Yscore is 2627
y1 14/52 y2 1/2 
GROUP_SANDWICH_BUNS_MULTIGRAIN_TOTAL_GRAMS
Yscore is 2808
y1 15/52 y2 0/2 
GROUP_SANDWICH_BUNS_WHOLEGRAIN_TOTAL_GRAMS
Yscore is 2862
y1 15/53 y2 0/1 
GROUP_SANDWICH_BUNS_MIX_TOTAL_GRAMS
Yscore is 2627
y1 1/2 y2 14/52 
GROUP_BAGELS_ENG_MUFFIN_PITA_TOTAL_GRAMS
Yscore is 5536
y1 15/54 y2 0/0 
GROUP_BAGELS_WHITE_TOTAL_GRAMS
Yscore is 2646
y1 15/49 y2 0/5 
GROUP_BAGELS_MULTIGRAIN_TOTAL_GRAMS
Yscore is 2862
y1 15/53 y2 0/1 
GROUP_BAGELS_WHOLEGRAIN_TOTAL_GRAMS
Yscore is 2862
y1 15/53 y2 0/1 
GROUP_BAGELS_MIX_TOTAL_GRAMS
Yscore is 2329
y1 2/5 y2 13/49 
GROUP_TORTILLAS_TOTAL_GRAMS
Yscore is 5209
y1 15/54 y2 0/0 
GROUP_TORTILLAS_CORN_TOTAL_GRAMS
Yscore is 2646
y1 15/49 y2 0/5 
GROUP_TORTILLAS_FLOUR_WHEAT_TOTAL_GRAMS
Yscore is 2289
y1 13/48 y2 2/6 
GROUP_TORTILLA_MIX_TOTAL_GRAMS
Yscore is 258

y1 15/52 y2 0/2 
GROUP_POPCORN_CARAMEL_TOTAL_GRAMS
Yscore is 2916
y1 15/54 y2 0/0 
GROUP_WHOLE_GRAIN_CRACKERS_TOTAL_GRAMS
Yscore is 2916
y1 15/54 y2 0/0 
GROUP_WHOLE_GRAIN_CRACKERS_LOW_FAT_TOTAL_GRAMS
Yscore is 2534
y1 14/50 y2 1/4 
GROUP_WHOLE_GRAIN_CRACKERS_REGULAR_TOTAL_GRAMS
Yscore is 2627
y1 14/52 y2 1/2 
GROUP_OTHER_CRACKERS_PRETZELS_NOT_WHOLEGRAIN_TOTAL_GRAMS
Yscore is 2923
y1 15/54 y2 0/0 
GROUP_OTHER_CRACKERS_PRETZELS_LOW_FAT_TOTAL_GRAMS
Yscore is 2862
y1 15/53 y2 0/1 
GROUP_OTHER_CRACKERS_FILLED_PRETZELS_REGULAR_TOTAL_GRAMS
Yscore is 2368
y1 13/50 y2 2/4 
GROUP_TORTILLA_OR_CORN_CHIPS_CORN_NUTS_TOTAL_GRAMS
Yscore is 2329
y1 13/49 y2 2/5 
GROUP_CORN_PUFFS_TWISTS_SOY_POTATO_CHIPS_TOTAL_GRAMS
Yscore is 2408
y1 13/51 y2 2/3 
GROUP_DONUTS_TOTAL_GRAMS
Yscore is 2203
y1 3/4 y2 12/50 
GROUP_CAKE_CUPCAKES_TOTAL_GRAMS
Yscore is 5076
y1 15/54 y2 0/0 
GROUP_CAKE_LOW_SUGAR_TOTAL_GRAMS
Yscore is 2862
y1 15/53 y2 0/1 
GROUP_CAKE_LOW_FAT_TOTAL_GRAMS
Yscore is 2674
y1 14/53 y2 1/1 
GROUP_CAKE_

GROUP_COFFEE_DONT_DRINK_TOTAL_GRAMS
Yscore is 2926
y1 15/54 y2 0/0 
GROUP_HOT_TEA_DECAF_TOTAL_GRAMS
Yscore is 2862
y1 15/53 y2 0/1 
GROUP_HOT_TEA_CAFFEINE_TOTAL_GRAMS
Yscore is 2300
y1 14/45 y2 1/9 
GROUP_HOT_TEA_BOTH_KINDS_TOTAL_GRAMS
Yscore is 2674
y1 14/53 y2 1/1 
GROUP_HOT_TEA_DONT_DRINK_TOTAL_GRAMS
Yscore is 2919
y1 15/54 y2 0/0 
GROUP_CREAM_OR_HALF_N_HALF_TOTAL_GRAMS
Yscore is 2919
y1 15/54 y2 0/0 
GROUP_NON_DAIRY_CREAMER_LIQUID_TOTAL_GRAMS
Yscore is 2919
y1 15/54 y2 0/0 
GROUP_CONDENSED_MILK_TOTAL_GRAMS
Yscore is 2919
y1 15/54 y2 0/0 
GROUP_SUGAR_OR_HONEY_TOTAL_GRAMS
Yscore is 2919
y1 15/54 y2 0/0 
GROUP_COOKING_FAT_POP_MIX_TOTAL_GRAMS
Yscore is 2919
y1 15/54 y2 0/0 
GROUP_NON_STICK_SPRAY_SR27_TOTAL_GRAMS
Yscore is 2919
y1 15/54 y2 0/0 
GROUP_COOK_FAT_BUTTER_OR_GHEE_TOTAL_GRAMS
Yscore is 2170
y1 13/45 y2 2/9 
GROUP_COOK_FAT_BUTTER_MARGARINE_BLEND_TOTAL_GRAMS
Yscore is 2627
y1 1/2 y2 14/52 
GROUP_COOK_FAT_MARGARINE_STICK_TOTAL_GRAMS
Yscore is 2674
y1 14/53 y2 1/1 
GROUP_COOK_FAT_

The Cluster analysis above doesn't provide the most accurate results in terms of which feature directly leads to diabetes. However the feature MAYO shows the highest relation to diabetes, perhaps meaning that people consuming more mayonaisse are likelier to be diabetic.

In [80]:
columnsdata = ["diabetes", "MAYOTYPE"]
newdf = dataset[['diabetes', 'MAYOTYPE']].copy()
newdf.loc[: , "diabetes"] = newdf.loc[: , "diabetes"].replace(to_replace=['Yes','No'], value=[2,1])
finaldata = newdf[list(columnsdata)].values

In [81]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(finaldata)
kmeans.labels_

array([0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1])

In [82]:
#hover = HoverTool(tooltips=[("$index", "$index"),("(Milk, Groceries)", "(@x, @y)")])
hover = HoverTool(tooltips=[("(diabetes, MAYOTYPE)", "($x, $y)")])
TOOLS = "box_select,lasso_select,box_zoom,wheel_zoom,pan,help,hover"
p1 = figure(width=500, height=500, title='K-Means Clustering', x_axis_label = "diabetes", y_axis_label = "MAYOTYPE", tools=TOOLS)

In [83]:
centroid_x = []
centroid_y = []

for entry in kmeans.cluster_centers_:
    centroid_x.append(entry[0])
    centroid_y.append(entry[1])

p1.circle_cross(x=centroid_x, y=centroid_y, size=25, fill_alpha=0, line_width=2, color=['red', 'blue'])

In [84]:
i=0
for sample in finaldata:
    if kmeans.labels_[i] == 0:
        p1.circle(x=sample[0], y=sample[1], size=5, color="red")
    if kmeans.labels_[i] == 1:
        p1.circle(x=sample[0], y=sample[1], size=5, color="blue")
    i += 1

In [85]:
show(p1)

However, what looked to be a more promising relation after clustering, does not turn out to be as useful after visualization