In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
import colorlover as cl

from tabula import read_pdf

# py.init_notebook_mode(connected=True)
tls.set_config_file(world_readable=False,
                             sharing='private')

pdf_url = 'https://globalassets.starbucks.com/assets/94fbcc2ab1e24359850fa1870fc988bc.pdf'

data = read_pdf(pdf_url, pages='all',output_format='dataframe',
                area=[76.87,72.73,531.76,657.89], guess=False, pandas_options={'header':None})

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,,,,,,,,,,,,,,,,,,,,
1,Coffee,,,,,,,,,,,,,,,,,,,
2,Brewed Coffee - Dark Roast,Short,,,236 mL,3.0,0.1,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.3,0%,0%,0%,0%,130.0
3,Brewed Coffee - Dark Roast,Tall,,,354 mL,4.0,0.1,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.5,0%,0%,0%,0%,193.0
4,Brewed Coffee - Dark Roast,Grande,,,473 mL,5.0,0.1,0.0,0.0,0.0,10.0,0.0,0.0,0.0,1.0,0%,0%,0%,0%,260.0


In [2]:
columns={
    0: 'Product Name',
    1: 'Size',
    2: 'Milk',
    3: 'Whip',
    4: 'Serving Size',
    5: 'Calories',
    6: 'Total Fat',
    7: 'Saturated Fat',
    8: 'Trans Fat',
    9: 'Cholesterol',
    10: 'Sodium',
    11: 'Total Carbs',
    12: 'Dietary Fiber',
    13: 'Sugar',
    14: 'Protein',
    15: 'Vitamin A',
    16: 'Vitamin C',
    17: 'Calcium',
    18: 'Iron',
    19: 'Caffeine'
}
starbucks = data.copy()
starbucks = starbucks.rename(columns=columns)

In [3]:
starbucks.drop(columns=[
    'Serving Size',
    'Vitamin A',
    'Vitamin C',
    'Calcium',
    'Iron'
], inplace=True)

In [4]:
starbucks.dropna(thresh=13, inplace=True)

In [5]:
starbucks['Product Name'].replace(np.NaN, '?', inplace=True)
starbucks['Caffeine'].replace(np.NaN, '?', inplace=True)

In [6]:
starbucks = starbucks[starbucks['Product Name'] != '?']
starbucks = starbucks[starbucks['Caffeine'] != '?']

In [7]:
starbucks['Milk'].replace(np.NaN, 'N/A', inplace=True)
starbucks['Whip'].replace(np.NaN, 'N/A', inplace=True)

starbucks.isna().sum()

Product Name     0
Size             0
Milk             0
Whip             0
Calories         0
Total Fat        0
Saturated Fat    0
Trans Fat        0
Cholesterol      0
Sodium           0
Total Carbs      0
Dietary Fiber    0
Sugar            0
Protein          0
Caffeine         0
dtype: int64

In [8]:
starbucks['Caffeine'].replace('40+', 40, inplace=True)
starbucks['Caffeine'].replace('45-55', 50, inplace=True)
starbucks['Caffeine'].replace('90-110', 100, inplace=True)
starbucks['Caffeine'].replace('35-45', 40, inplace=True)
starbucks['Caffeine'].replace('70-85', 77.5, inplace=True)
starbucks['Caffeine'].replace('20-25', 22.5, inplace=True)
starbucks['Caffeine'].replace('25-30', 27.5, inplace=True)
starbucks['Caffeine'].replace('40-45', 42.5, inplace=True)
starbucks['Caffeine'].replace('45-50', 47.5, inplace=True)
starbucks['Caffeine'].replace('50-55', 52.5, inplace=True)
starbucks['Caffeine'].replace('1--15', 7.5, inplace=True)
starbucks['Caffeine'].replace('16-25', 20.5, inplace=True)
### For sodium, the strings were parsed incorrectly by tabula, 
### so what I infer is just the numbers inbetween the characters of the string
### Seems reasonable
starbucks['Sodium'].replace('M4il5k', 45, inplace=True)
starbucks['Sodium'].replace('Se1r0v0in', 100, inplace=True)
starbucks['Sodium'].replace('Ca9lo0ri', 90, inplace=True)
starbucks['Sodium'].replace('To1t3a0l', 130, inplace=True)
starbucks['Sodium'].replace('Siz5e', 5, inplace=True)
starbucks['Sodium'].replace('Ch1o0les', 10, inplace=True)
starbucks['Sodium'].replace('So1d0iu', 10, inplace=True)
starbucks['Dietary Fiber'].replace('g)\r0', 10, inplace=True)

starbucks['Caffeine'] = starbucks['Caffeine'].astype(float,copy=False)
starbucks['Sodium'] = starbucks['Sodium'].astype(float,copy=False)
starbucks['Dietary Fiber'] = starbucks['Dietary Fiber'].astype(float, copy=False)

In [9]:
barely_any_caffeine = starbucks.loc[(starbucks['Caffeine'] >= 0) & (starbucks['Caffeine'] <= 49.999)]
low_caffeine = starbucks.loc[(starbucks['Caffeine'] <= 100.0) & (starbucks['Caffeine'] >= 50.0)]
medium_caffeine = starbucks.loc[(starbucks['Caffeine'] <= 199.999) & (starbucks['Caffeine'] >= 100.001)]
high_caffeine = starbucks.loc[starbucks['Caffeine'] >= 200.0]


print('Shape of barely caffeinated drinks {}'.format(barely_any_caffeine.shape))
print('Shape of low caffeinated drinks {}'.format(low_caffeine.shape))
print('Shape of medium caffeinated drinks {}'.format(medium_caffeine.shape))
print('Shape of highly caffeinated drinks {}'.format(high_caffeine.shape))

Shape of barely caffeinated drinks (491, 15)
Shape of low caffeinated drinks (319, 15)
Shape of medium caffeinated drinks (317, 15)
Shape of highly caffeinated drinks (94, 15)


In [10]:
shapes = [barely_any_caffeine.shape[0], low_caffeine.shape[0], medium_caffeine.shape[0], high_caffeine.shape[0]] 

sum_shapes = sum(shapes)
print('Expected shape for Starbucks data {}'.format(starbucks.shape))
print('Are the shapes of the split dataframes and the original frame the same? \n{}'.format(sum_shapes==starbucks.shape[0]))


Expected shape for Starbucks data (1221, 15)
Are the shapes of the split dataframes and the original frame the same? 
True


In [11]:
ylgn = [[0,'rgb(255,255,229)'], [0.125,'rgb(247,252,185)'], [0.25,'rgb(217,240,163)'], [0.375,'rgb(173,221,142)'], [0.5,'rgb(120,198,121)'], [0.625,'rgb(65,171,93)'], [0.75,'rgb(35,132,67)'], [0.875,'rgb(0,104,55)'], [1,'rgb(0,69,41)']]

print(ylgn)

gnbr = [[0,'rgb(0,90,50)'],[0.125,'rgb(35,132,67)'],[0.25, 'rgb(65,171,93)']
       ,[0.375,'rgb(120,198,121)'],[0.5,'rgb(246,232,195)'],[0.625,'rgb(223,194,125)'],
        [0.75,'rgb(191,129,45)'], [0.875,'rgb(140,81,10)'],[1,'rgb(84,48,5)']]

print(gnbr)

[[0, 'rgb(255,255,229)'], [0.125, 'rgb(247,252,185)'], [0.25, 'rgb(217,240,163)'], [0.375, 'rgb(173,221,142)'], [0.5, 'rgb(120,198,121)'], [0.625, 'rgb(65,171,93)'], [0.75, 'rgb(35,132,67)'], [0.875, 'rgb(0,104,55)'], [1, 'rgb(0,69,41)']]
[[0, 'rgb(0,90,50)'], [0.125, 'rgb(35,132,67)'], [0.25, 'rgb(65,171,93)'], [0.375, 'rgb(120,198,121)'], [0.5, 'rgb(246,232,195)'], [0.625, 'rgb(223,194,125)'], [0.75, 'rgb(191,129,45)'], [0.875, 'rgb(140,81,10)'], [1, 'rgb(84,48,5)']]


In [12]:
starbucks_copy = starbucks.drop(columns='Trans Fat')
starbucks_copy.columns

Index(['Product Name', 'Size', 'Milk', 'Whip', 'Calories', 'Total Fat',
       'Saturated Fat', 'Cholesterol', 'Sodium', 'Total Carbs',
       'Dietary Fiber', 'Sugar', 'Protein', 'Caffeine'],
      dtype='object')

In [13]:
## starbucks drinks pearson correlation
## for some reason not all the columns are being represented in 
## the heatmap

## i don't think i will include a heatmap at this time. 


# import plotly.figure_factory as ff

# sbux_corr = tea_latte.corr().values
# z_text = np.around(sbux_corr, decimals = 2)

# data = [
    
#     go.Heatmap(
#         z=sbux_corr,
#         x=tea_latte.columns.values,
#         y=tea_latte.columns[::-1].values,
#         colorscale=gnbr,
#         opacity=0.9,
#         zmax=1,
#         zmin=-1, 
#         showscale=True
        
#     )
# ]

# layout = go.Layout(
#     title='Pearson Correlation of Starbucks Beverage Nutrition Facts',
#     xaxis=dict(ticks='', nticks=225),
#     yaxis=dict(ticks=''),
#     width = 600, height = 600,
# )

# f = go.Figure(data=data, layout=layout)

# # f = ff.create_annotated_heatmap(
# #     z = sbux_corr,
# #     x = starbucks_copy.columns.values,
# #     y = starbucks_copy.columns.values,
# #     annotation_text = z_text,
# #     colorscale = 'Viridis'
# # )

# plot_url = py.plot(f, filename='sbux_heatmap')
# plot_url

In [15]:
tea_latte = starbucks.copy()
tea_latte = tea_latte[tea_latte['Product Name'].isin([
    
    'Chai Tea Latte',
    'Iced Chai Tea Latte',
    'Green Tea Latte',
    'Iced Green Tea Latte',
    'London Fog Tea Latte',
    'English Breakfast Tea Latte'
    
])]

espresso = starbucks.copy()
espresso = espresso[espresso['Product Name'].isin([
    
    'Caffè Americano',
    'Iced Caffè Americano',
    'Caffè Latte',
    'Iced Caffè Latte',
    'Caffè Mocha',
    'Iced Caffè Mocha',
    'Cappuccino',
    'Caramel Macchiato',
    'Iced Caramel Macchiato',
    'Cinnamon Dolce Latte',
    'Espresso',
    'Espresso Macchiato',
    'Flat White',
    'Latte Macchiato',
    'Starbucks Doubleshot on Ice',
    'White Chocolate Mocha',
    'Iced White Chocolate Mocha',
    
])]

fraps = starbucks.copy()
fraps = fraps[fraps['Product Name'].str.contains('Frappuccino')]

refreshers = starbucks.copy()
refreshers = refreshers[refreshers['Product Name'].str.contains('Refreshers')]

print('Tea Latte Beverages Dataframe\'s values : {}'.format(tea_latte.shape[0]))
print('---------')
print('Espresso Beverages Dataframe\'s values : {}'.format(espresso.shape[0]))
print('---------')
print('Frappuccino Beverages Dataframe\'s values: {}'.format(fraps.shape[0]))
print('---------')
print('Refresher Beverages Dataframe\'s values: {}'.format(refreshers.shape[0]))

Tea Latte Beverages Dataframe's values : 90
---------
Espresso Beverages Dataframe's values : 235
---------
Frappuccino Beverages Dataframe's values: 336
---------
Refresher Beverages Dataframe's values: 6


In [26]:
favorites = starbucks.copy()
favorites = favorites.sort_values(by='Sugar',axis=0,ascending=False)
favorites.head()

Unnamed: 0,Product Name,Size,Milk,Whip,Calories,Total Fat,Saturated Fat,Trans Fat,Cholesterol,Sodium,Total Carbs,Dietary Fiber,Sugar,Protein,Caffeine
833,Java Chip Frappuccino® Blended Beverage,Venti®,Nonfat,Whip,560.0,17.0,11.0,0.4,45.0,350.0,96,3.0,89.0,8,145.0
978,Green Tea Crème Frappuccino® Blended Beverage,Venti®,Nonfat,Whip,500.0,12.0,7.0,0.3,45.0,320.0,91,2.0,88.0,9,95.0
980,Green Tea Crème Frappuccino® Blended Beverage,Venti®,2%,Whip,530.0,15.0,10.0,0.5,55.0,330.0,91,2.0,88.0,8,95.0
747,Caffè Vanilla Frappuccino® Blended Beverage,Venti®,Whole,Whip,530.0,16.0,10.0,0.5,55.0,300.0,91,0.0,88.0,6,125.0
745,Caffè Vanilla Frappuccino® Blended Beverage,Venti®,2%,Whip,510.0,14.0,8.0,0.4,55.0,300.0,91,0.0,88.0,6,125.0


In [53]:
#Fraps Bar Plots

sugar = go.Bar(

    y = fraps['Sugar'].values,
    x = fraps['Product Name'].values,
    marker=dict(
        color = fraps['Sugar'].values,
        colorscale = ylgn,
        reversescale = False
    ),
    orientation = 'v',
    name='Sugar'
)

calories = go.Bar(
    
    y = fraps['Calories'].values,
    x = fraps['Product Name'].values,
    marker = dict(
        color = fraps['Calories'].values,
        colorscale = gnbr,
        reversescale = False,
    ),
        orientation = 'v',
    name='Calories'
)

caffeine = go.Bar(

    y = fraps['Caffeine'],
    x = fraps['Product Name'].values,
    marker = dict(
        color = fraps['Caffeine'],
        colorscale = 'Greens',
        reversescale = False
    ),
    orientation = 'v',
    name='Caffeine'
)


layout = go.Layout(
    title = 'What\'s in my favorite Frappuccinos?',
    width = 1600,
    height = 600
)

f = go.Figure(data=[sugar,calories,caffeine])
f['layout'].update(layout)
plot_url = py.plot(f, filename='barplot', auto_show=False)
plot_url

'https://plot.ly/~Ruwai/7'

In [54]:
sugar = go.Bar(

    y = tea_latte['Sugar'].values,
    x = tea_latte['Product Name'].values,
    marker=dict(
        color = tea_latte['Sugar'].values,
        colorscale = ylgn,
        reversescale = False
    ),
    orientation = 'v',
    name='Sugar'
)

calories = go.Bar(
    
    y = tea_latte['Calories'].values,
    x = tea_latte['Product Name'].values,
    marker = dict(
        color = tea_latte['Calories'].values,
        colorscale = gnbr,
        reversescale = False,
    ),
        orientation = 'v',
    name='Calories'
)

caffeine = go.Bar(

    y = tea_latte['Caffeine'],
    x = tea_latte['Product Name'].values,
    marker = dict(
        color = tea_latte['Caffeine'],
        colorscale = 'Greens',
        reversescale = False
    ),
    orientation = 'v',
    name='Caffeine'
)


layout = go.Layout(
    title = 'What\'s in my favorite Teas and Lattes?',
    width = 1600,
    height = 600
)

f = go.Figure(data=[sugar,calories,caffeine])
f['layout'].update(layout)
plot_url = py.plot(f, filename='teabarplot', auto_show=False)
plot_url

'https://plot.ly/~Ruwai/17'

In [57]:
sugar = go.Bar(

    y = espresso['Sugar'].values,
    x = espresso['Product Name'].values,
    marker=dict(
        color = espresso['Sugar'].values,
        colorscale = ylgn,
        reversescale = False
    ),
    orientation = 'v',
    name='Sugar'
)

calories = go.Bar(
    
    y = espresso['Calories'].values,
    x = espresso['Product Name'].values,
    marker = dict(
        color = espresso['Calories'].values,
        colorscale = gnbr,
        reversescale = False,
    ),
        orientation = 'v',
    name='Calories'
)

caffeine = go.Bar(

    y = espresso['Caffeine'],
    x = espresso['Product Name'].values,
    marker = dict(
        color = espresso['Caffeine'],
        colorscale = 'Greens',
        reversescale = False
    ),
    orientation = 'v',
    name='Caffeine'
)


layout = go.Layout(
    title = 'What\'s in my favorite Espresso beverages?',
    width = 1600,
    height = 600
)

f = go.Figure(data=[sugar,calories,caffeine])
f['layout'].update(layout)
plot_url = py.plot(f, filename='espressobarplot', auto_show=False)
plot_url

'https://plot.ly/~Ruwai/19'

In [58]:
sugar = go.Bar(

    y = refreshers['Sugar'].values,
    x = refreshers['Product Name'].values,
    marker=dict(
        color = refreshers['Sugar'].values,
        colorscale = ylgn,
        reversescale = False
    ),
    orientation = 'v',
    name='Sugar'
)

calories = go.Bar(
    
    y = refreshers['Calories'].values,
    x = refreshers['Product Name'].values,
    marker = dict(
        color = refreshers['Calories'].values,
        colorscale = gnbr,
        reversescale = False,
    ),
        orientation = 'v',
    name='Calories'
)

caffeine = go.Bar(

    y = refreshers['Caffeine'],
    x = refreshers['Product Name'].values,
    marker = dict(
        color = tea_latte['Caffeine'],
        colorscale = 'Greens',
        reversescale = False
    ),
    orientation = 'v',
    name='Caffeine'
)


layout = go.Layout(
    title = 'What\'s in my favorite Teas and Lattes?',
    width = 1600,
    height = 600
)

f = go.Figure(data=[sugar,calories,caffeine])
f['layout'].update(layout)
plot_url = py.plot(f, filename='refresherbarplot', auto_show=False)
plot_url

'https://plot.ly/~Ruwai/21'

In [64]:
#Tea Latte 3D Scatter Plot
#Calories Total Fat
trace1 = go.Scatter3d(
    x = tea_latte['Calories'].values,
    y = tea_latte['Product Name'].values,
    z = tea_latte['Milk'].values,
    text = tea_latte['Size'].values,
    mode = 'markers',
    marker = dict(
        sizemode = 'diameter',
        color = tea_latte['Total Fat'].values,
        colorscale = ylgn,
        colorbar = dict(
            title = 'Total Fat',
        ),
        line = dict(color='rgb(255,255,255)')
    ),
    name='Tea Latte'
)


data = [trace1]
layout = go.Layout(
    scene = dict(
    xaxis = dict(
        title='Calories'
    )),
    title='Tea Lattes, measuring Calories and Total Fat given their size and milk',
    height = 900,
    width = 900,
)
f = dict(data=data, layout=layout)
plot_url = py.plot(f, filename='3DCaloriesTeas', auto_show = False)
plot_url


'https://plot.ly/~Ruwai/5'

In [65]:
#Espresso Scatter Plot
#Caffeine Calories
trace1 = go.Scatter3d(
    x = espresso['Caffeine'].values,
    y = espresso['Product Name'].values,
    z = espresso['Milk'].values,
    text = espresso['Size'].values,
    mode = 'markers',
    marker = dict(
        sizemode = 'diameter',
        color = espresso['Dietary Fiber'].values,
        colorscale = gnbr,
        colorbar = dict(
            title = 'Dietary Fiber',
            tickmode='array',
            ticks = 'inside',
            tickvals= espresso['Dietary Fiber'].values,
            ticktext= espresso['Dietary Fiber'].values,
        ),
        line = dict(color='rgb(255,255,255)'),
        showscale=True
    )
)

data = [trace1]
layout = go.Layout(
    scene = dict(
    xaxis = dict(
        title = 'Caffeine'
    )),
    height = 700,
    width = 1000,
    title='Espresso beverages, measuring Caffeine and Dietary Fiber given their size and milk'
)
f = dict(data=data, layout=layout)
py.plot(f, filename='3DEspressosCaff')

'https://plot.ly/~Ruwai/13'

In [63]:
#Espresso Scatter Plot

trace1 = go.Scatter3d(
    x = fraps['Sugar'].values,
    y = fraps['Product Name'].values,
    z = fraps['Milk'].values,
    ids = fraps['Milk'].values,
    text = fraps['Size'].values,
    mode = 'markers',
    marker = dict(
        sizemode = 'diameter',
        color = fraps['Calories'].values,
        colorscale = ylgn,
        colorbar = dict(
            title = 'Calories',
            tickmode='array',
            ticks = 'inside',
            tickvals= fraps['Calories'].values,
            ticktext= fraps['Calories'].values,
        ),
        line = dict(color='rgb(255,255,255)'),
        showscale=True
    )
)

data = [trace1]
layout = go.Layout(
    scene = dict(
    xaxis = dict(
    title = 'Sugar'
    )),
    height = 700,
    width = 1000,
    title='Frappuccino beverages, measuring Sugar and Calories given their size and milk'
)
f = dict(data=data, layout=layout)
py.plot(f, filename='3DFrapsSugar')

'https://plot.ly/~Ruwai/15'

In [62]:
# trace1 = go.Scatter3d(
#     x = refreshers['Sugar'].values,
#     y = refreshers['Product Name'].values,
#     z = refreshers['Milk'].values,
#     text = refreshers['Size'].values,
#     mode = 'markers',
#     marker = dict(
#         sizemode = 'diameter',
#         color = refreshers['Calories'].values,
#         colorscale = ylgn,
#         colorbar = dict(
#             title = 'Calories',
#             tickmode='array',
#             ticks = 'inside',
#             tickvals= refreshers['Calories'].values,
#             ticktext= refreshers['Calories'].values,
#         ),
#         line = dict(color='rgb(255,255,255)'),
#         showscale=True
#     )
# )

# data = [trace1]
# layout = go.Layout(
#     scene = dict(
#     xaxis = dict(
#     title = 'Sugar'
#     ),),
#     height = 700,
#     width = 1000,
#     title='Refresher beverages, measuring Sugar and Calories given their size and milk'
# )
# f = dict(data=data, layout=layout)
# py.plot(f, filename='3DFrapsSugar')

'https://plot.ly/~Ruwai/15'

In [80]:
## this one is for the caffeines

trace1 = go.Scatter3d(
    x = low_caffeine['Calories'].values,
    y = low_caffeine['Caffeine'].values,
    z = low_caffeine['Size'].values,
    text = low_caffeine['Product Name'].values,
    mode = 'markers',
    marker = dict(
        sizemode = 'diameter',
        color = low_caffeine['Sugar'].values,
        colorscale = gnbr,
        colorbar = dict(
            title = 'Sugar',
            tickmode='array',
            ticks = 'inside',
        ),
        line = dict(color='rgb(255,255,255)'),
        showscale=True
    ),
    name='Low Caffeine'
)

trace2 = go.Scatter3d(
    x = medium_caffeine['Calories'].values,
    y = medium_caffeine['Caffeine'].values,
    z = medium_caffeine['Size'].values,
    text = medium_caffeine['Product Name'].values,
    mode = 'markers',
    marker = dict(
        sizemode = 'diameter',
        color = medium_caffeine['Sugar'].values,
        colorscale = gnbr,
        colorbar = dict(
            title = 'Sugar',
            tickmode='array',
            ticks = 'inside',
        ),
        line = dict(color='rgb(255,255,255)'),
        showscale=True
    ),
    name ='Medium Caffeine'
)

trace3 = go.Scatter3d(
    x = high_caffeine['Calories'].values,
    y = high_caffeine['Caffeine'].values,
    z = high_caffeine['Size'].values,
    text = high_caffeine['Product Name'].values,
    mode = 'markers',
    marker = dict(
        sizemode = 'diameter',
        color = high_caffeine['Sugar'].values,
        colorscale = gnbr,
        colorbar = dict(
            title = 'Sugar',
            tickmode='array',
            ticks = 'inside',
        ),
        line = dict(color='rgb(255,255,255)'),
        showscale=True
    ),
    name = 'High Caffeine'
)

data = [trace1,trace2,trace3]

# data = [trace1,trace2,trace3,trace4]
layout = go.Layout(
    scene = dict(
    xaxis = dict(
    title = 'Calories'
    ),
    yaxis = dict(
    title = 'Caffeine'
    ),
    zaxis = dict(
    title = 'Size'
    )),
    height = 700,
    width = 1000,
    title='Caffeinated Beverages measuring Caffeine and Calories by their size'
)
f = dict(data=data, layout=layout)
py.plot(f, filename='3DFrapsSugar')

'https://plot.ly/~Ruwai/15'