In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
import colorlover as cl

from tabula import read_pdf

# py.init_notebook_mode(connected=True)
tls.set_config_file(world_readable=False,
                             sharing='private')

pdf_url = 'https://globalassets.starbucks.com/assets/94fbcc2ab1e24359850fa1870fc988bc.pdf'

data = read_pdf(pdf_url, pages='all',output_format='dataframe',
                area=[76.87,72.73,531.76,657.89], guess=False, pandas_options={'header':None})

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,,,,,,,,,,,,,,,,,,,,
1,Coffee,,,,,,,,,,,,,,,,,,,
2,Brewed Coffee - Dark Roast,Short,,,236 mL,3.0,0.1,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.3,0%,0%,0%,0%,130.0
3,Brewed Coffee - Dark Roast,Tall,,,354 mL,4.0,0.1,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.5,0%,0%,0%,0%,193.0
4,Brewed Coffee - Dark Roast,Grande,,,473 mL,5.0,0.1,0.0,0.0,0.0,10.0,0.0,0.0,0.0,1.0,0%,0%,0%,0%,260.0


In [18]:
columns={
    0: 'Product Name',
    1: 'Size',
    2: 'Milk',
    3: 'Whip',
    4: 'Serving Size',
    5: 'Calories',
    6: 'Total Fat',
    7: 'Saturated Fat',
    8: 'Trans Fat',
    9: 'Cholesterol',
    10: 'Sodium',
    11: 'Total Carbs',
    12: 'Dietary Fiber',
    13: 'Sugar',
    14: 'Protein',
    15: 'Vitamin A',
    16: 'Vitamin C',
    17: 'Calcium',
    18: 'Iron',
    19: 'Caffeine'
}
starbucks = data.copy()
starbucks = starbucks.rename(columns=columns)

In [19]:
starbucks.drop(columns=[
    'Serving Size',
    'Vitamin A',
    'Vitamin C',
    'Calcium',
    'Iron'
], inplace=True)

In [20]:
starbucks.dropna(thresh=13, inplace=True)

In [21]:
starbucks['Product Name'].replace(np.NaN, '?', inplace=True)
starbucks['Caffeine'].replace(np.NaN, '?', inplace=True)

In [22]:
starbucks = starbucks[starbucks['Product Name'] != '?']
starbucks = starbucks[starbucks['Caffeine'] != '?']

In [23]:
starbucks['Milk'].replace(np.NaN, 'N/A', inplace=True)
starbucks['Whip'].replace(np.NaN, 'N/A', inplace=True)

starbucks.isna().sum()

Product Name     0
Size             0
Milk             0
Whip             0
Calories         0
Total Fat        0
Saturated Fat    0
Trans Fat        0
Cholesterol      0
Sodium           0
Total Carbs      0
Dietary Fiber    0
Sugar            0
Protein          0
Caffeine         0
dtype: int64

In [24]:
starbucks['Caffeine'].replace('40+', 40, inplace=True)
starbucks['Caffeine'].replace('45-55', 50, inplace=True)
starbucks['Caffeine'].replace('90-110', 100, inplace=True)
starbucks['Caffeine'].replace('35-45', 40, inplace=True)
starbucks['Caffeine'].replace('70-85', 77.5, inplace=True)
starbucks['Caffeine'].replace('20-25', 22.5, inplace=True)
starbucks['Caffeine'].replace('25-30', 27.5, inplace=True)
starbucks['Caffeine'].replace('40-45', 42.5, inplace=True)
starbucks['Caffeine'].replace('45-50', 47.5, inplace=True)
starbucks['Caffeine'].replace('50-55', 52.5, inplace=True)
starbucks['Caffeine'].replace('1--15', 7.5, inplace=True)
starbucks['Caffeine'].replace('16-25', 20.5, inplace=True)
### For sodium, the strings were parsed incorrectly by tabula, 
### so what I infer is just the numbers inbetween the characters of the string
### Seems reasonable
starbucks['Sodium'].replace('M4il5k', 45, inplace=True)
starbucks['Sodium'].replace('Se1r0v0in', 100, inplace=True)
starbucks['Sodium'].replace('Ca9lo0ri', 90, inplace=True)
starbucks['Sodium'].replace('To1t3a0l', 130, inplace=True)
starbucks['Sodium'].replace('Siz5e', 5, inplace=True)
starbucks['Sodium'].replace('Ch1o0les', 10, inplace=True)
starbucks['Sodium'].replace('So1d0iu', 10, inplace=True)

starbucks['Caffeine'] = starbucks['Caffeine'].astype(float,copy=False)
starbucks['Sodium'] = starbucks['Sodium'].astype(float,copy=False)

In [25]:
barely_any_caffeine = starbucks.loc[(starbucks['Caffeine'] >= 0) & (starbucks['Caffeine'] <= 49.999)]
low_caffeine = starbucks.loc[(starbucks['Caffeine'] <= 100.0) & (starbucks['Caffeine'] >= 50.0)]
medium_caffeine = starbucks.loc[(starbucks['Caffeine'] <= 199.999) & (starbucks['Caffeine'] >= 100.001)]
high_caffeine = starbucks.loc[starbucks['Caffeine'] >= 200.0]


print('Shape of barely caffeinated drinks {}'.format(barely_any_caffeine.shape))
print('Shape of low caffeinated drinks {}'.format(low_caffeine.shape))
print('Shape of medium caffeinated drinks {}'.format(medium_caffeine.shape))
print('Shape of highly caffeinated drinks {}'.format(high_caffeine.shape))

Shape of barely caffeinated drinks (491, 15)
Shape of low caffeinated drinks (319, 15)
Shape of medium caffeinated drinks (317, 15)
Shape of highly caffeinated drinks (94, 15)


In [26]:
shapes = [barely_any_caffeine.shape[0], low_caffeine.shape[0], medium_caffeine.shape[0], high_caffeine.shape[0]] 

sum_shapes = sum(shapes)
print('Expected shape for Starbucks data {}'.format(starbucks.shape))
print('Are the shapes of the split dataframes and the original frame the same? \n{}'.format(sum_shapes==starbucks.shape[0]))


Expected shape for Starbucks data (1221, 15)
Are the shapes of the split dataframes and the original frame the same? 
True


In [59]:
ylgn = [[0,'rgb(255,255,229)'], [0.125,'rgb(247,252,185)'], [0.25,'rgb(217,240,163)'], [0.375,'rgb(173,221,142)'], [0.5,'rgb(120,198,121)'], [0.625,'rgb(65,171,93)'], [0.75,'rgb(35,132,67)'], [0.875,'rgb(0,104,55)'], [1,'rgb(0,69,41)']]

print(ylgn)

gnbr = [[0,'rgb(0,90,50)'],[0.125,'rgb(35,132,67)'],[0.25, 'rgb(65,171,93)']
       ,[0.375,'rgb(120,198,121)'],[0.5,'rgb(246,232,195)'],[0.625,'rgb(223,194,125)'],
        [0.75,'rgb(191,129,45)'], [0.875,'rgb(140,81,10)'],[1,'rgb(84,48,5)']]

print(gnbr)

[[0, 'rgb(255,255,229)'], [0.125, 'rgb(247,252,185)'], [0.25, 'rgb(217,240,163)'], [0.375, 'rgb(173,221,142)'], [0.5, 'rgb(120,198,121)'], [0.625, 'rgb(65,171,93)'], [0.75, 'rgb(35,132,67)'], [0.875, 'rgb(0,104,55)'], [1, 'rgb(0,69,41)']]
[[0, 'rgb(0,90,50)'], [0.125, 'rgb(35,132,67)'], [0.25, 'rgb(65,171,93)'], [0.375, 'rgb(120,198,121)'], [0.5, 'rgb(246,232,195)'], [0.625, 'rgb(223,194,125)'], [0.75, 'rgb(191,129,45)'], [0.875, 'rgb(140,81,10)'], [1, 'rgb(84,48,5)']]


In [65]:
starbucks_copy = starbucks.drop(columns='Trans Fat')
starbucks_copy.columns

Index(['Product Name', 'Size', 'Milk', 'Whip', 'Calories', 'Total Fat',
       'Saturated Fat', 'Cholesterol', 'Sodium', 'Total Carbs',
       'Dietary Fiber', 'Sugar', 'Protein', 'Caffeine'],
      dtype='object')

In [74]:
## starbucks drinks pearson correlation
## for some reason not all the columns are being represented in 
## the heatmap
data = [
    
    go.Heatmap(
        z=starbucks_copy.corr().values,
        x=starbucks_copy.columns.values,
        y=starbucks_copy.columns.values,
        colorscale=gnbr,
        opacity=0.9,
        zmax=1,
        zmin=-1
        
    )
]

layout = go.Layout(
    title='Pearson Correlation of Starbucks Beverage Nutrition Facts',
    xaxis=dict(ticks='', nticks=225),
    yaxis=dict(ticks=''),
    width = 1000, height = 1000,
)

f = go.Figure(data=data, layout=layout)
plot_url = py.plot(f, filename='sbux_heatmap', auto_open=False)
plot_url

'https://plot.ly/~Ruwai/1'

In [33]:
tea_latte = starbucks.copy()
tea_latte = tea_latte[tea_latte['Product Name'].isin([
    
    'Chai Tea Latte',
    'Iced Chai Tea Latte',
    'Green Tea Latte',
    'Iced Green Tea Latte',
    'London Fog Tea Latte',
    'English Breakfast Tea Latte'
    
])]

espresso = starbucks.copy()
espresso = espresso[espresso['Product Name'].isin([
    
    'Caffè Americano',
    'Iced Caffè Americano',
    'Caffè Latte',
    'Iced Caffè Latte',
    'Caffè Mocha',
    'Iced Caffè Mocha',
    'Cappuccino',
    'Caramel Macchiato',
    'Iced Caramel Macchiato',
    'Cinnamon Dolce Latte',
    'Espresso',
    'Espresso Macchiato',
    'Flat White',
    'Latte Macchiato',
    'Starbucks Doubleshot on Ice',
    'White Chocolate Mocha',
    'Iced White Chocolate Mocha',
    
])]

fraps = starbucks.copy()
fraps = fraps[fraps['Product Name'].str.contains('Frappuccino')]

print('Tea Latte Beverages Dataframe\'s values : {}'.format(tea_latte.shape[0]))
print('---------')
print('Espresso Beverages Dataframe\'s values : {}'.format(espresso.shape[0]))
print('---------')
print('Frappuccino Beverages Dataframe\'s values: {}'.format(fraps.shape[0]))

Tea Latte Beverages Dataframe's values : 90
---------
Espresso Beverages Dataframe's values : 235
---------
Frappuccino Beverages Dataframe's values: 336


In [46]:
#Tea Latte Bar Plots

trace = go.Bar(

    y = fraps['Sugar'].values,
    x = fraps['Product Name'].values,
    marker=dict(
        color = fraps['Sugar'].values,
        colorscale = ylgn,
        reversescale = False
    ),
    orientation = 'v',
)

trace1 = go.Bar(
    
    y = fraps['Calories'].values,
    x = fraps['Product Name'].values,
    marker = dict(
        color = fraps['Calories'].values,
        colorscale = gnbr,
        reversescale = False,
    ),
    orientation = 'v'
)

layout = go.Layout(
    title = 'How much sugar is in my Frappuccino?',
    width = 1600,
    height = 600
)

f = go.Figure(data=[trace,trace1])
f['layout'].update(layout)
plot_url = py.plot(f, filename='barplot', auto_show=False)
plot_url

'https://plot.ly/~Ruwai/7'

In [52]:
#Tea Latte 3D Scatter Plot

trace1 = go.Scatter3d(
    x = tea_latte['Sugar'].values,
    y = tea_latte['Product Name'].values,
    z = tea_latte['Total Fat'].values,
    text = tea_latte['Size'].values,
    mode = 'markers',
    marker = dict(
        sizemode = 'diameter',
        color = tea_latte['Total Fat'].values,
        colorscale = ylgn,
        colorbar = dict(
            title = 'Total Fat',
        ),
        line = dict(color='rgb(255,255,255)')
    )
)


data = [trace1]
layout = go.Layout(
    title='Tea Lattes, measuring Total Fat and Sugar given their size and milk option',
    height = 900,
    width = 900,
)
f = dict(data=data, layout=layout)
plot_url = py.plot(f, filename='3DCaloriesTeas', auto_show = False)
plot_url


'https://plot.ly/~Ruwai/5'

In [51]:
#Espresso Scatter Plot

trace1 = go.Scatter3d(
    x = espresso['Calories'].values,
    y = espresso['Product Name'].values,
    z = espresso['Caffeine'].values,
    text = espresso['Size'].values,
    mode = 'markers',
    marker = dict(
        sizemode = 'diameter',
        color = espresso['Caffeine'].values,
        colorscale = ylgn,
        colorbar = dict(
            title = 'Calories',
        ),
        line = dict(color='rgb(255,255,255)')
    )
)

data = [trace1]
layout = dict(
    
    height = 900,
    width = 900,
    title='Espressos and their Calorie given their Milk choice'
)
f = dict(data=data, layout=layout)
py.plot(f, filename='3DEspressosCals')

'https://plot.ly/~Ruwai/9'