# In This Notebook

Exploratory data analysis of beer object from brewery-db API.

# Setup

In [2]:
import os

from bkcharts import BoxPlot, Histogram, output_notebook, show
from bokeh.models import Range1d
import numpy as np
import pandas as pd

In [3]:
output_notebook()

In [4]:
wrk = '../../../data/wrk/brewery-db/'

# Load Data

In [5]:
# read csv data into dataframe object
fpath = os.path.abspath(os.path.join(wrk, 'beers.csv'))
beers = pd.read_csv(fpath)

In [6]:
beers.head(1000)

Unnamed: 0,abv,beerVariationId,createDate,description,foodPairings,glasswareId,ibu,id,ingredients,isOrganic,...,originalGravity,servingTemperature,servingTemperatureDisplay,socialAccounts,srmId,status,statusDisplay,styleId,updateDate,year
0,11.10,,2013-01-20 21:38:15,Hop Heads this one's for you! Checking in wit...,,5.0,,cBLTUw,,N,...,,,,,33.0,verified,Verified,43.0,2016-08-17 16:05:54,
1,,,2012-01-03 02:42:36,,,,,PvaOWz,,N,...,,,,,,verified,Verified,,2012-03-22 13:04:49,
2,,,2014-05-06 13:08:12,,,,,DyNPlC,,N,...,,,,"[{u'handle': u'161807', u'createDate': u'2015-...",,verified,Verified,30.0,2015-12-17 11:48:16,
3,,,2012-01-03 02:42:37,,,,,oQR5YM,,N,...,,,,,,verified,Verified,42.0,2012-03-22 13:04:50,
4,11.50,,2012-01-03 02:42:36,,,,,hNPjUO,,N,...,,,,"[{u'handle': u'40812', u'createDate': u'2014-1...",,verified,Verified,35.0,2014-10-09 14:32:53,
5,6.30,,2015-04-16 19:04:45,Stephen Fraley was discussing the Civil War wh...,,,32.0,piINy9,,N,...,,,,,,verified,Verified,18.0,2015-04-16 19:04:45,
6,3.50,,2012-01-03 02:42:37,,,,,Qg6dpg,,N,...,,,,"[{u'handle': u'89239', u'createDate': u'2015-0...",,verified,Verified,42.0,2015-05-22 15:22:58,
7,11.40,,2012-01-03 02:42:36,Der Rauch Gott (Imperial Smoked Ale),,,,ySo6qj,,N,...,,,,,,verified,Verified,129.0,2015-12-16 08:15:01,
8,5.00,,2012-01-03 02:42:36,"A refreshing, golden-hued European-style pilsn...",,4.0,32.0,bClbFg,,N,...,,,,"[{u'handle': u'141288', u'createDate': u'2014-...",,verified,Verified,76.0,2015-04-10 19:05:51,
9,6.90,,2014-06-03 11:51:34,Man Eating Sharks Infest the Atlantic!\r\n\r\n...,,5.0,,9wNKio,,N,...,,,,,,verified,Verified,30.0,2015-12-17 12:58:57,


# Describe Data

In [7]:
beers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16091 entries, 0 to 16090
Data columns (total 23 columns):
abv                          13263 non-null float64
beerVariationId              97 non-null object
createDate                   16091 non-null object
description                  9747 non-null object
foodPairings                 809 non-null object
glasswareId                  4137 non-null float64
ibu                          5902 non-null float64
id                           16091 non-null object
ingredients                  440 non-null object
isOrganic                    16091 non-null object
labels                       5551 non-null object
name                         16091 non-null object
nameDisplay                  16091 non-null object
originalGravity              1192 non-null float64
servingTemperature           930 non-null object
servingTemperatureDisplay    930 non-null object
socialAccounts               3867 non-null object
srmId                        1577 non

In [8]:
beers.describe()

Unnamed: 0,abv,glasswareId,ibu,originalGravity,srmId,styleId,year
count,13263.0,4137.0,5902.0,1192.0,1577.0,15396.0,178.0
mean,6.600605,5.339618,41.110826,1.061978,17.253012,58.959795,2012.185393
std,3.423505,1.489635,26.827465,0.022852,13.531315,40.031253,4.028768
min,0.01,1.0,1.0,1.0,1.0,1.0,1994.0
25%,5.1,5.0,22.0,1.05,6.0,30.0,2011.0
50%,6.0,5.0,33.45,1.058,12.0,43.0,2013.0
75%,7.5,5.0,55.45,1.07,28.0,83.0,2015.0
max,308.0,14.0,430.0,1.452,41.0,170.0,2017.0


# Transform Data

In [9]:
def transform_row(row):
    row.createDate = pd.to_datetime(row.createDate)
    if row.isOrganic == 'N':
        row.isOrganic = 0
    elif row.isOrganic == 'Y':
        row.isOrganic = 1
    return row

beers = beers.apply(lambda x: transform_row(x), axis=1)
beers['createYear'] = beers['createDate'].dt.year

# Exploratory Data Analysis

## ABV

The ABV for the beers object appears to have a negatively skewed heteroskedastic distribution, with median of 6.0 ABV and a mean of 6.60 ABV. This suggests there are a relatively small number of high-ABV beers within the distribution. In fact, there are 2 beers, in particular, with greater than 40.0 ABV. There are also 6,344 beers with no description. These might also need to be removed.

In [18]:
beers.abv.describe(percentiles=[.025,.25,.5,.75,.975])

count    13263.000000
mean         6.600605
std          3.423505
min          0.010000
2.5%         4.000000
25%          5.100000
50%          6.000000
75%          7.500000
97.5%       11.500000
max        308.000000
Name: abv, dtype: float64

In [19]:
hist = Histogram(beers[pd.notnull(beers.abv)].abv, title = 'ABV Distribution', bins=100,
              x_range=Range1d(0,50))
show(hist)

In [20]:
high_abv = beers[(pd.notnull(beers.abv)) & (beers.abv > 11.4)]
high_abv.sort_values('abv')

Unnamed: 0,abv,beerVariationId,createDate,description,foodPairings,glasswareId,ibu,id,ingredients,isOrganic,...,servingTemperature,servingTemperatureDisplay,socialAccounts,srmId,status,statusDisplay,styleId,updateDate,year,createYear
4,11.50,,2012-01-03 02:42:36,,,,,hNPjUO,,0,...,,,"[{u'handle': u'40812', u'createDate': u'2014-1...",,verified,Verified,35.0,2014-10-09 14:32:53,,2012
10576,11.50,,2012-01-03 02:43:50,"In Britain, seasonal brews for winter are high...",,,,wYUVVF,,0,...,,,,,verified,Verified,34.0,2013-11-25 00:14:57,2004.0,2012
10537,11.50,,2012-01-03 02:43:50,Named for the witch who guarded the fountain o...,,5.0,115.0,s0l856,,0,...,,,"[{u'handle': u'67745', u'createDate': u'2014-0...",41.0,verified,Verified,43.0,2014-08-08 10:29:29,,2012
10382,11.50,,2014-09-10 00:59:15,Our Smoked Wee Heavy is loaded with layers of ...,,2.0,,rhAJd4,,0,...,cellar,Cellar - (12-14C/54-57F),"[{u'handle': u'230836', u'createDate': u'2015-...",30.0,verified,Verified,15.0,2016-06-29 18:18:13,,2014
1534,11.50,,2016-05-24 13:20:34,Barrel-Aged Imperial Stout aged in Woodford Re...,,,,6VmqMF,,0,...,,,,,verified,Verified,43.0,2016-05-24 13:20:34,,2016
1616,11.50,,2015-09-01 03:27:09,,,,,6GlduG,,0,...,,,"[{u'handle': u'964055', u'createDate': u'2015-...",,verified,Verified,14.0,2015-12-18 05:06:40,,2015
8901,11.50,,2012-03-20 02:10:23,"Latin for ""wheat,"" Triticus is a strong and da...",,,57.0,ezUwcQ,,0,...,,,"[{u'handle': u'130038', u'createDate': u'2015-...",,verified,Verified,125.0,2015-12-16 16:08:50,,2012
8786,11.50,,2017-01-03 21:27:15,Bourbon Barrel-Aged Imperial Last Snow\r\n\r\n...,,,40.0,x705yC,,0,...,,,,,verified,Verified,158.0,2017-01-03 21:27:15,,2017
8193,11.50,,2013-11-07 05:43:49,,,,75.0,rsraBJ,,0,...,,,,,verified,Verified,41.0,2015-12-17 06:07:44,2013.0,2013
7757,11.50,,2017-06-08 13:32:01,"Imperial stout brewed with chocolate, black tr...",,,91.0,aBTOxN,,0,...,,,,,verified,Verified,43.0,2017-06-08 13:33:19,,2017


## IBU

In [22]:
beers.ibu.describe(percentiles=[.025,.25,.5,.75,.975])

count    5902.000000
mean       41.110826
std        26.827465
min         1.000000
2.5%        9.000000
25%        22.000000
50%        33.450000
75%        55.450000
97.5%     100.000000
max       430.000000
Name: ibu, dtype: float64

In [23]:
hist = Histogram(beers[pd.notnull(beers.ibu)].ibu, title = 'IBU Distribution', bins=100,
              x_range=Range1d(0,50))
show(hist)

In [21]:
beers[pd.isnull(beers.description)].count()

abv                          4431
beerVariationId                35
createDate                   6344
description                     0
foodPairings                   22
glasswareId                   792
ibu                          1066
id                           6344
ingredients                    30
isOrganic                    6344
labels                       1689
name                         6344
nameDisplay                  6344
originalGravity               128
servingTemperature            135
servingTemperatureDisplay     135
socialAccounts               1165
srmId                         238
status                       6344
statusDisplay                6344
styleId                      5738
updateDate                   6344
year                           77
createYear                   6344
dtype: int64

In [40]:
### Get Beer Counts by Year Added to Database
count = beers.id.groupby(beers.createYear).count()

In [41]:
count

createYear
2012    4754
2013    1337
2014    2221
2015    3912
2016    1987
2017    1199
Name: id, dtype: int64