# Our Dataset

Collecting beer recipes for machine learning

## Outline
* What we want to know about beer
* What beer data is out there
* What is beerXML
  * Load and print a beerXML file
  * Compare brewtoad and brewersfriend: presence/absense of tags
* How much did we get
  * Count recipes
* How did we represent it
  * Print a DataFrame  
* What does the beer recipe landscape look like
  * \# recipes/styles
  * what's cloned the most often?
  * most common malt, hop, yeast names
  * batch sizes
* Sneak preview: beer measurables (IBU, ABV, SRM)

In [55]:
import pandas as pd
import glob
from lxml import etree
from pprint import pprint

In [43]:
recipes_folder = "../recipes/brewtoad_recipes/"
files = glob.glob(recipes_folder + "*snpa*")
files[0:50]

['../recipes/brewtoad_recipes/snpa-clone-17.xml',
 '../recipes/brewtoad_recipes/snpa-clone-attempt-with-hops-on-hand-and-extra-15l-5gal-mash-vers.xml',
 '../recipes/brewtoad_recipes/snpa-klon-nr-2-836cfe.xml',
 '../recipes/brewtoad_recipes/snpa-15.xml',
 '../recipes/brewtoad_recipes/snpa-14.xml',
 '../recipes/brewtoad_recipes/mnir-snpa-clone-00499c.xml',
 '../recipes/brewtoad_recipes/snpa-clone-tweaked.xml',
 '../recipes/brewtoad_recipes/snpa-161007.xml',
 '../recipes/brewtoad_recipes/snpa-2-ccd326.xml',
 '../recipes/brewtoad_recipes/snpa-clone-16.xml',
 '../recipes/brewtoad_recipes/snpa-clone-28.xml',
 '../recipes/brewtoad_recipes/snpa-clone-14.xml',
 '../recipes/brewtoad_recipes/snpa-clone-90e7aa.xml',
 '../recipes/brewtoad_recipes/snpa-ipa-1.xml',
 '../recipes/brewtoad_recipes/snpabyanders.xml',
 '../recipes/brewtoad_recipes/snpa2-9d878c.xml',
 '../recipes/brewtoad_recipes/snpa-9.xml',
 '../recipes/brewtoad_recipes/snpa-16.xml',
 '../recipes/brewtoad_recipes/snpa-8.xml',
 '../recipe

In [44]:
len(files)

166

In [101]:
bt_snpa_example = "../recipes/brewtoad_recipes/snpaclone.xml"

In [102]:
with open(bt_snpa_example, "r") as f:
    beerxml = f.read()

In [103]:
pprint(beerxml)

('<?xml version="1.0" encoding="UTF-8"?>\n'
 '<RECIPES>\n'
 '  <RECIPE>\n'
 '    <NAME>SNPAclone</NAME>\n'
 '    <STYLE>\n'
 '      <STYLE_GUIDE>BJCP</STYLE_GUIDE>\n'
 '      <VERSION>1</VERSION>\n'
 '      <NAME>American Pale Ale</NAME>\n'
 '      <STYLE_LETTER>A</STYLE_LETTER>\n'
 '      <CATEGORY_NUMBER>10</CATEGORY_NUMBER>\n'
 '      <TYPE>Ale</TYPE>\n'
 '      <OG_MIN>1.045</OG_MIN>\n'
 '      <OG_MAX>1.06</OG_MAX>\n'
 '      <FG_MIN>1.01</FG_MIN>\n'
 '      <FG_MAX>1.015</FG_MAX>\n'
 '      <IBU_MIN>30.0</IBU_MIN>\n'
 '      <IBU_MAX>45.0</IBU_MAX>\n'
 '      <COLOR_MIN>5.0</COLOR_MIN>\n'
 '      <COLOR_MAX>14.0</COLOR_MAX>\n'
 '      <ABV_MIN>4.5</ABV_MIN>\n'
 '      <ABV_MAX>6.2</ABV_MAX>\n'
 '    </STYLE>\n'
 '    <FERMENTABLES>\n'
 '      <FERMENTABLE>\n'
 '        <NAME>2-Row (US)</NAME>\n'
 '        <ORIGIN></ORIGIN>\n'
 '        <TYPE>Base Malt</TYPE>\n'
 '        <YIELD>79.89635068019865</YIELD>\n'
 '        <AMOUNT>4.535923700000001</AMOUNT>\n'
 '        <DISPLAY_AMOUNT>

In [104]:
xml = etree.parse(bt_snpa_example)
for e in xml.iter():
    print(xml.getpath(e))

/RECIPES
/RECIPES/RECIPE
/RECIPES/RECIPE/NAME
/RECIPES/RECIPE/STYLE
/RECIPES/RECIPE/STYLE/STYLE_GUIDE
/RECIPES/RECIPE/STYLE/VERSION
/RECIPES/RECIPE/STYLE/NAME
/RECIPES/RECIPE/STYLE/STYLE_LETTER
/RECIPES/RECIPE/STYLE/CATEGORY_NUMBER
/RECIPES/RECIPE/STYLE/TYPE
/RECIPES/RECIPE/STYLE/OG_MIN
/RECIPES/RECIPE/STYLE/OG_MAX
/RECIPES/RECIPE/STYLE/FG_MIN
/RECIPES/RECIPE/STYLE/FG_MAX
/RECIPES/RECIPE/STYLE/IBU_MIN
/RECIPES/RECIPE/STYLE/IBU_MAX
/RECIPES/RECIPE/STYLE/COLOR_MIN
/RECIPES/RECIPE/STYLE/COLOR_MAX
/RECIPES/RECIPE/STYLE/ABV_MIN
/RECIPES/RECIPE/STYLE/ABV_MAX
/RECIPES/RECIPE/FERMENTABLES
/RECIPES/RECIPE/FERMENTABLES/FERMENTABLE[1]
/RECIPES/RECIPE/FERMENTABLES/FERMENTABLE[1]/NAME
/RECIPES/RECIPE/FERMENTABLES/FERMENTABLE[1]/ORIGIN
/RECIPES/RECIPE/FERMENTABLES/FERMENTABLE[1]/TYPE
/RECIPES/RECIPE/FERMENTABLES/FERMENTABLE[1]/YIELD
/RECIPES/RECIPE/FERMENTABLES/FERMENTABLE[1]/AMOUNT
/RECIPES/RECIPE/FERMENTABLES/FERMENTABLE[1]/DISPLAY_AMOUNT
/RECIPES/RECIPE/FERMENTABLES/FERMENTABLE[1]/POTENTIAL
/RECI

Original Ken Grossman recipe: https://sierranevada.com/blog/pale-ale-homebrew-recipe/

In [32]:
recipes_folder = "../recipes/recipes_brewersfriend/"
files = glob.glob(recipes_folder + "*")

In [33]:
with pd.HDFStore("../all_recipes.h5") as store:
    core = store.select(key="/core")

In [75]:
clones = core[core.name.str.contains("clone")].name

In [87]:
clone_words = pd.Series([item for sublist in list(clones.str.split()) for item in sublist])

In [92]:
clone_words.value_counts().head(40)

clone       13586
ale          1381
ipa          1200
-             789
clone)        558
stout         544
pale          503
the           410
sierra        379
nevada        361
stone         346
porter        327
pliny         326
red           297
hearted       282
version)      273
two           243
dust          232
elder         230
zombie        224
brown         213
black         206
2             206
old           201
fat           196
blue          190
tire          188
extract       186
moon          182
(clone)       180
bastard       178
head          174
amber         169
bell's        157
white         156
hop           156
milk          150
of            146
1             145
double        142
dtype: int64

In [39]:
bf_snpa = core[(core.origin == "brewersfriend") & (core.name.str.contains("snpa"))]

In [42]:
bf_snpa

Unnamed: 0_level_0,batch_size,boil_size,boil_time,brewer,efficiency,name,origin,recipe_file,src_abv,src_color,src_fg,src_ibu,src_og,style_category,style_guide,style_name,style_version
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
333780,5.8,8.5,60.0,,0.8,snpa,brewersfriend,recipes/brewersfriend/266866.xml,5.47,9.42,1.011,42.39,1.053,10a,bjcp,american pale ale,1.0
344204,41.63953,47.62048,60.0,,0.8,snpa 10 gal,brewersfriend,recipes/brewersfriend/303792.xml,5.65,9.05,1.01,43.06,1.053,10a,bjcp,american pale ale,1.0
346531,22.712471,28.390588,60.0,my adv / ozarks mountain brew,0.76,snpa clone,brewersfriend,recipes/brewersfriend/232281.xml,5.36,7.49,1.009,37.92,1.05,10a,bjcp,american pale ale,1.0
349787,18.927059,28.390588,60.0,aih,0.7,snpa,brewersfriend,recipes/brewersfriend/417766.xml,5.72,8.02,1.012,63.78,1.056,10a,bjcp,american pale ale,1.0
351027,20.0,12.0,60.0,,0.7,snpa clone keg,brewersfriend,recipes/brewersfriend/335632.xml,5.57,13.2,1.008,28.17,1.051,18b,bjcp,american pale ale,1.0
354349,20.819765,28.390588,60.0,,0.75,snpa clone,brewersfriend,recipes/brewersfriend/125052.xml,5.89,7.62,1.015,55.23,1.06,10a,bjcp,american pale ale,1.0
358117,18.0,25.0,60.0,,0.64,snpa,brewersfriend,recipes/brewersfriend/545021.xml,5.4,9.46,1.013,44.24,1.054,18b,bjcp,american pale ale,1.0
381521,22.0,26.5,90.0,,0.75,toms snpa clone (fwh),brewersfriend,recipes/brewersfriend/449635.xml,5.3,8.01,1.012,55.88,1.053,10a,bjcp,american pale ale,1.0
385308,11.356235,13.248941,40.0,,0.8,snpa clone-ish,brewersfriend,recipes/brewersfriend/1012.xml,5.31,6.13,1.013,58.47,1.053,10a,bjcp,american pale ale,1.0
388437,41.63953,51.103059,60.0,,0.8,snpa,brewersfriend,recipes/brewersfriend/354940.xml,5.25,8.22,1.013,40.46,1.053,18b,bjcp,american pale ale,1.0


In [35]:
core.origin.value_counts()

brewtoad         330790
brewersfriend     72367
Name: origin, dtype: int64

In [49]:
bf_snpa_example = "../recipes/recipes_brewersfriend/232281.xml"

In [50]:
with open(bf_snpa_example, "r") as f:
    bf_snpa_beerxml = f.read()

In [52]:
pprint(bf_snpa_beerxml)

('<?xml version="1.0" encoding="UTF-8"?>\n'
 '<RECIPES>\n'
 ' <RECIPE>\n'
 '  <NAME>SNPA Clone</NAME>\n'
 '  <VERSION>1</VERSION>\n'
 '  <TYPE>All Grain</TYPE>\n'
 '  <BREWER>MY ADV / Ozarks Mountain Brew</BREWER>\n'
 '  <DISPLAY_BATCH_SIZE>6 gal</DISPLAY_BATCH_SIZE>\n'
 '  <DISPLAY_BOIL_SIZE>7.5 gal</DISPLAY_BOIL_SIZE>\n'
 '  <BATCH_SIZE>22.71247068</BATCH_SIZE>\n'
 '  <BOIL_SIZE>28.39058835</BOIL_SIZE>\n'
 '  <BOIL_TIME>60</BOIL_TIME>\n'
 '  <EFFICIENCY>76</EFFICIENCY>\n'
 '  <NOTES>Water Needed. 8.71 gallons in to kettle. Single infusion no '
 'sparge.  Filled to 8.75 set boil to med to hard and had 6 gallons into '
 'fermenter with no loss.&#13;\n'
 '&#13;\n'
 'No squeeze bag: Hang and drip. Place drippings back into kettle before boil. '
 '&#13;\n'
 '&#13;\n'
 'Ambient Temp = 68F&#13;\n'
 'Estimated Strike Temp: 158.47F&#13;\n'
 'Real Strike Temp Should be: 155F&#13;\n'
 '60 min rest at 153F&#13;\n'
 '&#13;\n'
 'Hop Addition: 0 Min = Flame out&#13;\n'
 'Dry Hop: After primary ferm

In [100]:
xml = etree.parse(bf_snpa_example)
for e in xml.iter():
    print(xml.getpath(e))

/RECIPES
/RECIPES/RECIPE
/RECIPES/RECIPE/NAME
/RECIPES/RECIPE/VERSION
/RECIPES/RECIPE/TYPE
/RECIPES/RECIPE/BREWER
/RECIPES/RECIPE/DISPLAY_BATCH_SIZE
/RECIPES/RECIPE/DISPLAY_BOIL_SIZE
/RECIPES/RECIPE/BATCH_SIZE
/RECIPES/RECIPE/BOIL_SIZE
/RECIPES/RECIPE/BOIL_TIME
/RECIPES/RECIPE/EFFICIENCY
/RECIPES/RECIPE/NOTES
/RECIPES/RECIPE/PRIMARY_TEMP
/RECIPES/RECIPE/EST_COLOR
/RECIPES/RECIPE/IBU
/RECIPES/RECIPE/IBU_METHOD
/RECIPES/RECIPE/EST_ABV
/RECIPES/RECIPE/EST_OG
/RECIPES/RECIPE/EST_FG
/RECIPES/RECIPE/OG
/RECIPES/RECIPE/FG
/RECIPES/RECIPE/PRIMING_SUGAR_NAME
/RECIPES/RECIPE/CARBONATION_USED
/RECIPES/RECIPE/BF_PRIMING_METHOD
/RECIPES/RECIPE/BF_PRIMING_AMOUNT
/RECIPES/RECIPE/BF_CO2_LEVEL
/RECIPES/RECIPE/BF_CO2_UNIT
/RECIPES/RECIPE/URL
/RECIPES/RECIPE/BATCH_SIZE_MODE
/RECIPES/RECIPE/YEAST_STARTER
/RECIPES/RECIPE/NO_CHILL_EXTRA_MINUTES
/RECIPES/RECIPE/PITCH_RATE
/RECIPES/RECIPE/FERMENTABLES
/RECIPES/RECIPE/FERMENTABLES/FERMENTABLE[1]
/RECIPES/RECIPE/FERMENTABLES/FERMENTABLE[1]/NAME
/RECIPES/RECIPE/