# Data-Collection from Beer Maverick Hops Database 

**Source**: https://beermaverick.com/hops/

**Method**: Web-Scraping

------------------------------------------------------------

### Setup & Validation
**Objective**: Import required modules, establish connection & validate scraping capability.

In [1]:
# Import necessary packages
import numpy as np
import os
import pandas as pd
import requests
import unicodedata2
import urllib.robotparser
from bs4 import BeautifulSoup as bsoup
from time import sleep

In [2]:
# Use RobotParser to check validity of scraping on endpoint of interest (checking for base page)
base_url = 'https://beermaverick.com/hops/'
rp = urllib.robotparser.RobotFileParser()
rp.set_url('https://beermaverick.com/robots.txt')
rp.read()
rp.can_fetch('*', base_url)

True

In [3]:
# Establish connection to base-URL & setup a base exception handling
try:
    response = requests.get(base_url)
    print('Connection successfully established. \nStatus Code: ', response.status_code)
except requests.exceptions.RequestException as err:  
    print('Unable to establish connection at this time: \n', err)

Connection successfully established. 
Status Code:  200


### Base-URL Scraping
**Objective:** Collect hop names & corresponding endpoint URLs for further validation & scraping. 

In [4]:
# Instantiate BS4 for parsing the HTML data 
soup = bsoup(response.text, 'html')

# Find container tag & specific attribute for hops names
hops_container = soup.find_all('div', {'class':'box-inner-p-bigger box-single'})[1]  # locate specific container
hops_container = hops_container.find_all('p')  # gather each section

# Loop through each subsection of container & store obtained hop name/endpoint
hop_names = dict()
for section in hops_container[:-1]:
    all_hops = section.find_all('a')
    for each_hop in all_hops:
        hop_names[each_hop.text] = each_hop['href']

print(len(hop_names) == 304)  # Verify correct amount of hops 
# print(hop_names)  # Look at final dict (key=official-hop-name & value=url's endpoint)

True


### Endpoint Scraping
**Objective:** Collect & store detailed data for each hop from its respective endpoint.

In [5]:
def hops_scraper(hops):
    
    # Loop through each hop page and retrive data ('list(' used for debug purposes; to be normalized later)
    for hop in list(hops.keys())[0:]:

        endpoint_url = 'https://beermaverick.com' + hops[hop]

        # Validate connection & force wait-time to prevent Error 429 (too many requests)
        try:
            response = requests.get(endpoint_url)
            sleep(8)
        except requests.exceptions.RequestException as err:  
            print('Unable to establish connection at this time: \n', err)

        if response.status_code == 200:

            hop_dict = {'Hop Name': hop, 'Scraping Status': True}  # initiate dict to store hop data

            # Instantiate BS4 & locate umbrella container tag/attribute for all data sections
            soup = bsoup(response.text, 'html')
            info_section = soup.find('div', {'class':'entry-content'})

            # Retrieve info from initial overview table (purpose, country, code, ownership)
            table = info_section.find('table')
            table = table.find_all('tr')  # each 'tr' tags each row
            for row in table:
                key = row.find('th').text  # each 'th' tags index col
                val = row.find('td').text  # each 'td' tags value col
                hop_dict[key] = val

            # Retrieve flavor/aroma profile characteristics (displayed as "tags" on webpage)
            aroma_profile = info_section.find('em')
            if aroma_profile != None:  # conditional check if sectino exists
                aroma_profile = aroma_profile.find_all('a', {'class':'text-muted'})
                val = [tag.text for tag in aroma_profile]
                hop_dict['Flavor & Aroma Profile'] = val

            # Retrieve data from brewing values table
            brew_table = info_section.find('table', {'class':'brewvalues'})
            brew_table = brew_table.find_all('tr')  # each 'tr' tags each row
            for row in brew_table:
                # Conditional to skip over 'Total Oil Breadown' empty row without values
                if row.find('th') != None:
                    # Grab key-value pair of the brewing value for that row
                    key = row.find('th').text.replace(row.find('small').text, '')  # subtracting details-subtext from heading str
                    val = row.find('td').text  # each 'td' tags value col
                    hop_dict[key] = val
                   
        else:
            hop_dict = {'Hop Name': hop, 'Scraping Status': False}
        
        # print(hop_dict)  # debug
        yield hop_dict

### Load Hops Data
**Objective**: Execute scraper generator function and append each yield dict as row into dataframe & output CSV.

In [6]:
from time import time
t1 = time()
hops_df = pd.DataFrame(hops_scraper(hop_names))
t2 = time()
print(t2-t1)

2795.8903152942657


In [13]:
hops_df.to_csv('./rc_raw_data/rc_raw_hops_main.csv')
hops_df

Unnamed: 0,Hop Name,Scraping Status,Purpose:,Country:,International Code:,Ownership:,Flavor & Aroma Profile,Alpha Acid % (AA),Beta Acid %,Alpha-Beta Ratio,Co-Humulone as % of Alpha,Total Oils (mL/100g),› Myrcene,› Humulene,› Caryophyllene,› Farnesene,› All Others,Cultivar/Brand ID:,Hop Storage Index (HSI)
0,Astra,True,Dual,Australia (AUS),AST,™ Ellerslie Hop Estate,"[#tropical_fruit, #sweet, #peach, #white_wine,...",7-10%8.5% avg,4-6%5% avg,1:1 - 3:12:1 avg,26-28%27% avg,1-3 mL2mL avg,Unknown,6-8%7% avg,Unknown,Unknown,92-94%,,
1,Eclipse,True,Dual,Australia (AUS),,® HPA,"[#citrus, #pine, #mandarin]",15.7-19%17.4% avg,5.9-9.0%7.5% avg,2:1 - 3:12:1 avg,33-37%35% avg,1.8-2.7 mL2.3mL avg,35-49%42% avg,0-2%1% avg,6-12%9% avg.,0-1%0.5% avg,36-59%,HPA 016,
2,Ella,True,Dual,Australia (AUS),ELL,™ Hop Products Australia (HPA),"[#floral, #spicy, #grapefruit, #tropical_fruit...",13.3-19.2%16.3% avg,4-7.8%5.9% avg,2:1 - 5:13:1 avg,33-40%36.5% avg,2.4-3.4 mL2.9mL avg,34-50%42% avg,15-22%18.5% avg,5-18%11.5% avg.,12-14%13% avg,0-34%,01-220-060,
3,Enigma,True,Aroma,Australia (AUS),ENI,™ Hop Products Australia (HPA),"[#tropical_fruit, #berry, #stone_fruit, #raspb...",13.5-19.4%16.5% avg,4.5-7.1%5.8% avg,2:1 - 4:13:1 avg,37-43%40% avg,1.8-3 mL2.4mL avg,23-30%26.5% avg,12-19%15.5% avg,6-8%7% avg.,9-12%10.5% avg,31-50%,02-016-008,30% (Good) 0.295
4,Feux-Coeur Francais,True,Bittering,Australia (AUS),,,,12-16%14% avg,3.1-6%4.6% avg,2:1 - 5:14:1 avg,Unknown,Unknown,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,Zappa,True,Aroma,United States of America (USA),ZAP,™ CLS Farms,"[#mint, #savory, #fruity, #mango, #passion_fru...",6-9%7.5% avg,8-9%8.5% avg,1:1 - 1:11:1 avg,40-45%42.5% avg,1.8-2.5 mL2.2mL avg,64-65%64.5% avg,4-5%4.5% avg,8-9%8.5% avg.,0-1%0.5% avg,20-24%,FZMR2,
300,Zenia,True,Dual,United States of America (USA),,Mighty Axe Hops,"[#dank, #resin, #orange, #onion, #citrus, #fru...",11-14%12.5% avg,4.4-5.1%4.8% avg,2:1 - 3:13:1 avg,Unknown,0.7-2.5 mL1.6mL avg,,,,,,,
301,Zenith,True,Bittering,United States of America (USA),,,"[#floral, #citrus]",9-11%10% avg,3%3% avg,INF:1 - 4:1INF:1 avg,25%25% avg,1.8 mL1.8mL avg,51-53%52% avg,18-20%19% avg,6-7%6.5% avg.,0-1%0.5% avg,19-25%,,
302,Zeus,True,Bittering,United States of America (USA),ZEU,,"[#black_pepper, #licorice, #curry, #pungent]",13-17.5%15.3% avg,4-6.5%5.3% avg,2:1 - 4:13:1 avg,28-40%34% avg,2.4-4.5 mL3.5mL avg,45-60%52.5% avg,9-18%13.5% avg,6-11%8.5% avg.,0-1%0.5% avg,10-40%,,48% (Poor) 0.48


### Reference-Material Scraping
**Objective**: Scrape reference materials (meta-data / standard info) for further context & analysis.

In [14]:
### SCRAPE HOPS-SUBSTITUTION CHART

substitutions_url = 'https://beermaverick.com/hops/hop-substitutions-chart/'

# Validate connection
try:
    response = requests.get(substitutions_url)
except requests.exceptions.RequestException as err:  
    print('Unable to establish connection at this time: \n', err)

if response.status_code == 200:
    
    # Initiate list with empty dict objects for each row of data loading
    subs_info = [{} for i in range(214)]  # for each row on webpage

    # Instantiate BS4 & locate container tag/attribute for table
    soup = bsoup(response.text, 'html')
    table = soup.find('tbody')
    table = table.find_all('tr')  # each 'tr' tags each row
    
    # Loop through each row of table and load info to corresponding dict in list
    row_num = 0
    for row in table:
        subs_info[row_num]['Hop Name'] = row.find('th').text  # each 'th' tags index col
        subs_info[row_num]['Substitutions'] = row.find('td').text  # each 'td' tags value col
        row_num += 1

# Load scraped data into local df & output CSV
subs_df = pd.DataFrame(subs_info)
subs_df.to_csv('./rc_raw_data/rc_raw_ref_hops_substitutions.csv')

In [15]:
subs_df

Unnamed: 0,Hop Name,Substitutions
0,Adeena,"Hallertau Mittelfruh, Styrian Golding"
1,Admiral,"Target, Northdown, Challenger"
2,African Queen,"Amarillo, Cascade, Simcoe, Citra, Mosaic"
3,Agnus,"Magnum (US), Nugget, Target, Columbus"
4,Ahtanum,"Amarillo, Cascade, Centennial, Willamette"
...,...,...
209,Yeoman,Target
210,Zagrava,"Saaz (US), Tettnanger, Lubelski, Spalter Se..."
211,Zenith,"Yeoman, Northern Brewer (US)"
212,Zeus,"Columbus, Tomahawk, Zeus, CTZ, Centennial,..."


In [16]:
### SCRAPE AROMAS META-DATA

aroma_meta_url = 'https://beermaverick.com/the-science-behind-identifying-hop-aromas/'

# Validate connection
try:
    response = requests.get(aroma_meta_url)
except requests.exceptions.RequestException as err:  
    print('Unable to establish connection at this time: \n', err)

if response.status_code == 200:
    
    # Initiate lists of dicts for each aroma type, to be used for data loading
    aromas_info = [
        {'Aroma Type': 'Floral'},
        {'Aroma Type': 'Citrus'},
        {'Aroma Type': 'Tropical/Sweet Fruits'},
        {'Aroma Type': 'Stone/Green Fruits'},
        {'Aroma Type': 'Berries & Currant'},
        {'Aroma Type': 'Cream & Caramel'},
        {'Aroma Type': 'Woody Aromatic'},
        {'Aroma Type': 'Menthol'},
        {'Aroma Type': 'Herbal'},
        {'Aroma Type': 'Spicy'},
        {'Aroma Type': 'Grassy'},
        {'Aroma Type': 'Vegetal'}
    ]
    
    # Instantiate BS4 & locate container tag/attribute aroma info
    soup = bsoup(response.text, 'html')
    aroma_section = soup.find('div', {'class':'entry-content'})
    aroma_section = aroma_section.find_all('li')
    
    # Separate aroma types & compound names information
    aroma_types = aroma_section[0:-1:2]
    compound_names = aroma_section[1:-1:2]
    
    # Loop through each aroma type & compound name and load info to corresponding dict in list
    for i in range(len(aromas_info)):
        aromas_info[i][aroma_types[i].text[0:6]] = aroma_types[i].text[8:]
        aromas_info[i][compound_names[i].text[0:21]] = compound_names[i].text[23:]

# Load scraped data into local df & output CSV
aromas_df = pd.DataFrame(aromas_info)
aromas_df.to_csv('./rc_raw_data/rc_raw_ref_aroma_types.csv')
aromas_df

Unnamed: 0,Aroma Type,Aromas,Compounds Responsible
0,Floral,"elderflower, chamomile blossom, lily of the va...","rose oxide, geraniol, geraniol acetate, citron..."
1,Citrus,"grapefruit, orange, lime, lemon, bergamot, lem...","alpha-terpineol, limonene, linalool, citral, d..."
2,Tropical/Sweet Fruits,"banana, watermelon, honeydew melon, peach, apr...","2-methylpropyl hexanoate, ethyl 2-methylpropan..."
3,Stone/Green Fruits,"pear, apple, quince, gooseberry, white wine gr...","decanal, cis-3-dexenal, d-3-carene, 2-dodecano..."
4,Berries & Currant,"cassis, blueberry, raspberry, blackberry, stra...","beta ionone, 4-mercapto-4-methylpentan-2-one, ..."
5,Cream & Caramel,"butter, chocolate, yogurt, honey, cream, caram...","methyl decanoate, Y-nonalactone, vanillin, phe..."
6,Woody Aromatic,"tobacco, cognac, barrique, leather, woodruff, ...","humulene, alpha-pinene, beta-pinene, farnesene..."
7,Menthol,"mint, lemon balm, sage, camphor, menthol, wine...","carvone, terpinen-4-ol, camphene"
8,Herbal,"marjoram, tarragon, dill, parsley, basil, fenn...","myrcene, humulene, epoxide, p-cymene, cis-b-oc..."
9,Spicy,"lovage, pepper, chili, curry, juniper, aniseed...","beta-caryophyllene, eugenol, 2-isopropyl-3-met..."


In [17]:
### SCRAPE BREWING VALUES META-DATA 

brew_meta_url = 'https://beermaverick.com/hop/newport/'  # using a hops page with all brew values

# Validate connection
try:
    response = requests.get(brew_meta_url)
except requests.exceptions.RequestException as err:  
    print('Unable to establish connection at this time: \n', err)

if response.status_code == 200:
    
    meta = []
    
    # Instantiate BS4 & locate umbrella container tag/attribute for all data sections
    soup = bsoup(response.text, 'html')
    info_section = soup.find('div', {'class':'entry-content'})
    
    # Retrieve data from brewing values table
    brew_table = info_section.find('table', {'class':'brewvalues'})
    brew_table = brew_table.find_all('tr')  # each 'tr' tags each row
    for row in brew_table:
        # Conditional to skip over 'Total Oil Breadown' empty row without values
        if row.find('th') != None:
            # Grab info from first column for each row of brew values
            brew_type = row.find('th').text.replace(row.find('small').text, '')  # subtracting details-subtext from heading str
            brew_info = row.find('th').find('small').text  
            meta.append({'Value Type': brew_type, 'Description': brew_info})

# Load scraped data into local df & output CSV
brew_meta_df = pd.DataFrame(meta)
brew_meta_df.to_csv('./rc_raw_data/rc_raw_ref_brew_values.csv')
brew_meta_df

Unnamed: 0,Value Type,Description
0,Alpha Acid % (AA),Alpha acids are the main source of bitterness ...
1,Beta Acid %,Beta acids are a component of hop resins respo...
2,Alpha-Beta Ratio,The ratio of alpha to beta acids dictates the ...
3,Hop Storage Index (HSI),The HSI indicates the percent of alpha and bet...
4,Co-Humulone as % of Alpha,Low cohumulone hops may impart a smoother bitt...
5,Total Oils (mL/100g),"These highly volatile, not very soluble oils a..."
6,› Myrcene,"Flavors: resinous, citrus, fruity (β-myrcene)"
7,› Humulene,"Flavors: woody, noble, spicy (α-caryophyllene)"
8,› Caryophyllene,"Flavors: pepper, woody, herbal (β-caryophyllene)"
9,› Farnesene,"Flavors: fresh, green, floral (β-farnesene)"


### Scratch-Work (!!this section to be deleted later!!)

In [12]:
## PD-TABLE version to scrape each hops page's brewing values (somehow this is less clean)
# # (PD-TABLE VERSION) Retrieve data from basic info table & brewing values table directly into pd df
# basic_df, brewing_df = pd.read_html(endpoint_url, encoding='ascii')
# basic_df = basic_df.set_index(0)
# brewing_df = brewing_df.set_index(0)
# basic_dict = basic_df.to_dict()[1]  # dict we want is nested within '1' column
# brewing_dict = brewing_df.to_dict()[1]  # dict we want is nested within '1' column
# print(basic_dict, brewing_dict)
# hop_dict.update(basic_dict)
# hop_dict.update(brewing_dict)