# Data-Collection from Beer Maverick Hops Database 

**Source**: https://beermaverick.com/hops/

**Method**: Web-Scraping

------------------------------------------------------------

### Setup & Validation
**Objective**: Import required modules and validate scraping capability.

In [40]:
# Import necessary packages
import numpy as np
import os
import pandas as pd
import requests
import unicodedata2
import urllib.robotparser
from bs4 import BeautifulSoup as bsoup
from time import sleep

In [42]:
# Use RobotParser to check validity of scraping on endpoint of interest (checking only for first page)
rp = urllib.robotparser.RobotFileParser()
rp.set_url('https://beermaverick.com/robots.txt')
rp.read()
rp.can_fetch('*', 'https://beermaverick.com/hops/')

True

### Base URL Scraping
**Objective:** Generate a list of all hops (to use for endpoint URLs) for further scraping & data retrieval. 

In [36]:
base_url = 'https://beermaverick.com/hops/'
response = requests.get('https://beermaverick.com/hops/')

if response.status_code == 200:
    
    # Instantiate BS4 for parsing the HTML data (can also do via lxml)
    soup = bsoup(response.text, 'html')
    
    # Find section with hops names
    hopnames_section = soup.find_all('div', {'class':'box-inner-p-bigger box-single'})
    
    # Obtain correct section (since multiple exist with same class tag)
    hopnames_section = hopnames_section[1]
    
    # Retrieve text of all hops names into list (separated by comma delimeter)
    hopnames = hopnames_section.text.split(',')
    

In [37]:
# Use bash commands & pythonic tools to clean hops_list

hops_list = []
# for i in hopnames:
#     !echo "$i" | read word
#     !export word
#     hops_list.append(os.environ['word'])

# print(hopnames)
# print(hops_list)

In [38]:
hops_list = []

# Remove unwanted text (country names & end quotes)
cleaner_dict = {
    '\n\n\nAustralia':'', 
    'Canada':', ', 
    'China':', ', 
    'Czech Replublic':', ', 
    'Czech Republic':', ', 
    'France':', ', 
    'Germany':', ',
    'Japan':', ', 
    'New Zealand':', ', 
    'Poland':', ', 
    'Slovenia':', ', 
    'South Africa':', ', 
    'Ukraine':', ', 
    'United Kingdom':', ',
    'United States of America':', ',
    '\n\n\n\nIf you see an error in our data':'',
    ' please let us know!\n\n\n':'',
    '\n':''
}
for name in hopnames:
    
    if name[0] == ' ':
        name = name[1:]
    if name[-1] == ' ':
        name = name[:-1]
    for text in cleaner_dict:
        if text in name:
            name = name.replace(text, cleaner_dict[text])
    if ' ' in name:
        name = name.replace(' ', '-')
    name = name.replace('(', '')
    name = name.replace(')', '')
    if name[-1] == '-':
        name = name[:-1]
    
    if '&' in name:
        two_hops = name.split('&')
        for hop in two_hops:
            if hop[-1] == '-':
                hop = hop[:-1]
            if hop[0] == '-':
                hop = hop[1:]
            hops_list.append(hop)

    else:
        hops_list.append(name)
        
hopnames = hops_list[:-1]
hop_list = []

for name in hopnames:
    name = str(unicodedata2.normalize('NFD', name).encode('ascii', 'ignore'))
    name = name[2:]
    name = name[:-1]
    if ',' in name:
        two_names = name.split(',')
        for hop in two_names:
            if hop[0] == '-':
                hop = hop[1:]
            if hop[-1] == '-':
                hop = hop[:-1]
            hop_list.append(hop)
    elif name == "Dr.-Rudi":
        name = 'Dr-Rudi'
        hop_list.append(name)
    elif name in ['HBC-360', 'HBC-472', 'HBC-692']:
        name = name[0:3] + name[4:]
        hop_list.append(name)
    elif 'Brewer\'s-Gold' in name:
        name = 'brewers-gold'
        hop_list.append(name)
    elif name == 'Lubelska':
        hop_list.append('lublin')
    elif name == 'Styrian-Cardinal':
        hop_list.append('cardinal')
    elif name == 'XJA2/436':
        hop_list.append('xja2-436')
    elif 'blend' in name.lower() and 'TNT' not in name:
        name = name.replace(' Blend', '')
        name = name.replace('-Blend', '')
        hop_list.append(name)
    elif name == 'Sultana-Denali':
        hop_list.append('denali')
    elif name == 'Hallertau-US':
        hop_list.append('hallertau')
    elif name == 'XJA2/436':
        hop_list.append('XJA2-436')
    elif name == 'HBC360':
        hop_list.append('hbc-360')
    else:
        if name[0] == '-':
            name = name[1:]
        hop_list.append(name)

hop_list[133] = 'XJA2-436'
hop_list[219] = 'HBC-360'
    
print(hop_list)

['Astra', 'Eclipse', 'Ella', 'Enigma', 'Feux-Coeur-Francais', 'Galaxy', 'Helga', 'HPA-016', 'Melba', 'Pride-of-Ringwood', 'Summer', 'Super-Pride', 'Sylva', 'Topaz', 'Vic-Secret', 'Vienna-Gold', 'Canadian-Redvine', 'Lumberjack', 'Sasquatch', 'Wild-Loyalist', 'Marco-Polo', 'Tsingdao-Flower', 'Agnus', 'Amethyst', 'Bohemie', 'Boomerang', 'Bor', 'Gaia', 'Harmonie', 'Kazbek', 'Mimosa', 'Premiant', 'Saaz-CZ', 'Saaz-Late', 'Sladek', 'Vital', 'Aramis', 'Barbe-Rouge', 'Bouclier', 'Elixir', 'Mistral', 'Petit-Blanc', 'Strisselspalt', 'Tardif-de-Bourgogne', 'Triskel', 'Akoya', 'Ariana', 'Aurum', 'brewers-gold', 'Callista', 'Diamant', 'Hallertau-Blanc', 'Hallertau-Gold', 'Hallertau-Mittelfruh', 'Hallertau-Tradition', 'Herkules', 'Hersbrucker', 'Huell-Melon', 'Huller-Bitterer', 'Magnum-GR', 'Mandarina-Bavaria', 'Merkur', 'Monroe', 'Northern-Brewer-GR', 'Opal', 'Orion', 'Perle-GR', 'Polaris', 'Relax', 'Saphir', 'Smaragd', 'Solero', 'Spalt', 'Spalter-Select', 'Tettnanger', 'Wurttemberg', 'Yellow-Sub', 

In [43]:
print(len(hop_list))

304


In [39]:

for hop in hop_list:
    endpoint_url = 'https://beermaverick.com/hop/' + hop
    response = requests.get(endpoint_url)
    
    if response.status_code != 200:
        print(endpoint_url)
    
    sleep(3)
    

    

### Endpoint Scraping (each Hop)
**Goal**: Collect and store data for each hop

In [44]:
response = requests.get("https://beermaverick.com/hop/ultra/")
d = {'Alpha Acid': ''}
if response.status_code == 200:
        
        # Instantiate BS4 for parsing the HTML data (can also do via lxml)
        soup = bsoup(response.text, 'html')
        
        # Section that umbrellas as info/subsections
        hop_section = soup.find('div', {"class":"gridlove-site-content container gridlove-sidebar-right"})
        hop_section = soup.find('div', {"class":"entry-content"})
        
        # Sub-sections
        
        # Grabbing name info (can be done via url ending too)
        hop_name = soup.find('h1', {"class":"entry-title"}).text
        
        # Purpose and Country info
        tab1 = soup.find('figure', {"class":"wp-block-table"}).text
        tab1 = tab1.splitlines()
        hop_purpose = tab1[1].split(':')[1]
        hop_country = tab1[2].split(':')[1]
        
        # Flavor or Aroma tags
        # tab2 = hop_section.find_all('a', {"class":"text-muted"})
        
        # Brewing values
        brew_section = hop_section.find('table', {"class":"brewvalues"})
        for tr in brew_section.find_all('tr'):
            th = tr.find('th')
            td = tr.find('td')
            print(th.text, td.text)
#             print(tr)
            
        for tr in brew_section.find_all('tr', {"class":"brewoils"}):
            th = tr.find('th')
            td = tr.find('td')
#             print(th.text, td.text)
        
        
#         print(hop_aa)

Alpha Acid % (AA)Alpha acids are the main source of bitterness in beer. Longer boil times will result in isomerization of more alpha acids leading to increased bitterness.  Learn more » 3-9.7%6.4% avg
Beta Acid %Beta acids are a component of hop resins responsible for contributing volatile aromatic and flavor properties. Beta acids contribute no bitterness. 3.6-5%4.3% avg
Alpha-Beta RatioThe ratio of alpha to beta acids dictates the degree to which bitterness fades during aging. 1:1 ratios are common in aroma varieties. 1:1 - 3:12:1 avg
Co-Humulone as % of AlphaLow cohumulone hops may impart a smoother bitterness when added to the boil as opposed to higher ones that add a sharper bitterness to the final beer. Learn more » 25-35%30% avg
Total Oils (mL/100g)These highly volatile, not very soluble oils are easily boiled off, but add flavor and aroma to the finished beer when added very late in the boil or during fermentation. Learn more » 0.8-1.5 mL1.2mL avg


AttributeError: 'NoneType' object has no attribute 'text'

In [28]:
hops_list = ['Astra', 'Eclipse', 'Ella']  # test list

for hop in hops_list:
    endpoint_url = 'https://beermaverick.com/hop/' + hop
    response = requests.get(endpoint_url)
    
    if response.status_code == 200:  
        
        # Read in HTML tables into pandas dataframes 
        basic_df, brewing_df = pd.read_html(endpoint_url, encoding='ascii')
        
        d = {"Alpha Acid % (AA)":"Alpha Acid (AA)", "Beta Acid %":"Beta Acid (BA)"}
        l = [e.replace(key, val) for e in l for key, val in d.items() if key in e]
        print(l)
        
        print(basic_df)
        print(brewing_df)
    
    break
        
        

                     0                       1
0             Purpose:                    Dual
1             Country:         Australia (AUS)
2  International Code:                     AST
3           Ownership:  ™ Ellerslie Hop Estate
                                                   0                     1
0  Alpha Acid % (AA)Alpha acids are the main sour...         7-10%8.5% avg
1  Beta Acid %Beta acids are a component of hop r...            4-6%5% avg
2  Alpha-Beta RatioThe ratio of alpha to beta aci...      1:1 - 3:12:1 avg
3  Co-Humulone as % of AlphaLow cohumulone hops m...         26-28%27% avg
4  Total Oils (mL/100g)These highly volatile, not...         1-3 mL2mL avg
5                               Total Oil Breakdown:  Total Oil Breakdown:
6       › MyrceneFlavors: resinous, citrus, fruity (                   NaN


In [44]:
# Import necessary packages
import numpy as np
import os
import pandas as pd
import requests
import unicodedata2
import urllib.robotparser
from bs4 import BeautifulSoup as bsoup
from time import sleep

In [45]:
# Use RobotParser to check validity of scraping on endpoint of interest (checking for base page)
base_url = 'https://beermaverick.com/hops/'
rp = urllib.robotparser.RobotFileParser()
rp.set_url('https://beermaverick.com/robots.txt')
rp.read()
rp.can_fetch('*', base_url)

True

In [46]:
# Establish connection to base-URL & setup a base exception handling
try:
    response = requests.get(base_url)
    print('Connection successfully established. \nStatus Code: ', response.status_code)
except requests.exceptions.RequestException as err:  
    print('Unable to establish connection at this time: \n', err)

Connection successfully established. 
Status Code:  200


In [47]:
# Instantiate BS4 for parsing the HTML data 
soup = bsoup(response.text, 'html')

# Find container tag & specific attribute for hops names
hops_container = soup.find_all('div', {'class':'box-inner-p-bigger box-single'})[1]  # locate specific container
hops_container = hops_container.find_all('p')  # gather each section

# Loop through each subsection of container & store obtained hop name/endpoint
hop_names = dict()
for section in hops_container[:-1]:
    all_hops = section.find_all('a')
    for each_hop in all_hops:
        hop_names[each_hop.text] = each_hop['href']


In [49]:
# Ensure all hop names were retrieved
print(len(hop_names) == 304)

# Validate endpoint URLs
for hop in hop_names:
    endpoint_url = 'https://beermaverick.com' + hop_names[hop]
    try:
        response = requests.get(endpoint_url)
        if response.status_code != 200:
            print(f'Status Code: {response.status_code}. Unsuccessful request to: {endpoint_url}')
            sleep(5)
    except requests.exceptions.RequestException as err:  
        print('Unable to establish connection at this time: \n', err)

True
Status Code: 429. Unsuccessful request to: https://beermaverick.com/hop/huller-bitterer/
Status Code: 429. Unsuccessful request to: https://beermaverick.com/hop/mandarina-bavaria/
Status Code: 429. Unsuccessful request to: https://beermaverick.com/hop/merkur/
Status Code: 429. Unsuccessful request to: https://beermaverick.com/hop/monroe/
Status Code: 429. Unsuccessful request to: https://beermaverick.com/hop/northern-brewer-gr/
Status Code: 429. Unsuccessful request to: https://beermaverick.com/hop/orion/
Status Code: 429. Unsuccessful request to: https://beermaverick.com/hop/perle-gr/
Status Code: 429. Unsuccessful request to: https://beermaverick.com/hop/wurttemberg/
Status Code: 429. Unsuccessful request to: https://beermaverick.com/hop/nelson-sauvin/
Status Code: 429. Unsuccessful request to: https://beermaverick.com/hop/southern-cross/
Status Code: 429. Unsuccessful request to: https://beermaverick.com/hop/styrian-eagle/
Status Code: 429. Unsuccessful request to: https://beer

KeyboardInterrupt: 

In [None]:
# # Validate endpoint URLs
# for hop in hop_names:
#     endpoint_url = 'https://beermaverick.com' + hop_names[hop]
#     try:
#         response = requests.get(endpoint_url)
#         if response.status_code != 200:
#             print(f'Status Code: {response.status_code}. Unsuccessful request to: {endpoint_url}')
#             sleep(3)
#     except requests.exceptions.RequestException as err:  
#         print('Unable to establish connection at this time: \n', err)

In [None]:
hops_df = pd.DataFrame(
    columns=[
        'Hop Name', 
        'Purpose',
        'Country',
        'International Code',
        'Ownership',
        'Origin Info',
        'Flavor & Aroma Profile',
        'Brewing',
        '',
        '',
        '',
        '',
        '',
        '',
        '',
        '',
        '',
        '',
        '',
        '',
        '',
    ]
)

In [None]:
TAGS = 'tropical_fruit', 'sweet', 'peach', 'white_wine', 'melon', 'grassy', 'grapefruit', 'honey', '', '',  '', '',  '', '',  '', '',  '', '',  '', '',  '', '',  '', '',  

In [None]:
Profile = 'Citrus', 'Tropical Fruit', 'Stone Fruit', 'Berry', 'Floral', 'Grassy', 'Herbal', 'Spice', 'Resin / Pine', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 

In [None]:
Brewing Values = 'Alpha Acid % (AA)', 'Beta Acid %', 'Alpha-Beta Ratio', 'Co-Humulone as % of Alpha', 'Total Oils (mL/100g)', '', '', '', 

In [None]:
Oils = 'Myrcene', 'Humulene', 'Caryophyllene', 'Farnesene', 'All Others', '', '', 