# Data-Collection from Beer Maverick Hops Database 

**Source**: https://beermaverick.com/hops/

**Method**: Web-Scraping

------------------------------------------------------------

### Setup & Validation
**Objective**: Import required modules, establish connection & validate scraping capability.

In [1]:
# Import necessary packages
import numpy as np
import os
import pandas as pd
import requests
import unicodedata2
import urllib.robotparser
from bs4 import BeautifulSoup as bsoup
from time import sleep



In [2]:
# Use RobotParser to check validity of scraping on endpoint of interest (checking for base page)
base_url = 'https://beermaverick.com/hops/'
rp = urllib.robotparser.RobotFileParser()
rp.set_url('https://beermaverick.com/robots.txt')
rp.read()
rp.can_fetch('*', base_url)

True

In [3]:
# Establish connection to base-URL & setup a base exception handling
try:
    response = requests.get(base_url)
    print('Connection successfully established. \nStatus Code: ', response.status_code)
except requests.exceptions.RequestException as err:  
    print('Unable to establish connection at this time: \n', err)

Connection successfully established. 
Status Code:  200


### Base-URL Scraping
**Objective:** Collect hop names & corresponding endpoint URLs for further validation & scraping. 

In [4]:
# Instantiate BS4 for parsing the HTML data 
soup = bsoup(response.text, 'html')

# Find container tag & specific attribute for hops names
hops_container = soup.find_all('div', {'class':'box-inner-p-bigger box-single'})[1]  # locate specific container
hops_container = hops_container.find_all('p')  # gather each section

# Loop through each subsection of container & store obtained hop name/endpoint
hop_names = dict()
for section in hops_container[:-1]:
    all_hops = section.find_all('a')
    for each_hop in all_hops:
        hop_names[each_hop.text] = each_hop['href']

print(len(hop_names) == 304)  # Verify correct amount of hops 
# print(hop_names)  # Look at final dict (key=official-hop-name & value=url's endpoint)

True


### Endpoint Scraping
**Objective:** Collect & store detailed data for each hop from its respective endpoint.

In [5]:
def hops_scraper(hops):
    
    # Loop through each hop page and retrive data ('list(' used for debug purposes; to be normalized later)
    for hop in list(hops.keys())[0:]:

        endpoint_url = 'https://beermaverick.com' + hops[hop]

        # Validate connection & force wait-time to prevent Error 429 (too many requests)
        try:
            response = requests.get(endpoint_url)
            sleep(8)
        except requests.exceptions.RequestException as err:  
            print('Unable to establish connection at this time: \n', err)

        if response.status_code == 200:

            hop_dict = {'Hop Name': hop, 'Scraping Status': True}  # initiate dict to store hop data

            # Instantiate BS4 & locate umbrella container tag/attribute for all data sections
            soup = bsoup(response.text, 'html')
            info_section = soup.find('div', {'class':'entry-content'})

            # Retrieve info from initial overview table (purpose, country, code, ownership)
            table = info_section.find('table')
            table = table.find_all('tr')  # each 'tr' tags each row
            for row in table:
                key = row.find('th').text  # each 'th' tags index col
                val = row.find('td').text  # each 'td' tags value col
                hop_dict[key] = val

            # Retrieve flavor/aroma profile characteristics (displayed as "tags" on webpage)
            aroma_profile = info_section.find('em')
            if aroma_profile != None:  # conditional check if sectino exists
                aroma_profile = aroma_profile.find_all('a', {'class':'text-muted'})
                val = [tag.text for tag in aroma_profile]
                hop_dict['Flavor & Aroma Profile'] = val

            # Retrieve data from brewing values table
            brew_table = info_section.find('table', {'class':'brewvalues'})
            brew_table = brew_table.find_all('tr')  # each 'tr' tags each row
            for row in brew_table:
                # Conditional to skip over 'Total Oil Breadown' empty row without values
                if row.find('th') != None:
                    # Grab key-value pair of the brewing value for that row
                    key = row.find('th').text.replace(row.find('small').text, '')  # subtracting details-subtext from heading str
                    val = row.find('td').text  # each 'td' tags value col
                    hop_dict[key] = val
                   
        else:
            hop_dict = {'Hop Name': hop, 'Scraping Status': False}
        
        # print(hop_dict)  # debug
        yield hop_dict

### Load Hops Data
**Objective**: Execute scraper generator function and append each yield dict as row into dataframe & output CSV.

In [6]:
from time import time
t1 = time()
hops_df = pd.DataFrame(hops_scraper(hop_names))
t2 = time()
print(t2-t1)

2769.429688692093


In [7]:
hops_df.to_csv('./raw_data/raw_hops_main.csv')
hops_df

FileNotFoundError: [Errno 2] No such file or directory: './rc_raw_data/rc_raw_hops_main.csv'

### Reference-Material Scraping
**Objective**: Scrape reference materials (meta-data / standard info) for further context & analysis.

In [None]:
### SCRAPE HOPS-SUBSTITUTION CHART

substitutions_url = 'https://beermaverick.com/hops/hop-substitutions-chart/'

# Validate connection
try:
    response = requests.get(substitutions_url)
except requests.exceptions.RequestException as err:  
    print('Unable to establish connection at this time: \n', err)

if response.status_code == 200:
    
    # Initiate list with empty dict objects for each row of data loading
    subs_info = [{} for i in range(214)]  # for each row on webpage

    # Instantiate BS4 & locate container tag/attribute for table
    soup = bsoup(response.text, 'html')
    table = soup.find('tbody')
    table = table.find_all('tr')  # each 'tr' tags each row
    
    # Loop through each row of table and load info to corresponding dict in list
    row_num = 0
    for row in table:
        subs_info[row_num]['Hop Name'] = row.find('th').text  # each 'th' tags index col
        subs_info[row_num]['Substitutions'] = row.find('td').text  # each 'td' tags value col
        row_num += 1

# Load scraped data into local df & output CSV
subs_df = pd.DataFrame(subs_info)
subs_df.to_csv('./raw_data/raw_ref_hops_substitutions.csv')

In [None]:
subs_df

In [None]:
### SCRAPE AROMAS META-DATA

aroma_meta_url = 'https://beermaverick.com/the-science-behind-identifying-hop-aromas/'

# Validate connection
try:
    response = requests.get(aroma_meta_url)
except requests.exceptions.RequestException as err:  
    print('Unable to establish connection at this time: \n', err)

if response.status_code == 200:
    
    # Initiate lists of dicts for each aroma type, to be used for data loading
    aromas_info = [
        {'Aroma Type': 'Floral'},
        {'Aroma Type': 'Citrus'},
        {'Aroma Type': 'Tropical/Sweet Fruits'},
        {'Aroma Type': 'Stone/Green Fruits'},
        {'Aroma Type': 'Berries & Currant'},
        {'Aroma Type': 'Cream & Caramel'},
        {'Aroma Type': 'Woody Aromatic'},
        {'Aroma Type': 'Menthol'},
        {'Aroma Type': 'Herbal'},
        {'Aroma Type': 'Spicy'},
        {'Aroma Type': 'Grassy'},
        {'Aroma Type': 'Vegetal'}
    ]
    
    # Instantiate BS4 & locate container tag/attribute aroma info
    soup = bsoup(response.text, 'html')
    aroma_section = soup.find('div', {'class':'entry-content'})
    aroma_section = aroma_section.find_all('li')
    
    # Separate aroma types & compound names information
    aroma_types = aroma_section[0:-1:2]
    compound_names = aroma_section[1:-1:2]
    
    # Loop through each aroma type & compound name and load info to corresponding dict in list
    for i in range(len(aromas_info)):
        aromas_info[i][aroma_types[i].text[0:6]] = aroma_types[i].text[8:]
        aromas_info[i][compound_names[i].text[0:21]] = compound_names[i].text[23:]

# Load scraped data into local df & output CSV
aromas_df = pd.DataFrame(aromas_info)
aromas_df.to_csv('./raw_data/raw_ref_aroma_types.csv')
aromas_df

In [None]:
### SCRAPE BREWING VALUES META-DATA 

brew_meta_url = 'https://beermaverick.com/hop/newport/'  # using a hops page with all brew values

# Validate connection
try:
    response = requests.get(brew_meta_url)
except requests.exceptions.RequestException as err:  
    print('Unable to establish connection at this time: \n', err)

if response.status_code == 200:
    
    meta = []
    
    # Instantiate BS4 & locate umbrella container tag/attribute for all data sections
    soup = bsoup(response.text, 'html')
    info_section = soup.find('div', {'class':'entry-content'})
    
    # Retrieve data from brewing values table
    brew_table = info_section.find('table', {'class':'brewvalues'})
    brew_table = brew_table.find_all('tr')  # each 'tr' tags each row
    for row in brew_table:
        # Conditional to skip over 'Total Oil Breadown' empty row without values
        if row.find('th') != None:
            # Grab info from first column for each row of brew values
            brew_type = row.find('th').text.replace(row.find('small').text, '')  # subtracting details-subtext from heading str
            brew_info = row.find('th').find('small').text  
            meta.append({'Value Type': brew_type, 'Description': brew_info})

# Load scraped data into local df & output CSV
brew_meta_df = pd.DataFrame(meta)
brew_meta_df.to_csv('./raw_data/raw_ref_brew_values.csv')
brew_meta_df