In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from more_itertools import unique_everseen
import string
import itertools
import re
import sys
sys.path.append('..')
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy  as np
from db import db, query, query_list

In [3]:
from scraping_utilities import (get_missing_scrape_targets,
                                create_scraper, 
                                iterate_scraping,
                                make_db_inserter,
                                get_soup_with_requests,
                                get_selenium_resource
                               )

In [4]:
bars_table_name   = 'beers'
bars_url_col_name = 'bar_url'

In [5]:
# The relevant table will be search_pages and the column 
#  search_page_url will include urls already scraped
search_page_url_col_name = 'search_page_url'
search_page_table_name   = 'search_pages'

In [6]:
from beer_scraping import *

## Part 1: Get a list of bars on beermenus.com

Do this by using their "places" page which has a list of bars, 20 per page

In [None]:
# For debugging, uncomment
# query(f'DROP TABLE {search_page_table_name}')

In [None]:
# Get a list of urls to search
@np.vectorize
def get_search_page_url(page_num):
    return f'https://www.beermenus.com/places?page={page_num}'

target_urls  = get_search_page_url(range(1,32))
missing_urls = get_missing_scrape_targets(target_urls,
                                          search_page_url_col_name,
                                          search_page_table_name)

In [None]:
def parse_search_page(soup, **kwargs):
    '''
        Parse the beautiful soup on a beermenus.com search page
        
        Given the soup, select the search results and get the links
        Return a data frame with columns:
         - search_page_url, the source
         - link, the bar page scraped
         - bar_name, the website text displayed
         
        Parameters:
         - soup: a beautiful soup object from the page
         - **kwargs: url is required, should be the
            source url of the page. All else ignored.
        
        Returns:
         - DataFrame with the parsed data
                                         
    '''
    search_page_url = kwargs['url']
    
    # Use a CSS selector to find the list of bars
    links = soup.select('body div.results ul.pure-list a')
    data  = [{search_page_url_col_name  : search_page_url,
              'link'                    : 'https://www.beermenus.com' + link.attrs['href'],
              'bar_name'                : link.getText()}
             for link in links]
    return pd.DataFrame.from_records(data)

In [None]:
db_inserter = make_db_inserter(search_page_table_name)
scraper = create_scraper(parse_search_page, db_inserter, get_soup_with_requests)

In [None]:
error_return = iterate_scraping(scraper, missing_urls)

# Part 2: Scrape the bar pages

In order to speed the development and analysis process, there is some information about the bar that isn't collected here. A seperate loop is used to collect that information into a 'bars' table in Part 3. Some bar information is collected, again trading good database practices (being in database normal form) for agility.

The parser is relatively complex in order to handle the variety of cases and can be found in beer_scraping.py

In [None]:
beer_inserter = make_db_inserter('beers')

In [None]:
target_urls = query_list('link', search_page_table_name)

In [None]:
missing_urls = get_missing_scrape_targets(target_urls,
                                          bars_url_col_name,
                                          bars_table_name)

In [None]:
# Beer stores with ~1000 beers or more. They will be excluded from the analysis
#  and take a long time to parse
exclusions = [
'https://www.beermenus.com/places/39689-public-wine-beer-and-spirits',
'https://www.beermenus.com/places/2190-bellmore-beverage',
'https://www.beermenus.com/places/20239-fast-break',
'https://www.beermenus.com/places/11448-castle-wine-spirits',
'https://www.beermenus.com/places/5797-universal-beverage-llc',
'https://www.beermenus.com/places/14346-super-buy-rite-of-north-plainfield',
'https://www.beermenus.com/places/20828-beer-town',
'https://www.beermenus.com/places/51031-boardwalk-liquids',
'https://www.beermenus.com/places/31794-the-wine-guys',
'https://www.beermenus.com/places/43312-linwood-wine-liquor-at-hudson-lights',
'https://www.beermenus.com/places/47185-bloomfield-buyrite-we-deliver',
'https://www.beermenus.com/places/14981-beverage-plus-2',
'jjj',
'https://www.beermenus.com/places/17349-other-half-brewing-company',
'https://www.beermenus.com/places/1868-on-tap-at-whole-foods-market-columbus-circle',
'https://www.beermenus.com/places/23644-midland-brew-house', 
'https://www.beermenus.com/places/25864-bridge-view-tavern-beer-garden',
'https://www.beermenus.com/places/41075-icarus-brewing',
'https://www.beermenus.com/places/50163-huertas']

In [None]:
missing_urls = set(missing_urls).difference(exclusions)

In [None]:
page_getter = get_selenium_resource()

In [None]:
scraper = create_scraper(parse_page, beer_inserter, page_getter)

In [None]:
error_data = iterate_scraping(scraper, missing_urls, on_fail = 'proceed')

## Part 3: Get additional bar information

In [11]:
target_urls = query_list('bar_url', bars_table_name)

In [12]:
bar_inserter = make_db_inserter('bars')

In [13]:
missing_urls = get_missing_scrape_targets(target_urls,
                                          'bar_url',
                                          'bars')

590 pages already scraped detected
0 pages needing scraping detected


In [14]:
scraper = create_scraper(parse_bar_info, bar_inserter, get_soup_with_requests)

In [15]:
error_data = iterate_scraping(scraper, missing_urls, on_fail = 'abort')