In [1]:
import pandas as pd
import re
import requests
import time
from bs4 import BeautifulSoup
from itertools import chain
from pandas.io.json import json_normalize
from recipe_scrapers import scrape_me
from tqdm import tqdm_notebook as tqdm
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)

### Create Parser Function for Ingredients

In [2]:
# regex for separating ingredients list
SEPARATOR_RE = re.compile(r'^([\d\s*[\d\.,/]*)\s*(.+)')

# create a normalized string for ingredients
def normalize(st):
    """

    :param st:
    :return:
    """
    return re.sub(r'\s+', ' ', SEPARATOR_RE.sub('\g<1> \g<2>', st)).strip()


def escape_re_string(text):
    """

    :param text:
    :return:
    """
    text = text.replace('.', '\.')
    return re.sub(r'\s+', ' ', text)

# list of common units of measurements for ingredients
UNITS = {"cup": ["cups", "cup", "c.", "c"], "fluid_ounce": ["fl. oz.", "fl oz", "fluid ounce", "fluid ounces"],
         "gallon": ["gal", "gal.", "gallon", "gallons"], "ounce": ["oz", "oz.", "ounce", "ounces"],
         "pint": ["pt", "pt.", "pint", "pints"], "pound": ["lb", "lb.", "pound", "pounds"],
         "quart": ["qt", "qt.", "qts", "qts.", "quart", "quarts"],
         "tablespoon": ["tbsp.", "tbsp", "T", "T.", "tablespoon", "tablespoons", "tbs.", "tbs"],
         "teaspoon": ["tsp.", "tsp", "t", "t.", "teaspoon", "teaspoons"],
         "gram": ["g", "g.", "gr", "gr.", "gram", "grams"], "kilogram": ["kg", "kg.", "kilogram", "kilograms"],
         "liter": ["l", "l.", "liter", "liters"], "milligram": ["mg", "mg.", "milligram", "milligrams"],
         "milliliter": ["ml", "ml.", "milliliter", "milliliters"], "pinch": ["pinch", "pinches"],
         "dash": ["dash", "dashes"], "touch": ["touch", "touches"], "handful": ["handful", "handfuls"],
         "stick": ["stick", "sticks"], "clove": ["cloves", "clove"], "can": ["cans", "can"], "large": ["large"],
         "small": ["small"], "scoop": ["scoop", "scoops"], "filets": ["filet", "filets"], "sprig": ["sprigs", "sprig"]}

# numbers to separate quantities from ingredients
NUMBERS = ['seventeen', 'eighteen', 'thirteen', 'nineteen', 'fourteen', 'sixteen', 'fifteen', 'seventy', 'twelve',
           'eleven', 'eighty', 'thirty', 'ninety', 'twenty', 'seven', 'fifty', 'sixty', 'forty', 'three', 'eight',
           'four', 'zero', 'five', 'nine', 'ten', 'one', 'six', 'two', 'an', 'a', '½', '⅓','¼', '⅛', '¾']

prepositions = ["of"]

a = list(chain.from_iterable(UNITS.values()))
a.sort(key=lambda x: len(x), reverse=True)
a = map(escape_re_string, a)

PARSER_RE = re.compile(
    r'(?P<quantity>(?:[\d\.,][\d\.,\s/]*)?\s*(?:(?:%s)\s*)*)?(\s*(?P<unit>%s)\s+)?(\s*(?:%s)\s+)?(\s*(?P<name>.+))?' % (
        '|'.join(NUMBERS), '|'.join(a), '|'.join(prepositions)))


def parse(st):
    """

    :param st:
    :return:
    """
    st = normalize(st)
    res = PARSER_RE.match(st)
    
    return ((res.group('name') or '').strip())

# uncomment below code to return a tuple with measurement and ingredients

#     return ((res.group('quantity') or '').strip() + ' ' + (res.group('unit') or '').strip(),
#            (res.group('name') or '').strip())


# uncomment below code to return a dictionary with with measurements and ingredients as keys

#     return {
#             'measure': (res.group('quantity') or '').strip() + ' ' + (res.group('unit') or '').strip(),
#             'name': (res.group('name') or '').strip()
#         }


### Scrape Recipes from BudgetBytes with Recipe_Scrapers

In [158]:
# Create an empty list for the scraped links
linklist=[]
for i in tqdm(range(1,2)):
    responses=requests.get('https://www.allrecipes.com/recipes/80/main-dish/?page={}'.format(i))
    soup=BeautifulSoup(responses.content,'html.parser')
    for link in soup.findAll('a', attrs={'href': re.compile("^https://")}):
        linklist.append(link.get('href'))

# Filter through links that correspond with a singular dish and not multiple dishes       
dishes = [k for k in linklist if '/recipe/' in k]

# Remove duplicate links
recipes = [] 
for d in dishes: 
    if d not in recipes: 
        recipes.append(d)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [160]:
len(linklist)

318

In [163]:
revlist=[]

for link in tqdm(linklist):
    responses=requests.get(link)
    soup=BeautifulSoup(responses.content,'html.parser')
    reviews = soup.findAll('a',{'class':'review-detail__link'})
    for r in reviews:
        revlist.append(r.get('href'))

        
# # Filter through links that correspond with a singular dish and not multiple dishes       
# dishes = [k for k in linklist if '/recipe/' in k]

# Remove duplicate links
reviews = [] 
for l in tqdm(revlist): 
    if l not in reviews: 
        reviews.append(l)
        
print(len(revlist), len(reviews))

HBox(children=(IntProgress(value=0, max=318), HTML(value='')))




HBox(children=(IntProgress(value=0, max=668), HTML(value='')))


668 216


In [165]:
revlist

['https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821050/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/2771446/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821050/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/843991/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/199471/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/302997/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/375761/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/2771446/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/1327470/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821739/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/876441/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821050/',
 'https://www.allrecipes.com/recipe/2

In [164]:
reviews

['https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821050/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/2771446/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/843991/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/199471/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/302997/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/375761/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/1327470/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821739/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/876441/',
 'https://www.allrecipes.com/recipe/269592/pork-chops-in-garlic-mushroom-sauce/reviews/6671667/',
 'https://www.allrecipes.com/recipe/269592/pork-chops-in-garlic-mushroom-sauce/reviews/6657017/',
 'https://www.allrecipes.com/recipe/269592/pork-chops-in-garlic-mushroom-sauce/rev

In [22]:
# Remove duplicate links
recipes = [] 
for l in linklist: 
    if l not in recipes: 
        recipes.append(l)

In [23]:
len(recipes)

475

In [None]:
remove_words = ['facebook', 'instagram', 'pinterest', 'youtube']

In [24]:
recipes

['https://www.budgetbytes.com/',
 'https://www.budgetbytes.com/category/recipes/',
 'https://www.budgetbytes.com/category/recipes/beansandgrains/',
 'https://www.budgetbytes.com/category/recipes/breakfast/',
 'https://www.budgetbytes.com/category/recipes/dessert/',
 'https://www.budgetbytes.com/category/recipes/global/',
 'https://www.budgetbytes.com/category/recipes/global/asian/',
 'https://www.budgetbytes.com/category/recipes/global/indian/',
 'https://www.budgetbytes.com/category/recipes/global/italian/',
 'https://www.budgetbytes.com/category/recipes/global/mediterranean/',
 'https://www.budgetbytes.com/category/recipes/global/southwest/',
 'https://www.budgetbytes.com/category/recipes/meat/',
 'https://www.budgetbytes.com/category/recipes/meat/chicken/',
 'https://www.budgetbytes.com/category/recipes/meat/beef/',
 'https://www.budgetbytes.com/category/recipes/meat/turkey/',
 'https://www.budgetbytes.com/category/recipes/meat/pork/',
 'https://www.budgetbytes.com/category/recipes/

In [20]:
len(linklist)

1701

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [3]:
browser = webdriver.Chrome(executable_path = '/Users/stephaniekendall/Library/Application Support/Google/chromedriver')

In [142]:
link1 = 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/'

In [140]:
browser.get('https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/')

In [141]:
# create a click action to open at least 50 reviews on the page

def mark_complete(link):
       for i in range(4):
            try:
                mark_complete = browser.find_element_by_xpath("//*[contains(@class, 'more-button')]")
                mark_complete.click()
            except Exception:
                continue

In [145]:
mark_complete(link1)

In [95]:
links = []

for link in browser.find_element_by_xpath("//a[contains(.,'reviews')]"):
    link.append(links)

TypeError: 'WebElement' object is not iterable

In [38]:
def get_review_link(link):
    revy = []
    text = browser.find_elements_by_xpath("//*[contains(@class, 'review-detail__link')]")
    for t in text:
        review = t.get_attribute('href')
        revy.append(review)
        return revy

In [40]:
from lxml import html

In [42]:
responses=requests.get('https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/')
tree = html.fromstring(responses.content)
soup=BeautifulSoup(responses.content,'html.parser')
reviews = soup.findAll('a',{'class':'review-detail__link'})
# for r in reviews:
#     linklist.append(r.get('href'))

In [87]:
links = []
responses=requests.get('https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/')
# tree = html.fromstring(responses.content)
soup=BeautifulSoup(responses.content,'lxml')

for tag in soup.find_all('a',href=True):
    for link in tag['href']:
        
        links.append(tag['href'])
        print(links)

# reviews = soup.findAll('a',{'class':'review-detail__link'})
# for r in reviews:
#     linklist.append(r.get('href'))

TypeError: list indices must be integers or slices, not str

In [88]:
import lxml.html

def extract(content):
    links = []
    dom = lxml.html.fromstring(content)
    for link in dom.xpath('//a/@href'):
        links.append(link)
    return links

In [91]:
extract('https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/')

[]

In [109]:
browser.find_element_by_partial_link_text('review')

<selenium.webdriver.remote.webelement.WebElement (session="d06e379d299ed231cc93518c9ed32c58", element="44201abb-f9b3-47c2-81a3-0585b50fc0ea")>

In [None]:
html = scraperwiki.scrape('https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/')

root = lxml.html.fromstring(html)

In [148]:
# this works to get review links

yum1 = []
for url in link1:
    for url in root.xpath('//a/@href'):
        if 'review' in url:
            yum1.append(url)

In [149]:
yum1

['https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821739/?SourcePageName=%2Frecipe%2F23600%2Fworlds-best-lasagna%2Freviews%2F876441%2F&SourceContentType=reviews&SourceContentID=23600&AnalyticsEvent=recipe%20review%20detail%20nav&EventName=recipe%20review%20detail%20-%20prev',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/339170/?SourcePageName=%2Frecipe%2F23600%2Fworlds-best-lasagna%2Freviews%2F876441%2F&SourceContentType=reviews&SourceContentID=23600&AnalyticsEvent=recipe%20review%20detail%20nav&EventName=recipe%20review%20detail%20-%20next',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/339170/?SourcePageName=%2Frecipe%2F23600%2Fworlds-best-lasagna%2Freviews%2F876441%2F&SourceContentType=reviews&SourceContentID=23600&AnalyticsEvent=recipe%20review%20detail%20nav&EventName=recipe%20review%20detail%20-%20next',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821739/?SourcePageName=%2Frecipe%2F2360

In [152]:
# Remove duplicate links
recipes = [] 
for d in yum1: 
    if d not in recipes: 
        recipes.append(d)

In [153]:
recipes

['https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821739/?SourcePageName=%2Frecipe%2F23600%2Fworlds-best-lasagna%2Freviews%2F876441%2F&SourceContentType=reviews&SourceContentID=23600&AnalyticsEvent=recipe%20review%20detail%20nav&EventName=recipe%20review%20detail%20-%20prev',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/339170/?SourcePageName=%2Frecipe%2F23600%2Fworlds-best-lasagna%2Freviews%2F876441%2F&SourceContentType=reviews&SourceContentID=23600&AnalyticsEvent=recipe%20review%20detail%20nav&EventName=recipe%20review%20detail%20-%20next']

In [154]:
# for the rating from review
rating = []
for url in yum:
    html = scraperwiki.scrape(url)
    root = lxml.html.fromstring(html)
    for rate in root1.xpath('//*[@itemprop="ratingValue"]')[0].get('content'):
        rating.append(rate)

In [146]:
rating = []
html1 = scraperwiki.scrape(link1)
root1 = lxml.html.fromstring(html1)
for rate in root1.xpath('//*[@itemprop="ratingValue"]')[0].get('content'):
    rating.append(rate)

In [147]:
rating

['4', '.', '8', '0']

In [134]:
# gets rating


'5'

In [124]:
rating

[]

In [115]:
yum

['https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821050/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/2771446/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821050/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/843991/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/199471/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/302997/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/375761/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/2771446/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/1327470/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821739/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/876441/']

In [None]:
for i in tqdm(range(1,2)):
    responses=requests.get('https://www.allrecipes.com/recipes/80/main-dish/?page={}'.format(i))
    soup=BeautifulSoup(responses.content,'html.parser')
    for link in soup.findAll('a', attrs={'href': re.compile("^https://")}):
        linklist.append(link.get('href'))

In [114]:
len(yum)

11

In [104]:
# review_links = tree.xpath("//*[contains(@class, 'review-detail__link')]/text()")


tree.xpath("//a[text()='review-detail__link']/@href")

[]

In [56]:
pip install scraperwiki

Collecting scraperwiki
  Downloading https://files.pythonhosted.org/packages/30/84/d874847baad89f03e6984fcd87505a37bf924b66519d1e07bf76e2369af0/scraperwiki-0.5.1.tar.gz
Collecting alembic (from scraperwiki)
[?25l  Downloading https://files.pythonhosted.org/packages/dc/6d/3c1411dfdcf089ec89ce5e2222deb2292f39b6b1a5911222e15af9fe5a92/alembic-1.3.2.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 8.0MB/s eta 0:00:01     |███████████████████████████▊    | 921kB 8.0MB/s eta 0:00:01
Collecting Mako (from alembic->scraperwiki)
[?25l  Downloading https://files.pythonhosted.org/packages/b0/3c/8dcd6883d009f7cae0f3157fb53e9afb05a0d3d33b3db1268ec2e6f4a56b/Mako-1.1.0.tar.gz (463kB)
[K     |████████████████████████████████| 471kB 13.5MB/s eta 0:00:01
[?25hCollecting python-editor>=0.3 (from alembic->scraperwiki)
  Downloading https://files.pythonhosted.org/packages/c6/d3/201fc3abe391bbae6606e6f1d598c15d367033332bd54352b12f35513717/python_editor-1.0.4-py3-none-any.whl
Building wheel

In [57]:
import scraperwiki

In [59]:
import lxml.html

In [70]:
html = scraperwiki.scrape('https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/')

root = lxml.html.fromstring(html)

# hrefs = root.xpath("//*[contains(@class='review-detail__link']/a")

for e in tree.xpath('//a/@href'):
    print(e.attrib['href'], e.text_content())

# href1 = []
# for href in hrefs:
#     href1.append(href.attrib['href'])

KeyError: 'href'

In [67]:
href1

[]

In [54]:
tree.xpath("//a[contains(@class, 'review-detail__link')]").get('href')

AttributeError: 'list' object has no attribute 'get'

In [48]:
review_links

['\r\n                ',
 '\r\n                ',
 '\r\n            ',
 '\r\n                ',
 '\r\n                ',
 '\r\n            ',
 '\r\n                ',
 '\r\n                ',
 '\r\n            ',
 '\r\n                ',
 '\r\n                ',
 '\r\n            ',
 '\r\n                ',
 '\r\n                ',
 '\r\n            ',
 '\r\n                ',
 '\r\n                ',
 '\r\n            ',
 '\r\n                ',
 '\r\n                ',
 '\r\n            ',
 '\r\n                ',
 '\r\n                ',
 '\r\n            ',
 '\r\n                ',
 '\r\n                ',
 '\r\n            ',
 '\r\n                ',
 '\r\n                ',
 '\r\n            ',
 '\r\n                ',
 '\r\n                ',
 '\r\n            ']

In [15]:
mark_complete(link1)

In [39]:
get_review_link(link1)

['https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821050/']

In [28]:
# have to change 'getreviews' to '23600/worlds-best-lasagna'

NoneType

In [21]:
len(revvvs)

TypeError: object of type 'NoneType' has no len()

In [7]:
browser.quit()

In [74]:
linklist=[]



responses=requests.get('https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/')
soup=BeautifulSoup(responses.content,'html.parser')
reviews = soup.findAll('a',{'class':'review-detail__link'})
for r in reviews:
    linklist.append(r.get('href'))

        
        
# # Filter through links that correspond with a singular dish and not multiple dishes       
# dishes = [k for k in linklist if '/recipe/' in k]

# Remove duplicate links
recipes = [] 
for l in linklist: 
    if l not in recipes: 
        recipes.append(l)
        
print(len(linklist), len(recipes))

KeyboardInterrupt: 