In [1]:
import pandas as pd
import re
import requests
import time
from bs4 import BeautifulSoup
from itertools import chain
from pandas.io.json import json_normalize
from recipe_scrapers import scrape_me
from tqdm import tqdm_notebook as tqdm
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)

### Create Parser Function for Ingredients

In [2]:
# regex for separating ingredients list
SEPARATOR_RE = re.compile(r'^([\d\s*[\d\.,/]*)\s*(.+)')

# create a normalized string for ingredients
def normalize(st):
    """

    :param st:
    :return:
    """
    return re.sub(r'\s+', ' ', SEPARATOR_RE.sub('\g<1> \g<2>', st)).strip()


def escape_re_string(text):
    """

    :param text:
    :return:
    """
    text = text.replace('.', '\.')
    return re.sub(r'\s+', ' ', text)

# list of common units of measurements for ingredients
UNITS = {"cup": ["cups", "cup", "c.", "c"], "fluid_ounce": ["fl. oz.", "fl oz", "fluid ounce", "fluid ounces"],
         "gallon": ["gal", "gal.", "gallon", "gallons"], "ounce": ["oz", "oz.", "ounce", "ounces"],
         "pint": ["pt", "pt.", "pint", "pints"], "pound": ["lb", "lb.", "pound", "pounds"],
         "quart": ["qt", "qt.", "qts", "qts.", "quart", "quarts"],
         "tablespoon": ["tbsp.", "tbsp", "T", "T.", "tablespoon", "tablespoons", "tbs.", "tbs"],
         "teaspoon": ["tsp.", "tsp", "t", "t.", "teaspoon", "teaspoons"],
         "gram": ["g", "g.", "gr", "gr.", "gram", "grams"], "kilogram": ["kg", "kg.", "kilogram", "kilograms"],
         "liter": ["l", "l.", "liter", "liters"], "milligram": ["mg", "mg.", "milligram", "milligrams"],
         "milliliter": ["ml", "ml.", "milliliter", "milliliters"], "pinch": ["pinch", "pinches"],
         "dash": ["dash", "dashes"], "touch": ["touch", "touches"], "handful": ["handful", "handfuls"],
         "stick": ["stick", "sticks"], "clove": ["cloves", "clove"], "can": ["cans", "can"], "large": ["large"],
         "small": ["small"], "scoop": ["scoop", "scoops"], "filets": ["filet", "filets"], "sprig": ["sprigs", "sprig"]}

# numbers to separate quantities from ingredients
NUMBERS = ['seventeen', 'eighteen', 'thirteen', 'nineteen', 'fourteen', 'sixteen', 'fifteen', 'seventy', 'twelve',
           'eleven', 'eighty', 'thirty', 'ninety', 'twenty', 'seven', 'fifty', 'sixty', 'forty', 'three', 'eight',
           'four', 'zero', 'five', 'nine', 'ten', 'one', 'six', 'two', 'an', 'a', '½', '⅓','¼', '⅛', '¾']

prepositions = ["of"]

a = list(chain.from_iterable(UNITS.values()))
a.sort(key=lambda x: len(x), reverse=True)
a = map(escape_re_string, a)

PARSER_RE = re.compile(
    r'(?P<quantity>(?:[\d\.,][\d\.,\s/]*)?\s*(?:(?:%s)\s*)*)?(\s*(?P<unit>%s)\s+)?(\s*(?:%s)\s+)?(\s*(?P<name>.+))?' % (
        '|'.join(NUMBERS), '|'.join(a), '|'.join(prepositions)))


def parse(st):
    """

    :param st:
    :return:
    """
    st = normalize(st)
    res = PARSER_RE.match(st)
    
    return ((res.group('name') or '').strip())

# uncomment below code to return a tuple with measurement and ingredients

#     return ((res.group('quantity') or '').strip() + ' ' + (res.group('unit') or '').strip(),
#            (res.group('name') or '').strip())


# uncomment below code to return a dictionary with with measurements and ingredients as keys

#     return {
#             'measure': (res.group('quantity') or '').strip() + ' ' + (res.group('unit') or '').strip(),
#             'name': (res.group('name') or '').strip()
#         }


### Scrape Recipes from BudgetBytes with Recipe_Scrapers

In [34]:
linklist=[]


responses=requests.get('https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/')
soup=BeautifulSoup(responses.content,'html.parser')
reviews = soup.findAll('a',{'class':'review-detail__link'})
for r in reviews:
    linklist.append(r.get('href'))

        
        
# # Filter through links that correspond with a singular dish and not multiple dishes       
# dishes = [k for k in linklist if '/recipe/' in k]

# Remove duplicate links
recipes = [] 
for l in linklist: 
    if l not in recipes: 
        recipes.append(l)
        
print(len(linklist), len(recipes))

11 9


In [32]:
recipes

['https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821050/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/2771446/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/843991/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/199471/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/302997/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/375761/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/1327470/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/821739/',
 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/reviews/876441/']

In [22]:
# Remove duplicate links
recipes = [] 
for l in linklist: 
    if l not in recipes: 
        recipes.append(l)

In [23]:
len(recipes)

475

In [None]:
remove_words = ['facebook', 'instagram', 'pinterest', 'youtube']

In [24]:
recipes

['https://www.budgetbytes.com/',
 'https://www.budgetbytes.com/category/recipes/',
 'https://www.budgetbytes.com/category/recipes/beansandgrains/',
 'https://www.budgetbytes.com/category/recipes/breakfast/',
 'https://www.budgetbytes.com/category/recipes/dessert/',
 'https://www.budgetbytes.com/category/recipes/global/',
 'https://www.budgetbytes.com/category/recipes/global/asian/',
 'https://www.budgetbytes.com/category/recipes/global/indian/',
 'https://www.budgetbytes.com/category/recipes/global/italian/',
 'https://www.budgetbytes.com/category/recipes/global/mediterranean/',
 'https://www.budgetbytes.com/category/recipes/global/southwest/',
 'https://www.budgetbytes.com/category/recipes/meat/',
 'https://www.budgetbytes.com/category/recipes/meat/chicken/',
 'https://www.budgetbytes.com/category/recipes/meat/beef/',
 'https://www.budgetbytes.com/category/recipes/meat/turkey/',
 'https://www.budgetbytes.com/category/recipes/meat/pork/',
 'https://www.budgetbytes.com/category/recipes/

In [20]:
len(linklist)

1701

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [3]:
browser = webdriver.Chrome(executable_path = '/Users/stephaniekendall/Library/Application Support/Google/chromedriver')

In [4]:
link1 = 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/'

In [None]:
browser.get('https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/')

In [None]:
# create a click action to open at least 50 reviews on the page

def mark_complete(link):
       for i in range(9):
            try:
                mark_complete = browser.find_element_by_xpath("//*[contains(@class, 'more-button')]")
                mark_complete.click()
            except Exception:
                continue

In [None]:
def get_review_link(link):
    revy = []
    text = browser.find_elements_by_xpath("//*[contains(@class, 'review-detail__link')]")
    for t in text:
        review = t.get_attribute('href')
        revy.append(review)
        print(revy)

In [69]:
mark_complete(link1)

In [76]:
get_review_link(link1)

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=57719): Max retries exceeded with url: /session/c1fa9dac51ef05b40629671531747986/elements (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x115d1e978>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [17]:
browser.quit()

In [74]:
linklist=[]



responses=requests.get('https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/')
soup=BeautifulSoup(responses.content,'html.parser')
reviews = soup.findAll('a',{'class':'review-detail__link'})
for r in reviews:
    linklist.append(r.get('href'))

        
        
# # Filter through links that correspond with a singular dish and not multiple dishes       
# dishes = [k for k in linklist if '/recipe/' in k]

# Remove duplicate links
recipes = [] 
for l in linklist: 
    if l not in recipes: 
        recipes.append(l)
        
print(len(linklist), len(recipes))

KeyboardInterrupt: 