In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException, InvalidSelectorException, ElementNotVisibleException, WebDriverException
import pickle
import csv
import tqdm
import time

In [3]:
category_url = 'https://www.allrecipes.com/recipes/80/main-dish/?page='

def search_entire_category(category_url, save_filename):
    '''writes all recipe links for a given category to pickle list file'''
    MAX_NUM_PAGES = 3 #TODO: find a way to set that automatically
    recipe_urls = []
    
    #browse over all the search pages for a given category and get their recipes
    for page in range(1, MAX_NUM_PAGES):
        recipe_urls.extend(get_recipe_urls(category_url+str(page)))
    
    return recipe_urls
    
    #TODO: either pickle and load or do all at once
    with open(save_filename, 'wb') as handle:
        pickle.dump(recipe_urls, handle, protocol=pickle.HIGHEST_PROTOCOL)     

In [10]:
def get_recipe_urls(cat_page_url):
    '''returns all recipe links for a given search page within a category'''
    response = requests.get(cat_page_url)
    soup = BeautifulSoup(response.text, "html.parser")
    recipe_urls = []
    
    #find all recipe links embedded in search page
    for elt in set(soup.find_all('a', href=re.compile('^(https://www.allrecipes\.com\/recipe\/)[0-9]*(\/)'))):
        recipe_urls.append(elt['href'])
                   
    return list(set(recipe_urls))

# cat_page_url = 'https://www.allrecipes.com/recipes/80/main-dish/?page=1'
# print(get_recipe_urls(cat_page_url))

In [11]:
def get_recipe_with_reviews_soup(driver, recipe_url):
    '''returns soup for a recipe after loading many reviews'''
    print(recipe_url)
    driver.get(recipe_url)
    driver.implicitly_wait(10)
    
    N_REVIEW_PAGES = 10
    
    for _ in range(N_REVIEW_PAGES):
        try:
            clicker = driver.find_element_by_link_text('More Reviews')
            clicker.location_once_scrolled_into_view
            clicker.click()
            time.sleep(1)
        except (TimeoutException, NoSuchElementException, InvalidSelectorException, ElementNotVisibleException, WebDriverException):
            break
    
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    return soup

def get_recipe_reviews(recipe_soup):
    ''' returns reviews consisting of author, rating and date for a recipe '''
    authors = [" ".join(author.text.split()) for author in recipe_soup.find_all('h4', attrs={'itemprop': 'author'})]
    ratings = [content['content'] for content in recipe_soup.find_all('meta', attrs={'itemprop': 'ratingValue'})][1:] #the first one is the average rating
    dates = [date.text for date in recipe_soup.find_all('div', attrs={'itemprop': 'dateCreated'})]
    
    reviews = list(set(zip(authors, ratings, dates)))
    return reviews
    
def get_recipe_features(recipe_soup):
    ''' returns features consisting of title, breadcrumbs and ingredients for a recipe '''
    title = recipe_soup.find('h1', attrs={'id': 'recipe-main-content'}).text
    breadcrumbs = [" ".join(breadcrumb.text.split()) for breadcrumb in recipe_soup.find_all('span', attrs={'itemprop': 'name'})]
#   TODO: do we want categories? there's also cuisines but both might be empty depending on the recipe
#     categories = [" ".join(category.text.split()) for category in soup.find_all('span', attrs={'itemprop': 'recipeCategory'})]
    ingredients = [ingredient.text for ingredient in recipe_soup.find_all('span', attrs={'itemprop': 'recipeIngredient'})]
    cuisine = [cuisine.text for cuisine in recipe_soup.find_all('span', attrs={'itemprop': 'recipeCuisine'})]
    
    return title, breadcrumbs, cuisine, ingredients



In [12]:
def generate_db_from_recipe_urls(recipe_urls, db_path):
    
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument("--window-size=1920,1080")
    driver = webdriver.Chrome('./chromedriver.exe',options=options)

    
    with open ('recipe_reviews.csv', 'w') as r, open ('recipe_features.csv', 'w') as f:
        review_writer = csv.writer(r, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        feature_writer = csv.writer(f, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        review_writer.writerow(['recipe_id', 'author', 'rating', 'date'])
        feature_writer.writerow(['recipe_id', 'title', 'breadcrumbs', 'cuisine', 'ingredients'])

        for recipe_url in tqdm.tqdm(recipe_urls):
            soup = get_recipe_with_reviews_soup(driver, recipe_url)
            recipe_id = re.search('^(https://www.allrecipes\.com\/recipe\/)([0-9]*)(\/)', recipe_url, re.IGNORECASE).group(2)
            reviews = get_recipe_reviews(soup)  
            title, breadcrumbs, cuisine, ingredients = get_recipe_features(soup)
            
            breadcrumbs = '+'.join(breadcrumbs)
            cuisine = '+'.join(cuisine)
            ingredients =  '+'.join(ingredients)
            
            feature_writer.writerow([recipe_id, title, breadcrumbs, cuisine, ingredients])
            for author, rating, date in reviews:
                review_writer.writerow([recipe_id, author, rating, date])
        
    return

In [13]:
def main():
    category_url = 'https://www.allrecipes.com/recipes/80/main-dish/?page='
    pickle_filename = 'MAYBE_TODO'
    db_path = 'TODO'
    
    recipe_urls = search_entire_category(category_url, pickle_filename)
    print('RECIPE URLS DONE')
    generate_db_from_recipe_urls(recipe_urls, db_path)

main()

RECIPE URLS DONE


WebDriverException: Message: 'chromedriver.exe' executable may have wrong permissions. Please see https://sites.google.com/a/chromium.org/chromedriver/home


## DISCARDED CELLS

In [8]:
url='https://www.allrecipes.com/recipe/166624/tomato-basil-salmon/'
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
breadcrumbs = [breadcrumb.text for breadcrumb in soup.find_all('span', attrs={'itemprop': 'name'})]
breadcrumbs
# ids = [elt['href'] for elt in soup.find_all('a', href=re.compile('^(https://www.allrecipes\.com\/cook\/)[0-9]*(\/)'))]
# ids = list(set(map(lambda x: re.search('^(https://www.allrecipes\.com\/cook\/)([0-9]*)(\/)', x, re.IGNORECASE).group(2), ids)))
# print(ids)
# authors = [" ".join(author.text.split()) for author in soup.find_all('h4', attrs={'itemprop': 'author'})]
# authors

['\r\n                Home\r\n            ',
 '\r\n                    Recipes\r\n                ',
 '\r\n                    World Cuisine\r\n                ',
 '\r\n                    European\r\n                ',
 '\r\n                    Italian\r\n                ']

In [9]:
df = pd.read_csv('recipe_features.csv', sep='|', header=None)
df

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xae in position 31: invalid start byte

In [None]:
import tqdm