In [1]:
import numpy as np
import pandas as pd
import requests
import json
import bs4
from bs4 import BeautifulSoup as bs

# Web Scraping from Allrecipes.com

In order to compile a dataset of recipes for this project, we will first need to use web scraping and build a JSON object.  For this, we will send requests to www.allrecipes.com and parse their recipe cards.

### Setup

In [None]:
# set how many pages to scrape recipes from
first_page = 1
last_page = 50

In [None]:
# create empty json file to store recipe data
data = []
with open('recipes.json','w') as out_file:
    json.dump(data, out_file, indent=4)

In [None]:
def save_to_json(title, layout, picture, servings, ingredients, method, prep_time, cook_time, additional_time, total_time):
    with open('recipes.json') as in_file:
        data = json.load(in_file)
    
    # if this recipe title already exists in the data, do not add it again
    already_exists = False
    for recipe in data:
        if recipe['title'] == title:
            already_exists = True
    
    # if a new recipe, append to the json object and dump back to the file
    if not already_exists:
        print("Saved: {} ".format(title))
        new_recipe = {}
        new_recipe['title'] = title
        new_recipe['layout'] = layout
        new_recipe['picture'] = picture
        new_recipe['ingredients'] = ingredients
        new_recipe['method'] = method
        new_recipe['servings'] = servings
        new_recipe['prep_time'] = prep_time
        new_recipe['cook_time'] = cook_time
        new_recipe['additional_time'] = additional_time
        new_recipe['total_time'] = total_time

        data.append(new_recipe)
    
        with open('recipes.json', 'w') as out_file:
            json.dump(data, out_file, indent=4)
    else:
        print("Already Exists: {} ".format(title))

### The Web Scraper

Note that allrecipes has two different HTML layouts for their recipe pages, a regular layout and a layout which supports shopping for ingredients directly from the recipe page.  These two layouts have the information we need in different locations, so we need to differentiate them.  If the title element we initially search for is set to 'None', we have to instead look for the elements where they would be in the second (shopper) layout.

In [None]:
for page in range(first_page, last_page + 1):
    # request the main allrecipes page which lists the top recipes
    source = requests.get("https://www.allrecipes.com/recipes?page=" + str(page))
    print("PARSING PAGE {}".format(page))
    doc = bs(source.text,'html.parser')
    
    # find each recipe linked on the main page, and open their links one by one
    recipe_cards = doc.select('a.fixed-recipe-card__title-link')

    for card in recipe_cards:
        # these are the values we will scrape for.  We first declare them as empty strings and lists
        layout = 0
        ingredients_list = []
        method_list = []
        title, picture = '', ''
        prep_time, cook_time, total_time, additional_time, servings, = '','','','',''
        
        #open the page for each recipe card and parse its html
        recipe_page_source = requests.get(card['href'])        
        recipe_main = bs(recipe_page_source.text,'html.parser')
        
        # search for the values we declared above
        title = recipe_main.select_one('.recipe-summary__h1')
        if title is not None:
            #for ordinary formatting layout (1)
            layout = 1
            title = title.text
            picture = recipe_main.select_one('.rec-photo').attrs['src']
            ingredients = recipe_main.select('.recipe-ingred_txt')
            method = recipe_main.select('.recipe-directions__list--item')
            servings = recipe_main.select_one('#metaRecipeServings')['content']
            
            meta_item_types = recipe_main.select('.prepTime__item--type')
            meta_item_times = recipe_main.select('.prepTime__item--time')
            
            for label, time in zip(meta_item_types, meta_item_times):
                if label.text == 'Prep':
                    prep_time = time.text
                elif label.text =='Cook':
                    cook_time = time.text
                elif label.text == 'Additional':
                    additional_time = time.text
                elif label.text == 'Ready In':
                    total_time = time.text                
                
        # if the title is 'None', then the page must be in the shopper formatting layout
        else:
            # for shopper formatting layout (2)
            layout = 2
            title = recipe_main.select_one('h1.headline.heading-content').text
            picture = recipe_main.select_one('.inner-container > img').attrs['src']
            ingredients = recipe_main.select('span.ingredients-item-name')
            method = recipe_main.select('div.paragraph > p')
            meta_items = recipe_main.select('div.recipe-meta-item')

            for item in meta_items:
                parts = item.select('div')
                header = parts[0].text.strip()
                body = parts[1].text.strip()
                    
                if header == 'prep:':
                    prep_time = body
                elif header =='cook:':
                    cook_time = body
                elif header == 'additional:':
                    additional_time = body
                elif header == 'total:':
                    total_time = body
                elif header == 'Servings:':
                    servings = body
        
        # compile a list of ingredients for the current recipe
        for ingredient in ingredients:
            if ingredient.text != 'Add all ingredients to list' and ingredient.text != '':
                ingredients_list.append(ingredient.text.strip())
            
        # compile a list of method instructions for the current recipe
        for instruction in method:
            method_list.append(instruction.text.strip())
        
        # add this to the json string being built by our custom JsonBuilder class
        save_to_json(title, 
                    layout, 
                    picture, 
                    servings, 
                    ingredients_list, 
                    method_list, 
                    prep_time, 
                    cook_time, 
                    additional_time, 
                    total_time
                   )
        
print("FINISHED PARSING")