In [122]:
import numpy as np
import pandas as pd
import json
import requests
import bs4
from bs4 import BeautifulSoup as bs

# Web Scraping from Allrecipes

In order to compile a dataset of recipes for this project, we will first need to use web scraping and build a JSON object.  For this, we will send requests to www.allrecipes.com and parse their recipe cards.

In [126]:
# set how many pages to scrape recipes from
first_page = 1
last_page = 1

In [136]:
# create empty json file to store recipe data
data = []
with open('recipes.json','w') as out_file:
    json.dump(data, out_file, indent=4)

In [131]:
def save_to_json(title, picture, ingredients, method):
    with open('recipes.json') as in_file:
        data = json.load(in_file)
    
    # if this recipe title already exists in the data, do not add it again
    already_exists = False
    for recipe in data:
        if recipe['title'] == title:
            already_exists = True
    
    # if a new recipe, append to the json object and dump back to the file
    if not already_exists:
        new_recipe = {}
        new_recipe['title'] = title
        new_recipe['picture'] = picture
        new_recipe['ingredients'] = ingredients
        new_recipe['method'] = method

        data.append(new_recipe)
    
        with open('recipes.json', 'w') as out_file:
            json.dump(data, out_file, indent=4)

Note that allrecipes has two different HTML layouts for their recipe pages, so if the title element we search for is set to 'None', we have to instead look for the elements where they would be in the second layout.

In [135]:
for page in range(first_page, last_page + 1):
    # request the main allrecipes page which lists the top recipes
    source = requests.get("https://www.allrecipes.com?page=" + str(page))
    print("PARSING PAGE {}".format(page))
    doc = bs(source.text,'html.parser')
    
    # find each recipe linked on the main page, and open their links one by one
    recipe_cards = doc.select('a.fixed-recipe-card__title-link')
    for card in recipe_cards:
        recipe_page_source = requests.get(card['href'])        
        recipe_main = bs(recipe_page_source.text,'html.parser')
        
        # search for the title, picture, ingredients, and method elements
        title = recipe_main.select_one('.recipe-summary__h1')
        if title is not None:
             #for primary formatting layout
            picture = recipe_main.select_one('.rec-photo')
            ingredients = recipe_main.select('.recipe-ingred_txt')
            method = recipe_main.select('.recipe-directions__list--item')
        # if the title is 'None', then the page must be in the second layout
        else: 
            # for secondary formatting layout
            title = recipe_main.select_one('h1.headline.heading-content')
            picture = recipe_main.select_one('.inner-container > img')
            ingredients = recipe_main.select('span.ingredients-item-name')
            method = recipe_main.select('div.section-body > p')
        
        # compile a list of ingredients for the current recipe
        ingredients_list = []
        for ingredient in ingredients:
            ingredients_list.append(ingredient.text.strip())
            
        # compile a list of method instructions for the current recipe
        method_list = []
        for instruction in method:
            method_list.append(instruction.text.strip())

        # save all data for the current recipe to the json file
        save_to_json(title.text, picture.attrs['src'], ingredients_list, method_list)
        print("Saved: {}".format(title.text))

PARSING PAGE 1
Saved: Curry Salmon with Mango
Saved: Cake Mix Cinnamon Rolls
Saved: Slow Cooker Creamy Chicken Taco Soup
Saved: Simple Tomato Soup
Saved: Two-Ingredient Pizza Dough
Saved: Best Chocolate Chip Cookies
Saved: Janet's Rich Banana Bread
Saved: Creamed Eggs on Toast
Saved: World's Best Lasagna
Saved: Good Old Fashioned Pancakes
Saved: To-Die-For Chicken Pot Pie
Saved: Pantry Chicken Casserole
Saved: Slow Cooker Barbecue Chicken Breast
Saved: Island Kielbasa in a Slow Cooker
Saved: Best Brownies
Saved: Banana Banana Bread
Saved: Shrimp and Sugar Snap Peas
Saved: Easy Korean Ground Beef Bowl
Saved: Dill Pickle Soup
Saved: Fluffy Pancakes
