In [15]:
import numpy as np
import pandas as pd
import requests
import json
import bs4
from bs4 import BeautifulSoup as bs

# Web Scraping from Allrecipes.com

In order to compile a dataset of recipes for this project, we will first need to use web scraping and build a JSON object.  For this, we will send requests to www.allrecipes.com and parse their recipe cards.

### Setup

In [16]:
# set how many pages to scrape recipes from
first_page = 101
last_page = 200

In [17]:
# method called to save a recipe to the json file
def save_to_json(title, layout, picture, servings, ingredients, method, prep_time, cook_time, additional_time, total_time):
    with open('recipes.json') as in_file:
        data = json.load(in_file)
    
    # if this recipe title already exists in the data, do not add it again
    already_exists = False
    for recipe in data:
        if recipe['title'] == title:
            already_exists = True
    
    # if a new recipe, append to the json object and dump back to the file
    if not already_exists:
        print("Saved: {} ".format(title))
        new_recipe = {}
        new_recipe['title'] = title
        new_recipe['layout'] = layout
        new_recipe['picture'] = picture
        new_recipe['ingredients'] = ingredients
        new_recipe['method'] = method
        new_recipe['servings'] = servings
        new_recipe['prep_time'] = prep_time
        new_recipe['cook_time'] = cook_time
        new_recipe['additional_time'] = additional_time
        new_recipe['total_time'] = total_time

        data.append(new_recipe)
    
        with open('recipes.json', 'w') as out_file:
            json.dump(data, out_file, indent=4)
    else:
        print("Already Exists: {} ".format(title))

### The Web Scraper

Note that allrecipes has two different HTML layouts for their recipe pages, a regular layout and a layout which supports shopping for ingredients directly from the recipe page.  These two layouts have the information we need in different locations, so we need to differentiate them.  If the title element we initially search for is set to 'None', we have to instead look for the elements where they would be in the second (shopper) layout.

In [18]:
for page in range(first_page, last_page + 1):
    # request the main allrecipes page which lists the top recipes
    source = requests.get("https://www.allrecipes.com/recipes?page=" + str(page))
    print("PARSING PAGE {}".format(page))
    doc = bs(source.text,'html.parser')
    
    # find each recipe linked on the main page, and open their links one by one
    recipe_cards = doc.select('a.fixed-recipe-card__title-link')

    for card in recipe_cards:
        # these are the values we will scrape for.  We first declare them as empty strings and lists
        layout = 0
        ingredients_list = []
        method_list = []
        title, picture = '', ''
        prep_time, cook_time, total_time, additional_time, servings, = '','','','',''
        
        #open the page for each recipe card and parse its html
        recipe_page_source = requests.get(card['href'])        
        recipe_main = bs(recipe_page_source.text,'html.parser')
        
        # search for the values we declared above
        title = recipe_main.select_one('.recipe-summary__h1')
        if title is not None:
            #for ordinary formatting layout (1)
            layout = 1
            title = title.text
            picture = recipe_main.select_one('.rec-photo').attrs['src']
            ingredients = recipe_main.select('.recipe-ingred_txt')
            method = recipe_main.select('.recipe-directions__list--item')
            servings = recipe_main.select_one('#metaRecipeServings')['content']
            
            meta_item_types = recipe_main.select('.prepTime__item--type')
            meta_item_times = recipe_main.select('.prepTime__item--time')
            
            for label, time in zip(meta_item_types, meta_item_times):
                if label.text == 'Prep':
                    prep_time = time.text
                elif label.text =='Cook':
                    cook_time = time.text
                elif label.text == 'Additional':
                    additional_time = time.text
                elif label.text == 'Ready In':
                    total_time = time.text                
                
        # if the title is 'None', then the page must be in the shopper formatting layout
        else:
            # for shopper formatting layout (2)
            layout = 2
            title = recipe_main.select_one('h1.headline.heading-content').text
            picture = recipe_main.select_one('.inner-container > img').attrs['src']
            ingredients = recipe_main.select('span.ingredients-item-name')
            method = recipe_main.select('div.paragraph > p')
            meta_items = recipe_main.select('div.recipe-meta-item')
            
            for item in meta_items:
                parts = item.select('div')
                header = parts[0].text.strip()
                body = parts[1].text.strip()
                    
                if header == 'prep:':
                    prep_time = body
                elif header =='cook:':
                    cook_time = body
                elif header == 'additional:':
                    additional_time = body
                elif header == 'total:':
                    total_time = body
                elif header == 'Servings:':
                    servings = body
        
        # compile a list of ingredients for the current recipe
        for ingredient in ingredients:
            if ingredient.text != 'Add all ingredients to list' and ingredient.text != '':
                ingredients_list.append(ingredient.text.strip())
            
        # compile a list of method instructions for the current recipe
        for instruction in method:
            method_list.append(instruction.text.strip())
        
        # do not save if ingredients and method lists are empty (video recipes)
        if len(ingredients_list)==0 and len(method_list) == 0:
            pass
        else:
            # add this to the json string stored in the 'recipes.json' file
            save_to_json(title, 
                        layout, 
                        picture, 
                        servings, 
                        ingredients_list, 
                        method_list, 
                        prep_time, 
                        cook_time, 
                        additional_time, 
                        total_time
                       )
        
print("FINISHED PARSING")

PARSING PAGE 101
Saved: Easiest Pot Roast Ever 
Saved: German Chocolate Cake Frosting II 
Saved: Taco Salad I 
Saved: Mushroom Slow Cooker Roast Beef 
Saved: Delectable Marinated Chicken 
Saved: Split Pea and Ham Soup I 
Saved: Microwave Corn on the Cob 
Saved: Bev's Chocolate Pie 
Saved: Barbeque Chicken 
Saved: Awesome Grilled Cheese Sandwiches 
Saved: Best Grilled Pork Chops 
Saved: Pork Carnitas 
Saved: The Best Meatballs You'll Ever Have 
Saved: Award Winning Chili 
Saved: Fettuccine Alfredo V 
Saved: Lighter Chicken and Dumplings 
Saved: Sesame Shrimp Stir-Fry 
Saved: Sweet Potato Bread I 
Saved: Chicken Scarpariello 
Saved: Barlow's Blackened Catfish 
PARSING PAGE 102
Already Exists: Chicken Scarpariello 
Saved: Ratatouille Bake 
Saved: Best Ever Popcorn Balls 
Saved: Cream Cheese Jalapeno Hamburgers 
Saved: Sesame Seared Tuna 
Saved: Almond-Crusted Tilapia 
Saved: Chicken Enchilada Soup III 
Saved: Farmer's Market Vegetarian Quesadillas 
Saved: Garlic Roasted Chicken and Potato

Saved: Fish Tacos with Honey-Cumin Cilantro Slaw and Chipotle Mayo 
Saved: Simply Parmesan Chicken 
Saved: Quinoa and Black Bean Chili 
Saved: The Perfect Cinnamon Roll Icing 
Saved: Cinnamon Ice Cream 
Saved: Lemon-Buttermilk Pound Cake with Aunt Evelyn's Lemon Glaze 
Saved: Tex-Mex Burger with Cajun Mayo 
Saved: Homemade Taco Seasoning Mix 
Saved: Low Carb Cauliflower Leek Soup 
Saved: Bacon and Potato Soup 
Saved: Apple Cake IV 
Saved: Maple Apple Crisp 
PARSING PAGE 114
Saved: Strawberry Avocado Salad 
Saved: Quick and Easy Yorkshire Pudding 
Saved: Wonderful Banana Cake 
Saved: Potato, Broccoli and Cheese Soup 
Saved: Chicago-Style Pan Pizza 
Saved: Grecian Pork Tenderloin 
Saved: Slow Cooker Swiss Steak 
Saved: Boiled Bagels 
Saved: Japanese-Style Sesame Green Beans 
Saved: Bev's Orange Chicken 
Saved: Baked Tofu Bites 
Saved: Roasted Asparagus with Parmesan 
Saved: Moroccan-Style Stuffed Acorn Squash 
Saved: Balsamic Glazed Carrots 
Saved: Orange Cream Fruit Salad 
Saved: Brown 

Saved: Maple Baked Pork Loin Roast 
Saved: Hashbrown Casserole 
Saved: BBQ Chicken Pizza 
Saved: Best Moist Chocolate Cake 
Saved: Steakhouse Wheat Bread for the Bread Machine 
Saved: Slow Cooker Apple Crisp 
Saved: Bread Machine Challah I 
Saved: Chicken and Spinach Alfredo Lasagna 
Saved: Arkansas Green Beans 
Saved: Cranberry-Pumpkin Cookies 
Saved: Sherry Braised Beef Short Ribs 
Saved: Lemon Chicken Orzo Soup 
Saved: Taco Dip I 
Saved: Macaroni and Cheese V 
Saved: Lemon Horseradish New Potatoes 
Saved: Chicken Biscuit Pie 
PARSING PAGE 126
Saved: Shrimp Linguine Alfredo 
Saved: Roasted Potatoes with Greens 
Saved: Banana Nut Bread I 
Saved: Emergency Chicken 
Saved: Butter Rich Spritz Butter Cookies 
Saved: Ronaldo's Beef Carnitas 
Saved: Fried Pork Chop 
Saved: Easy Chicken Rice Casserole 
Saved: Easy Chicken Satay 
Saved: Cheesy Chicken Meatballs 
Saved: Real Strawberry Frosting 
Saved: English Royalty Chocolate Chip Scones 
Saved: Dash's Donair 
Saved: Apple-Raisin French Toas

Saved: Brazilian Fish Stew 
Saved: Apple Bread 
Already Exists: Coconut Cream Pound Cake 
Saved: Mushroom Risotto 
Saved: Birthday Bones 
Saved: Authentic French Meringues 
Saved: Lasagna Roll Ups 
Saved: Vegan Red Lentil Soup 
Saved: Tender Pan-Fried Chicken Breasts 
Saved: Chef John's Ricotta Meatballs 
Saved: Egg in a Boat 
Saved: Bow Tie Pasta with Sausage and Sweet Peppers 
Saved: Napa Cabbage Salad 
Saved: Fudge Brownies I 
Saved: Bacon Water Chestnut Rolls 
Saved: Raspberry Oatmeal Bars 
Saved: Camp Cornbread 
PARSING PAGE 138
Saved: Ultimate Frozen Strawberry Margarita 
Already Exists: Raspberry Oatmeal Bars 
Already Exists: Camp Cornbread 
Saved: Baked Macaroni and Cheese III 
Saved: Banana Muffins I 
Saved: Amish Yumazuti 
Saved: Sunshine Toast 
Saved: Chicken Stuffed Baked Avocados 
Saved: Fantastic Lemon Butter Fillet 
Saved: World's Best Potato Soup 
Saved: Louisiana Sweet Potato Pancakes 
Saved: Oh-So-Easy Caramel Sauce 
Saved: Rhubarb Bread I 
Saved: Oven Brown Rice 
Sav

Saved: Freezer Caramel Drizzle Pie 
Saved: The Old Boy's Strawberry Pie 
Saved: Brigadeiro 
Saved: Adobo Chicken with Ginger 
Saved: Greek Pasta Salad 
Saved: Jiffy Cinnamon Rolls 
Saved: Never Fail Chocolate Chip Cookies 
Saved: Butternut Squash Soup with a Kick 
Saved: Restaurant-Style Potato Skins 
Saved: Amaretto Cake 
Saved: Baked Chicken and Zucchini 
Saved: Old-Fashioned Potato Soup 
Saved: Chicken, Sausage, Peppers, and Potatoes 
Saved: Creamy Italian White Bean Soup 
Saved: Stuffed Pepper Soup I 
PARSING PAGE 150
Already Exists: Stuffed Pepper Soup I 
Saved: Turkey Carcass Soup 
Saved: Double Layer Pumpkin Pie 
Saved: Whole Chicken Slow Cooker Recipe 
Saved: Chef John's Shepherd's Pie 
Saved: Chocolate Cake II 
Saved: Buffalo Shrimp 
Saved: Grandma's Doughnuts 
Saved: Strawberry Dream Cake I 
Saved: Chinese Pork Chops 
Saved: Cocktail Meatballs I 
Saved: Skillet Chops with Mushroom Gravy 
Saved: Peanut Butter and Banana Smoothie 
Saved: Emily's Famous Sloppy Joes 
Saved: Moroc

Saved: Grilled Garlic Artichokes 
Saved: Rita's Sweet Holiday Baked Ham 
Saved: Angel Food Cake I 
Saved: Cheddar Chicken 
Saved: S.O.P.P. 
Saved: Hungarian Goulash II 
Saved: Charline's Sweet Potato Casserole 
Saved: Blueberry Buttermilk Coffeecake 
Saved: Absolutely The Best Shrimp Scampi 
Saved: Hot Crab Dip 
Saved: Key Lime Cake III 
Saved: Mom's Pumpkin Pie 
Saved: Roasted Herb Chicken and Potatoes 
Saved: Spinach Lentil Soup 
Saved: Whole Grain Breakfast Cookies 
Saved: Grilled Chicken with Rosemary and Bacon 
Saved: Raspberry Sauce 
PARSING PAGE 162
Already Exists: Roasted Herb Chicken and Potatoes 
Already Exists: Key Lime Cake III 
Saved: Chocolate Turtles® Cheesecake I 
Saved: Hot Spiced Cider 
Saved: Lactation Cookies 
Saved: Apple, Cranberry, and Pear Crisp 
Saved: Fast and Easy Pancakes 
Saved: Barley Bake 
Saved: Alfredo Light 
Saved: Pistachio Cake III 
Saved: Gingerbread Cookies II 
Saved: Spicy Black Bean Cakes 
Saved: Cream of Broccoli Soup I 
Saved: Bread Pudding III

Saved: Spicy Tortilla Roll-Ups 
Saved: Baby Carrots with Dill Butter 
Saved: Blue Cheese, Bacon and Chive Stuffed Pork Chops 
Saved: Shredded Chicken Tacos 
Saved: Gnocchi with Sage-Butter Sauce 
Saved: Mountain Apple Cobbler 
Saved: Butternut Squash Fries 
Saved: Baked Omelet Squares 
Saved: Grandma's Sloppy Joes 
Saved: Abby's Chicken Rollatini 
Saved: Teriyaki and Pineapple Chicken 
Saved: Strawberry Romaine Salad I 
Saved: Savory Tater Tot Casserole 
Saved: Sweet Potato Pound Cake 
Saved: Sweet and Sour Sauce II 
Saved: Blue Cheese Beef Tenderloin 
Saved: Easy Baklava 
Saved: Vegetable Pizza I 
Saved: Chicken Pot Pie VI 
PARSING PAGE 174
Saved: The Original Fantasy Fudge 
Saved: Blueberry Coffee Cake III 
Saved: Erin's Indonesian Chicken 
Saved: Chocolate Sauce 
Saved: Unsloppy Joes 
Saved: Scottish Oat Scones 
Saved: Sausage Stuffed Mushrooms II 
Saved: Sweet and Sour Chicken II 
Already Exists: Chicken Pot Pie VI 
Saved: Glazed Corned Beef 
Saved: Avocado Soup with Chicken and Li

Saved: Chef John's Baked Eggs 
Saved: How to Make Perfect Hard Boiled Eggs 
Saved: Chicken Tetrazzini for a Crowd 
Saved: Applesauce Cake I 
Saved: No Bake Cookies II 
Saved: Baked Apples 
Saved: Chicken Salad with Bacon, Lettuce, and Tomato 
Saved: Spinach Tortellini Soup 
Saved: Bob's Stuffed Banana Peppers 
Saved: Real Sopapillas 
Saved: Pot Stickers Traditional 
Saved: Cauliflower Casserole 
PARSING PAGE 186
Already Exists: Cauliflower Casserole 
Saved: Andy's Spicy Green Chile Pork 
Already Exists: Spinach Tortellini Soup 
Already Exists: Chicken Tetrazzini for a Crowd 
Already Exists: Bob's Stuffed Banana Peppers 
Saved: Scottish Shortbread II 
Saved: Mexican Wedding Cakes II 
Saved: Irish Potato Candy 
Saved: Grilled Marinated Swordfish 
Saved: Best Ever Chocolate Cutout Cookies 
Saved: Mother's Banana Bread 
Saved: Leftover Chicken Croquettes 
Saved: Ultimate Banana Muffins 
Saved: Mom's Cucumbers 
Saved: Sweet Potato Potato Salad 
Saved: Chewy Crispy Coconut Cookies 
Saved: He

Saved: Ciabatta 
Saved: Quick Chicken And Wine 
Saved: Tortellini Soup I 
Saved: Colcannon 
Saved: Slow-Cooked Green Beans 
Saved: Ground Beef and Cabbage 
Saved: Cranberry Chutney I 
Saved: Basic British Scones 
Saved: Turkey in a Bag 
PARSING PAGE 198
Saved: Fresh Ginger Cookies 
Saved: Different Chicken Divan 
Saved: Cheesy Acorn Squash 
Saved: Krystal's Perfect Marinade for BBQ or Grilled Chicken 
Saved: Parmesan Crusted Chicken 
Saved: Hot and Sour Chicken Soup 
Saved: Easy Ice Cream Cake 
Saved: BBQ NY Strip 
Saved: Amazing Pork Chops in Cream Sauce 
Saved: Jerky Lover's Jerky - Sweet, Hot and Spicy! 
Saved: Slow Cooker Tapioca Pudding 
Saved: Dark Chocolate Bacon Cupcakes 
Saved: Chicken, Stuffing and Green Bean Casserole 
Saved: Slow Cooker Kielbasa and Beer 
Saved: Pumpkin Cake II 
Saved: Pear Bread II 
Saved: Tomato, Cucumber and Red Onion Salad with Mint 
Saved: Hot Apple Cider 
Saved: Chocolate Chocolate Chip Cake Cookies 
Saved: Cheese Ravioli with Fresh Tomato and Articho