In [36]:
import numpy as np
import pandas as pd
import json
import requests
import re
import bs4
from bs4 import BeautifulSoup as bs

# Web Scraping from Allrecipes

In order to compile a dataset of recipes for this project, we will first need to use web scraping and build a JSON object.  For this, we will send requests to www.allrecipes.com and parse their recipe cards.

In [71]:
# set how many pages to scrape recipes from
first_page = 1
last_page = 1

In [72]:
# create empty json file to store recipe data
data = []
with open('recipes.json','w') as out_file:
    json.dump(data, out_file, indent=4)

In [73]:
def save_to_json(title, layout, picture, servings, ingredients, method, prep_time, cook_time, additional_time, total_time):
    with open('recipes.json') as in_file:
        data = json.load(in_file)
    
    # if this recipe title already exists in the data, do not add it again
    already_exists = False
    for recipe in data:
        if recipe['title'] == title:
            already_exists = True
    
    # if a new recipe, append to the json object and dump back to the file
    if not already_exists:
        print("Saved: {} ".format(title))
        new_recipe = {}
        new_recipe['title'] = title
        new_recipe['layout'] = layout
        new_recipe['picture'] = picture
        new_recipe['ingredients'] = ingredients
        new_recipe['method'] = method
        new_recipe['servings'] = servings
        new_recipe['prep_time'] = prep_time
        new_recipe['cook_time'] = cook_time
        new_recipe['additional_time'] = additional_time
        new_recipe['total_time'] = total_time

        data.append(new_recipe)
    
        with open('recipes.json', 'w') as out_file:
            json.dump(data, out_file, indent=4)
    else:
        print("Already Exists: {} ".format(title))

Note that allrecipes has two different HTML layouts for their recipe pages, a regular layout and a layout which supports shopping for ingredients directly from the recipe page.  These two layouts have the information we need in different locations, so we need to differentiate them.  If the title element we initially search for is set to 'None', we have to instead look for the elements where they would be in the second (shopper) layout.

In [75]:
builder = JsonBuilder()

In [76]:
for page in range(first_page, last_page + 1):
    # request the main allrecipes page which lists the top recipes
    source = requests.get("https://www.allrecipes.com/recipes?page=" + str(page))
    print("PARSING PAGE {}".format(page))
    doc = bs(source.text,'html.parser')
    
    # find each recipe linked on the main page, and open their links one by one
    recipe_cards = doc.select('a.fixed-recipe-card__title-link')

    for card in recipe_cards:
        # these are the values we will scrape for.  We first declare them as empty strings and lists
        layout = 0
        ingredients_list = []
        method_list = []
        title, picture = '', ''
        prep_time, cook_time, total_time, additional_time, servings, = '','','','',''
        
        #open the page for each recipe card and parse its html
        recipe_page_source = requests.get(card['href'])        
        recipe_main = bs(recipe_page_source.text,'html.parser')
        
        # search for the values we declared above
        title = recipe_main.select_one('.recipe-summary__h1')
        if title is not None:
            #for ordinary formatting layout (1)
            layout = 1
            title = title.text
            picture = recipe_main.select_one('.rec-photo').attrs['src']
            ingredients = recipe_main.select('.recipe-ingred_txt')
            method = recipe_main.select('.recipe-directions__list--item')
            servings = recipe_main.select_one('#metaRecipeServings')['content']
            
            meta_item_types = recipe_main.select('.prepTime__item--type')
            meta_item_times = recipe_main.select('.prepTime__item--time')
            
            for label, time in zip(meta_item_types, meta_item_times):
                if label.text == 'Prep':
                    prep_time = time.text
                elif label.text =='Cook':
                    cook_time = time.text
                elif label.text == 'Additional':
                    additional_time = time.text
                elif label.text == 'Ready In':
                    total_time = time.text                
                
        # if the title is 'None', then the page must be in the shopper formatting layout
        else:
            # for shopper formatting layout (2)
            layout = 2
            title = recipe_main.select_one('h1.headline.heading-content').text
            picture = recipe_main.select_one('.inner-container > img').attrs['src']
            ingredients = recipe_main.select('span.ingredients-item-name')
            method = recipe_main.select('div.paragraph > p')
            meta_items = recipe_main.select('div.recipe-meta-item')

            for item in meta_items:
                parts = item.select('div')
                header = parts[0].text.strip()
                body = parts[1].text.strip()
                    
                if header == 'prep:':
                    prep_time = body
                elif header =='cook:':
                    cook_time = body
                elif header == 'additional:':
                    additional_time = body
                elif header == 'total:':
                    total_time = body
                elif header == 'Servings:':
                    servings = body
        
        # compile a list of ingredients for the current recipe
        for ingredient in ingredients:
            if ingredient.text != 'Add all ingredients to list' and ingredient.text != '':
                ingredients_list.append(ingredient.text.strip())
            
        # compile a list of method instructions for the current recipe
        for instruction in method:
            method_list.append(instruction.text.strip())
        
        # add this to the json string being built by our custom JsonBuilder class
        save_to_json(title, 
                    layout, 
                    picture, 
                    servings, 
                    ingredients_list, 
                    method_list, 
                    prep_time, 
                    cook_time, 
                    additional_time, 
                    total_time
                   )
        
print("FINISHED PARSING")

PARSING PAGE 1
Saved: Greek Chicken Skewers 
Saved: World's Best Lasagna 
Saved: Best Chocolate Chip Cookies 
Saved: Fluffy Pancakes 


KeyboardInterrupt: 

# Data Analysis

Now we have a JSON object which contains recipe information scraped from the web.  We can use this data and perform some analyses on it.

In [19]:
#load json object into dataframe
recipes = pd.read_json(r'recipes.json')

# re-order columns
recipes = recipes[['title', 'servings', 'cook_time', 'prep_time', 'additional_time', 
                   'total_time', 'ingredients', 'method', 'layout', 'picture']]
recipes.head(20)

KeyError: "None of [Index(['title', 'servings', 'cook_time', 'prep_time', 'additional_time',\n       'total_time', 'ingredients', 'method', 'layout', 'picture'],\n      dtype='object')] are in the [columns]"

### Analyzing Ingredients

Let's look at the ingredients that compose each recipe a bit more closely.  First, we want to know what the most common ingredients are among our scraped recipes.

In [140]:
all_ingredients_list = []

for row in recipes['ingredients']:
    for ing in row:
        all_ingredients_list.append(ing)

In [141]:
ingredients = pd.DataFrame(all_ingredients_list, columns=['ingredients'])
ingredients['edited'] = ingredients['ingredients']
ingredients.head()

Unnamed: 0,ingredients,edited
0,"2 tablespoons olive oil, divided","2 tablespoons olive oil, divided"
1,2 cups sliced green onions,2 cups sliced green onions
2,"1/4 teaspoon salt, divided","1/4 teaspoon salt, divided"
3,2 (8.8 ounce) packages UNCLE BEN'S® READY RICE...,2 (8.8 ounce) packages UNCLE BEN'S® READY RICE...
4,2 1/2 cups sliced zucchini,2 1/2 cups sliced zucchini


Now we attempt to clean the ingredients list to get rid of the 'noise' and have a raw list of ingredient names without units, numbers or descriptors.

In [142]:
# lists of common words we want to remove
units = ['gallon','quart','pint','cup','teaspoon','tablespoon','ounce','pound','can','pinch','serving','slice','package','bottle']
descriptors = ['small','medium','large']

# remove common measuring and descriptive words
for word in units + descriptors:
    plural = word+"s"
    ingredients['edited'] = ingredients['edited'].str.replace(' '+plural+' ', ' ')
    ingredients['edited'] = ingredients['edited'].str.replace(' '+word+' ','')

# remove parantheicals
ingredients['edited'] = ingredients['edited'].str.replace(r'\([^()]*\)','')
# remove text after commas and hyphens
ingredients['edited'] = ingredients['edited'].str.partition(',')
ingredients['edited'] = ingredients['edited'].str.partition(',')
# remove non-alphabetical characters
ingredients['edited'] = ingredients['edited'].str.replace('[^a-zA-Z]', ' ')

# edit down extra spaces caused by adjacent removals
ingredients['edited'] = ingredients['edited'].str.strip()

ingredients.head()

Unnamed: 0,ingredients,edited
0,"2 tablespoons olive oil, divided",olive oil
1,2 cups sliced green onions,sliced green onions
2,"1/4 teaspoon salt, divided",salt
3,2 (8.8 ounce) packages UNCLE BEN'S® READY RICE...,UNCLE BEN S READY RICE Whole Grain Brown Rice
4,2 1/2 cups sliced zucchini,sliced zucchini


From here we can investigate which ingredients are the most common.  First, we see which edited rows appear most commonly with a call to .value_counts()

In [143]:
ingredients['edited'].value_counts().head(20)

all purpose flour            18
butter                       16
salt                         16
white sugar                  15
baking soda                   9
vanilla extract               8
eggs                          8
baking powder                 7
egg                           6
water                         5
packed brown sugar            5
milk                          5
chopped walnuts               4
ground cinnamon               4
unsalted butter               3
vegetable oil                 3
semisweet chocolate chips     3
skinless                      2
bananas                       2
cloves garlic                 2
Name: edited, dtype: int64

This shows us the most common row values.  However, this data has a few flaws.  Primarily, the issue is that we need the entire row to match to be counted together.  For instance, 'sugar' and 'white sugar' are counted seperately in this analysis.

Another approach might be to store the list of all ingredient 'words' in its own dataframe, and perform a value_counts on it to see what the most common non-unit, non-descriptive words are in our recipe ingredients.  This loses some specificity (we lose the distinction between 'white sugar' and 'brown sugar'), but is helpful in other contexts.

In [144]:
ingredient_words_list = []
for row in ingredients['edited']:
    for word in row.split():
        ingredient_words_list.append(word)

ingredient_words = pd.DataFrame(ingredient_words_list,columns=['words'])

In [145]:
ingredient_words['words'].value_counts().head(20)

sugar      22
butter     19
flour      18
purpose    18
all        18
salt       17
white      17
baking     16
ground     11
powder     10
vanilla     9
soda        9
eggs        8
extract     8
chopped     8
egg         7
brown       6
onion       6
water       6
packed      5
Name: words, dtype: int64