# Scrape Recipe Details from Epicurious

In [1]:
from bs4 import BeautifulSoup
from splinter import Browser
import time
from time import sleep
import pandas as pd
import json
import requests

In [2]:
# load the recipe list into a data frame
recipe_list_filepath = 'resources/data/recipe_list.csv'
df_recipe_list = pd.read_csv(recipe_list_filepath)

df_recipe_list.head()

Unnamed: 0,source,year,title,url
0,editorial,2019,Okonomiyaki (“As You Like It”) Pancakes With B...,/recipes/food/views/okonomiyaki-as-you-like-it...
1,editorial,2019,"Salmon Confit with Lime, Juniper, and Fennel",/recipes/food/views/salmon-confit-with-lime-ju...
2,editorial,2019,Coconut-Braised Chickpeas with Sweet Potatoes ...,/recipes/food/views/coconut-braised-chickpeas-...
3,editorial,2019,Crispy Sheet-Pan Broccoli,/recipes/food/views/crispy-sheet-pan-broccoli
4,editorial,2019,Prawn Moilee,/recipes/food/views/prawn-moilee-south-indian-...


In [3]:
# add a new columns to track whether it's a recipe and that it has been scraped
df_recipe_list['is_recipe'] = ''
df_recipe_list['scraped'] = ''
df_recipe_list.head()

Unnamed: 0,source,year,title,url,is_recipe,scraped
0,editorial,2019,Okonomiyaki (“As You Like It”) Pancakes With B...,/recipes/food/views/okonomiyaki-as-you-like-it...,,
1,editorial,2019,"Salmon Confit with Lime, Juniper, and Fennel",/recipes/food/views/salmon-confit-with-lime-ju...,,
2,editorial,2019,Coconut-Braised Chickpeas with Sweet Potatoes ...,/recipes/food/views/coconut-braised-chickpeas-...,,
3,editorial,2019,Crispy Sheet-Pan Broccoli,/recipes/food/views/crispy-sheet-pan-broccoli,,
4,editorial,2019,Prawn Moilee,/recipes/food/views/prawn-moilee-south-indian-...,,


In [4]:
df_recipe_list[df_recipe_list['is_recipe'] == '']

Unnamed: 0,source,year,title,url,is_recipe,scraped
0,editorial,2019,Okonomiyaki (“As You Like It”) Pancakes With B...,/recipes/food/views/okonomiyaki-as-you-like-it...,,
1,editorial,2019,"Salmon Confit with Lime, Juniper, and Fennel",/recipes/food/views/salmon-confit-with-lime-ju...,,
2,editorial,2019,Coconut-Braised Chickpeas with Sweet Potatoes ...,/recipes/food/views/coconut-braised-chickpeas-...,,
3,editorial,2019,Crispy Sheet-Pan Broccoli,/recipes/food/views/crispy-sheet-pan-broccoli,,
4,editorial,2019,Prawn Moilee,/recipes/food/views/prawn-moilee-south-indian-...,,
5,editorial,2019,Cardamom-Pistachio Bûche de Noël,/recipes/food/views/cardamom-pistachio-buche-d...,,
6,editorial,2019,Tonnato Eggs,/recipes/food/views/tonatto-eggs,,
7,editorial,2019,Chickpea-Mushroom Burgers,/recipes/food/views/chickpea-mushroom-burgers,,
8,editorial,2019,Kimchi Soup With Tofu and Clams,/recipes/food/views/kimchi-soup-with-tofu-and-...,,
9,editorial,2019,Roasted Cabbage Steaks With Crispy Chickpeas a...,/recipes/food/views/roasted-cabbage-steaks-wit...,,


In [5]:
# (not scraped: special equipment, preparation, menus, related content)
def scrape_general_info(soup, recipe):
    '''
    Takes the soup object and dictionary of recipe. Parses out general recipe info and
    adds to recipe dictionary. Missing values will have a value of None.
    
    '''

    # parse the bits
    try:
        title = soup.find('div', class_='title-source').h1.text
    except Exception:
        title = None
    
    try:
        author = soup.find(class_='contributor')['title'].strip()
    except Exception:
        author = None
        
    try:
        date = soup.find(class_="pub-date").text
    except Exception:
        date = None
    
    try:
        month = date.split()[0]
    except Exception:
        month = None
    
    try:
        year = int(date.split()[1])
    except Exception:       
        year = None
        
    try: 
        rating = float(soup.find(class_="user-interactions").meta['content'])
    except Exception:
        rating = None
        
    try:
        reviews = int(soup.find(class_="reviews-count").text)
    except Exception:
        reviews = None
        
    try:
        make_again = soup.find('div', class_="prepare-again-rating").span.text
    except Exception:
        make_again = None
        
    try:
        active_time = soup.find('dd', class_="active-time").text
    except Exception:
        active_time = None
        
    try:
        image_url = soup.find('img', class_='photo loaded')['srcset']
    except Exception:
        image_url = None
        
    # add bits to recipe dictionary
    recipe['title'] = title
    recipe['author'] = author
    recipe['date'] = date
    recipe['month'] = month
    recipe['year'] = year
    recipe['rating'] = rating
    recipe['reviews'] = reviews
    recipe['make_again'] = make_again
    recipe['active_time'] = active_time
    recipe['image_url'] = image_url

In [6]:
def scrape_ingredients(soup, recipe):
    '''
    Takes the soup object and dictionary of recipe. Parses out the ingredients into a list
    and adds to receipe.
    
    '''
    try:
        ingredients = []
        
        ingredient_list = soup.find_all('li', class_="ingredient")
        
        for ingredient_item in ingredient_list:
            ingredients.append(ingredient_item.text)
        
        recipe['ingredients'] = ingredients
    except Exception:
        recipe['ingredients'] = None

In [7]:
def scrape_nutritional_info(soup, recipe):
    '''
    Takes the soup object and dictionary of recipe. Parses out the nutrional info into a dictionary
    and adds to receipe.
    
    '''
    nutrition = dict()

    try:
        nutrition['cal'] = int(soup.find('span', class_="nutri-data", itemprop="calories").text)
    except Exception:
        nutrition['cal'] = None
    
    try: 
        nutrition['carb'] = (soup.find('span', class_="nutri-data", itemprop="carbohydrateContent").text)
    except Exception:
        nutrition['carb'] = None
        
    try:
        nutrition['fat'] = soup.find('span', class_="nutri-data", itemprop="fatContent").text
    except Exception:
        nutrition['fat'] = None
        
    try:
        nutrition['protein'] = soup.find('span', class_="nutri-data", itemprop="proteinContent").text
    except Exception:
        nutrition['protein'] = None
        
    try:
        nutrition['sat_fat'] = soup.find('span', class_="nutri-data", itemprop="saturatedFatContent").text
    except Exception:
        nutrition['sat_fat'] = None
        
    try:
        nutrition['sodium'] = soup.find('span', class_="nutri-data", itemprop="sodiumContent").text
    except Exception:
        nutrition['sodium'] = None
        
    try:
        nutrition['polyunsat_fat'] = soup.findAll('span', class_="nutri-data")[6].text # no attr = itemprop
    except Exception:
        nutrition['polyunsat_fat'] = None
        
    try:
        nutrition['fiber'] = soup.find('span', class_="nutri-data", itemprop="fiberContent").text
    except Exception:
        nutrition['fiber'] = None
        
    try:
        nutrition['monounsat_fat'] = soup.findAll('span', class_="nutri-data")[8].text # no attr = itemprop
    except Exception:
        nutrition['monounsat_fat'] = None
    
    try:
        nutrition['cholesterol'] = soup.findAll('span', class_="nutri-data")[9].text # no attr = itemprop
    except Exception:
        nutrition['cholesterol'] = None
        
    try:
        nutrition['servings'] = soup.find(class_="per-serving").text
    except Exception:
        nutrition['servings'] = None

    recipe['nutrition'] = nutrition # dict of dict [int, int]

In [8]:
def scrape_tags(soup, recipe):
    '''
    Takes the soup object and dictionary of recipe. Parses out the tag info into a dictionary
    and adds to receipe.
    
    '''
    # dictionary of tags
    tags = dict()
        
    try:
        tag_list = soup.find('dl', class_='tags').findAll('a')

        # loop through each tag and add values to tags dictionary
        for tag in tag_list:
            # this removes the first forward slash from the href tag and then splits into list
            item = tag['href'][1:].split("/")
    
            if item[0] not in tags:
                tags[item[0]] = [item[1]]
            else:
                tags[item[0]].append(item[1])
    except Exception:
        tags = None
        
    recipe['tags'] = tags

In [9]:
# define recipe urls and years to search
root_url = "https://www.epicurious.com"

In [10]:
# define list of all recipes
all_recipes = []

In [11]:
# start up the browser
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [12]:
# loop through all items in the dataframe and pull details
for index, row in df_recipe_list.iterrows(): 
    print(f'Processing index {index}')
    if index > 50:
        break
    else:
        # navigate to recipe url
        url = root_url + row.url
        
        r = requests.get(url)
        
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, 'html.parser')
            
        
#         browser.visit(url)
#         sleep(1)
        
#         # parse with beautiful soup
#         html = browser.html
#         soup = BeautifulSoup(html, 'html.parser')
        
            # confirm it is a recipe before trying to parse
            if 'recipe-detail' in soup.body['class']:
                # is a recipe, let's scrape!
                df_recipe_list.iloc[index, df_recipe_list.columns.get_loc('is_recipe')] = 1

                # create recipe dictionary
                recipe = dict()  
                recipe['url'] = url

                #scrape bits we need
                scrape_general_info(soup, recipe)
                scrape_ingredients(soup, recipe)
                scrape_nutritional_info(soup, recipe)
                scrape_tags(soup, recipe)

                # add completed recipe to list
                all_recipes.append(recipe)
                df_recipe_list.iloc[index, df_recipe_list.columns.get_loc('scraped')] = 1

            else:
                # not a recipe. mark as such and move on
                df_recipe_list.iloc[index, df_recipe_list.columns.get_loc('is_recipe')] = 0
                
        else:
            print(f'Response code {r.status_code} for index {index} and url {url}')

#         print(url)
        
#         df_recipe_list.iloc[index, df.columns.get_loc('is_recipe')] = 'Y'
        
print('out of loop')

Processing index 0
Processing index 1
Processing index 2
Processing index 3
Processing index 4
Processing index 5
Processing index 6
Processing index 7
Processing index 8
Processing index 9
Processing index 10
Processing index 11
Processing index 12
Processing index 13
Processing index 14
Processing index 15
Processing index 16
Processing index 17
Processing index 18
Processing index 19
Processing index 20
Processing index 21
Processing index 22
Processing index 23
Processing index 24
Processing index 25
Processing index 26
Processing index 27
Processing index 28
Processing index 29
Processing index 30
Processing index 31
Processing index 32
Processing index 33
Processing index 34
Processing index 35
Processing index 36
Processing index 37
Processing index 38
Processing index 39
Processing index 40
Processing index 41
Processing index 42
Processing index 43
Processing index 44
Processing index 45
Processing index 46
Processing index 47
Processing index 48
Processing index 49
Processing

In [13]:
all_recipes

[{'url': 'https://www.epicurious.com/recipes/food/views/okonomiyaki-as-you-like-it-pancakes-with-bonito-flakes',
  'title': 'Okonomiyaki (“As You Like It”) Pancakes With Bonito Flakes ',
  'author': 'Sonoko Sakai',
  'date': 'December 2019',
  'month': 'December',
  'year': 2019,
  'rating': 3.5,
  'reviews': 3,
  'make_again': '100%',
  'active_time': None,
  'image_url': 'https://assets.epicurious.com/photos/5dfbeb3a8a7c8f0008b67fe6/6:4/w_274%2Ch_169/Japanese-Home-Cooking-Okonomiyaki-Recipe-121919.jpg',
  'ingredients': ['1 ½ cups (180 g) all-purpose flour',
   '1 teaspoon baking powder',
   '¼ teaspoon sea salt',
   '1 large egg, beaten',
   '1¼ cups (296 ml) whole milk or milk of your choice',
   '8 ounces (230 g) cabbage, thinly sliced',
   '2 scallions, white and light green parts chopped',
   '½ yellow, green, or red bell pepper, thinly sliced',
   '4 tablespoons vegetable oil',
   '8 ounces (230 g) boneless chicken, shrimp, crab, or sukiyaki-style beef or pork, cut into ½-inch 

In [14]:
df_recipe_list.head(60)

Unnamed: 0,source,year,title,url,is_recipe,scraped
0,editorial,2019,Okonomiyaki (“As You Like It”) Pancakes With B...,/recipes/food/views/okonomiyaki-as-you-like-it...,1.0,1.0
1,editorial,2019,"Salmon Confit with Lime, Juniper, and Fennel",/recipes/food/views/salmon-confit-with-lime-ju...,1.0,1.0
2,editorial,2019,Coconut-Braised Chickpeas with Sweet Potatoes ...,/recipes/food/views/coconut-braised-chickpeas-...,1.0,1.0
3,editorial,2019,Crispy Sheet-Pan Broccoli,/recipes/food/views/crispy-sheet-pan-broccoli,1.0,1.0
4,editorial,2019,Prawn Moilee,/recipes/food/views/prawn-moilee-south-indian-...,1.0,1.0
5,editorial,2019,Cardamom-Pistachio Bûche de Noël,/recipes/food/views/cardamom-pistachio-buche-d...,1.0,1.0
6,editorial,2019,Tonnato Eggs,/recipes/food/views/tonatto-eggs,1.0,1.0
7,editorial,2019,Chickpea-Mushroom Burgers,/recipes/food/views/chickpea-mushroom-burgers,1.0,1.0
8,editorial,2019,Kimchi Soup With Tofu and Clams,/recipes/food/views/kimchi-soup-with-tofu-and-...,1.0,1.0
9,editorial,2019,Roasted Cabbage Steaks With Crispy Chickpeas a...,/recipes/food/views/roasted-cabbage-steaks-wit...,1.0,1.0


In [None]:
all_recipes.append(recipe)
with open('resources/data/all_recipe.json', 'w') as f:
    json.dump(all_recipes, f)

In [None]:
df_recipe_list.count()

In [None]:
url = "https://www.epicurious.com/recipes/food/views/okonomiyaki-as-you-like-it-pancakes-with-bonito-flakes"
r = requests.get(url)

In [None]:
r.status_code == 200

In [None]:
soup = BeautifulSoup(r.content, 'html.parser')

In [None]:
# parse with beautiful soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [None]:
ingredient_list = soup.find_all('li', class_="ingredient")

In [None]:
for ingredient_item in ingredient_list:
    print(ingredient_item.text)