# Epicurious Recipe Webscrape

Guide to Web Scraping with Python Part 1: Requests and BeautifulSoup
https://www.learndatasci.com/tutorials/ultimate-guide-web-scraping-w-python-requests-and-beautifulsoup/

In [25]:
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time
from time import sleep
import requests
import json

# Scrape recipe urls from a list of categories

# Scrape recipe page, save HTML locally, read local HTML

In [24]:
# Visit specific recipe website >> add time.sleep(1) if site is loading slowly

base = "https://epicurious.com/recipes"
url = "/food/views/overnight-porridge-congee-chao-andrea-nguyen-vietnamese-rice-soup/"

r = requests.get(base+url)

# Save HTML locally
def save_html(html, path):
    with open(path, 'wb') as f:
        f.write(html)
        
save_html(r.content, 'recipe.html')

# Read local HTML file
def open_html(path):
    with open(path, 'rb') as f:
        return f.read()
    
    
html = open_html('recipe.html')

In [6]:
# Soupify
soup = bs(r.content, 'html.parser')

# Scrape the recipe for general information
(not scraped: special equipment, preparation, menus, related content)

In [None]:
### do you want to scrape any of these sections above? If we save individual raw html for each scrape we can add later

In [None]:
all_recipes = [] # this will be a JSON of all recipes

# for recipe in recipes:
recipe = dict()  
# d['name'] = row.select_one('.source-title').text.strip()
# d['allsides_page'] = 'https://www.allsides.com' + row.select_one('.source-title a')['href']
# d['bias'] = row.select_one('.views-field-field-bias-image a')['href'].split('/')[-1]
# d['agree'] = int(row.select_one('.agree').text)
# d['disagree'] = int(row.select_one('.disagree').text)
# d['agree_ratio'] = d['agree'] / d['disagree']
# d['agreeance_text'] = get_agreeance_text(d['agree_ratio'])
    
all_recipes.append(recipe)

In [11]:


title = soup.find('div', class_='title-source').h1.text
author = soup.find(class_='contributor')['title']
date = soup.find(class_="pub-date").text.split()
month = date[0]
year = int(date[1])
rating = float(soup.find(class_="user-interactions").meta['content'])
reviews = int(soup.find(class_="reviews-count").text)
make_again = soup.find('div', class_="prepare-again-rating").span.text
active_time = soup.find('dd', class_="active-time").text

# Ingredients

In [12]:
ingredients = []

lis = soup.find_all('li', class_="ingredient")

for li in lis:
    newsoup = bs(str(li), 'html.parser')
    ingredients.append(li.text)

In [18]:
ingr_len = len(ingredients)

# Nutritional Information

In [None]:
### Need to figure out how to parse and organize information

In [14]:
cal = int(soup.find('span', class_="nutri-data", itemprop="calories").text)
carb = soup.find('span', class_="nutri-data", itemprop="carbohydrateContent").text
fat = soup.find('span', class_="nutri-data", itemprop="fatContent").text
protein = soup.find('span', class_="nutri-data", itemprop="proteinContent").text
sat_fat = soup.find('span', class_="nutri-data", itemprop="saturatedFatContent").text
sodium = soup.find('span', class_="nutri-data", itemprop="sodiumContent").text
polyunsat_fat = soup.findAll('span', class_="nutri-data")[6].text # no attr = itemprop
fiber = soup.find('span', class_="nutri-data", itemprop="fiberContent").text
monounsat_fat = soup.findAll('span', class_="nutri-data")[8].text # no attr = itemprop
cholesterol = soup.findAll('span', class_="nutri-data")[9].text # no attr = itemprop
servings = soup.find(class_="per-serving").text

In [15]:
nutrition = {
    "cal" : cal,                     # int
    "carb" : carb,                   #
    "fat" : fat,                     #
    "protein" : protein,             #
    "sat_fat" : sat_fat,             #
    "sodium" : sodium,               #
    "polyunsat_fat" : polyunsat_fat, #
    "fiber" : fiber,                 #
    "monounsat_fat" : monounsat_fat, # 
    "cholesterol" : cholesterol,     # 
    "servings" : servings            # 
}

nutrition

{'cal': 127,
 'carb': '28 g(9%)',
 'fat': '0 g(1%)',
 'protein': '2 g(5%)',
 'sat_fat': '0 g(0%)',
 'sodium': '342 mg(14%)',
 'polyunsat_fat': '0 g',
 'fiber': '1 g(3%)',
 'monounsat_fat': '0 g',
 'cholesterol': '',
 'servings': 'per serving (4 servings)'}

# Tags

In [None]:
### What is the best way to save this data in a format that preserves the category and tag relationship?

In [16]:
# tags = soup.find('dl', class_='tags').a['href'].split("/")[1:]
# tags = {"category" : category, "tag" : tag}

tags = []

dl_tags = soup.find_all('dl', class_='tags')

for dl_tag in dl_tags:
    newsoup = bs(str(dl_tag), 'html.parser')
    tags.append(dl_tag.a['href'].split("/")[1:])
    
tag_len = len(tags)

a = dl_tag.findAll('a')[0]['href']
a

dl_tags

[<dl class="tags"><a href="/ingredient/rice"><dt itemprop="recipeCategory">Rice</dt></a><a href="/cuisine/vietnamese"><dt itemprop="recipeCuisine">Vietnamese</dt></a><a href="/type/soup-stew"><dt itemprop="recipeCategory">Soup/Stew</dt></a><a href="/ingredient/ginger"><dt itemprop="recipeCategory">Ginger</dt></a><a href="/ingredient/green-onion-scallion"><dt itemprop="recipeCategory">Green Onion/Scallion</dt></a><a href="/special-consideration/healthy"><dt itemprop="recipeCategory">Healthy</dt></a><a href="/meal/breakfast"><dt itemprop="recipeCategory">Breakfast</dt></a><a href="/meal/lunch"><dt itemprop="recipeCategory">Lunch</dt></a></dl>]

# The whole shabang

In [22]:
recipe_data = {
    "title" : title,             # string
    "author" : author,           # string
    "month" : month,             # string
    "year" : year,               # int
    "rating" : rating,           # float
    "reviews" : reviews,         # int
    "make_again" : make_again,   # int (not sure if this should be float?)
    "active_time" : active_time, # string
    "ingr_len" : ingr_len,       # int
    "ingredients" : ingredients, # list of strings
    "nutrition" : nutrition,     # dictionary
    "tags": tags                 # list of dictionaries?
}

# Save data as a JSON

In [23]:
with open('recipe.json', 'w') as f:
    json.dump(all_recipes, f)