# Epicurious Recipe Webscrape

Guide to Web Scraping with Python Part 1: Requests and BeautifulSoup
https://www.learndatasci.com/tutorials/ultimate-guide-web-scraping-w-python-requests-and-beautifulsoup/

In [118]:
from bs4 import BeautifulSoup
import time
from time import sleep
import requests
import json
import re

# Scrape recipe urls from a list of categories

# Scrape recipe page, save HTML locally, read local HTML (DONE)

In [104]:
### Do you want to save all of the scraped HTML files?

In [105]:
# Visit specific recipe website >> add time.sleep(1) if site is loading slowly

base = "https://epicurious.com/recipes"
url = "/food/views/overnight-porridge-congee-chao-andrea-nguyen-vietnamese-rice-soup/"

r = requests.get(base+url)

# Save HTML locally
def save_html(html, path):
    with open(path, 'wb') as f:
        f.write(html)
        
save_html(r.content, 'recipe.html')

# Read local HTML file
def open_html(path):
    with open(path, 'rb') as f:
        return f.read()
    
    
html = open_html('recipe.html')

In [106]:
# Soupify
soup = BeautifulSoup(r.content, 'html.parser')

# Scrape the recipe for general information (DONE)
(not scraped: special equipment, preparation, menus, related content)

In [107]:
### do you want to scrape any of these sections above?

In [108]:
recipe = dict()  
recipe['title'] = soup.find('div', class_='title-source').h1.text                 # string
recipe['author'] = soup.find(class_='contributor')['title']                       # string
recipe['date'] = soup.find(class_="pub-date").text.split()                        # string
recipe['month'] = date[0]                                                         # string
recipe['year'] = int(date[1])                                                     # int
recipe['rating'] = float(soup.find(class_="user-interactions").meta['content'])   # float
recipe['reviews'] = int(soup.find(class_="reviews-count").text)                   # int
recipe['make_again'] = soup.find('div', class_="prepare-again-rating").span.text  # int (float?)
recipe['active_time'] = soup.find('dd', class_="active-time").text                # string

# Ingredients (DONE)

In [109]:
ingredients = []

lis = soup.find_all('li', class_="ingredient")

for li in lis:
    newsoup = bs(str(li), 'html.parser')
    ingredients.append(li.text)
    
recipe['ingredients'] = ingredients  # list of strings

In [110]:
recipe['ingr_len'] = len(ingredients)  # int

# Nutritional Information

In [122]:
carb = re.findall('\d+', (soup.find('span', class_="nutri-data", itemprop="carbohydrateContent").text))
carb

['28', '9']

In [116]:
nutrition = dict()

nutrition['cal'] = int(soup.find('span', class_="nutri-data", itemprop="calories").text)
nutrition['carb'] = (soup.find('span', class_="nutri-data", itemprop="carbohydrateContent").text)
nutrition['fat'] = soup.find('span', class_="nutri-data", itemprop="fatContent").text
nutrition['protein'] = soup.find('span', class_="nutri-data", itemprop="proteinContent").text
nutrition['sat_fat'] = soup.find('span', class_="nutri-data", itemprop="saturatedFatContent").text
nutrition['sodium'] = soup.find('span', class_="nutri-data", itemprop="sodiumContent").text
nutrition['polyunsat_fat'] = soup.findAll('span', class_="nutri-data")[6].text # no attr = itemprop
nutrition['fiber'] = soup.find('span', class_="nutri-data", itemprop="fiberContent").text
nutrition['monounsat_fat'] = soup.findAll('span', class_="nutri-data")[8].text # no attr = itemprop
nutrition['cholesterol'] = soup.findAll('span', class_="nutri-data")[9].text # no attr = itemprop
nutrition['servings'] = soup.find(class_="per-serving").text

recipe['nutrition'] = nutrition # dict of dict [int, int]


# Tags

In [85]:
### What is the best way to save this data in a format that preserves the category and tag relationship?

In [86]:
# tags = soup.find('dl', class_='tags').a['href'].split("/")[1:]
# tags = {"category" : category, "tag" : tag}

tags = []

dl_tags = soup.find_all('dl', class_='tags')

for dl_tag in dl_tags:
    newsoup = bs(str(dl_tag), 'html.parser')
    tags.append(dl_tag.a['href'].split("/")[1:])
    
# tags >> only outputs one tag group
    
# tag_len = len(tags)

a = dl_tag.findAll('a')[1]['href']
a

# dl_tags

'/cuisine/vietnamese'

# The whole shabang (DONE)

In [88]:
recipe

{'title': 'Super-Simple Overnight Porridge ',
 'author': 'Andrea Nguyen',
 'date': ['November', '2019'],
 'month': 'November',
 'year': 2019,
 'rating': 0.0,
 'reviews': 0,
 'make_again': '0%',
 'active_time': '30 minutes, plus overnight soaking',
 'ingr_len': 7,
 'nutrition': {'cal': 127,
  'carb': ['28', 'g(9%)'],
  'fat': '0 g(1%)',
  'protein': '2 g(5%)',
  'sat_fat': '0 g(0%)',
  'sodium': '342 mg(14%)',
  'polyunsat_fat': '0 g',
  'fiber': '1 g(3%)',
  'monounsat_fat': '0 g',
  'cholesterol': '',
  'servings': 'per serving (4 servings)'},
 'ingredients': ['2 cups packed cooked white rice',
  'About 5 cups chicken stock, vegetable stock, or store-bought chicken or vegetable broth',
  '2 cups water, plus more as needed',
  '3 thick slices unpeeled ginger, bruised',
  '2 green onions, white parts kept whole, green parts cut into thin rings',
  'About ½ teaspoon fine sea salt',
  'Recently ground black pepper (optional)']}

# Save recipe as a JSON (DONE)

In [89]:
with open('recipe.json', 'w') as f:
    json.dump(all_recipes, f)

# Save multiple recipes to the category search JSON

In [None]:
### Need to decide how to organize this information

In [87]:
all_recipes = []
all_recipes.append(recipe)