In [7]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from bs4.element import Tag
import bs4
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import re
from lxml import html
import lxml
import collections.abc as clct

In [8]:
%load_ext autoreload
%autoreload 2

# Bytes Into Baking

## Project Overview
- Food is one of the most common topics on the internet, with content being published on the web by big businesses such as *foodnetwork* and *allrecipes* to home chefs writing their own blogs. 
- Food is trendy--keeping on top of food trends could be valuable to people who work in the food or food publishing industries.
- The purpose of this project is to gather recipe data from a variety of different websites and then use supervised and unsupervised NLP models to discover useful information from the data

## Project Phases
- Create a utility web scraper to capture recipes from a variety of different websites
- Use ML to compare and contrast recipes

## Goals
- Generate a list of target websites for a particular recipe class by pulling urls from Google search
- Develop a utility scraper to identify recipes and then grab pertinent information about the recipe from the recipe section--starting with instructions
- Test the results of the scraper on a supervised model that classifies the vectorized recipes
- Explore the possibiliity of identifying different groupings within a specific recipe class

## Tools and techniques used in this project
- **Tools**
> - Python, Beautiful Soup, Pandas, Numpy, Gensim
- **Visualization**
> - Matplotlib, Plotly
- **Techniques**
> - Web-scraping, K Fold cross validation, Multinomial Naive Bayes Classification, Non-negative Matrix Factoring (NMF)

## In this Notebook (phase one of the project)
- Web recipe scraper
- Uses Google search to gather target website urls
- Uses scraper.py to scrape the urls for recipes


## Step one
### Get links for the target search term
- The websites that feed into the recipe scraper were obtained from a Google search of a specific recipe. This approach was deemed to be superior to relying upon a public index of recipe websites. The purpose of this project is to dive deep into a particular recipe class and hopefully find those hidden gem recipes in that class that may not appear in a public index. For example, one of the best croissant recipe websites can be found on Gourmetier. The recipe on this website is rich with insight written by someone who has made thousands of croissants and has a passion for education. This website does not have the breadth of recipes like *allrecipes* or *foodnetwork*, but the recipes that appear on it are gems. These recipes are more important--as measured by their utility to someone who actually wants to know how to make a great croissant--than those that appear on mass recipe websites. Gourmetier and other 'deep' sites like it absolutely need to be included if they happen to have a recipe in a target class. Google search provided the most likely first step to finding websites and recipes like these.

In [24]:
def get_links_from_google_search(google_url):
    '''
    Given a Google search string, returns links from that Google search page
    Parameters:
    ----------
    google_url (str): a google search string. To get past the first page, replace start=0 with start={bundle}.
    Returns:
    -------
    links (list): a list of links 
    '''
    links_ = []
    driver = webdriver.Firefox()
    time.sleep(3) # slow requests down to keep google from getting upset
    driver.get(google_url)
    soup = BeautifulSoup(driver.page_source,'lxml')
    result_div = soup.find_all('div', attrs={'class': 'g'})
    for r in result_div:
            try:
                link = r.find('a', href=True)
                if link != '':
                    # site exceptions
                    if 'google.' not in link and 'costco.' not in link and 'freshdirect' not in link: 
                        links_.append(link['href'])
                
            except Exception as e:
                print(e)
                continue
    return(links_)


In [40]:
def run_remote_google_search(search_string, start=0, num_search_pages=12):
    '''
    Takes in a preformatted Google search string, a starting number, and num_search_pages which yields 10 urls per search page.
    Returns a list of urls that is num_search_pages * 10 long unless it reaches the end of the Google search first.
    '''
    page_bundles = list(range(start,num_search_pages*10, 10))
    links=[]
    for bundle in page_bundles:
        formatted_string = search_string.replace('start=10', f'start={bundle}')
        links_list = get_links_from_google_search(formatted_string)
        for link in links_list:
            links.append(link)
    return links

In [41]:
#  To change the search term, run a search in Google, go to the second page of the search, and then copy the Google search address. 
#  Assign search string to variable.

us_pie_crust = 'https://www.google.com/search?q=how+to+make+croissants&client=ubuntu&hs=KiU&channel=fs&sxsrf=ALeKk02N149o2b-eULZqvU-6oYKKKGnsqA:1598729557213&ei=Va1KX_PLDIO_0PEP9ue5yAU&start=10&sa=N&ved=2ahUKEwizyuXak8HrAhWDHzQIHfZzDlkQ8NMDegQIDhBA&biw=1637&bih=942'
# us_brioche = f'https://www.google.com/search?q=how+to+make+brioche+dough&client=ubuntu&hs=Do3&channel=fs&sxsrf=ALeKk03eqUHI6tXD0_aUHZKqzJUzujjeZQ:1598546700864&ei=DONHX56eNIXB-wS2mZvwBw&start=10&sa=N&ved=2ahUKEwie1IvC6rvrAhWF4J4KHbbMBn4Q8NMDegQIDhBA&biw=1637&bih=942'
# us_puff_pastry = f'https://www.google.com/search?q=how+to+make+puff+pastry+dough&client=ubuntu&hs=GNM&channel=fs&sxsrf=ALeKk011tOIIja512_JwZ8zfTmHSiLc9HA:1598459179879&ei=K41GX96hNcP7-gTrwZCYAg&start=10&sa=N&ved=2ahUKEwie4-q8pLnrAhXDvZ4KHesgBCMQ8NMDegQIDhBA&biw=1637&bih=942'
# us_ciambellone = f'https://www.google.com/search?q=how+to+make+ciambellone&client=ubuntu&hs=GlP&channel=fs&sxsrf=ALeKk03eb_8FF-pe9XNVMG56gUih10WpYA:1598472199741&ei=B8BGX83oLIvJ0PEPr-2ryAQ&start=10&sa=N&ved=2ahUKEwiNiJj91LnrAhWLJDQIHa_2CkkQ8NMDegQIDBBA&biw=1637&bih=942'
# us_google_url=f"https://www.google.com/search?q=croissant+baking+temp&client=ubuntu&hs=bju&channel=fs&tbas=0&sxsrf=ALeKk01tinXVzgWJgZhSeeUfjrJd4FW4oA:1597161338731&ei=er8yX--YLLi50PEPiYu28A4&start=10&sa=N&ved=2ahUKEwjvvfnRwZPrAhW4HDQIHYmFDe4Q8NMDegQIDhBA&biw=1920&bih=969"
# fr_google_url=f"https://www.google.fr/search?q=temp%C3%A9rature+cuisson+croissant&ei=0PEzX4C2PMXk-gTv_ImwCA&start=10&sa=N&ved=2ahUKEwiA3szk5ZXrAhVFsp4KHW9-AoYQ8NMDegQIDRBC&biw=1848&bih=942"
# uk_google_url=f"https://www.google.co.uk/search?q=croissant+temperature&ei=OPQzX8T0KMj4-gSnn5moBw&start=10&sa=N&ved=2ahUKEwjE8JaK6JXrAhVIvJ4KHadPBnUQ8NMDegQIDhA_&biw=1848&bih=942"

pie_crust_links = run_remote_google_search(us_pie_crust, 0, 2)

In [None]:
# Confirm function ran properly and grabbed links
pie_crust_links

### Write results to file from Google search function

In [46]:
# CAUTION: Be sure to change the input variable list and the output file name prior to running.

pd.DataFrame(pie_crust_links).to_csv('data/us-piecrust-links.txt')

## Step two
### A focus on pastry recipes
- I chose to start with pastry recipes since I come with an indepth knowledge of this space. I've explored hundreds of pastry recipe websites over my career.
- I chose five categories of pastry and baked goods recipes that had what I considered to be important differences, yet enough similarity that they might confuse a model.
> - Brioche
> - Ciambellone
> - Croissants
> - Pie crust
> - Puff pastry

### Numerous ways to format a recipe, but some intriguing similarities make a utility scraper possible
- My initial Google search website 'spider' yielded more than recipe websites--I needed a scraper that would limit the possibility of seeing a recipe when one wasn't there
- More than a third of recipes I encountered follow a schema format promoted by Yoast, a website schema publisher to aid in SEO optimization--useful tags and a json structure with standardized keys for things like recipe ingredients and instructions (Yeah!)
- Another 40-50% placed the body of their recipe in a script tag with one of several commonly used attributes. This wasn't as clean of an approach--I usually ended up with extra text--but I still was able to get some results
- The rest, well, some didn't want to be scraped, and some had unusual structures. I could spend a lot of time chasing the tail with limited utility.
- Results:

| Recipe | First pass links from Google search | Usable recipes obtained |  Yield |
| :---: | :---: |  :---: |  :---: |
| Brioche     | 112   |  55     | 49% |
| Ciambellone | 116   |  55     |  47% |
| Croissant   | 180   |  72     | 40% |
| Puff pastry | 200   |  86     | 43% |
| Pie crust   | 122   |  74     | 60% |

### Read in links file and initiate web scrape

In [47]:
# url_list = pd.read_csv('data/fr-croissant-links.txt',header=None)
# url_list = pd.read_csv('data/uk-croissant-links.txt',header=None)
url_piecrust = pd.read_csv('data/us-piecrust-links.txt',header=None)
url_brioche = pd.read_csv('data/us-brioche-links.txt',header=None)
url_puff = pd.read_csv('data/us-puff-pastry-links.txt',header=None)
url_csnt = pd.read_csv('data/us-croissant-links.txt', header=None)
url_cake = pd.read_csv('data/us-ciambellone-links.txt', header=None)

search_string_1 = r'([A-Z][^.]*(?:reheat|ake|oven)[^.]*\d{3}(?:º|°|F| ºF| °F| F|C| ºC| °C| C)[^.]*(?:[.]|[\s]))' #US regex
search_string_2 = r'(\d{3})'

In [48]:
from src.scraper import *

piecrust_scraper = ScrapeRecipe(page_genus='laminated', page_species='pie_crust', re_pattern_1=search_string_1, re_pattern_2= search_string_2)

req_list = []
for i in range(1,len(url_piecrust[0])):
    req_list.append(piecrust_scraper.process_url(url_piecrust[1][i], verbose=True, extract_lang=True))
    




  soup = BeautifulSoup(req.content, 'html')


 Success with code 200 at URL: https://sallysbakingaddiction.com/homemade-croissants/
 Success with code 200 at URL: https://www.jocooks.com/recipes/homemade-croissants/
 Success with code 200 at URL: https://sallysbakingaddiction.com/homemade-croissants/
 Success with code 200 at URL: https://sallysbakingaddiction.com/homemade-croissants/
 Success with code 200 at URL: https://www.youtube.com/watch?v=beOxJm1_tbk




  soup = BeautifulSoup(call.content)


 Failure with code 200 at URL: https://www.youtube.com/watch?v=e5O3TdgYsE4
 Failure with code 200 at URL: https://www.mybluprint.com/article/croissant-filling-ideas
 Success with code 200 at URL: https://bakingamoment.com/easy-homemade-croissant-recipe/
 Success with code 200 at URL: https://www.halfbakedharvest.com/homemade-croissants-step-step-photos/
 Success with code 200 at URL: https://www.kingarthurbaking.com/recipes/bakers-croissants-recipe
 Success with code 200 at URL: https://www.thekitchn.com/how-to-make-croissants-138921
 Success with code 200 at URL: https://www.finecooking.com/recipe/classic-croissants
 Failure with code 301 at URL: http://www.saveur.com/best-croissants-recipe/
 Success with code 200 at URL: https://www.youtube.com/watch?v=hJxaVD6eAtc
 Failure with code 200 at URL: https://www.weekendbakery.com/posts/classic-french-croissant-recipe/
 Success with code 200 at URL: https://www.foodnetwork.com/recipes/food-network-kitchen/homemade-croissants-5277802
 Succes

In [49]:
req_list

[{'page_genus': 'laminated', 'page_family': '', 'page_species': 'pie_crust', 'url': 'https://sallysbakingaddiction.com/baking-basics-homemade-buttery-flaky-pie-crust/', 'language': 'en', 'recipe_name': 'Homemade Buttery Flaky Pie Crust', 'cook_time': '', 'ingredients': '', 'instructions': 'Mix the flour and salt together in a large bowl. Add\xa0the butter and shortening. Using a pastry cutter\xa0(the one I own) or two forks, cut the butter and shortening into\xa0the mixture until it resembles coarse meal (pea-sized bits with a few larger bits of fat is OK). A\xa0pastry cutter makes this step very easy and quick. Measure 1/2 cup (120ml) of water in a cup. Add ice. Stir it around. From that, measure 1/2 cup (120ml) of water&#8211; since the ice has melted a bit. Drizzle the cold water in, 1 Tablespoon (15ml) at a time, and stir with a rubber\xa0spatula or wooden spoon after every Tablespoon (15ml) added.\xa0Do not add any more water than you need to. Stop adding water when the dough begi

In [59]:
BeautifulSoup(requests.get(req_list[4].url).content)

<!DOCTYPE html>
<!--[if IE 8]>
<html class="ie8" lang="en-US" prefix="og: http://ogp.me/ns#"> <![endif]--><!--[if IE 9]>
<html class="ie9" lang="en-US" prefix="og: http://ogp.me/ns#"> <![endif]--><!--[if !( IE 8 ) | !( IE 9 ) ]><!--><html lang="en-US" prefix="og: http://ogp.me/ns#"> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="width=device-width,initial-scale=1.0" name="viewport"/>
<link href="//gmpg.org/xfn/11" rel="profile"/>
<link href="https://www.ladybehindthecurtain.com/xmlrpc.php" rel="pingback"/>
<!-- Weaver Xtreme Standard Google Fonts for page-type: single -->
<link href="https://fonts.googleapis.com/css?family=Open+Sans:400%2C700%2C700italic%2C400italic%7COpen+Sans+Condensed:300%2C700%7CAlegreya+SC:400%2C400i%2C700%2C700i%7CAlegreya+Sans+SC:400%2C400i%2C700%2C700i%7CAlegreya+Sans:400%2C400i%2C700%2C700i%7CAlegreya:400%2C400i%2C700%2C700i%7CDroid+Sans:400%2C700%7CDroid+Serif:400%2C400italic%2C700%2C700italic%7CExo+2:400%2C700%7CLato:400%2C400italic%2C700%2C7

In [50]:
good_url, bad_url, instr, target = [], [], [], []
hits, misses = 1,1
for page in req_list:
    if page != None:
        if page.instructions:
            if len(page.instructions)>0:
                good_url.append(page.url)
                instr.append(page.instructions)
                target.append(page.page_species)
                hits += 1
                       
            else:
                print(page.url)
                print("NO DATA")
                bad_url.append(page.url)
                misses += 1
print(f': Hits {hits}  Misses {misses}')

df_piecrust = pd.DataFrame([good_url, instr, target]).T
# df = pd.DataFrame([good_url, instr, init_temp]).T

: Hits 75  Misses 1


In [58]:
instr

['Mix the flour and salt together in a large bowl. Add\xa0the butter and shortening. Using a pastry cutter\xa0(the one I own) or two forks, cut the butter and shortening into\xa0the mixture until it resembles coarse meal (pea-sized bits with a few larger bits of fat is OK). A\xa0pastry cutter makes this step very easy and quick. Measure 1/2 cup (120ml) of water in a cup. Add ice. Stir it around. From that, measure 1/2 cup (120ml) of water&#8211; since the ice has melted a bit. Drizzle the cold water in, 1 Tablespoon (15ml) at a time, and stir with a rubber\xa0spatula or wooden spoon after every Tablespoon (15ml) added.\xa0Do not add any more water than you need to. Stop adding water when the dough begins to form large clumps. I always use about\xa01/2 cup (120ml) of water and a little more in dry winter months (up to 3/4 cup). Transfer the pie dough\xa0to\xa0a floured work surface. The dough should come together easily and should not feel overly sticky. Using floured\xa0hands, fold the

In [57]:
df_piecrust.iloc[:, 1]

0     Mix the flour and salt together in a large bow...
1     Add&nbsp;flour, salt to a large mixing bowl.&n...
2     Add&nbsp;flour, salt to a large mixing bowl.&n...
3     Pour cold water into a small dish with a few p...
4     In the bowl of a food processor; add the flour...
                            ...                        
69    1. Place the first 3 ingredients in a food pro...
70     If you are looking for a from scratch pie cru...
71    I put the bowl of the food processor in the fr...
72    In a large bowl with a pastry blender or two k...
73    In a food processor, pulse together the flour,...
Name: 1, Length: 74, dtype: object

### Write search results to file

In [53]:
# Write results to csv

us_piecrust = pd.DataFrame(df_piecrust)
us_piecrust.to_csv('data/us_piecrust.csv')

# us_brioche = pd.DataFrame(df_brioche)
# us_brioche.to_csv('data/us_brioche.csv')

# us_puff = pd.DataFrame(df_puff)
# us_puff.to_csv('data/us_puff.csv')

# us_croissant = pd.DataFrame(df_csnt)
# us_croissant.to_csv('data/us_croissant.csv')

# us_ciambellone = pd.DataFrame(df_cake)
# us_ciambellone.to_csv('data/us_ciambellone.csv')


In [59]:
us_brioche

Unnamed: 0,0,1,2
0,https://www.allrecipes.com/recipe/17486/brioche/,"In a small bowl, dissolve yeast in warm water....",brioche
1,https://www.bonappetit.com/recipe/brioche-dough,Combine 1/4 cup warm water and warm milk in bo...,brioche
2,https://www.delish.com/cooking/recipe-ideas/a2...,Make the sponge: To the bowl of a stand mixer ...,brioche
3,https://www.allrecipes.com/recipe/17486/brioche/,"In a small bowl, dissolve yeast in warm water....",brioche
4,https://www.finecooking.com/recipe/authentic-b...,In a stand mixer fitted with the paddle attach...,brioche
5,https://www.kingarthurbaking.com/recipes/brioc...,In a stand mixer or bread machine (programmed ...,brioche
6,https://www.fifteenspatulas.com/homemade-brioc...,"Start with the sponge. Place the milk, yeast, ...",brioche
7,https://www.epicurious.com/recipes/food/views/...,YieldMakes about 1 1/4 poundsIngredientsFor st...,brioche
8,https://www.thekitchn.com/how-to-make-brioche-...,Let the butter soften: About an hour before yo...,brioche
9,https://www.food.com/recipe/brioche-51546,"Put the milk, yeast, egg and 1 cup of the flou...",brioche
