In [1]:
pip install bs4

You should consider upgrading via the '/Users/elvirabernadet/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import re
import numpy as np
import pandas as pd  
from urllib import request
from bs4 import BeautifulSoup

In [3]:
URL = "https://www.food.com"

robots = request.urlopen(URL+"/robots.txt").read().decode('utf8')
robots = robots.split("\n\n")[1]

In [4]:
disallowed = []
for line in robots.split("\n"):
    m = re.match(r'^Disallow: (.*)$',line.strip())
    if m:
        this_url = m.group(1)
        # convert the disallowed URLs to regular expressions
        this_url = re.sub(r'\*',".*",this_url)
        this_url = re.sub(r'\?',"\?",this_url)
        this_url = re.sub(r'\/$',"",this_url)
        if (not this_url.startswith("http")): this_url = URL+this_url 
        disallowed.append(this_url)

In [5]:
# open the main page of Food.com
if (not URL in disallowed) and (not "/" in disallowed):
    food = request.urlopen(URL).read().decode('utf8')

In [6]:
# retrieve url for each cuisine main page

url_set = dict()

cuisine = ('mexican', 'italian', 'indian', 'thai', 'korean', 'french', 'latin-american', 'chinese', 'japanese', 'spanish')
# use Beautiful Soup to find all <a> tags in the page
for atag in BeautifulSoup(food,"html.parser").find_all('a'):
    href = atag.get('href')
    if href==None: continue

    # Add this hypertext reference to the list, if it's allowed
    allowed = 1
    for disallow in disallowed:
        if re.match(r'^'+disallow+'$',href): 
            allowed = 0
    # preserve only meaningful web pages of news
    # if allowed and re.match(r'^' + URL, href): 
    
    for c in cuisine:
        # some of the URL are using -food pattern meanwhile the other ones are using -recipe
        if allowed and c+'-food' in href:
            url_set[c]=URL + href
        elif allowed and c+'-recipe' in href:
            url_set[c]=URL + href

In [7]:
# retrieve content of each main page of the cuisine

link_contents = {}

for i in cuisine:
    if (not URL in disallowed) and (not "/" in disallowed):
        link_contents[i] = request.urlopen(url_set[i]).read().decode('utf8')

In [8]:
# retrieve links for recipe inside each cuisine

links = {}

for i in cuisine:
    recipes = []
    for atag in BeautifulSoup(link_contents[i],"html.parser").find_all('a'):
        href = atag.get('href')
        if href==None: continue
            
        allowed = 1
        for disallow in disallowed:
            if re.match(r'^'+disallow+'$',href): 
                allowed = 0
            
        if allowed and 'www.food.com/recipe/' in href:
            recipes.append(href)
            
    links[i] = recipes

In [27]:
# retrieve full recipes
# this may take some time to run

allrecipes = []

def crawl_recipe(c):
    for i in range (0,len(links[c])):
        recipe=[]
        
        # reading the content of the web page
        content = request.urlopen(links[c][i]).read()
        soup = BeautifulSoup(content, 'lxml')
        
        # retrieve the dish title
        title = soup.find('div', class_= 'layout__item title svelte-mq22ro').h1.text
        recipe.append(title)
        
        # retrieve the list of ingredients
        start = 'window.mdManager.setParameter("ingredients", "'
        end = '");window.mdManager.setParameter("keywordids",'
        results = str(soup.find_all('script'))
        ingredients = (results.split(start))[1].split(end)[0]
        recipe.append(ingredients)
        
        # input the cuisine name as identifier
        recipe.append(c)
        # input the recipe to the final var
        allrecipes.append(recipe)

        
# crawl each cuisine
for item in cuisine:
    crawl_recipe(item)

In [28]:
# export final result to csv

df = pd.DataFrame(allrecipes)
headerList = ['Name', 'Ingredients', 'Cuisine']
df.to_csv('all_recipes.csv', header=headerList,index=False)