# Web scraping

### Imports

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
import json
import re

## Next up: Allrecipes.com

In [2]:
sitemap = requests.get('https://www.allrecipes.com/sitemap.xml')

In [3]:
sitemap

<Response [200]>

In [4]:
soup = BeautifulSoup(sitemap.content, "html.parser")



In [5]:
metapagelist = []
for i in soup.select("loc"):
    metapagelist.append(i.get_text())

In [6]:
len(metapagelist)

4

In [7]:
metapagelist

['https://www.allrecipes.com/sitemap_1.xml',
 'https://www.allrecipes.com/sitemap_2.xml',
 'https://www.allrecipes.com/sitemap_3.xml',
 'https://www.allrecipes.com/sitemap_4.xml']

In [10]:
pagelist = []
for page in metapagelist: 
    subpage = requests.get(page)
    soup = BeautifulSoup(subpage.content, 'html.parser')
    for i in soup.select("loc"):
        pagelist.append(i.get_text())


In [11]:
len(pagelist)

62219

In [12]:
pagelist[1]

'https://www.allrecipes.com/recipe/15238/delicious-baked-chicken/'

In [13]:
newpagelist = []
for page in pagelist: 
    if re.search('https://www.allrecipes.com/recipe', page):
        newpagelist.append(page)

In [14]:
len(newpagelist)

53197

In [15]:
def getrecipe(recipeslist):
    recipes = []

    for page in recipeslist: 
        try: 
            req = requests.get(page)
            soup = BeautifulSoup(req.content, "html.parser")
            data = json.loads(soup.find('script', type='application/ld+json').text)
            name = data[0]['name']
            ingredientlist = data[0]['recipeIngredient']
            recipes.append([name, page, ingredientlist])
            print(f"Done with index # {recipeslist.index(page)}")
           
            # The try/except structure here has two benefits: 
            # - It guards against a patchy internet connection: It just keeps on rolling until it's back.
            # - It will automatically skip pages that do not conform to the same json format (recipeIngredient)
            
        except: 
            print("No success, moving on.")
        
        # Polite wait time
        wait_time = randint(1,4000)
        print("I will sleep for " + str(wait_time/4000) + " seconds.")
        sleep(wait_time/4000)
        
    return recipes

In [1]:
allrecipes1 = getrecipe(newpagelist)

In [47]:
allrecipes_df = pd.DataFrame(allrecipes1)

In [48]:
allrecipes_df.to_csv('allrecipes.csv', sep = '|', index = False)

## Next: Epicurious

In [16]:
sitemap = requests.get('https://www.epicurious.com/sitemap.xml/editorial-recipes')

In [17]:
soup = BeautifulSoup(sitemap.content, "html.parser")


In [18]:
metapagelist = []
for i in soup.select("loc"):
    metapagelist.append(i.get_text())

In [2]:
newpagelist = []
for i in range(len(metapagelist)):        
    subpage = requests.get(metapagelist[i])
    soup = BeautifulSoup(subpage.content, "html.parser")
    for item in soup.select("loc"):
        newpagelist.append(item.get_text())

In [20]:
len(newpagelist)

17052

In [21]:
newpagelist[0]

'https://www.epicurious.com/recipes/food/views/burnt-broccoli-and-crushed-olive-salad'

In [10]:
req = requests.get(newpagelist[0])

In [11]:
req

<Response [200]>

In [12]:
soup = BeautifulSoup(req.content, "html.parser")

In [14]:
data = json.loads(soup.find('script', type='application/ld+json').text)

In [17]:
data['recipeIngredient']

['1 large head of broccoli (about 1 lb.)',
 '2 Tbsp. extra-virgin olive oil, divided',
 '½ tsp. Diamond Crystal or ¼ tsp. Morton kosher salt',
 '2 oz. fresh goat cheese',
 '½ cup Castelvetrano olives, crushed, pits removed',
 '1 lemon']

In [22]:
def getrecipe(recipeslist):
    recipes = []

    for page in recipeslist: 
        try: 
            req = requests.get(page)
            soup = BeautifulSoup(req.content, "html.parser")
            data = json.loads(soup.find('script', type='application/ld+json').text)
            name = data['name']
            ingredientlist = data['recipeIngredient']
            recipes.append([name, page, ingredientlist])
            print(f"Done with index # {recipeslist.index(page)}")
           
            # The try/except structure here has two benefits: 
            # - It guards against a patchy internet connection: It just keeps on rolling until it's back.
            # - It will automatically skip pages that do not conform to the same json format (recipeIngredient)
            
        except: 
            print("No success, moving on.")
        
        # Polite wait time
        wait_time = randint(1,4000)
        print("I will sleep for " + str(wait_time/4000) + " seconds.")
        sleep(wait_time/4000)
        
    return recipes

In [3]:
#epicurious1 = getrecipe(newpagelist[:5000])

In [4]:
#epicurious2 = getrecipe(newpagelist[5000:10000])

In [27]:
epi1 = pd.DataFrame(epicurious1)


In [28]:
epi2 = pd.DataFrame(epicurious2)

In [29]:
epimix = pd.concat([epi1, epi2], axis = 0)

In [31]:
epimix.columns = ['name','url','ingredients']

In [33]:
epimix.to_csv('epicurious.csv', index = False, sep = '|')

In [5]:
#epicurious3 = getrecipe(newpagelist[10000:15000])

In [6]:
#epicurious4 = getrecipe(newpagelist[15000:])

In [37]:
epi3 = pd.DataFrame(epicurious3)
epi4 = pd.DataFrame(epicurious4)

In [38]:
epicurious = pd.concat([epi1, epi2, epi3, epi4], axis = 0)

In [40]:
epicurious.columns = ['name', 'url', 'ingredients']

In [41]:
epicurious.to_csv('epicurious.csv', index = False, sep = '|')

In [42]:
len(epicurious)

16799