# Web scraping

### Imports

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
import json
import re

I'll try to scrape recipes from the NYT website. I found the sitemap that contains links to pages with all the recipes:

In [None]:
sitemap = requests.get('https://www.nytimes.com/sitemaps/new/cooking.xml.gz')

In [None]:
sitemap

In [None]:
soup = BeautifulSoup(sitemap.content, "html.parser")

In [None]:
pagelist = []
for i in soup.select("loc"):
    pagelist.append(i.get_text())

In [None]:
len(pagelist)

That gives me a list of pages, each of which contains links to actual recipes. Now I'll explore one of these pages, in order to find an algorithm for scraping.

In [None]:
pagelist[0]

In [None]:
page = requests.get(pagelist[0])

In [None]:
souppage = BeautifulSoup(page.content, "html.parser")

In [None]:
#souppage

In [None]:
pagelist_recipes = []
for i in souppage.select("loc"):
    pagelist_recipes.append(i.get_text())

In [None]:
#pagelist_recipes

In [None]:
recipelist = []
for i in pagelist:
    page = requests.get(i)
    soup = BeautifulSoup(page.content, "html.parser")
    for i in soup.select("loc"):
        recipelist.append(i.get_text())
    wait_time = randint(1,4000)
    print("I will sleep for " + str(wait_time/2000) + " seconds.")
    sleep(wait_time/2000)

In [None]:
len(recipelist)

In [None]:
with open("nytimespages.txt", "w") as f:
    for line in recipelist:
        f.write(line)
        f.write("\n")

Now that I have that list, the million dollar question is whether the NYT website will allow me full access to the recipe pages. When you visit it in a browser, it wants you to sign up.

In [None]:
url = recipelist[0]

In [None]:
harissa = requests.get(url)

In [None]:
souppage = BeautifulSoup(harissa.content, "html.parser")

In [None]:
data = json.loads(souppage.find('script', type='application/ld+json').text)

In [None]:
data['recipeIngredient']

### Re-open the list of pages from the file (i.e. without scraping it again)

In [2]:
with open("nytimespages.txt", "r") as file:
    dummy = file.readlines()

In [3]:
recipelist = []
for item in dummy:
    recipelist.append(item.replace("\n", ""))

In [None]:
recipelist

Since I'm scraping over 20,000 pages, I will divide the list up into different batches. I define a formula here that I can use to generate different batches.

In [None]:
for page in recipelist: 
    print(recipelist.index(page))

In [4]:
def getrecipe(recipeslist):
    recipes = []

    for page in recipeslist: 
        try: 
            req = requests.get(page)
            soup = BeautifulSoup(req.content, "html.parser")
            data = json.loads(soup.find('script', type='application/ld+json').text)
            name = data['name']
            ingredientlist = data['recipeIngredient']
            recipes.append([name, page, ingredientlist])
            print(f"Done with index # {recipeslist.index(page)}")
           
            # The try/except structure here has two benefits: 
            # - It guards against a patchy internet connection: It just keeps on rolling until it's back.
            # - It will automatically skip pages that do not conform to the same json format (recipeIngredient)
            
        except: 
            print("No success, moving on.")
        
        # Polite wait time
        wait_time = randint(1,4000)
        print("I will sleep for " + str(wait_time/2000) + " seconds.")
        sleep(wait_time/2000)
        
    return recipes

In [None]:
batch1 = getrecipe(recipelist[:1000])

In [None]:
batch1

In [None]:
batch2 = getrecipe(recipelist[1000:2000])

In [None]:
batch3 = getrecipe(recipelist[2000:3000])

In [None]:
batch4 = getrecipe(recipelist[3000:4000])

In [None]:
df1 = pd.DataFrame(batch1)
df2 = pd.DataFrame(batch2)
df3 = pd.DataFrame(batch3)
df4 = pd.DataFrame(batch4)

In [None]:
firstbunch = pd.concat([df1, df2, df3, df3], axis = 0)

In [None]:
firstbunch.to_csv("firstbunch.csv", sep = "|", index = False)

### Next round

In [None]:
batch5 = getrecipe(recipelist[4000:7500])

In [None]:
%%time
batch6 = getrecipe(recipelist[7500:10000])

In [None]:
df5 = pd.DataFrame(batch5)
df6 = pd.DataFrame(batch6)

In [None]:
df5

In [None]:
secondbunch = pd.concat([df5, df6], axis = 0)
secondbunch.to_csv("secondbunch.csv", sep = "|", index = False)

In [None]:
secondbunch

In [None]:
%%time
batch7 = getrecipe(recipelist[10000:12500])

In [None]:
batch8 = getrecipe(recipelist[12500:15000])

In [None]:
df7 = pd.DataFrame(batch7)
df8 = pd.DataFrame(batch8)

In [None]:
thirdbunch = pd.concat([df7, df8], axis = 0)

In [None]:
thirdbunch.to_csv("thirdbunch.csv", sep = "|", index = False)

In [None]:
batch9 = getrecipe(recipelist[15000:17500])

In [None]:
fourthbunch = pd.DataFrame(batch9)

In [None]:
fourthbunch.to_csv("fourthbunch.csv", sep = "|", index = False)

In [None]:
recipelist

In [1]:
batch10 = getrecipe(recipelist[17500:])
fifthbunch = pd.DataFrame(batch10)
fifthbunch.to_csv("fifthbunch.csv", sep = "|", index = False)

In [3]:
first = pd.read_csv("firstbunch.csv", sep = '|')
second = pd.read_csv("secondbunch.csv", sep = '|')
third = pd.read_csv("thirdbunch.csv", sep = '|')
fourth = pd.read_csv("fourthbunch.csv", sep = '|')
fifth = pd.read_csv("fifthbunch.csv", sep = '|')

In [10]:
nyt = pd.concat([first, second, third, fourth, fifth], axis = 0)

In [12]:
nyt.isna().sum()

0    0
1    0
2    0
dtype: int64

In [13]:
nyt = nyt.reset_index(drop = True)

In [19]:
nyt['2'][0]

"['3 medium sweet potatoes, washed and trimmed, (about 1 1/2 pounds)', '3 medium red onions, peeled and trimmed (about 1 pound)', '2 tablespoons extra-virgin olive oil', '2 tablespoons harissa paste, plus more for serving', '1 teaspoon ground cumin', 'Kosher salt and black pepper', '1/2 lemon', '3 ounces feta, crumbled (optional)', 'Handful of cilantro or mint leaves']"

In [20]:
nyt.columns = ['name', 'url', 'ingredients']

In [21]:
nyt

Unnamed: 0,name,url,ingredients
0,Harissa-Roasted Sweet Potatoes and Red Onion,https://cooking.nytimes.com/recipes/1023541-ha...,"['3 medium sweet potatoes, washed and trimmed,..."
1,Tofu and Mushroom Jorim (Soy-Braised Tofu),https://cooking.nytimes.com/recipes/1023476-to...,"['1/3 cup low-sodium soy sauce', '5 garlic clo..."
2,Roasted Chicken With Crispy Mushrooms,https://cooking.nytimes.com/recipes/1023551-ro...,"['2 to 2 1/4 pounds boneless, skinless chicken..."
3,Chocolate Pumpkin Swirl Muffins,https://cooking.nytimes.com/recipes/1023565-ch...,"['2 cups/256 grams all-purpose flour', '1 tabl..."
4,Pasta e Patate (Pasta and Potato Soup),https://cooking.nytimes.com/recipes/1023564-pa...,"['1/3 cup extra-virgin olive oil', '1 large ye..."
...,...,...,...
22070,Mushrooms in Marsala Wine (Funghi Alla Marsala),https://cooking.nytimes.com/recipes/31-mushroo...,"['1 ounce dried mushrooms, preferably imported..."
22071,Veal Scaloppine With Mushrooms Bordelaise,https://cooking.nytimes.com/recipes/30-veal-sc...,"['12 slices veal scaloppine, about 1 1/4 pound..."
22072,Mushroom and Meat Loaf,https://cooking.nytimes.com/recipes/28-mushroo...,"['1/2 pound mushrooms', '1 tablespoon butter',..."
22073,Mushroom and Pepper Salad,https://cooking.nytimes.com/recipes/29-mushroo...,"['1 large sweet red pepper, about 1/2 pound', ..."


In [22]:
nyt.to_csv('nyt.csv', sep = '|', index = False)