In [None]:
#TESTING WEB SCRAPING

#For single recipe, taking nutrient info

In [1]:
import requests
from bs4 import BeautifulSoup
import time

import pandas as pd

In [2]:
# FUNCTION FOR WEB SCRAPING

import time
def get_recipe_data(url, nutrient_df, ingredient_df):
    """
    Scrape allrecipes.com for information on the nutrient content and ingredients of each recipe
    Based on the url, ingredients and nutrient content are extracted separately from the website.
    
    IN: url: URL of the specific recipe as a string
    OUT: nutrient and ingredient dataframes with the information for the given recipe
    """
    
    #nutrition information
    url_nutrition = url + 'fullrecipenutrition'
    time.sleep(1)
    r1 = requests.get(url_nutrition)
    soup1 = BeautifulSoup(r1.text, 'html.parser')
    
    #this is the div containing all the nutrition information
    nutrition_info = soup1.find_all('div', class_='nutrition-row')
    
    nutrient_list = []
    
    recipe = soup1.find('h2')
    if recipe is not None:
        recipe = recipe.text
        for n in nutrition_info:
            name = n.find(class_ = 'nutrient-name').text
            amount = n.find(class_ = 'nutrient-value').text
            name = name[:name.index(':')]
            nutrient_list.append({'nutrient': name, 'amount': amount, 'recipe': recipe, 'URL': url})

    nutrients = pd.DataFrame(nutrient_list)
    nutrient_df = nutrient_df.append(nutrients, sort=True)
    
    #ingredients
    r2 = requests.get(url)
    soup2 = BeautifulSoup(r2.text, 'html.parser')
    ingredient_info = soup2.find_all(class_='recipe-container-outer')
    
    ingredient_list = []
    for i in ingredient_info:
        ingredient = i.find_all('span', {'itemprop':'recipeIngredient'}, class_ = 'recipe-ingred_txt added')
        for x in ingredient:
            ingredient_list.append({'ingredient': x.text, 'URL':url})

    ingredients = pd.DataFrame(ingredient_list)
    ingredient_df = ingredient_df.append(ingredients, sort=True)
    
    return nutrient_df, ingredient_df

In [None]:
#test the function
url = 'https://www.allrecipes.com/recipe/18057/sweet-potato-casserole-ii/'
nutrient_columns = ['URL', 'amount', 'nutrient', 'recipe']
ingredient_columns = ['URL', 'ingredient']
nutrient_df = pd.DataFrame(columns=nutrient_columns)
ingredient_df = pd.DataFrame(columns=ingredient_columns)

nutrient_df, ingredient_df = get_recipe_data(url, nutrient_df, ingredient_df)

url2 = 'https://www.allrecipes.com/recipe/7589/allspice-cream-cheese-frosting/'
nutrient_df, ingredient_df = get_recipe_data(url2, nutrient_df, ingredient_df)

display(nutrient_df.head())
display(nutrient_df.tail())

## Import dinner recipes

In [None]:
dinner_url_df = pd.read_csv('data/recipeLists/dinner_url.csv')
dinner_url_df.columns = ["originGridUrl", "recipeName", "recipeURL"]
#pd.options.display.max_colwidth = 200
#display(dinner_url_df['recipeURL'].head())

In [None]:
#dinner_url_df['recipeURL'] = dinner_url_df['recipeURL'].split("?")[0]
dinner_url_df['recipeURL'] = dinner_url_df['recipeURL'].astype(str).apply(lambda x: x.split("?")[0])
#dinner_url_df['recipeURLPrime'] = str(dinner_url_df['recipeURL']).split("?")[0]
pd.set_option('display.max_rows', None,'display.max_columns', None)
#display(dinner_url_df.head())

In [None]:
dinnerURLList = dinner_url_df.recipeURL.tolist()

In [None]:
dinnerURLList[:5]

In [None]:
nutrient_columns = ['URL', 'amount', 'nutrient', 'recipe']
ingredient_columns = ['URL', 'ingredient']
nutrient_df = pd.DataFrame(columns=nutrient_columns)
ingredient_df = pd.DataFrame(columns=ingredient_columns)

for url in dinnerURLList:
    nutrient_df, ingredient_df = get_recipe_data(url, nutrient_df, ingredient_df)
    time.sleep(1)
    

In [None]:
nutrient_df.to_csv("dinner_nutrient.csv")
ingredient_df.to_csv("dinner_ingredient.csv")

In [None]:
nutrient_df.shape

In [None]:
dinnerURLList

## Brunch recipes import

In [3]:
brunch_url_df = pd.read_csv('data/recipeLists/breakfast_brunch_url.csv')
#The last row is "Nan" so we decided to drop it
brunch_url_df = brunch_url_df.dropna()

display(brunch_url_df.head())
display(brunch_url_df.tail())

Unnamed: 0,url_list_link,url_list_selection1_name,url_list_selection1_url
1,https://www.allrecipes.com/recipes/78/breakfas...,French Toast I,https://www.allrecipes.com/recipe/7016/french-...
2,https://www.allrecipes.com/recipes/78/breakfas...,French Toast Casserole,https://www.allrecipes.com/recipe/22389/french...
3,https://www.allrecipes.com/recipes/78/breakfas...,Banana Bread,https://www.allrecipes.com/recipe/6687/banana-...
4,https://www.allrecipes.com/recipes/78/breakfas...,Quick Quiche,https://www.allrecipes.com/recipe/21551/quick-...
5,https://www.allrecipes.com/recipes/78/breakfas...,Easy Broccoli Quiche,https://www.allrecipes.com/recipe/24148/easy-b...


Unnamed: 0,url_list_link,url_list_selection1_name,url_list_selection1_url
3464,https://www.allrecipes.com/recipes/78/breakfas...,Whole Grain French Toast with Blackberry Compote,https://www.allrecipes.com/recipe/217958/whole...
3465,https://www.allrecipes.com/recipes/78/breakfas...,Rebecca's Matzo Brei,https://www.allrecipes.com/recipe/200690/rebec...
3466,https://www.allrecipes.com/recipes/78/breakfas...,Amazing Matza Brei,https://www.allrecipes.com/recipe/194372/amazi...
3467,https://www.allrecipes.com/recipes/78/breakfas...,Passover Pancakes,https://www.allrecipes.com/recipe/165775/passo...
3468,https://www.allrecipes.com/recipes/78/breakfas...,Ejja,https://www.allrecipes.com/recipe/162211/ejja/...


In [4]:
brunch_url_df.columns = ["originGridUrl", "recipeName", "recipeURL"]
#pd.options.display.max_colwidth = 200
#display(dinner_url_df['recipeURL'].head())

#dinner_url_df['recipeURL'] = dinner_url_df['recipeURL'].split("?")[0]
brunch_url_df['recipeURL'] = brunch_url_df['recipeURL'].astype(str).apply(lambda x: x.split("?")[0])
#dinner_url_df['recipeURLPrime'] = str(dinner_url_df['recipeURL']).split("?")[0]
pd.set_option('display.max_rows', None,'display.max_columns', None)
#display(dinner_url_df.head())

brunchURLList = brunch_url_df.recipeURL.tolist()

brunchURLList[:5]

['https://www.allrecipes.com/recipe/7016/french-toast-i/',
 'https://www.allrecipes.com/recipe/22389/french-toast-casserole/',
 'https://www.allrecipes.com/recipe/6687/banana-bread/',
 'https://www.allrecipes.com/recipe/21551/quick-quiche/',
 'https://www.allrecipes.com/recipe/24148/easy-broccoli-quiche/']

In [5]:
nutrient_columns = ['URL', 'amount', 'nutrient', 'recipe']
ingredient_columns = ['URL', 'ingredient']
brunch_nutrient_df = pd.DataFrame(columns=nutrient_columns)
brunch_ingredient_df = pd.DataFrame(columns=ingredient_columns)

brunch_length=str(len(brunchURLList))
for url in range(len(brunchURLList)):
    brunch_nutrient_df, brunch_ingredient_df = get_recipe_data(brunchURLList[url], brunch_nutrient_df, brunch_ingredient_df)
    print(str(url+1) + "/" + brunch_length)
    time.sleep(1)



1/3468
2/3468
3/3468
4/3468
5/3468
6/3468
7/3468
8/3468
9/3468
10/3468
11/3468
12/3468
13/3468
14/3468
15/3468
16/3468
17/3468
18/3468
19/3468
20/3468
21/3468
22/3468
23/3468
24/3468
25/3468
26/3468
27/3468
28/3468
29/3468
30/3468
31/3468
32/3468
33/3468
34/3468
35/3468
36/3468
37/3468
38/3468
39/3468
40/3468
41/3468
42/3468
43/3468
44/3468
45/3468
46/3468
47/3468
48/3468
49/3468
50/3468
51/3468
52/3468
53/3468
54/3468
55/3468
56/3468
57/3468
58/3468
59/3468
60/3468
61/3468
62/3468
63/3468
64/3468
65/3468
66/3468
67/3468
68/3468
69/3468
70/3468
71/3468
72/3468
73/3468
74/3468
75/3468
76/3468
77/3468
78/3468
79/3468
80/3468
81/3468
82/3468
83/3468
84/3468
85/3468
86/3468
87/3468
88/3468
89/3468
90/3468
91/3468
92/3468
93/3468
94/3468
95/3468
96/3468
97/3468
98/3468
99/3468
100/3468
101/3468
102/3468
103/3468
104/3468
105/3468
106/3468
107/3468
108/3468
109/3468
110/3468
111/3468
112/3468
113/3468
114/3468
115/3468
116/3468
117/3468
118/3468
119/3468
120/3468
121/3468


ConnectionError: ('Connection aborted.', OSError("(104, 'ECONNRESET')",))

In [6]:
brunch_nutrient_df.to_csv("brunch_nutrient_1-120.csv")
brunch_ingredient_df.to_csv("brunch_ingredient_1-120.csv")

In [8]:
brunchURLList[120]

'https://www.allrecipes.com/recipe/20746/country-quiche/'