## Scrape Data

Following tutorial from: https://p-mckenzie.github.io/2018/08/06/Allrecipes-categories-scraper/

First, install Selenium for Python by running `pip install selenium`. Then, since we use Chrome as the preferred browser, download ChromeDriver (https://sites.google.com/a/chromium.org/chromedriver/downloads), and add ChromeDriver to your system's PATH environmental variable: `export PATH="$PATH:/path/to/chromedriver"` (following instructions from Selenium documentation: https://www.selenium.dev/documentation/en/).

In [1]:
# install packages to scrape recipes
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options

# other packages
import pandas as pd
import time

In [2]:
executable_path = '/opt/WebDriver/bin/chromedriver'  # location of Chrome webdriver
options = Options()
options.add_argument('--headless')  # "opens" a browser without creating a window
options.add_argument('--log-level=3')  # suppress some outputs

driver = Chrome(executable_path, options=options)  # instantiate a Chrome session

In [3]:
# navigate to 'recipes' page
driver.get('https://www.allrecipes.com/recipes/')

In [4]:
# get all subcategories under 'Meal Type'
major_cats = {'Meal Type'}

# initialize df to hold category information
cat_df = pd.DataFrame(columns=['category', 'subcategory', 'url'])

# iterate over sections
for section in driver.find_elements_by_xpath("//div[@class='all-categories-col']//section"):
    section_name = section.find_elements_by_xpath(".//h3[@class='heading__h3']")[0].text
    # skip some categories
    if section_name not in major_cats:
        continue
    # retain the subcategory name and link for the remaining categories 
    cat_df = pd.concat([cat_df, 
                        pd.DataFrame([(section_name, a.text, a.get_attribute('href')) 
                                      for a in section.find_elements_by_xpath(".//ul//li//a")], 
                                     columns=['category', 'subcategory', 'url'])], axis=0)

In [36]:
# check cat_df
cat_df

Unnamed: 0,category,subcategory,url
0,Meal Type,Breakfast and Brunch,https://www.allrecipes.com/recipes/78/breakfas...
1,Meal Type,Desserts,https://www.allrecipes.com/recipes/79/desserts/
2,Meal Type,Dinners,https://www.allrecipes.com/recipes/17562/dinner/
3,Meal Type,Lunch,https://www.allrecipes.com/recipes/17561/lunch/


In [37]:
from selenium.common.exceptions import ElementNotVisibleException as NotVisible

# restrict number of recipes to retrieve from each subcategory 
# use this for testing; delete later!
num_per_category = 20 

data = pd.DataFrame(columns=['meal_type', 'url'])
    
# iterate through subcategory pages to find recipes links for each subcategory
for index, (subcategory, url) in cat_df[['subcategory', 'url']].iterrows():
    # go to subcategory page and scrape urls
    driver.get(url)
    
    # keep scrolling down until enough recipes appear
    urls = driver.find_elements_by_xpath("//article//div//h3[@class='fixed-recipe-card__h3']//a")
    last_size = len(urls)
    while last_size < num_per_category:
        # check if "More" results button has appeared; click if necessary
        try:
            driver.find_element_by_id("btnMoreResults").click()
        except NotVisible:
            pass
        
        # scroll down and wait for recipes to load
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # for politeness and loading time
        urls = driver.find_elements_by_xpath("//article//div//h3[@class='fixed-recipe-card__h3']//a")
        
        # exit if scrolling down doesn't load more recipes 
        if len(urls) == last_size:
            break
        last_size = len(urls)
        
    # add urls to `data`
    new_data = pd.DataFrame([[card.get_attribute('href') for card in urls]], 
                            index=['url']).T.reindex(columns=data.columns, fill_value=subcategory)
    data = pd.concat([data, new_data], ignore_index=True)
    
    # print status update
    print("Finished scraping '{}' subcategory with {} recipe urls.".format(subcategory, last_size))

Finished scraping 'Breakfast and Brunch' subcategory with 23 recipe urls.
Finished scraping 'Desserts' subcategory with 29 recipe urls.
Finished scraping 'Dinners' subcategory with 28 recipe urls.
Finished scraping 'Lunch' subcategory with 28 recipe urls.


In [38]:
# check data
data

Unnamed: 0,meal_type,url
0,Breakfast and Brunch,https://www.allrecipes.com/recipe/7001/poppy-s...
1,Breakfast and Brunch,https://www.allrecipes.com/recipe/246512/puff-...
2,Breakfast and Brunch,https://www.allrecipes.com/recipe/259473/maple...
3,Breakfast and Brunch,https://www.allrecipes.com/recipe/162760/fluff...
4,Breakfast and Brunch,https://www.allrecipes.com/recipe/21014/good-o...
...,...,...
103,Lunch,https://www.allrecipes.com/recipe/8932/fruity-...
104,Lunch,https://www.allrecipes.com/recipe/78052/beaker...
105,Lunch,https://www.allrecipes.com/recipe/8565/beckys-...
106,Lunch,https://www.allrecipes.com/recipe/83844/slow-c...


In [39]:
# remove internalSource extras in string links
data['url'] = data['url'].apply(lambda x:x.split("?internalSource")[0])
# groupby link and aggregate multiple rows of the same link into a single row
data = data.groupby('url').sum().reset_index()
# reindex to add columns for each recipe
data_to_scrape = ['title', 'ratings', 'madeit', 'reviews', 'photos', 'submitter_description',
                  'ingredients', 'readyin', 'servings', 'directions',
                  'calories', 'fat', 'carbohydrate', 'protein']
data = data.reindex(columns=data.columns.tolist()+data_to_scrape, fill_value='')

In [40]:
# check data again
data

Unnamed: 0,url,meal_type,title,ratings,madeit,reviews,photos,submitter_description,ingredients,readyin,servings,directions,calories,fat,carbohydrate,protein
0,https://www.allrecipes.com/recipe/10033/iced-p...,Desserts,,,,,,,,,,,,,,
1,https://www.allrecipes.com/recipe/10402/the-be...,Desserts,,,,,,,,,,,,,,
2,https://www.allrecipes.com/recipe/104850/black...,Lunch,,,,,,,,,,,,,,
3,https://www.allrecipes.com/recipe/10497/beths-...,Desserts,,,,,,,,,,,,,,
4,https://www.allrecipes.com/recipe/10549/best-b...,Desserts,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,https://www.allrecipes.com/recipe/9247/christm...,Breakfast and Brunch,,,,,,,,,,,,,,
104,https://www.allrecipes.com/recipe/92761/debs-g...,Dinners,,,,,,,,,,,,,,
105,https://www.allrecipes.com/recipe/9471/peanut-...,Desserts,,,,,,,,,,,,,,
106,https://www.allrecipes.com/recipe/98579/barbie...,Lunch,,,,,,,,,,,,,,


In [None]:
# my attempts at scrolling to bottom of infinite scroll page to get all recipes (part I)


# from selenium.common.exceptions import ElementNotVisibleException as NotVisible

# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC

# # `scroll` function source code: https://dev.to/hellomrspaceman/python-selenium-infinite-scrolling-3o12
# # function to scroll to bottom of infinite scrolling style 'Desserts' recipes page
# def scroll(driver, pause_time):
#     # get scroll height
#     last_height = driver.execute_script("return document.body.scrollHeight")
    
#     while True:
#         # check if "More" results button has appeared and click if necessary
#         try:
#             driver.find_element_by_id("btnMoreResults").click()
#         except NotVisible:
#             pass
# #         try:
# #             WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[contains(@class, 'btnMoreResults')]")))
# #             browser.find_element_by_xpath("//a[contains(@class, 'btnMoreResults')]").click()
# #             wait.until(EC.visibility_of_element_located((By.XPATH,"//div[@class='o-listing__btnMoreResults']")))
# #         except Exception as e:
# #             print(e)
# #             break
        
#         # scroll down to bottom
#         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
#         # wait to load page
#         time.sleep(pause_time)
        
#         # calculate new scroll height and compare with last scroll height
#         new_height = driver.execute_script("return document.body.scrollHeight")
#         if new_height == last_height:
#             # if heights are the same, exit the function
#             break
#         last_height = new_height

In [None]:
# my attempts at scrolling to bottom of infinite scroll page to get all recipes (part II)

# scroll(driver, 4)

# # scrape recipe links
# links = driver.find_elements_by_xpath("//article//div//h3[@class='fixed-recipe-card__h3']//a")

# print("Finished scraping {} recipe links.".format(len(links)))

In [41]:
data.columns

Index(['url', 'meal_type', 'title', 'ratings', 'madeit', 'reviews', 'photos',
       'submitter_description', 'ingredients', 'readyin', 'servings',
       'directions', 'calories', 'fat', 'carbohydrate', 'protein'],
      dtype='object')

In [42]:
# scrape pages
def scrape_recipe(entry, driver):
    driver.get(entry['url'])
    
    try:
        entry.loc['title'] = driver.find_elements_by_xpath("//h1[@class='recipe-summary__h1']")[0].text
    except:
        pass
    try:
        entry.loc['ratings'] = driver.find_elements_by_xpath("//div[@class='rating-stars']")[0].get_attribute('data-ratingstars')
    except:
        pass
    try:
        entry.loc['madeit'] = driver.find_elements_by_xpath("//span[@class='made-it-count ng-binding']")[0].text
    except:
        pass
    try:
        entry.loc['reviews'] = driver.find_elements_by_xpath("//span[@class='review-count']")[0].text.split()[0]
    except:
        pass
    try:
        entry.loc['photos'] = driver.find_elements_by_xpath("//span[@class='picture-count-link']")[0].text.split()[0]
    except:
        pass
    try:
        entry.loc['submitter_description'] = driver.find_elements_by_xpath("//div[@class='submitter__description']")[0].text
    except:
        pass
    try:
        entry.loc['ingredients'] = [element.text for element in driver.find_elements_by_xpath("//span[@class='recipe-ingred_txt added']")]
    except:
        pass
    try:
        entry.loc['readyin'] = driver.find_elements_by_xpath("//span[@class='ready-in-time']")[0].text
    except:
        pass
    try:
        entry.loc['servings'] = driver.find_elements_by_xpath("//span[@class='servings-count']//span")[0].text
    except:
        pass
    try:
        entry.loc['directions'] = driver.find_elements_by_xpath("//div[@class='recipe-directions__list--item']")[0].text
    except:
        pass
    try:
        entry.loc['calories'] = driver.find_elements_by_xpath("//span[@class='calorie-count']//span")[0].text
    except:
        pass
    for string in ['fatContent', 'carbohydrateContent', 'proteinContent']:
        try:
            entry.loc[re.split(r"[A-Z]", string)[0]] = driver.find_elements_by_xpath("//span[@itemprop='{}']".format(string))[0].text
        except:
            pass
    return entry

In [45]:
for idx, row in data[data['title']==''].iterrows():
    data.loc[idx] = scrape_recipe(row, driver)

In [54]:
data[['url', 'title']]

Unnamed: 0,url,title
0,https://www.allrecipes.com/recipe/10033/iced-p...,
1,https://www.allrecipes.com/recipe/10402/the-be...,
2,https://www.allrecipes.com/recipe/104850/black...,Black Bean and Corn Quesadillas
3,https://www.allrecipes.com/recipe/10497/beths-...,
4,https://www.allrecipes.com/recipe/10549/best-b...,
...,...,...
103,https://www.allrecipes.com/recipe/9247/christm...,
104,https://www.allrecipes.com/recipe/92761/debs-g...,
105,https://www.allrecipes.com/recipe/9471/peanut-...,
106,https://www.allrecipes.com/recipe/98579/barbie...,Barbie's Tuna Salad
