# Epicurious Yearly Recipe List

Get the list of recipes and links to recipes from the sitemap by year

In [1]:
from bs4 import BeautifulSoup
from splinter import Browser
import time
from time import sleep
import pandas as pd

## Create Recipe URL List

Only using recipes from the editors as member recipes don't seem to have nutritional information.

This will pull a list of all the recipe titles and urls by year and put into a dataframe and save as csv so we can then loop through all recipes to pull details

In [2]:
# define lists to store recipe URLs and Links. will zip these up togehter into dataframe at end
sourcelist, yearlist, urllist, namelist = ([] for i in range(4))

In [3]:
# define recipe urls and years to search
root_url = "https://www.epicurious.com"
sitemap_recipes_url = "/services/sitemap/recipes"
source_urls = ["/editorial/"] # only used editorials since the members don't seem to have nutritional info
years = [2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 
         2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998]

In [4]:
# function that parses the links and adds them to the lists
def parse_recipe_links(source, year, links):
    next_page_url = ''
    counter = 0
    for link in links:
        counter += 1
        try:
            if sitemap_recipes_url in link['href']:
                # the sitemap part of the url is in the link, this means it's not a recipe but a "next page"
                if link.text == "Next":
                    # we got a next page
                    next_page_url = root_url + link['href']
            else:
                sourcelist.append(source)
                yearlist.append(year)
                urllist.append(link['href'].strip())
                namelist.append(link.text.strip())
        except:
            print(f'Issue parsing recipe {counter}')
            
    return next_page_url;

In [5]:
# start up the browser
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
for source_url in source_urls:
    source = source_url.replace('/', '')
    print(f'Beginning source: {source}')
    for year in years:
        print(f'Beginning year {year}')
        url = root_url + sitemap_recipes_url + source_url + str(year)

        while url != '':
            print(url)
            # visit url
            browser.visit(url)
            sleep(2)

            # parse with beautiful soup
            html = browser.html
            soup = BeautifulSoup(html, 'html.parser')

            # find div of recipe links
            try:
                links = soup.find(id='sitemapItems').findAll('a')

                # parse the recipe links. will return the next page or an empty string 
                url = parse_recipe_links(source, year, links)
            except:
                print(f'Error finding links for page {url}')
                url = ''

print('Done!')

In [None]:
# create dictionary from lists  
dict_recipe_list = {'source': sourcelist, 'year': yearlist, 'title': namelist, 'url': urllist}  

In [None]:
# create dataframe from dictionary
df = pd.DataFrame(dict_recipe_list) 
df.head()

In [None]:
# export data to csv
df.to_csv('resources/data/recipe_list.csv', index=False)