# Flavor's Network Data Collection #
## Using scrapy spiders to collect spices, herbs, condiments, and recipes ##
### Sara Evans ###

In [1]:
#imports
import requests
from bs4 import BeautifulSoup
from scrapy import Selector
from scrapy.crawler import CrawlerProcess
from scrapy.http import Response
import scrapy
import scrapy.crawler as crawler
from multiprocessing import Process, Queue
from twisted.internet import reactor
import pandas as pd
import numpy as np
import csv

recipes site='https://www.seriouseats.com/recipes/topics/cuisine
spice_sites = 'https://www.thespicehouse.com/collections/letter-a', ''https://spicesinc.com/t-list-of-spices.aspx'

since scrapy spiders can't be rerun without restarting the kernel, this function found from [stack overflow](https://stackoverflow.com/questions/41495052/scrapy-reactor-not-restartable) 

In [2]:
#for making tweaks to spider
def run_spider(spider):
    def f(q):
        try:
            runner = crawler.CrawlerRunner()
            deferred = runner.crawl(spider)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result

Here are the spiders that will crawl the spice and recipe sites

In [3]:
class SpiceSpider(scrapy.Spider):
    name = 'spice_spider'
    def start_requests(self):
        url = 'https://www.thespicehouse.com/collections/letter-q'
        yield scrapy.Request(url = url, callback = self.parse_links)
    def parse_links(self, response):
        #find all the pages with spices listed on them
        link_path = 'header.section__head > div.container li > a::attr(href)'
        links = ['https://www.thespicehouse.com' + i.extract() for i in response.css(link_path)]
        for link in links:
            yield response.follow(url = link, callback = self.parse_page)
    def parse_page(self, response):
        #scrape all spices on page
        spices = response.css('h3.product__title > a::text').extract()
        #make all lowe case, clean up, and only retain the name of the spice not information after comma
        sub_lst = [i.lower().replace('\n','').strip().split(',')[0] for i in spices]
        global spice_list
        [spice_list.append(spice) for spice in sub_lst]



In [4]:
class SpiceHerb(scrapy.Spider):
    name = 'spice_herb'
    def start_requests(self):
        url = 'https://spicesinc.com/t-list-of-spices.aspx'
        yield scrapy.Request(url = url, callback = self.parse)
    def parse(self, response):
        #get them spices + herbs
        spice_herb = response.xpath('/html/body/div[3]/div/div/div[2]/div/article/div/section/p/strong/text()')
        [spice_list.append(i.extract().lower()[:-2]) for i in spice_herb]

In [5]:
class recipeSpider(scrapy.Spider):
    name = 'recipe_spider'
    cuisines = []
    
    def start_requests(self):
        url = 'https://www.seriouseats.com/recipes/topics/cuisine'
        yield scrapy.Request(url = url, callback = self.parse_first_pg)
        
    def parse_first_pg(self, response):
        #find the cuisine names the site uses and put them into list for cuisines
        main_cuis_path = '//*[@id="expanded-nav-Narrow by type"]/div[2]/ul/li/a/text()'
        sub_cuis_path = '//*[@id="expanded-nav-Narrow by type"]/div[2]/ul/li/ul/li/a/text()'
        
        main_cuis = response.xpath(main_cuis_path).extract()
        sub_cuis = response.xpath(sub_cuis_path).extract()
        total_cuis = main_cuis+sub_cuis
        
        global cuisines
        cuisines = total_cuis
        
        #find all recipes links on page
        recipe_links = response.xpath('/html/body/div[3]/section[1]/section/article/a/@href').extract()
        
        #find the total number of recipe pages 
        last_page_num = int(response.xpath('/html/body/div[3]/section[1]/div/div/a[3]/text()').extract_first())
        
        #input page number into standard url for the next pages
        std_url = 'https://www.seriouseats.com/recipes/topics/cuisine?page={}#recipes'
        nxt_pg_urls = [std_url.format(pg) for pg in range(2,last_page_num+1)]
        
        #send next pages to be parsed for recipe urls, and first page recipe urls are sent to be parsed for info
        for nxt in nxt_pg_urls:
            yield response.follow(url = nxt, callback = self.parse_next)
        for link in recipe_links:
            yield response.follow(url = link, callback = self.parse_recipes)

    def parse_next(self, response):
        #find recipe links on each page
        recipe_links = response.xpath('/html/body/div[3]/section[1]/section/article/a/@href').extract()
        
        for link in recipe_links:
            yield response.follow(url = link, callback = self.parse_recipes)
    
    def parse_recipes(self, response):
        #extract title, cuisine, ingreds, recipe, rating, url
        title = response.css('h1.recipe-title::text').extract_first()
        
        #cuisine placement unpredictable, find all info in the area then filter for cuisines found in parse_first_pg
        hidden_cuisine_path = '//div[@class = "breadcrumbs__more"]/ul/li/a/strong/text()'
        hidden_cuisine = [i.strip() for i in response.xpath(hidden_cuisine_path).extract()]

        cuisine = [c for c in cuisines if c in ' '.join(hidden_cuisine)]
        
        ingredients_path = '//*[@id="recipe-wrapper"]/div[2]/ul/li//text()'
        ingredients = ' '.join(response.xpath(ingredients_path).extract())
        
        directions_path = '//*[@id="recipe-wrapper"]/div[3]/ol//text()'
        directions = ' '.join(response.xpath(directions_path).extract()).strip()
        
        #some recipes don't have ratings return NaN if no rating
        try:
            rating_path = '//*[@id="recipe-wrapper"]/ul/li[4]/span[2]/span/text()'
            rating = float(response.xpath(rating_path).extract_first())
        except:
            rating = np.nan
        
        url = response.url
        
        #add new record to recipes df
        record = pd.DataFrame({'title':title, 
                               'cuisine':cuisine, 
                               'ingredients':ingredients, 
                               'directions':directions,
                               'rating':rating,
                               'url':url
                              })
        global recipes
        recipes = pd.concat([recipes,record])

In [6]:
#initiate empty receptors of spider info
spice_list = []
recipes = pd.DataFrame(columns = ['title','cuisine','ingredients','directions','rating','url'])


Process = CrawlerProcess()
Process.crawl(SpiceSpider)
Process.crawl(SpiceHerb)
Process.crawl(recipeSpider)
Process.start()        


2020-06-18 13:23:12 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2020-06-18 13:23:12 [scrapy.utils.log] INFO: Versions: lxml 4.4.1.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 20.3.0, Python 3.7.4 (default, Aug 13 2019, 15:17:50) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.7, Platform Darwin-19.4.0-x86_64-i386-64bit
2020-06-18 13:23:12 [scrapy.crawler] INFO: Overridden settings: {}
2020-06-18 13:23:12 [scrapy.extensions.telnet] INFO: Telnet Password: 77392db9ce14e30e
2020-06-18 13:23:12 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2020-06-18 13:23:12 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.Dow

In [7]:
#add japanese spices bc those recipes had the fewest hits for spices
html = requests.get('https://livejapan.com/en/article-a0001822/').content
sel = Selector(text = html)
japanese = [i[3:].strip().lower() for i in sel.xpath('/html/body/main/div[1]/div[1]/div[2]/dl/dd/ol/li/a/text()').extract()]
[spice_list.append(i) for i in japanese]

#add common condiments to spices (and herbs) list
chtml = requests.get('https://www.cnn.com/travel/article/best-condiments/index.html').content
selc = Selector(text = chtml)

condiments = [i.lower() for i in selc.css('div > span > h3::text').extract()]
condiments[condiments.index('vegemite/marmite')] = 'vegimite'
condiments.append('marmite')

[spice_list.append(i) for i in condiments]

#make sure only one entry for each spice
spices = list(set(spice_list))

#filter out scraping mistakes
not_spices = ['savory','sweeteners','physical gift card','crushgrind gift bundle',
              'kitchen essentials','paprik','water','sesame seed','cilantro leaves',
              'corn','mushrooms','stock', 'bell peppers','cumin seeds','curry leaves',
              'extract', 'vanilla extract','fenugreek leaves','dried fenugreek leaves',
              'fennel pollen', 'cocoa powder']
for i in not_spices:
    try:
        spices.remove(i)
    except:
        print(i+' is not in spices')

2020-06-18 13:24:26 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): livejapan.com:443
2020-06-18 13:24:28 [urllib3.connectionpool] DEBUG: https://livejapan.com:443 "GET /en/article-a0001822/ HTTP/1.1" 200 None
2020-06-18 13:24:28 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): www.cnn.com:443
2020-06-18 13:24:28 [urllib3.connectionpool] DEBUG: https://www.cnn.com:443 "GET /travel/article/best-condiments/index.html HTTP/1.1" 200 53268


In [8]:
#clean df a lil
recipes.reset_index(inplace=True, drop = True)
recipes['directions'] = recipes.directions.str.replace('\n','')

In [9]:
#find the spices in each ingredient string
def find_spice(ingredients):
    spice_herb = []
    for i in ingredients:
        spice_herb.append([s for s in spices if s in i])
    for s in range(len(spice_herb)):
        if len(spice_herb[s]) == 0:
            spice_herb[s] = np.nan
    return spice_herb

In [10]:
recipes['spice_herb'] = find_spice(recipes.ingredients)

In [11]:
#remove list of spices from recipe if there is only one spice
def more_than_one(col):
    col_copy = col.copy()
    for i in range(len(col_copy)):
        if isinstance(col_copy[i],list):
            if len(col_copy[i]) < 2:
                col_copy[i] = np.nan
    return col_copy

recipes['more_than_one'] = more_than_one(recipes.spice_herb)

In [12]:
#find the cuisines and the fraction of recipes without spices identified
gb_cuisines = recipes.groupby('cuisine').count()
gb_cuisines['missing_spice'] = gb_cuisines.title - gb_cuisines.spice_herb
gb_cuisines['only_one_spice'] = gb_cuisines.title - gb_cuisines.more_than_one
gb_cuisines['frac_missing'] = gb_cuisines.missing_spice/gb_cuisines.title
gb_cuisines['frac_only_one'] = gb_cuisines.only_one_spice/gb_cuisines.title
gb_cuisines.sort_values('frac_only_one', ascending = False)

2020-06-18 13:24:32 [numexpr.utils] INFO: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-06-18 13:24:32 [numexpr.utils] INFO: NumExpr defaulting to 8 threads.


Unnamed: 0_level_0,title,ingredients,directions,rating,url,spice_herb,more_than_one,missing_spice,only_one_spice,frac_missing,frac_only_one
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
British,78,78,78,32,78,74,58,4,20,0.051282,0.25641
Tex-Mex,40,40,40,22,40,38,30,2,10,0.05,0.25
Asian,35,35,35,6,35,31,28,4,7,0.114286,0.2
Mexican,385,385,385,150,385,361,311,24,74,0.062338,0.192208
Kosher,73,73,73,13,73,69,59,4,14,0.054795,0.191781
Japanese,168,168,168,79,168,163,140,5,28,0.029762,0.166667
Latin American,84,84,84,36,84,82,70,2,14,0.02381,0.166667
American,169,169,169,99,169,165,144,4,25,0.023669,0.147929
French,304,304,304,91,304,292,265,12,39,0.039474,0.128289
Thai,177,177,177,75,177,171,155,6,22,0.033898,0.124294


In [13]:
recipes

Unnamed: 0,title,cuisine,ingredients,directions,rating,url,spice_herb,more_than_one
0,Grilled Skirt Steak Fajitas Recipe,Mexican,For the Steak Fajita Marinade: 1/2 cup (120ml)...,1. ...,4.684211,https://www.seriouseats.com/recipes/2013/06/gr...,"[cloves, chili powder, salsa, pepper, cumin, s...","[cloves, chili powder, salsa, pepper, cumin, s..."
1,Grilled Skirt Steak Fajitas Recipe,Tex-Mex,For the Steak Fajita Marinade: 1/2 cup (120ml)...,1. ...,4.684211,https://www.seriouseats.com/recipes/2013/06/gr...,"[cloves, chili powder, salsa, pepper, cumin, s...","[cloves, chili powder, salsa, pepper, cumin, s..."
2,Homemade Ramen Noodles Recipe,Japanese,8g baked baking soda 4g Diamond Crystal koshe...,1. ...,5.000000,https://www.seriouseats.com/recipes/2018/11/ho...,[salt],
3,Cheese Sauce for Cheese Fries and Nachos Recipe,Mexican,8 ounces extra sharp cheddar cheese (or a mix ...,1. ...,4.171429,https://www.seriouseats.com/recipes/2010/09/ch...,[pepper],
4,"Stir-Fried Lo Mein With Charred Cabbage, Shiit...",Chinese,Kosher salt 1 pound fresh lo mein noodles 1/4...,1. ...,4.142857,https://www.seriouseats.com/recipes/2014/06/lo...,"[cloves, sesame, scallions, chives, pepper, so...","[cloves, sesame, scallions, chives, pepper, so..."
...,...,...,...,...,...,...,...,...
2725,Quick Curtido (Mexican Cabbage Slaw) Recipe,Mexican,"1/2 small (2-pound) green cabbage, cored and v...",1. ...,4.500000,https://www.seriouseats.com/recipes/2014/05/qu...,"[salt, onion]","[salt, onion]"
2726,Grilled Skirt Steak With Mojo Marinade Recipe,Caribbean,"For the Steak: 2 pounds skirt steak, trimmed o...",1. ...,,https://www.seriouseats.com/recipes/2014/04/gr...,"[cloves, cilantro, pepper, cumin, garlic, salt]","[cloves, cilantro, pepper, cumin, garlic, salt]"
2727,Pasta,Italian,"8 ounces (225g) guanciale (cured pork jowl),...",1. ...,5.000000,https://www.seriouseats.com/recipes/2018/11/pa...,"[pepper, salt]","[pepper, salt]"
2728,Creamy Vegan Saag Paneer (With Tofu) Recipe,Indian,12 ounces (350g) extra-firm tofu 1 tablespoon...,1. ...,4.000000,https://www.seriouseats.com/recipes/2017/03/cr...,"[cloves, ginger, turmeric, mustard, miso, card...","[cloves, ginger, turmeric, mustard, miso, card..."


### Save Output ###

In [14]:
recipes.to_csv('recipes.csv')
with open('spices_herbs.txt', 'w', newline='') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(spices)