Code source: https://github.com/DerekHodgson/Scraping-AllRecipes/blob/master/scrape.py

In [1]:
# from urllib.request import urlopen
import requests
from io import BytesIO
import pandas as pd
from bs4 import BeautifulSoup
import re
import pickle
import json
import gc

In [2]:
# import the scraped URLs
urlList = [line.strip() for line in open('urlList.txt')]

In [3]:
# number of recipe urls
len(urlList)

69737

In [4]:
# remove any duplicates
urlList = list(set(urlList))

In [5]:
# check number of urls after removing duplicates
# (only one url removed due to duplication)
len(urlList)

69736

In [6]:
def get_photo_urls(url):
    photo_urls_list = []
    
    try:
#         page = urlopen(url)
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'lxml')

        htmlPhotoIDs = re.search('(?<=photoCollection:)(.*}})(?=,)', str(soup)).group(0)
        htmlPhotoIDs = json.loads(htmlPhotoIDs)

        photo_url_root = 'https://images.media-allrecipes.com/userphotos/'
        photo_ext = '.jpg'

        for photo_id in htmlPhotoIDs:
            photo_urls_list.append(photo_url_root + str(photo_id) + photo_ext)
    except:
        pass
        
    return photo_urls_list

In [7]:
def get_json_info(json_text, soup, recipe_url):
    # get 'title'
    try:
        title = json_text['name'].strip()
    except:
        title = ''
    
    # get 'ratings'
    try:
        ratings = json_text['aggregateRating']['ratingCount']
    except:
        ratings = ''
    
    # get 'avg_rating'
    try:
        avg_rating = json_text['aggregateRating']['ratingValue']
    except:
        avg_rating = ''
    
    # get 'madeit'
    madeit = ''  # 'Made It' not available in this html style recipe
    
    # get 'reviews'
    try:
        reviews = soup.find_all('a', {'class': 'ugc-ratings-link ugc-reviews-link'})[0].text # not in json!
    except:
        reviews = ''
    
    # get 'photos'
    try:
        photos = soup.find_all('a', {'class': 'ugc-ratings-link ugc-photos-link'})[0].text # not in json!
    except:
        photos = ''
    
    # get 'photo_urls'
    photo_urls = []
    
    if photos != '': 
        if int(photos.split()[0]) > 0:
            try:
                main_photo_info = soup.find_all('div', {'class': 'image-container'})[0]
                main_photo_url = re.search('(?<=data-src=")(.*.jpg)(?=")',
                                           str(main_photo_info)).group(1)
                photo_urls.append(main_photo_url)

                photo_slide = soup.find_all(lambda tag: tag.name == 'a' and 
                                            tag.get('class') == ['ugc-photos-link'])
                for i in photo_slide:
                    photo_url = re.search('(?<=data-src=")(.*.jpg)(?=")',
                                          str(i)).group(1)
                    photo_urls.append(photo_url)
            except:
                pass
    
    # get 'description'
    try:
        description = json_text['description'].strip()
    except:
        description = ''
    
    # get 'ingredients_list'
    try:
        ingredients_list = json_text['recipeIngredient']
        ingredients_list = [i.strip() for i in ingredients_list]
    except:
        ingredients_list = []
    
    # get 'readyin' and 'servings'
    try: 
        htmlReady_Servings = soup.find_all('div', {'class': 'recipe-meta-item'})
        ready_servings_list = [' '.join(i.text.lower().split()) for i in htmlReady_Servings]
        ready_servings_dict = {j.split(': ')[0]: j.split(': ')[1] for j in ready_servings_list}
    except:
        ready_servings_dict = {}
        
    try:
        readyin = ready_servings_dict['total']  # not in json!
    except:
        readyin = ''
        
    try:
        servings = ready_servings_dict['servings']  # not in json!
    except:
        servings = ''
    
    # get 'directions_list'
    try:
        htmlDirections = json_text['recipeInstructions']
        directions_list = [d['text'].strip() for d in htmlDirections]
    except:
        directions_list = []
    
    # get 'calories'
    try:
        calories = json_text['nutrition']['calories']
    except:
        calories = ''
    
    # get 'fat'
    try:
        fat = json_text['nutrition']['fatContent']
    except:
        fat = ''
    
    # get 'carbohydrates'
    try:
        carbohydrates = json_text['nutrition']['carbohydrateContent']
    except:
        carbohydrates = ''
    
    # get 'protein'
    try:
        protein = json_text['nutrition']['proteinContent']
    except:
        protein = ''

    # compile recipe info into dictionary and return
    recipe_dict = {'title': title,
                   'recipe_url': recipe_url,
                   'ratings': ratings,  
                   'avg_rating': avg_rating,
                   'madeit': madeit,  # this style html doesn't have 'Made It', but has number of 'Ratings'
                   'reviews': reviews,
                   'photos': photos,
                   'photo_urls': photo_urls,
                   'description': description,
                   'ingredients': ingredients_list,
                   'readyin': readyin,
                   'servings': servings,
                   'directions': directions_list,
                   'calories': calories,
                   'fat': fat,
                   'carbohydrates': carbohydrates,
                   'protein': protein}
    
    return recipe_dict

In [8]:
def scrape_html_info(soup, recipe_url):
    # get 'title'
    try:
        title = re.search('(?<=<title>)(.*)(?=Recipe - Allrecipes.com)', 
                          str(soup.title)).group(1).strip()
    except:
        title = ''
    
    # get 'ratings'
    ratings = ''  # 'Ratings' not available in this html style recipe
    
    # get rating info ('avg_rating', 'reviews')
    htmlRating = soup.find_all('span', {'itemprop': 'aggregateRating'})
    
    # get 'avg_rating'
    try:
        avg_rating = re.search('(?<=content=")(.*)(?=" itemprop="ratingValue")',
                               str(htmlRating)).group(1)
    except:
        avg_rating = ''
    
    # get 'madeit'
    try:
        madeit = re.search('(?<="made-it-count"></span><span>)(.*)(?=made it)',
                           str(soup)).group(1).split()[0]
    except:
        madeit = ''
    
    # get 'reviews'
    try:
        reviews = re.search('(?<=content=")(.*)(?=" itemprop="reviewCount")',
                            str(htmlRating)).group(1)
    except:
        reviews = ''
    
    # get 'photos'
    try:
        photos = soup.find_all('span', {'class': 'picture-count-link'})[0].text.split()[0]
    except:
        photos = ''
    
    # get 'photo_urls'
    photo_urls = []
    
    if photos != '':
        if int(photos) > 0:
            photos_page_url = soup.find('a', {'class': 'icon-photoPage-link'})['href']
            if photos_page_url is not None:
                photo_urls = get_photo_urls(photos_page_url)
    
    # get 'description'
    try:
        description = soup.find_all('div', {'class': 'submitter__description'})[0].text
    except:
        description = ''
    
    # get 'ingredients'
    htmlIngredients = soup.find_all('span', {'class': "recipe-ingred_txt added"})
    ingredients_list = []
    if htmlIngredients is not None:
        for ingredient in htmlIngredients:
            ingredients_list.append(ingredient.text.strip())
        
    # get 'readyin'
    try:
        readyin = soup.find_all('span', {'class': 'ready-in-time'})[0].text
    except:
        readyin = ''
    
    # get 'servings'
    try:
        servings = soup.find_all('meta', {'itemprop': 'recipeYield'})[0]['content']
    except:
        servings = ''
    
    # get 'directions'
    htmlDirections = soup.find_all('span', {'class': "recipe-directions__list--item"})
    directions_list = []
    if htmlDirections is not None:
        for direction in htmlDirections:
            # sometimes there is an empty direction following the last direction
            if direction.text != '':
                directions_list.append(direction.text.strip())
    
    # get 'calories'
    try:
        calories = soup.find_all('span', {'itemprop': 'calories'})[0].text.split()[0]
    except:
        calories = ''
    
    # get 'fat'
    try:
        fat = soup.find_all('span', {'itemprop': 'fatContent'})[0].text
    except:
        fat = ''
    
    # get 'carbohydrates'
    try:
        carbohydrates = soup.find_all('span', {'itemprop': 'carbohydrateContent'})[0].text
    except:
        carbohydrates = ''
    
    # get 'protein'
    try:
        protein = soup.find_all('span', {'itemprop': 'proteinContent'})[0].text
    except:
        protein = ''
    
    # compile recipe info into dictionary and return
    recipe_dict = {'title': title,
                   'recipe_url': recipe_url,
                   'ratings': ratings,  # this style html doesn't have number of ratings, but has number 'Made It'
                   'avg_rating': avg_rating,
                   'madeit': madeit,
                   'reviews': reviews,
                   'photos': photos,
                   'photo_urls': photo_urls,
                   'description': description,
                   'ingredients': ingredients_list,
                   'readyin': readyin,
                   'servings': servings,
                   'directions': directions_list,
                   'calories': calories,
                   'fat': fat,
                   'carbohydrates': carbohydrates,
                   'protein': protein}
    
    return recipe_dict

In [9]:
from PIL import Image
import numpy as np

# number of recipes to scrape
n_recipes = len(urlList)

# create empty list to hold recipes and images/labels
recipe_list = []
recipe_image_arrays_with_labels = []

with open("recipe_data.csv", "a") as recipe_file, open("recipe_image_data.csv", "a") as image_file:
    df = pd.DataFrame(columns=['title',
                               'recipe_url',
                               'ratings',
                               'avg_rating', 
                               'madeit', 
                               'reviews', 
                               'photos', 
                               'photo_urls', 
                               'description', 
                               'ingredients', 
                               'readyin', 
                               'servings', 
                               'directions', 
                               'calories', 
                               'fat', 
                               'carbohydrates', 
                               'protein'])
    df.to_csv(recipe_file, index=False)
    
    df_img = pd.DataFrame(columns=['image_array', 
                                   'label'])
    df_img.to_csv(image_file, index=False)
    
    # scrape recipe characteristics from urls
    for i in range(n_recipes):
        print(i)
        print(urlList[i])

        # create empty recipe dictionary
        recipe_dict = {}

        # get website html
#         page = urlopen(urlList[i])
        page = requests.get(urlList[i])
        soup = BeautifulSoup(page.text, "lxml")

        # if source has: <script type="application/ld+json">, 
        # then scrape most info from this json script
        has_json = soup.find('script', type='application/ld+json')
        if has_json is not None:
            json_text = json.loads(has_json.text)[1]
            recipe_dict = get_json_info(json_text, soup, urlList[i])
        else:
            recipe_dict = scrape_html_info(soup, urlList[i])

        # save all images as 64x64x3 arrays to `recipe_image_arrays_with_labels` 
        # list with label (i.e. recipe title)
        label = recipe_dict['title'].lower().strip().replace(' ', '_')

        if recipe_dict['photo_urls'] != []:
            for i in recipe_dict['photo_urls']:
                try:
#                     img = Image.open(urlopen(i))
                    img_page = requests.get(i)
                    img_page_bytes = BytesIO(img_page.content)
                    img = Image.open(img_page_bytes)
                    resized_img = img.resize((64, 64))
                    img_array = np.asarray(resized_img)
                    recipe_image_arrays_with_labels.append([img_array, 
                                                            label])
                    df_img = pd.DataFrame([{'img_array': img_array, 
                                        'label': label}])
                    df_img.to_csv(image_file, header=False, index=False)
                except:
                    pass

        # add recipe dictionary to list of recipes
        recipe_list.append(recipe_dict)

        # add recipe to .csv file output (convert lists to strings delimited by '%>%')
        recipe_dict['photo_urls'] = '%>%'.join(recipe_dict['photo_urls'])
        recipe_dict['ingredients'] = '%>%'.join(recipe_dict['ingredients'])
        recipe_dict['directions'] = '%>%'.join(recipe_dict['directions'])

        df = pd.DataFrame.from_dict([recipe_dict])
        df.to_csv(recipe_file, header=False, index=False)

0
https://www.allrecipes.com/recipe/221230/margarita-punch/
1
https://www.allrecipes.com/recipe/19047/chocolate-wafer-roll/
2
https://www.allrecipes.com/recipe/254997/acorn-squash-risotto/
3
https://www.allrecipes.com/recipe/277338/christmas-cobbler/
4
https://www.allrecipes.com/recipe/218329/cool-and-refreshing-cantaloupe-drink/
5
https://www.allrecipes.com/recipe/85256/easy-honey-mustard-mozzarella-chicken/
6
https://www.allrecipes.com/recipe/86938/orzo-with-chicken-and-artichokes/
7
https://www.allrecipes.com/recipe/15451/southern-butternut-squash/
8
https://www.allrecipes.com/recipe/218537/restaurant-style-sausage-gravy-and-biscuits/
9
https://www.allrecipes.com/recipe/213024/cbs-black-eyed-peas/
10
https://www.allrecipes.com/recipe/14679/sesame-beef/
11
https://www.allrecipes.com/recipe/81939/cheesy-chicken-broccoli-bake/
12
https://www.allrecipes.com/recipe/278216/easy-cabbage-onion-and-kale-stir-fry/
13
https://www.allrecipes.com/recipe/269004/one-pot-spaghetti-with-meat-sauce/


114
https://www.allrecipes.com/recipe/15587/apple-crumb-pie/
115
https://www.allrecipes.com/recipe/11467/crunchy-date-rounds/
116
https://www.allrecipes.com/recipe/236200/amazing-buffalo-dip/
117
https://www.allrecipes.com/recipe/70798/easy-grape-salad/
118
https://www.allrecipes.com/recipe/9175/pear-conserve-with-cherries-and-hazelnuts/
119
https://www.allrecipes.com/recipe/231506/strawberry-cream-pie-to-die-for/
120
https://www.allrecipes.com/recipe/228479/perfect-caprese-salad/
121
https://www.allrecipes.com/recipe/273805/charred-corn-and-quinoa/
122
https://www.allrecipes.com/recipe/263094/grilled-swordfish-tacos-with-tomatillo-mango-salsa/
123
https://www.allrecipes.com/recipe/261722/vegan-black-eyed-pea-salad-with-cilantro/
124
https://www.allrecipes.com/recipe/20437/kiwi-lime-refresher/
125
https://www.allrecipes.com/recipe/260520/apple-frog-for-kids/
126
https://www.allrecipes.com/recipe/240817/breakfast-power-smoothie/
127
https://www.allrecipes.com/recipe/267409/easy-air-frye

225
https://www.allrecipes.com/recipe/212695/deep-dish-persimmon-pie/
226
https://www.allrecipes.com/recipe/276599/healthier-mediterranean-tuna-salad/
227
https://www.allrecipes.com/recipe/218121/pops-dill-pickles/
228
https://www.allrecipes.com/recipe/45596/savory-tomato-bread-pudding/
229
https://www.allrecipes.com/recipe/9709/chocolate-crisps/
230
https://www.allrecipes.com/recipe/10003/cookie-press-shortbread/
231
https://www.allrecipes.com/recipe/261874/easy-caramel-apple-cookie-bars/
232
https://www.allrecipes.com/recipe/222294/sauteed-rice-with-kale/
233
https://www.allrecipes.com/recipe/217869/sour-cream-lemon-pound-cake-with-cherry-compote/
234
https://www.allrecipes.com/recipe/223520/john-gs-caesar-salad-dressing/
235
https://www.allrecipes.com/recipe/23875/spicy-chicken-thai-noodle-soup/
236
https://www.allrecipes.com/recipe/239022/sweet-n-creamy-peanut-butter-apple-sandwich/
237
https://www.allrecipes.com/recipe/221144/trieste-tequila-cooler/
238
https://www.allrecipes.com/

339
https://www.allrecipes.com/recipe/234671/spicy-vegan-lentil-quinoa-soup/
340
https://www.allrecipes.com/recipe/218362/4th-of-july-star-cupcakes/
341
https://www.allrecipes.com/recipe/74814/spinach-potatoes/
342
https://www.allrecipes.com/recipe/202840/almond-flour-pancakes/
343
https://www.allrecipes.com/recipe/255504/apple-smiles/
344
https://www.allrecipes.com/recipe/260916/apple-caramel-cheesecake/
345
https://www.allrecipes.com/recipe/71548/sauteed-green-beans/
346
https://www.allrecipes.com/recipe/277226/red-wine-hot-chocolate/
347
https://www.allrecipes.com/recipe/220238/oyster-stew-christmas-eve-recipe/
348
https://www.allrecipes.com/recipe/25647/apricot-balls/
349
https://www.allrecipes.com/recipe/17426/firecracker-burgers/
350
https://www.allrecipes.com/recipe/230138/framboise-fizz/
351
https://www.allrecipes.com/recipe/275617/balsamic-glazed-stuffed-chicken-breasts-with-pesto-and-parmesan/
352
https://www.allrecipes.com/recipe/86810/curried-shrimp-bisque/
353
https://www.

453
https://www.allrecipes.com/recipe/237345/smokey-vegetarian-cuban-black-bean-soup/
454
https://www.allrecipes.com/recipe/239935/japanese-dressing/
455
https://www.allrecipes.com/recipe/177619/lemon-herb-grilled-chicken/
456
https://www.allrecipes.com/recipe/50761/pancetta-wrapped-shrimp-with-chipotle-vinaigrette-and-cilantro-oil/
457
https://www.allrecipes.com/recipe/257653/sweet-and-spicy-twice-baked-sweet-potatoes/
458
https://www.allrecipes.com/recipe/87211/chicken-pesto-paninis/
459
https://www.allrecipes.com/recipe/128967/baked-delicata-squash-with-lime-butter/
460
https://www.allrecipes.com/recipe/231750/spinach-potatoes-and-bacon-au-gratin/
461
https://www.allrecipes.com/recipe/17589/albondigas-soup-iii/
462
https://www.allrecipes.com/recipe/237956/easy-strawberry-cupcakes/
463
https://www.allrecipes.com/recipe/21463/carrot-souffle/
464
https://www.allrecipes.com/recipe/270718/leeks-with-chardonnay-and-creme-fraiche/
465
https://www.allrecipes.com/recipe/11758/baked-ziti-i/
4

568
https://www.allrecipes.com/recipe/220251/asparagus-ham-and-lemon/
569
https://www.allrecipes.com/recipe/13792/pumpkin-pie-squares/
570
https://www.allrecipes.com/recipe/246883/awake-peanut-butter-snack-bites/
571
https://www.allrecipes.com/recipe/99058/cauliflower-and-kale-with-mustard-currant-dressing/
572
https://www.allrecipes.com/recipe/20743/claires-yummy-crepes/
573
https://www.allrecipes.com/recipe/76989/mixed-grill-of-sausage-chicken-and-lamb-with-tandoori-flavorings/
574
https://www.allrecipes.com/recipe/7503/sock-it-to-me-cake-i/
575
https://www.allrecipes.com/recipe/9534/old-fashioned-homemade-hard-candy/
576
https://www.allrecipes.com/recipe/261575/broccoli-slaw-with-spicy-dressing/
577
https://www.allrecipes.com/recipe/223014/passover-pumpkin-muffins/
578
https://www.allrecipes.com/recipe/13816/easy-english-trifle/
579
https://www.allrecipes.com/recipe/24241/spicy-hot-crab-spread/
580
https://www.allrecipes.com/recipe/86411/barbecued-korean-ribs/
581
https://www.allrec

682
https://www.allrecipes.com/recipe/160603/bonds-vesper/
683
https://www.allrecipes.com/recipe/20906/doctor-bird-cake/
684
https://www.allrecipes.com/recipe/106584/campfire-foil-packs/
685
https://www.allrecipes.com/recipe/232428/habanero-cream-sauce/
686
https://www.allrecipes.com/recipe/267637/simple-peach-salsa/
687
https://www.allrecipes.com/recipe/26021/creme-de-menthe-cake-ii/
688
https://www.allrecipes.com/recipe/17536/italian-cassata-cake/
689
https://www.allrecipes.com/recipe/258166/the-best-vanilla-bean-cheesecake/
690
https://www.allrecipes.com/recipe/149302/tomato-and-bread-soup/
691
https://www.allrecipes.com/recipe/239113/billy-boys-butter-tart-slice/
692
https://www.allrecipes.com/recipe/217109/make-ahead-vegetarian-moroccan-stew/
693
https://www.allrecipes.com/recipe/233888/mudslide-mousse-shots/
694
https://www.allrecipes.com/recipe/129065/easy-peasy-rice-bake/
695
https://www.allrecipes.com/recipe/261202/pumpkin-almond-date-balls/
696
https://www.allrecipes.com/reci

797
https://www.allrecipes.com/recipe/259755/delicious-sweet-potato-pie-recipe/
798
https://www.allrecipes.com/recipe/258051/chicken-brown-rice-sloppy-joes/
799
https://www.allrecipes.com/recipe/223420/dads-working-mans-macaroni-salad/
800
https://www.allrecipes.com/recipe/240641/salt-dough/
801
https://www.allrecipes.com/recipe/220158/turkey-cranberry-wreath-bake/


KeyboardInterrupt: 