In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import itertools
from tqdm import tqdm
import re

In [16]:
def clean_numbers(txt):
    """
    Inputs: string (ex. '10mg.')
    Outputs: string with letters and last '.' removed (ex. '10')
    """
    #Remove letters
    txt = re.sub('[^0-9\.]', '', txt)
    #Remove '.' at end
    if txt[-1] == '.':
        txt = txt[:len(txt)-1]
    return txt

In [17]:
def get_nutrients(URL):
    """
    Inputs: string of url (Allrecipes url for food recipe)
    Outputs: dataframe of nurtitional values if soup object is found
    What it Does: Gets html of page, extracts nutritional info, returns in dataframe
    """
    page = requests.get(URL)
    if page: 
        soup = BeautifulSoup(page.content, 'html.parser')
        if soup:
            #See if class exists in beautiful soup, if not information needed is not on page
            try:
                txt = soup.find(class_='partial recipe-nutrition-section').find(class_="section-body").text
            except:
                return pd.DataFrame()
            #Clean nutrient output into 'txtf'
            txt = txt.strip()
            txt = txt.split('\n')[0]
            txtf = txt.split(';')
            #extract numbers and nutrients into separate lists (ex. ['fat', 'protein'], [6.2, 3.5])
            nums = []
            nutrient = []
            for val in txtf:
                v = val.split()
                num = [x for x in v if sum(char.isdigit() for char in x) > 0]
                #Clean numbers - remove labels (mg) and '.' if occurs at end
                num = [clean_numbers(x) for x in num]
                nut = [x.lower() for x in v if sum(char.isdigit() for char in x) == 0]
                nums += num
                nutrient += nut
            #save nutrient data to dataframe for concatenation
            d = dict(zip(nutrient, nums))
            df = pd.DataFrame(data = [d])
            df['url'] = str(URL)
            return df
    return pd.DataFrame()

In [18]:
def get_recipe_urls():
    """
    Outputs: list of links to slow cooker recipes from Allrecipes
    What it Does: finds all slow cooker recipes from first 20 pages of allrecipes
    """
    all_url = []
    #get first 20 pages of slow cooker recipes
    url = 'https://www.allrecipes.com/recipes/253/everyday-cooking/slow-cooker/?page={}'
    for i in range(20):
        url_i = url.format(i+1)
        page = requests.get(url_i)
        soup = BeautifulSoup(page.content, 'html.parser')
        #Turn soup output into string and regex find all recipe links
        t = str(soup)
        l = re.findall('(?<="url": ")https:\/\/www\.allrecipes\.com\/recipe\/.*(?=")', t)
        all_url += l
    #outputs list of urls with slow cooker recipes
    return list(set(all_url))

In [19]:
def add_recipe_names(url):
    """
    Inputs: list of urls from dataframe
    Outputs: series of recipe name extracted from the url
    """
    recipes = []
    for u in url:
        #Get indices of '/' and take all chars between last 2 indices
        idx = [i for i, ltr in enumerate(str(u))if ltr == '/']
        idx.sort(reverse=True)
        idx2, idx1 = idx[0], idx[1]
        name = str(u)[idx1+1:idx2]
        #Clean name by removing '-' and title case
        name = name.replace('-', ' ').title()
        #append to final list
        recipes.append(name)
    return recipes

In [20]:
#Establish dataframe of link, nutrient info
df = pd.DataFrame(columns = ['calories', 'carbohydrates', 'cholesterol', 'fat', 'protein', 'sodium', 'url'])

In [21]:
#Get recipe URLs
url_list = get_recipe_urls()
len(url_list) #485 recipes

485

In [22]:
#Loop through unique urls and get nutrients and add to df
for u in tqdm(url_list):  
    #Get nutrients
    n = get_nutrients(u)
    #Merge nutrient data with df
    df = pd.concat([df, n],ignore_index = True, sort=True)

100%|██████████| 485/485 [05:53<00:00,  1.37it/s]


In [23]:
#Rename df columns to include units for nutrients
df.columns = ['calories', 'carbohydrates_g', 'cholesterol_mg', 'fat_g', 'protein_g', 'sodium_mg', 'url']
#Convert url to string
df['recipe'] = add_recipe_names(df.url)
#Reorder columns
df = df[['recipe', 'calories', 'carbohydrates_g', 'cholesterol_mg', 'fat_g', 'protein_g', 'sodium_mg', 'url']]
df.head()

Unnamed: 0,recipe,calories,carbohydrates_g,cholesterol_mg,fat_g,protein_g,sodium_mg,url
0,Slow Cooker Creamed Corn Just Like Rudys Bbq,354,26.1,86.8,27.3,5.9,243.4,https://www.allrecipes.com/recipe/234375/slow-...
1,Slow Cooker Salsa Chicken,148,7.5,58.5,2.4,23.1,539.7,https://www.allrecipes.com/recipe/236128/slow-...
2,Randys Slow Cooker Ravioli Lasagna,544,52.9,91.3,23.6,29.4,1333.8,https://www.allrecipes.com/recipe/234397/randy...
3,Warm Mexican Corn Dip,138,6.2,35.0,12.3,2.1,272.9,https://www.allrecipes.com/recipe/107512/warm-...
4,Alisons Slow Cooker Vegetable Beef Soup,228,21.2,37.7,9.9,15.4,1716.8,https://www.allrecipes.com/recipe/26354/alison...


In [24]:
#save df to csv
df.to_csv('recipe_nutrients.csv', index=False)
df.shape

(484, 8)