### Import Necessary Packages 

In [1]:
import pandas as pd
import re
import requests
import time
from bs4 import BeautifulSoup
from itertools import chain
from pandas.io.json import json_normalize
from recipe_scrapers import scrape_me
from tqdm import tqdm_notebook as tqdm
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)

### Create Parser Function for Ingredients

In [2]:
# regex for separating ingredients list
SEPARATOR_RE = re.compile(r'^([\d\s*[\d\.,/]*)\s*(.+)')

# create a normalized string for ingredients
def normalize(st):
    """

    :param st:
    :return:
    """
    return re.sub(r'\s+', ' ', SEPARATOR_RE.sub('\g<1> \g<2>', st)).strip()


def escape_re_string(text):
    """

    :param text:
    :return:
    """
    text = text.replace('.', '\.')
    return re.sub(r'\s+', ' ', text)

# list of common units of measurements for ingredients
UNITS = {"cup": ["cups", "cup", "c.", "c"], "fluid_ounce": ["fl. oz.", "fl oz", "fluid ounce", "fluid ounces"],
         "gallon": ["gal", "gal.", "gallon", "gallons"], "ounce": ["oz", "oz.", "ounce", "ounces"],
         "pint": ["pt", "pt.", "pint", "pints"], "pound": ["lb", "lb.", "pound", "pounds"],
         "quart": ["qt", "qt.", "qts", "qts.", "quart", "quarts"],
         "tablespoon": ["tbsp.", "tbsp", "T", "T.", "tablespoon", "tablespoons", "tbs.", "tbs"],
         "teaspoon": ["tsp.", "tsp", "t", "t.", "teaspoon", "teaspoons"],
         "gram": ["g", "g.", "gr", "gr.", "gram", "grams"], "kilogram": ["kg", "kg.", "kilogram", "kilograms"],
         "liter": ["l", "l.", "liter", "liters"], "milligram": ["mg", "mg.", "milligram", "milligrams"],
         "milliliter": ["ml", "ml.", "milliliter", "milliliters"], "pinch": ["pinch", "pinches"],
         "dash": ["dash", "dashes"], "touch": ["touch", "touches"], "handful": ["handful", "handfuls"],
         "stick": ["stick", "sticks"], "clove": ["cloves", "clove"], "can": ["cans", "can"], "large": ["large"],
         "small": ["small"], "scoop": ["scoop", "scoops"], "filets": ["filet", "filets"], "sprig": ["sprigs", "sprig"]}

# numbers to separate quantities from ingredients
NUMBERS = ['seventeen', 'eighteen', 'thirteen', 'nineteen', 'fourteen', 'sixteen', 'fifteen', 'seventy', 'twelve',
           'eleven', 'eighty', 'thirty', 'ninety', 'twenty', 'seven', 'fifty', 'sixty', 'forty', 'three', 'eight',
           'four', 'zero', 'five', 'nine', 'ten', 'one', 'six', 'two', 'an', 'a', '½', '⅓','¼', '⅛', '¾']

prepositions = ["of"]

a = list(chain.from_iterable(UNITS.values()))
a.sort(key=lambda x: len(x), reverse=True)
a = map(escape_re_string, a)

PARSER_RE = re.compile(
    r'(?P<quantity>(?:[\d\.,][\d\.,\s/]*)?\s*(?:(?:%s)\s*)*)?(\s*(?P<unit>%s)\s+)?(\s*(?:%s)\s+)?(\s*(?P<name>.+))?' % (
        '|'.join(NUMBERS), '|'.join(a), '|'.join(prepositions)))


def parse(st):
    """

    :param st:
    :return:
    """
    st = normalize(st)
    res = PARSER_RE.match(st)
    
    return ((res.group('name') or '').strip())

# uncomment below code to return a tuple with measurement and ingredients

#     return ((res.group('quantity') or '').strip() + ' ' + (res.group('unit') or '').strip(),
#            (res.group('name') or '').strip())


# uncomment below code to return a dictionary with with measurements and ingredients as keys

#     return {
#             'measure': (res.group('quantity') or '').strip() + ' ' + (res.group('unit') or '').strip(),
#             'name': (res.group('name') or '').strip()
#         }


### Scrape Recipes from AllRecipes with Recipe_Scrapers

In [4]:
# Time the seconds required to complete site scraping and filtering
start_time = time.time()

# Create an empty list for the scraped links
linklist=[]
for i in tqdm(range(1,994)):
    responses=requests.get('https://www.allrecipes.com/recipes/80/main-dish/?page={}'.format(i))
    soup=BeautifulSoup(responses.content,'html.parser')
    for link in soup.findAll('a', attrs={'href': re.compile("^https://")}):
        linklist.append(link.get('href'))

# Filter through links that correspond with a singular dish and not multiple dishes       
dishes = [k for k in linklist if '/recipe/' in k]

# Remove duplicate links
recipes = [] 
for d in dishes: 
    if d not in recipes: 
        recipes.append(d)

# Print time surpassed
print('-------- %s seconds --------' % (time.time() - start_time))

HBox(children=(IntProgress(value=0, max=993), HTML(value='')))

-------- 963.9331917762756 seconds --------


In [5]:
# Count the number of recipes present after filtering
len(recipes)

16379

In [None]:
start_time = time.time()

# Use Scrape_Me package on links
data = []

# Instantiate a progress bar within for loop 
for links in tqdm(recipes):
    data.append(scrape_me(links))
    

# Create lists for name, ingredients, and total time of recipes
names = []
ings = []
times = []
for dat in tqdm(data):
    try:
        names.append(dat.title())
        ings.append(dat.ingredients())
        times.append(dat.total_time())
    except Exception as ex:
        print(str(ex))
        

# Print time surpassed
print('-------- %s seconds --------' % (time.time() - start_time))

HBox(children=(IntProgress(value=0, max=16379), HTML(value='')))

In [22]:
# Check that all of the recipes were scraped
len(data)

5667

### Parse Ingredients of Each Recipe

In [17]:
new_list = []
for sub_list in ings:
    new_list.append([])
    for item in sub_list:
            new_list[-1].append(parse(item))
ings = new_list

### Create and Save the DataFrame with Cleaned Data

In [12]:
# Create a dictionary to make a DataFrame from
alls = {'name':names, 'ingredients':ings1, 'total time (min)': times}
df = pd.DataFrame(alls)

# Preview DataFrame
df.head()

HBox(children=(IntProgress(value=0, max=5667), HTML(value='')))

Unnamed: 0,name,ingredients
0,World's Best Lasagna,"[1 pound sweet Italian sausage, 3/4 pound lean ground beef, 1/2 cup minced onion, 2 cloves garlic, crushed, 1 (28 ounce) can crushed tomatoes, 2 (6 ounce) cans tomato paste, 2 (6.5 ounce) cans canned tomato sauce, 1/2 cup water, 2 tablespoons white sugar, 1 1/2 teaspoons dried basil leaves, 1/2 teaspoon fennel seeds, 1 teaspoon Italian seasoning, 1 1/2 teaspoons salt, divided, or to taste, 1/4 teaspoon ground black pepper, 4 tablespoons chopped fresh parsley, 12 lasagna noodles, 16 ounces ricotta cheese, 1 egg, 3/4 pound mozzarella cheese, sliced, 3/4 cup grated Parmesan cheese]"
1,Pork Chops in Garlic Mushroom Sauce,"[2 pounds boneless pork chops, 1/2 teaspoon paprika, 1 pinch kosher salt and ground black pepper to taste, 1/4 cup butter, divided, 1 (8 ounce) package sliced fresh mushrooms, 4 cloves garlic, minced, 1 teaspoon Dijon mustard, 2 tablespoons all-purpose flour, 2 cups beef broth]"
2,Super Duper Slow Cooker Beef Stroganoff,"[1 1/2 pounds cubed beef stew meat, salt and ground black pepper to taste, 1 onion, chopped, 1 (10.75 ounce) can condensed cream of mushroom soup, 1/4 cup water, 1 tablespoon dried chives, 2 cloves garlic, minced, 1 tablespoon Worcestershire sauce, 1 cube beef bouillon, 1/2 cup red wine, 1 tablespoon cornstarch, 1 tablespoon all-purpose flour, 1 (8 ounce) container sour cream, 1 (8 ounce) package sliced fresh mushrooms, 4 ounces cream cheese, 1/2 cup chopped fresh parsley]"
3,Chef John's Perfect Prime Rib,"[4 pounds prime rib roast, 1/4 cup unsalted butter, softened, 1 tablespoon freshly ground black pepper, 1 teaspoon herbes de Provence, kosher salt]"
4,Beef Stroganoff for Instant Pot®,"[2 tablespoons canola oil, 1/2 onion, diced, 2 teaspoons salt, divided, 2 pounds beef stew meat, cut into 1-inch cubes, 1 teaspoon freshly ground black pepper, 3 cloves garlic, minced, 1/2 teaspoon dried thyme, 2 tablespoons soy sauce, 3 cups chopped mushrooms, 2 tablespoons all-purpose flour, 3 cups chicken broth, 1 (16 ounce) package wide egg noodles, 3/4 cup sour cream, or to taste]"


In [None]:
# Save the DataFrame
df.to_csv(r'/Users/stephaniekendall/Desktop/Errthang/Flatiron/projects/FP_Practice/hopeful.csv')

In [26]:
# Scrape the ratings for the recipes

# Determine the best way to get rating data, 1 average score or multiple

# ratings = []

# for recipe in tqdm(recipes):
#     responses = requests.get(recipe)
#     soup = BeautifulSoup(responses.content, 'html.parser')
#     rating = soup.find('meta', property='og:rating')
#     rating1 = soup.find('span', {'class':'review-star-text'}) 
#     ratings.append(rating['content'] if rating else rating1.text)

HBox(children=(IntProgress(value=0, max=5667), HTML(value='')))

KeyboardInterrupt: 

In [21]:
df['ingredients'].apply(pd.Series)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,5 teaspoons olive oil,"4 shallots, diced","1 large onion, cut into thin strips","1 pound bacon, cut into strips","1 clove garlic, chopped",1 (16 ounce) package fettuccini pasta,3 egg yolks,1/2 cup heavy cream,3/4 cup shredded Parmesan cheese,salt and pepper to taste,,,,,,,,,,,
1,1/2 pound ground turkey,1 egg,1/4 cup salsa,1/8 cup chopped red bell pepper,1/8 cup chopped yellow bell pepper,1/4 cup chopped onion,1/4 cup dry bread crumbs,lemon pepper to taste,,,,,,,,,,,,,
2,1 1/2 cups uncooked white rice,3 cups water,4 tablespoons vegetable oil,1 cup fresh bean sprouts,1/2 cup chopped onion,"1 1/2 cups cooked medium shrimp, peeled and deveined without tail",1/4 cup chopped green onion,"2 eggs, beaten",1 teaspoon salt,1/4 teaspoon ground black pepper,4 tablespoons soy sauce,1/4 teaspoon sesame oil,,,,,,,,,
3,3 pounds beef roast,6 potatoes,1 1/2 cups baby carrots,1 yellow onion,2 stalks celery,3 cubes beef bouillon,1/2 cup water,,,,,,,,,,,,,,
4,1 egg,1/4 cup milk,1 pound ground beef,1/4 cup dry cream of wheat cereal,1/4 cup minced onion,1 (10.75 ounce) can condensed cream of chicken soup,1 (10.75 ounce) can condensed cream of mushroom soup,1 (12 fluid ounce) can evaporated milk,1 tablespoon chopped fresh parsley,,,,,,,,,,,,
5,8 ounces farfalle (bow tie) pasta,1/4 cup butter,"1 clove garlic, minced",1/4 cup all-purpose flour,1/2 teaspoon salt,1/8 teaspoon ground black pepper,2 cups milk,1/2 teaspoon prepared mustard,2 1/2 cups shredded Colby cheese,"4 ounces cooked ham, julienned",1/4 cup grated Parmesan cheese,,,,,,,,,,
6,1 tablespoon olive oil,"1 (3 pound) roasting chicken, deboned and cut into bite size pieces","2 cloves garlic, crushed","1 onion, chopped","1 large potato, diced",1 teaspoon ground cumin,1 teaspoon ground coriander seed,1 teaspoon ground black pepper,1 teaspoon crushed red pepper flakes,1 teaspoon salt,1 cup water,¾ cup unsalted natural-style peanut butter,"1 (15 ounce) can garbanzo beans, drained and rinsed",,,,,,,,
7,"1 ¼ pounds skinless, boneless chicken breast halves",3 tablespoons all-purpose flour,2 tablespoons olive oil,2 teaspoons butter,"2 tablespoons shallots, minced","2 cloves garlic, minced",1 cup water,½ cup white wine,1 cube chicken bouillon,"½ teaspoon dried rosemary, crushed",¼ teaspoon salt,1 pinch ground black pepper,,,,,,,,,
8,10 ounces fettuccini pasta,1/2 cup butter,"5 cloves garlic, chopped",1 cup heavy cream,1 egg yolk,2 cups freshly grated Parmesan cheese,2 tablespoons dried parsley,,,,,,,,,,,,,,
9,1 tablespoon butter,1/2 cup minced white onion,"3 cloves garlic, minced",1 tablespoon chopped fresh thyme,3/4 cup low-sodium beef broth,1/2 cup port wine,1 tablespoon vegetable oil,4 filet mignon steaks (1 1/2 inch thick),3/4 cup crumbled blue cheese,1/4 cup panko bread crumbs,,,,,,,,,,,
