## The purpose of this notebook is to collect cooking information on banana bread recipes off of allrecipes.com. The data being collected includes ratings, how many people made the recipe, reviews, the ingredient list, ect. 


In [6]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import requests
import requests_cache
import numpy as np
import pandas as pd

## Cache data

In [7]:
requests_cache.install_cache("/Users/shermanpeng/Documents/Python/Webscrape")

In [8]:
domain = 'http://allrecipes.com'

In [9]:
the_pages = ['http://allrecipes.com/recipes/343/bread/quick-bread/fruit-bread/banana-bread/?internalSource=hubcard&referringContentType=search%20results&clickId=cardslot%201&page=1#1',
 'http://allrecipes.com/recipes/343/bread/quick-bread/fruit-bread/banana-bread/?internalSource=hubcard&referringContentType=search%20results&clickId=cardslot%201&page=2#2',
 'http://allrecipes.com/recipes/343/bread/quick-bread/fruit-bread/banana-bread/?internalSource=hubcard&referringContentType=search%20results&clickId=cardslot%201&page=3#3',
 'http://allrecipes.com/recipes/343/bread/quick-bread/fruit-bread/banana-bread/?internalSource=hubcard&referringContentType=search%20results&clickId=cardslot%201&page=4#4',
 'http://allrecipes.com/recipes/343/bread/quick-bread/fruit-bread/banana-bread/?internalSource=hubcard&referringContentType=search%20results&clickId=cardslot%201&page=5#5',
 'http://allrecipes.com/recipes/343/bread/quick-bread/fruit-bread/banana-bread/?internalSource=hubcard&referringContentType=search%20results&clickId=cardslot%201&page=6#6',
 'http://allrecipes.com/recipes/343/bread/quick-bread/fruit-bread/banana-bread/?internalSource=hubcard&referringContentType=search%20results&clickId=cardslot%201&page=7#7',
 'http://allrecipes.com/recipes/343/bread/quick-bread/fruit-bread/banana-bread/?internalSource=hubcard&referringContentType=search%20results&clickId=cardslot%201&page=8#8',
 'http://allrecipes.com/recipes/343/bread/quick-bread/fruit-bread/banana-bread/?internalSource=hubcard&referringContentType=search%20results&clickId=cardslot%201&page=9#9']

In [10]:
#FUNCTION TO GET ALL RECIPE LINKS ON PAGE
def get_links(pages):
    each_link = []
    html = requests.get(pages).text
    soup = BeautifulSoup(html, "html.parser")
    
    recipe_pages = soup.findAll("article", {"class": "fixed-recipe-card"})
    for p in recipe_pages:
        try:
            if '/recipe' in p.find("a")["href"]:
                each_link.append(p.find("a")["href"])
        except:
            continue

    return (each_link)
    

In [11]:
all_link = [get_links(x) for x in the_pages]

In [12]:
all_link

[['/recipe/22967/nannas-banana-bread/',
  '/recipe/6765/4h-banana-bread/',
  '/recipe/218541/caramel-macchiato-banana-bread/',
  '/recipe/219967/chef-johns-banana-bread/',
  '/recipe/7154/creamy-banana-bread/',
  '/recipe/17252/chocolate-banana-bread/',
  '/recipe/228442/raspberry-banana-bread/',
  '/recipe/16999/grandmas-homemade-banana-bread/',
  '/recipe/141933/banana-zucchini-bread/',
  '/recipe/17652/banana-crumb-muffins/',
  '/recipe/20144/banana-banana-bread/',
  '/recipe/17066/janets-rich-banana-bread/',
  '/recipe/6984/banana-sour-cream-bread/',
  '/recipe/67937/extreme-banana-nut-bread-ebnb/',
  '/recipe/15747/best-ever-banana-bread/',
  '/recipe/16952/the-best-banana-bread/',
  '/recipe/6687/banana-bread/',
  '/recipe/6993/cathys-banana-bread/',
  '/recipe/6990/sarahs-banana-bread-muffins/',
  '/recipe/23426/almost-no-fat-banana-bread/',
  '/recipe/20153/chocolate-chip-banana-bread-ii/',
  '/recipe/6713/easy-banana-bread/',
  '/recipe/17754/janines-best-banana-bread/',
  '/r

In [13]:
#ADD DOMAIN TO TO LINKS TO BE URLS
dlink = [domain + link for item in all_link for link in item ]


In [14]:
dlink

['http://allrecipes.com/recipe/22967/nannas-banana-bread/',
 'http://allrecipes.com/recipe/6765/4h-banana-bread/',
 'http://allrecipes.com/recipe/218541/caramel-macchiato-banana-bread/',
 'http://allrecipes.com/recipe/219967/chef-johns-banana-bread/',
 'http://allrecipes.com/recipe/7154/creamy-banana-bread/',
 'http://allrecipes.com/recipe/17252/chocolate-banana-bread/',
 'http://allrecipes.com/recipe/228442/raspberry-banana-bread/',
 'http://allrecipes.com/recipe/16999/grandmas-homemade-banana-bread/',
 'http://allrecipes.com/recipe/141933/banana-zucchini-bread/',
 'http://allrecipes.com/recipe/17652/banana-crumb-muffins/',
 'http://allrecipes.com/recipe/20144/banana-banana-bread/',
 'http://allrecipes.com/recipe/17066/janets-rich-banana-bread/',
 'http://allrecipes.com/recipe/6984/banana-sour-cream-bread/',
 'http://allrecipes.com/recipe/67937/extreme-banana-nut-bread-ebnb/',
 'http://allrecipes.com/recipe/15747/best-ever-banana-bread/',
 'http://allrecipes.com/recipe/16952/the-best-

In [15]:
len(dlink)

178

In [16]:
#FUNCTION TO TEST INGREDIENT LIST EXTRACTION
def get_ingredients(link):
    html = urlopen(link)
    soup = BeautifulSoup(html, 'lxml')
    test = []
    ingredients = soup.findAll('span', {'class': 'recipe-ingred_txt added'})
    for i in ingredients:
        test.append(i.get_text())
    return test

In [17]:
ingredient_test = get_ingredients('http://allrecipes.com/recipe/23082/janets-famous-banana-nut-bread/')

In [18]:
ingredient_test

['3 cups white sugar',
 '3/4 cup butter, softened',
 '3 eggs',
 '2 cups mashed ripe banana',
 '1/2 cup sour cream',
 '3 1/2 cups all-purpose flour',
 '1 teaspoon baking soda',
 '1/2 teaspoon baking powder',
 '1/2 teaspoon ground cinnamon',
 '1 1/2 cups chopped pecans']

# Function To Convert Times

In [19]:
def convertTime(timeString):
    if "h" in timeString:
        if len(timeString) == 3:
            return int(timeString.split(" ")[0])*60
        else:    
            hour_to_minutes = int(timeString.split("h")[0].split(" ")[0]) * 60
            minutes = int(timeString.split(" ")[2])
            return hour_to_minutes + minutes
    else:
        return int(timeString.split(" ")[0]) 

In [20]:
#FUNCTION TO RETURN RECIPE LINKS, RECIPE TITLE, MADE-IT COUNT, REVIEWS, RATING VALUE, 
#REVIEW COUNT, INGREDIENTS, CALORIES, RATINGS, PREP TIME, COOKING TIME, TOTAL TIME

def read_page(page):
    
    html = requests.get(page).text
    soup = BeautifulSoup(html, "html.parser")
    
    #GRAB RECIPE TITLE
    try:
        title = soup.find('h1', {'class':'recipe-summary__h1'}).get_text()
    except:
        title = np.NaN
        
    #GRAB MADE IT COUNT
    try:
        made_it = soup.find('div', {'class': 'total-made-it'}).attrs['data-ng-init'].split(',')[0].split('(')[1]
    except:
        made_it = np.NaN
        
    #GRAB REVIEW COUNT
    try:
        review_count = soup.find('span', {'class': 'review-count'}).get_text()
    except:
        review_count = np.NaN
        
    #GRAB LONG RATING VALUE
    try:
        long_rating = soup.find('div', {'class': 'rating-stars'}).attrs['data-ratingstars']
 
    except:
        long_rating = np.NaN
        
    #GRAB SHORT RATING VALUE
    try:
        #rating = soup.find('span', {'itemprop': 'aggregateRating'}).find('meta').attrs['content']
        rating_value = soup.find('meta', {'itemprop': 'ratingValue'}).attrs['content']
    except:
        rating_value = np.NaN
    
    #GRAB REVIEW COUNT
    try:
        review_count = soup.find('meta', {'itemprop': 'reviewCount'}).attrs['content']
    except:
        review_count = np.NaN
    
    #GRAB RATING
    try:
        rating = soup.find('h4', {'class': 'helpful-header'}).get_text().split(' ')[0]
    except:
        rating = np.NaN
    #GRAB INGREDIENTS
    try:
        ingredient_list = []
        ingredients = soup.findAll('span', {'class': 'recipe-ingred_txt added'})
        #ingredient_list = [item.get_text() for item in ingredients]
        for item in ingredients:
            #each ingredient separted by a "$" for easer processsing later
            ingredient_list.append(item.get_text())
            ingredient_list_new = "$".join(ingredient_list)
        
    except:
        ingredients_list = np.NaN
        
    #GRAB CALORIES
    try:
        calories = soup.find('span', {'class': 'calorie-count'}).get_text()
        calories = calories.split(" ")[0]
    except:
        calories = np.NaN
    
    #GRAB PREP TIME, COOKING TIME, AND TOTAL TIME
    try:
        all_prep_time = soup.find("ul", {"class": "prepTime"})
        prep_time_str = all_prep_time.findAll("li")[1].find("time").get_text()
        cook_time_str = all_prep_time.findAll("li")[2].find("time").get_text()
        total_time_str = all_prep_time.findAll("li")[3].find("time").get_text()
                
        prep_time = convertTime(prep_time_str)
        cook_time = convertTime(cook_time_str)
        total_time = convertTime(total_time_str)
    except:
        prep_time = np.NaN
        cook_time = np.NaN
        total_time = np.NaN
    
    
        
    #GRAB FAHRENHEIGHT TEMP
    try:
        fahrenheight = soup.find("ol", {"class": "list-numbers recipe-directions__list"}).findAll("li")[0].find(
        "span", {"class": "recipe-directions__list--item"}).get_text().split("degrees")[0].split(" ")[-2]
    except:
        fahrenheight = np.NaN
    #GRAB CELCIUS TEMP
    try:
        celcius = soup.find("ol", {"class": "list-numbers recipe-directions__list"}).findAll("li")[0].find(
        "span", {"class": "recipe-directions__list--item"}).get_text().split("(")[1].split(" ")[0]
    except: 
        celcius = np.NaN
    return ([page, title, made_it, review_count, long_rating, rating_value, 
             review_count, ingredient_list_new, calories, rating, prep_time, cook_time, 
             total_time, fahrenheight, celcius])
    

In [21]:
read_one_page = read_page("http://allrecipes.com/recipe/22967/nannas-banana-bread/?internalSource=rotd&referringId=343&referringContentType=recipe%20hub")

In [22]:
read_one_page

['http://allrecipes.com/recipe/22967/nannas-banana-bread/?internalSource=rotd&referringId=343&referringContentType=recipe%20hub',
 "Nanna's Banana Bread",
 '322',
 '215',
 '4.65476179122925',
 '4.65',
 '215',
 '1/2 cup buttermilk$2 eggs$1/2 cup vegetable oil$1 cup white sugar$2 small very ripe bananas, sliced$1 teaspoon vanilla extract$2 cups all-purpose flour$1 teaspoon ground cinnamon$1 teaspoon baking powder$1 teaspoon baking soda$1/8 teaspoon salt$1/2 cup chopped walnuts',
 '286',
 '252',
 15,
 60,
 75,
 '350',
 '175']

# Read all pages

In [40]:
read_all_pages = [read_page(link) for link in dlink]

In [41]:
read_all_pages

[['http://allrecipes.com/recipe/22967/nannas-banana-bread/',
  "Nanna's Banana Bread",
  '322',
  '215',
  '4.65476179122925',
  '4.65',
  '215',
  '1/2 cup buttermilk$2 eggs$1/2 cup vegetable oil$1 cup white sugar$2 small very ripe bananas, sliced$1 teaspoon vanilla extract$2 cups all-purpose flour$1 teaspoon ground cinnamon$1 teaspoon baking powder$1 teaspoon baking soda$1/8 teaspoon salt$1/2 cup chopped walnuts',
  '286',
  '252',
  15,
  60,
  75,
  '350',
  '175'],
 ['http://allrecipes.com/recipe/6765/4h-banana-bread/',
  '4H Banana Bread',
  '365',
  '883',
  '4.67162799835205',
  '4.67',
  '883',
  '2 cups all-purpose flour$1/2 teaspoon baking soda$1 cup white sugar$1 egg$5 tablespoons milk$1 teaspoon baking powder$1/2 teaspoon salt$1/2 cup margarine$1 cup mashed bananas$1/2 cup chopped walnuts (optional)',
  '265',
  '1075',
  10,
  60,
  70,
  'and',
  nan],
 ['http://allrecipes.com/recipe/218541/caramel-macchiato-banana-bread/',
  'Caramel Macchiato Banana Bread',
  '198',
  

In [45]:
table = pd.DataFrame([read_page(link) for link in dlink])

In [47]:
table.columns = ["url", "title", "made-it", "reviews", "longRating", "ratingValue", "reviewCount", "recipe", "calories", "ratings", "prep", "cook", "total", "fahrenheit", "celsius"]

In [48]:
table

Unnamed: 0,url,title,made-it,reviews,longRating,ratingValue,reviewCount,recipe,calories,ratings,prep,cook,total,fahrenheit,celsius
0,http://allrecipes.com/recipe/22967/nannas-bana...,Nanna's Banana Bread,322,215,4.65476179122925,4.65,215,1/2 cup buttermilk$2 eggs$1/2 cup vegetable oi...,286,252,15.0,60.0,75.0,350,175
1,http://allrecipes.com/recipe/6765/4h-banana-br...,4H Banana Bread,365,883,4.67162799835205,4.67,883,2 cups all-purpose flour$1/2 teaspoon baking s...,265,1075,10.0,60.0,70.0,and,
2,http://allrecipes.com/recipe/218541/caramel-ma...,Caramel Macchiato Banana Bread,198,58,4.17647075653076,4.18,58,cooking spray$2 cups all-purpose flour$1 teasp...,334,68,15.0,65.0,120.0,350,175
3,http://allrecipes.com/recipe/219967/chef-johns...,Chef John's Banana Bread,1009,383,4.6972861289978,4.70,383,cooking spray$2 cups all-purpose flour$1 teasp...,334,479,15.0,70.0,105.0,325,165
4,http://allrecipes.com/recipe/7154/creamy-banan...,Creamy Banana Bread,353,873,4.66091966629028,4.66,873,"1/2 cup margarine, softened$1 (8 ounce) packag...",289,1044,30.0,45.0,75.0,350,175
5,http://allrecipes.com/recipe/17252/chocolate-b...,Chocolate Banana Bread,1463,928,4.67461681365967,4.67,928,"1 cup margarine, softened$2 cups white sugar$4...",278,1174,10.0,60.0,70.0,350,175
6,http://allrecipes.com/recipe/228442/raspberry-...,Raspberry Banana Bread,97,32,4.68292665481567,4.68,32,"3/4 cup frozen unsweetened raspberries, thawed...",273,41,25.0,60.0,115.0,325,165
7,http://allrecipes.com/recipe/16999/grandmas-ho...,Grandma's Homemade Banana Bread,591,313,4.6962366104126,4.70,313,"1 1/2 cups white sugar$1/2 cup butter, softene...",226,372,10.0,60.0,70.0,350,175
8,http://allrecipes.com/recipe/141933/banana-zuc...,Banana-Zucchini Bread,525,263,4.51702785491943,4.52,263,3 eggs$3/4 cup vegetable oil$2/3 cup packed br...,272,323,15.0,50.0,65.0,325,165
9,http://allrecipes.com/recipe/17652/banana-crum...,Banana Crumb Muffins,14588,9663,4.80329847335815,4.80,9663,1 1/2 cups all-purpose flour$1 teaspoon baking...,263,13157,15.0,20.0,35.0,375,190


# Convert all data to CSV

In [51]:
import csv
with open("/Users/shermanpeng/Documents/Python/Webscrape/all_data.csv", 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(read_all_pages)