In [70]:
import urllib.request as req
from bs4 import BeautifulSoup as soup
import pandas as pd
import sqlite3

In [71]:
#using browser to scrape
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent,} 

In [72]:
#url for main site
main_url = "https://cookieandkate.com/"

#getting main site page info
request_main = req.Request(main_url, None, headers) #The assembled request
response_main = req.urlopen(request_main)
data_main = response_main.read()
response_main.close()

page_main = soup(data_main,"html.parser")

In [73]:
#retrieving categories of recipes
categories = page_main.find("nav",{"id":"genesis-nav-primary"}).ul.findAll("li")[2].findAll("li")
category_links = []
for category in categories:
    category_links.append(category.a["href"])

In [74]:
#a list for all recipe links
recipe_links = []
#a list that holds image links for each recipe
img_links = []

#looping through categories
for category_url in category_links:
    
    #getting first page (of recipe thumbnails) for current category
    category_request = req.Request(category_url, None, headers)
    category_response = req.urlopen(category_request)
    category_data = category_response.read()
    category_response.close()

    category_page = soup(category_data,"html.parser")
    
    while True:

        #next page marker
        next_marker = category_page.find("div", {"class":"pagination-next alignright"})

        #if there is a next page
        if next_marker:

            #get url for page
            next_url = next_marker.a['href']

            #read info from page
            next_request = req.Request(next_url, None, headers)
            next_response = req.urlopen(next_request)
            next_data = next_response.read()
            next_response.close()

            #set category_page to new page
            category_page = soup(next_data)

            #collect all recipes from new page
            recipes = category_page.findAll("div",{"class":"lcp_catlist_item"})

            #save the recipe links and image links
            for recipe in recipes:
                recipe_links.append(recipe.a["href"])
                img_links.append(recipe.find('img')['data-lazy-src'])

        #if there are no new pages
        else:
            break

In [81]:
recipe_info = []

# looping through recipe links
for new_url in recipe_links:

    #getting corresponding image url
    img_url = img_links[recipe_links.index(new_url)]
    #dict for saving recipe info
    recipe_dict = {}

    #reading info from recipe page
    new_request = req.Request(new_url,None,headers) #The assembled request
    new_response = req.urlopen(new_request)
    new_data = new_response.read()
    new_response.close()

    recipe_page = soup(new_data,"html.parser")

    #getting objects that contain details, ratings, ingredients
    recipe_details = recipe_page.find("div",{"class":"tasty-recipes-details"})
    recipe_ratings = recipe_page.find("div",{"class":"tasty-recipes-ratings"})
    recipe_ingredients = recipe_page.find("div", {"class":"tasty-recipe-ingredients"})

    #if details, ratings and ingredients are available
    if recipe_details and recipe_ratings and recipe_ingredients:
        #recipe name
        recipe_dict["title"] = recipe_page.h1.text
        #recipe ingredients
        recipe_dict["ingredients"] = [x.text for x in recipe_ingredients.find('ul').findAll('li')]                 
        #recipe ratings
        recipe_dict["rating"] = recipe_ratings.find("span",{"class":"average"}).text

        #recipe details - time, cuisine, category
        try:
            recipe_dict["time"] = recipe_details.find("span",{"class":"tasty-recipes-total-time"}).text
        except:
            recipe_dict["time"] = None
        
        try:
            recipe_dict["cuisine"] = recipe_details.find("span",{"class":"tasty-recipes-cuisine"}).text
        except:
            recipe_dict["cuisine"] = None

        try:
            recipe_dict["category"] = recipe_details.find("span",{"class":"tasty-recipes-category"}).text
        except:
            recipe_dict["category"] = None       
        
        #saving image and link
        recipe_dict["url"] = new_url
        recipe_dict["img"] = img_url

        #adding dictionary of recipe info to list
        recipe_info.append(recipe_dict)

In [82]:
recipes_df = pd.DataFrame(recipe_info)

In [95]:
recipes_df['ingredients'] = recipes_df['ingredients'].apply(lambda x: ' '.join(x))

In [96]:
recipes_df["rating"] = recipes_df["rating"].astype("float")
recipes_df.drop_duplicates(subset="title",inplace=True)

Recipes for a Mexican main dish with a rating of 4.5 stars or higher 

In [97]:
recipes_df[(recipes_df["cuisine"]=="Mexican") & (recipes_df["rating"]>=4.5) & (recipes_df["category"]=="Main")].sort_values("rating",ascending=False)[:3]


Unnamed: 0,title,ingredients,rating,time,cuisine,category,url,img
455,Fresh Arugula and Black Bean Tacos with Pickle...,3 tablespoons olive oil 3 tablespoons fresh li...,5.0,30 minutes,Mexican,Main,https://cookieandkate.com/fresh-arugula-and-bl...,https://cookieandkate.com/images/2014/06/fresh...
526,Chipotle-Glazed Delicata Squash,"3 delicata squash, sliced in half lengthwise, ...",5.0,1 hour 15 minutes,Mexican,Main,https://cookieandkate.com/chipotle-glazed-squash/,https://cookieandkate.com/images/2012/02/chipo...
57,"Crispy Mushroom, Spinach and Avocado Quesadillas","1 tablespoon olive oil ½ red onion, chopped sa...",4.9,20 minutes,Mexican,Main,https://cookieandkate.com/crispy-mushroom-spin...,https://cookieandkate.com/images/2013/01/crisp...


In [98]:
recipes_df.to_csv("./cookieandkaterecipes.csv")