# Category Scraper

Jose Nico Currea, Jenna Ferguson, Jennifer Gonzalez, Evan Hadd, Muhammad Ibrahim, Ramzi Kattan, Hadley Krummel

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
# URL for the A-Z recipes page
url = "https://www.allrecipes.com/recipes-a-z-6735880/"
# Send an HTTP request to fetch the page content
response = requests.get(url)

In [4]:
# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # List to store category names and URLs
    categories = []
    # Loop through each letter A-Z
    for letter in 'ABCDEFGHIJKLMNOPQRSTUVWYZ':
        # Find the heading for the letter
        heading = soup.find('h3', id=f'alphabetical-list-{letter.lower()}')
        
        if heading:
            # Find the next sibling <ul> which contains the categories
            category_list = heading.find_next_sibling('ul', class_="loc mntl-link-list")
            
            # Extract the category names and URLs from the list
            if category_list:
                list_items = category_list.find_all('li')
                for item in list_items:
                    category_name = item.find('a').text.strip()
                    category_url = item.find('a')['href']
                    
                    # Append the data to the list
                    categories.append({
                        'Category Name': category_name,
                        'Category URL': category_url
                    })
    # Convert the list to a pandas DataFrame
    df = pd.DataFrame(categories)
else:
    print("Failed to retrieve the webpage.")

In [5]:
df

Unnamed: 0,Category Name,Category URL
0,Air Fryer Recipes,https://www.allrecipes.com/recipes/23070/every...
1,Allrecipes Allstar Recipes,https://www.allrecipes.com/recipes/16492/every...
2,Angel Food Cakes,https://www.allrecipes.com/recipes/385/dessert...
3,Antipasti,https://www.allrecipes.com/recipes/102/appetiz...
4,Appetizers and Snacks,https://www.allrecipes.com/recipes/76/appetize...
...,...,...
373,Winter Squash,https://www.allrecipes.com/recipes/1097/fruits...
374,Yams,https://www.allrecipes.com/recipes/2452/fruits...
375,Yeast Breads,https://www.allrecipes.com/recipes/339/bread/y...
376,Ziti,https://www.allrecipes.com/recipes/550/pasta-a...


## Part B - Iterate over every category to get all category url to get all recipies and their URL

In [6]:
# List to store all the recipe information
recipes = []
# Function to scrape recipes from a category page
def scrape_category_recipes(category_name, category_url):
    response = requests.get(category_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Locate the section with the recipe cards
        recipe_section = soup.find('section', class_='mntl-three-post--taxsc')
        
        if recipe_section:
            # Find all <a> tags that contain recipes
            recipe_cards = recipe_section.find_all('a', class_='mntl-card-list-items')
            
            for recipe in recipe_cards:
                recipe_name = recipe.find('span', class_='card__title-text').text.strip()
                recipe_url = recipe['href']
                
                # Add the recipe data to the list
                recipes.append({
                    'Category Name': category_name,
                    'Recipe Name': recipe_name,
                    'Recipe URL': recipe_url
                })

In [7]:
# Iterate over each category and scrape its recipes
for index, row in df.iterrows():
    category_name = row['Category Name']
    category_url = row['Category URL']
    
    print(f"Scraping recipes for category: {category_name}")
    scrape_category_recipes(category_name, category_url)

# Convert the recipes list into a DataFrame
recipe_df = pd.DataFrame(recipes)

Scraping recipes for category: Air Fryer Recipes
Scraping recipes for category: Allrecipes Allstar Recipes
Scraping recipes for category: Angel Food Cakes
Scraping recipes for category: Antipasti
Scraping recipes for category: Appetizers and Snacks
Scraping recipes for category: Apple Pie
Scraping recipes for category: Applesauce
Scraping recipes for category: Artichoke Dips
Scraping recipes for category: Bagels
Scraping recipes for category: Baked Beans
Scraping recipes for category: Banana Breads
Scraping recipes for category: Bar Cookies
Scraping recipes for category: Beef Recipes
Scraping recipes for category: Beef Stews
Scraping recipes for category: Beef Stroganoff
Scraping recipes for category: Beef Tenderloin
Scraping recipes for category: Biscotti
Scraping recipes for category: Biscuits
Scraping recipes for category: Blintzes
Scraping recipes for category: Blondies
Scraping recipes for category: Bloody Marys
Scraping recipes for category: Blueberry Pie
Scraping recipes for cat

In [8]:
recipe_df

Unnamed: 0,Category Name,Recipe Name,Recipe URL
0,Air Fryer Recipes,Air Fryer Grilled Pimento Cheese,https://www.allrecipes.com/air-fryer-grilled-p...
1,Air Fryer Recipes,Air Fryer Chicken Parmesan,https://www.allrecipes.com/air-fryer-chicken-p...
2,Air Fryer Recipes,Air Fryer Eggplant,https://www.allrecipes.com/air-fryer-eggplant-...
3,Allrecipes Allstar Recipes,Easy Black Bean Soup for Two,https://www.allrecipes.com/easy-black-bean-sou...
4,Allrecipes Allstar Recipes,Chicken Adobo Fried Rice,https://www.allrecipes.com/chicken-adobo-fried...
...,...,...,...
1129,Ziti,Sausage and Spinach Baked Ziti,https://www.allrecipes.com/sausage-and-spinach...
1130,Ziti,Baked Ziti,https://www.allrecipes.com/recipe/11758/baked-...
1131,Zucchini Breads,Best Zucchini Bread,https://www.allrecipes.com/recipe/6915/zucchin...
1132,Zucchini Breads,Snickerdoodle Zucchini Bread,https://www.allrecipes.com/snickerdoodle-zucch...


In [9]:
recipe_df.to_csv('recipes.csv', index=False)

## Part C-For each recipe scrape Ingredients, time and Rating

In [19]:
full_recipes = []

In [20]:
# Function to scrape full recipe details (ingredients, total time, and rating)
def scrape_recipe_details(recipe_name, recipe_url, category_name):
    response = requests.get(recipe_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extracting ingredients
        ingredients = []
        ingredients_section = soup.find('ul', class_='mm-recipes-structured-ingredients__list')
        if ingredients_section:
            ingredient_items = ingredients_section.find_all('li')
            for item in ingredient_items:
                ingredient_text = item.get_text(separator=' ', strip=True)
                ingredients.append(ingredient_text)
        
        # Extracting total time
        total_time = None
        time_items = soup.find_all('div', class_='mm-recipes-details__item')
        for item in time_items:
            label = item.find('div', class_='mm-recipes-details__label')
            if label and 'Total Time' in label.text:
                total_time_value = item.find('div', class_='mm-recipes-details__value')
                if total_time_value:
                    total_time = total_time_value.text.strip()
        
        # Extracting rating
        rating = 0  # Default to 0 if no rating is found
        rating_section = soup.find('div', class_='mm-recipes-review-bar__rating')
        if rating_section:
            rating = rating_section.text.strip()

        # Append the recipe data to the list
        full_recipes.append({
            'Category Name': category_name,
            'Recipe Name': recipe_name,
            'Recipe URL': recipe_url,
            'Ingredients': ingredients,
            'Total Time': total_time,
            'Rating': rating
        })

In [21]:
recipe_df = pd.read_csv('recipes.csv')

In [22]:
# Iterate over each row in the recipe_df to scrape details
for index, row in recipe_df.iterrows():
    category_name = row['Category Name']
    recipe_name = row['Recipe Name']
    recipe_url = row['Recipe URL']
    
    print(f"Scraping details for recipe: {recipe_name}")
    scrape_recipe_details(recipe_name, recipe_url, category_name)

# Convert the list to a pandas DataFrame
full_recipe_df = pd.DataFrame(full_recipes)

Scraping details for recipe: Air Fryer Grilled Pimento Cheese
Scraping details for recipe: Air Fryer Chicken Parmesan
Scraping details for recipe: Air Fryer Eggplant
Scraping details for recipe: Easy Black Bean Soup for Two
Scraping details for recipe: Chicken Adobo Fried Rice
Scraping details for recipe: Caramelized Onion and Roasted Garlic Pasta
Scraping details for recipe: 20 Desserts With Store-Bought Angel Food Cake
Scraping details for recipe: Chocolate Angel Food Cake I
Scraping details for recipe: Angel Food Cake
Scraping details for recipe: Sicilian Eggplant Caponata
Scraping details for recipe: Best Caprese Skewers
Scraping details for recipe: Air Fryer Arancini
Scraping details for recipe: Peanut Butter Bugle Bites
Scraping details for recipe: Mango Shrimp Ceviche
Scraping details for recipe: Copycat Chuy's Creamy Jalapeno Ranch Dip
Scraping details for recipe: Easy Caramel Apple Crisp Pie
Scraping details for recipe: Copycat McDonald's Apple Pies
Scraping details for recipe

In [26]:
full_df = full_recipe_df[full_recipe_df['Rating']!=0]

In [32]:
full_df

Unnamed: 0,Category Name,Recipe Name,Recipe URL,Ingredients,Total Time,Rating
1,Air Fryer Recipes,Air Fryer Chicken Parmesan,https://www.allrecipes.com/air-fryer-chicken-p...,[2 (8-ounce) boneless skinless chicken breasts...,40 mins,4.5
7,Angel Food Cakes,Chocolate Angel Food Cake I,https://www.allrecipes.com/recipe/8252/chocola...,"[2 cups egg whites, ¼ teaspoon salt, 1 ¼ teasp...",1 hr 30 mins,4.7
8,Angel Food Cakes,Angel Food Cake,https://www.allrecipes.com/recipe/15432/angel-...,"[1 cup cake flour, 1 ½ cups white sugar, divid...",1 hr 10 mins,4.6
9,Antipasti,Sicilian Eggplant Caponata,https://www.allrecipes.com/recipe/247268/eggpl...,"[1 eggplant, peeled and cut into ½-inch cubes,...",1 hr 30 mins,4.9
10,Antipasti,Best Caprese Skewers,https://www.allrecipes.com/recipe/212896/capre...,"[20 grape tomatoes, 10 ounces mozzarella chees...",15 mins,4.7
...,...,...,...,...,...,...
1129,Ziti,Sausage and Spinach Baked Ziti,https://www.allrecipes.com/sausage-and-spinach...,"[1 pound ziti pasta, 1 tablespoon olive oil, 1...",1 hr 15 mins,4.3
1130,Ziti,Baked Ziti,https://www.allrecipes.com/recipe/11758/baked-...,"[1 pound dry ziti pasta, 1 onion, chopped, 1 p...",1 hr,4.7
1131,Zucchini Breads,Best Zucchini Bread,https://www.allrecipes.com/recipe/6915/zucchin...,"[3 large eggs, 2 cups white sugar, 1 cup veget...",1 hr 15 mins,4.7
1132,Zucchini Breads,Snickerdoodle Zucchini Bread,https://www.allrecipes.com/snickerdoodle-zucch...,"[cooking spray, 3/4 cup plus 2 tablespoons whi...",1 hr 20 mins,5.0


In [34]:
full_df.to_csv('rated_recipes.csv', index=False)

# Part D - Get reviewe for the recipies

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
rated_df = pd.read_csv('rated_recipes.csv')

In [3]:
# Remove duplicates based on URL
rated_df = rated_df.drop_duplicates(subset = 'Recipe URL')

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

In [5]:
# Setup Selenium WebDriver (adjust the path to your webdriver)
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode
#service = Service('/path_to_your_chromedriver')  # Adjust this path to where your ChromeDriver is located
driver = webdriver.Chrome(options=chrome_options)

In [6]:
# Function to scrape comments for a recipe
def scrape_comments(recipe_url, max_clicks=12, max_comments=100):
    # Navigate to the recipe page
    driver.get(recipe_url)
    
    comments = []
    click_count = 0

    # Click the "Load More Reviews" button up to `max_clicks` times
    while click_count < max_clicks:
        try:
            # Click the "Load More Reviews" button
            load_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, 'feedback-list__load-more-button'))
            )
            load_more_button.click()
            time.sleep(2)  # Wait for the reviews to load
            click_count += 1
        except:
            break  # Break if "Load More Reviews" button is not available anymore

    # After loading all the comments, scrape them using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Extract the feedback list items (comments)
    comment_divs = soup.find_all('div', class_='feedback__text', limit=max_comments)
    for comment_div in comment_divs:
        comment_text = comment_div.text.strip()
        comments.append(comment_text)
    
    return comments

In [7]:
test_df = rated_df.head(2)

In [8]:
# Create a new DataFrame to store the recipe info along with individual comments
comments_df = pd.DataFrame(columns=['Category Name', 'Recipe Name', 'Recipe URL', 'Ingredients', 'Total Time', 'Rating', 'Comment'])

# Iterate over each row in the rated_df to scrape comments
for index, row in rated_df.iterrows():
    recipe_url = row['Recipe URL']  # Pass the URL of the recipe
    category_name = row['Category Name']
    recipe_name = row['Recipe Name']
    ingredients = row['Ingredients']
    total_time = row['Total Time']
    rating = row['Rating']
    
    # Print the index of the recipe and recipe name
    print(f"Scraping comments for recipe {index + 1}: {recipe_name}")
    
    # Scrape comments for the current recipe by passing the recipe_url (string)
    comments = scrape_comments(recipe_url)  # Correctly pass the recipe URL here
    
    # Create a new row for each comment and add it to the comments_df
    for comment in comments:
        new_row = pd.DataFrame({
            'Category Name': [category_name],
            'Recipe Name': [recipe_name],
            'Recipe URL': [recipe_url],
            'Ingredients': [ingredients],
            'Total Time': [total_time],
            'Rating': [rating],
            'Comment': [comment]
        })
        comments_df = pd.concat([comments_df, new_row], ignore_index=True)

Scraping comments for recipe 1: Air Fryer Chicken Parmesan


  comments_df = pd.concat([comments_df, new_row], ignore_index=True)


Scraping comments for recipe 2: Chocolate Angel Food Cake I
Scraping comments for recipe 3: Angel Food Cake
Scraping comments for recipe 4: Sicilian Eggplant Caponata
Scraping comments for recipe 5: Best Caprese Skewers
Scraping comments for recipe 6: Air Fryer Arancini
Scraping comments for recipe 7: Easy Caramel Apple Crisp Pie
Scraping comments for recipe 8: Copycat McDonald's Apple Pies
Scraping comments for recipe 9: Caramel Apple Pie Cookies
Scraping comments for recipe 10: Sarah's Homemade Applesauce
Scraping comments for recipe 11: Slow Cooker Applesauce
Scraping comments for recipe 12: Applesauce
Scraping comments for recipe 13: Hot Spinach Artichoke Dip
Scraping comments for recipe 14: Simple Artichoke Dip
Scraping comments for recipe 15: Spinach and Artichoke Dip
Scraping comments for recipe 16: California BLT Chopped Bagel
Scraping comments for recipe 17: Party Beans
Scraping comments for recipe 18: BBQ'D Beans
Scraping comments for recipe 19: Southern Baked Beans
Scraping 

In [10]:
comments_df.to_csv('recipe_reviews.csv')

## Part E - Pandas and EDA

In [11]:
final_df = pd.read_csv('recipe_reviews.csv')

In [12]:
final_df.head()

Unnamed: 0.1,Unnamed: 0,Category Name,Recipe Name,Recipe URL,Ingredients,Total Time,Rating,Comment
0,0,Air Fryer Recipes,Air Fryer Chicken Parmesan,https://www.allrecipes.com/air-fryer-chicken-p...,['2 (8-ounce) boneless skinless chicken breast...,40 mins,4.5,I recently tried this *Air Fryer Chicken Parme...
1,1,Air Fryer Recipes,Air Fryer Chicken Parmesan,https://www.allrecipes.com/air-fryer-chicken-p...,['2 (8-ounce) boneless skinless chicken breast...,40 mins,4.5,I recently tried this *Air Fryer Chicken Parme...
2,2,Angel Food Cakes,Chocolate Angel Food Cake I,https://www.allrecipes.com/recipe/8252/chocola...,"['2 cups egg whites', '¼ teaspoon salt', '1 ¼ ...",1 hr 30 mins,4.7,Excellent angel food cake with a satisfying ch...
3,3,Angel Food Cakes,Chocolate Angel Food Cake I,https://www.allrecipes.com/recipe/8252/chocola...,"['2 cups egg whites', '¼ teaspoon salt', '1 ¼ ...",1 hr 30 mins,4.7,Yummy! I've made it twice and both times a hug...
4,4,Angel Food Cakes,Chocolate Angel Food Cake I,https://www.allrecipes.com/recipe/8252/chocola...,"['2 cups egg whites', '¼ teaspoon salt', '1 ¼ ...",1 hr 30 mins,4.7,Super easy recipe to follow!! We simply separa...


In [15]:
import re
import numpy as np

def convert_time_to_minutes(time_str):
    # Check if the value is a string
    if isinstance(time_str, str):
        # Initialize minutes
        total_minutes = 0
        
        # Find hours (if present)
        hours_match = re.search(r'(\d+)\s*hr', time_str)
        if hours_match:
            hours = int(hours_match.group(1))
            total_minutes += hours * 60
        
        # Find minutes (if present)
        minutes_match = re.search(r'(\d+)\s*min', time_str)
        if minutes_match:
            minutes = int(minutes_match.group(1))
            total_minutes += minutes
        
        return total_minutes
    else:
        # Return NaN if the value is not a string
        return np.nan

In [16]:
# Apply the function to the "Total Time" column
final_df['total_mins'] = final_df['Total Time'].apply(convert_time_to_minutes)

In [18]:
# Create a new column that stores the number of ingredients by calculating the length of each list
final_df['num_ingredients'] = final_df['Ingredients'].apply(len)

In [22]:
import ast

# Define a function to safely convert the string back into a list
def convert_to_list(ingredient_string):
    try:
        # Use ast.literal_eval to convert the string to a list
        return ast.literal_eval(ingredient_string)
    except (ValueError, SyntaxError):
        # In case the conversion fails, return an empty list
        return []

In [23]:
# Apply the conversion function to the 'Ingredients' column
final_df['Ingredients_List'] = final_df['Ingredients'].apply(convert_to_list)

# Now count the number of ingredients in each list
final_df['num_ingredients'] = final_df['Ingredients_List'].apply(len)

In [24]:
final_df.head()

Unnamed: 0.1,Unnamed: 0,Category Name,Recipe Name,Recipe URL,Ingredients,Total Time,Rating,Comment,total_mins,num_ingredients,Ingredients_List
0,0,Air Fryer Recipes,Air Fryer Chicken Parmesan,https://www.allrecipes.com/air-fryer-chicken-p...,['2 (8-ounce) boneless skinless chicken breast...,40 mins,4.5,I recently tried this *Air Fryer Chicken Parme...,40.0,12,[2 (8-ounce) boneless skinless chicken breasts...
1,1,Air Fryer Recipes,Air Fryer Chicken Parmesan,https://www.allrecipes.com/air-fryer-chicken-p...,['2 (8-ounce) boneless skinless chicken breast...,40 mins,4.5,I recently tried this *Air Fryer Chicken Parme...,40.0,12,[2 (8-ounce) boneless skinless chicken breasts...
2,2,Angel Food Cakes,Chocolate Angel Food Cake I,https://www.allrecipes.com/recipe/8252/chocola...,"['2 cups egg whites', '¼ teaspoon salt', '1 ¼ ...",1 hr 30 mins,4.7,Excellent angel food cake with a satisfying ch...,90.0,8,"[2 cups egg whites, ¼ teaspoon salt, 1 ¼ teasp..."
3,3,Angel Food Cakes,Chocolate Angel Food Cake I,https://www.allrecipes.com/recipe/8252/chocola...,"['2 cups egg whites', '¼ teaspoon salt', '1 ¼ ...",1 hr 30 mins,4.7,Yummy! I've made it twice and both times a hug...,90.0,8,"[2 cups egg whites, ¼ teaspoon salt, 1 ¼ teasp..."
4,4,Angel Food Cakes,Chocolate Angel Food Cake I,https://www.allrecipes.com/recipe/8252/chocola...,"['2 cups egg whites', '¼ teaspoon salt', '1 ¼ ...",1 hr 30 mins,4.7,Super easy recipe to follow!! We simply separa...,90.0,8,"[2 cups egg whites, ¼ teaspoon salt, 1 ¼ teasp..."


In [25]:
# Drop the 'Ingredients' column
final_df = final_df.drop(columns=['Ingredients'])

# Save the DataFrame as a CSV file
final_df.to_csv('final_df.csv', index=False)