# Scraping recipes from https://www.allrecipes.com
#### I want to scrape 
1. title of recipe
2. recipe introduction
3. prep time
4. cook time
5. total time
6. servings
7. ingredients
8. nutrients

In [2]:
# Imports 
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

## Scraping from Soups, Chili, Stews
https://www.allrecipes.com/recipes/94/soups-stews-and-chili/

In [2]:
# Saving each recipes url on the page: soups, chili, and stew
category_urls = [
    'https://www.allrecipes.com/recipes/94/soups-stews-and-chili/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/soups_chili_stews.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'soups_chili_stews.txt'")

Scraping: https://www.allrecipes.com/recipes/94/soups-stews-and-chili/
Saved 64 recipe URLs to 'soups_chili_stews.txt'


In [3]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/soups_chili_stews.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/soups_chili_stews.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/creamy-chicken-and-mushroom-soup-recipe-11710626
Scraping recipe: https://www.allrecipes.com/high-protein-broccoli-cheddar-soup-recipe-11706191
Scraping recipe: https://www.allrecipes.com/easy-chicken-rice-and-noodle-soup-recipe-11702264
Scraping recipe: https://www.allrecipes.com/chicken-biscuit-stew-recipe-11700236
Scraping recipe: https://www.allrecipes.com/polish-cabbage-roll-soup-recipe-11699185
Scraping recipe: https://www.allrecipes.com/guinness-chili-for-two-recipe-11692381
Scraping recipe: https://www.allrecipes.com/cottage-cheese-tomato-soup-recipe-11682446
Scraping recipe: https://www.allrecipes.com/mushroom-ramen-recipe-11681053
Scraping recipe: https://www.allrecipes.com/lazy-french-onion-soup-8773811
Scraping recipe: https://www.allrecipes.com/harissa-butternut-squash-soup-recipe-8781939
Scraping recipe: https://www.allrecipes.com/creamy-tomato-bean-soup-recipe-8781238
Scraping recipe: https://www.allrecipes.com/nicole-s-cajun-c

## Scraping from seafood main dishes
https://www.allrecipes.com/recipes/13290/main-dish/seafood/

In [4]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/13290/main-dish/seafood/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/main_seafood.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'main_seafood.txt'")

Scraping: https://www.allrecipes.com/recipes/13290/main-dish/seafood/
Saved 64 recipe URLs to 'main_seafood.txt'


In [5]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/main_seafood.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/main_seafood.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/one-pot-tuscan-shrimp-and-orzo-recipe-8763377
Scraping recipe: https://www.allrecipes.com/shrimp-fajita-rice-casserole-recipe-8788221
Scraping recipe: https://www.allrecipes.com/lobster-stuffed-pasta-shells-for-two-recipe-8789917
Scraping recipe: https://www.allrecipes.com/recipe/16023/shrimp-scampi-cheesecake-appetizer/
Scraping recipe: https://www.allrecipes.com/one-pot-mediterranean-shrimp-and-vegetables-recipe-8780639
Scraping recipe: https://www.allrecipes.com/copycat-chili-crisp-shrimp-recipe-8732545
Scraping recipe: https://www.allrecipes.com/halibut-en-papillote-recipe-8775088
Scraping recipe: https://www.allrecipes.com/mediterranean-salmon-baked-in-parchment-recipe-8768509
Scraping recipe: https://www.allrecipes.com/mediterranean-salmon-baked-in-foil-recipe-8768562
Scraping recipe: https://www.allrecipes.com/hot-honey-salmon-bowls-recipe-8765623
Scraping recipe: https://www.allrecipes.com/nicole-s-salmon-wellington-recipe-8764491
Scr

## Chicken Main dishes
https://www.allrecipes.com/recipes/16954/main-dish/chicken/

In [6]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/16954/main-dish/chicken/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/main_chicken.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'main_chicken.txt'")

Scraping: https://www.allrecipes.com/recipes/16954/main-dish/chicken/
Saved 64 recipe URLs to 'main_chicken.txt'


In [7]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/main_chicken.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/main_chicken.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/rotisserie-chicken-pasta-bake-recipe-11706563
Scraping recipe: https://www.allrecipes.com/hot-honey-feta-chicken-skewers-recipe-11706551
Scraping recipe: https://www.allrecipes.com/easy-rotisserie-chicken-enchiladas-recipe-11706233
Scraping recipe: https://www.allrecipes.com/slow-cooker-chicken-and-gravy-recipe-11693381
Scraping recipe: https://www.allrecipes.com/chicken-alfredo-garlic-bread-recipe-11703384
Scraping recipe: https://www.allrecipes.com/chicken-biscuit-stew-recipe-11700236
Scraping recipe: https://www.allrecipes.com/neiman-marcus-chicken-casserole-recipe-11693255
Scraping recipe: https://www.allrecipes.com/creamy-curry-spiced-chicken-pot-pie-recipe-11694077
Scraping recipe: https://www.allrecipes.com/crockpot-teriyaki-chicken-and-rice-recipe-11692257
Scraping recipe: https://www.allrecipes.com/air-fryer-pesto-chicken-quinoa-bowl-recipe-11691482
Scraping recipe: https://www.allrecipes.com/chicken-cordon-bleu-casserole-for-one-rec

## Pasta main dishes
https://www.allrecipes.com/recipes/17245/main-dish/pasta/

In [8]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/17245/main-dish/pasta/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/pasta.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'pasta.txt'")

Scraping: https://www.allrecipes.com/recipes/17245/main-dish/pasta/
Saved 64 recipe URLs to 'pasta.txt'


In [9]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/pasta.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('pasta.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/cheesy-lasagna-sheet-pasta-recipe-11703403
Scraping recipe: https://www.allrecipes.com/creamy-beef-and-bow-tie-pasta-recipe-11698660
Scraping recipe: https://www.allrecipes.com/recipe/271120/reuben-mac/
Scraping recipe: https://www.allrecipes.com/dump-and-bake-meatball-casserole-recipe-11695447
Scraping recipe: https://www.allrecipes.com/soy-butter-pasta-with-chicken-recipe-11694681
Scraping recipe: https://www.allrecipes.com/cheese-agnolotti-recipe-11694882
Scraping recipe: https://www.allrecipes.com/best-pasta-recipes-8737255
Scraping recipe: https://www.allrecipes.com/high-protein-cottage-cheese-mac-and-cheese-recipe-11687838
Scraping recipe: https://www.allrecipes.com/creamy-sun-dried-tomato-and-spinach-pasta-recipe-11688336
Scraping recipe: https://www.allrecipes.com/elote-ramen-noodles-recipe-8780286
Scraping recipe: https://www.allrecipes.com/easy-slow-cooker-boursin-chicken-and-pasta-recipe-11681173
Scraping recipe: https://www.allrec

## Tacos 
https://www.allrecipes.com/recipes/17874/main-dish/tacos/

In [10]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/17874/main-dish/tacos/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/tacos.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'tacos.txt'")

Scraping: https://www.allrecipes.com/recipes/17874/main-dish/tacos/
Saved 64 recipe URLs to 'tacos.txt'


In [11]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/tacos.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/tacos.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/our-most-saved-taco-recipe-of-all-time-8639608
Scraping recipe: https://www.allrecipes.com/smash-burger-taco-recipe-7485747
Scraping recipe: https://www.allrecipes.com/recipe/269743/poblano-and-ground-pork-tacos/
Scraping recipe: https://www.allrecipes.com/recipe/280916/shrimp-tacos-with-cilantro-lime-crema/
Scraping recipe: https://www.allrecipes.com/recipe/8497570/barbacoa-tacos/
Scraping recipe: https://www.allrecipes.com/recipe/257988/traditional-mexican-street-tacos/
Scraping recipe: https://www.allrecipes.com/recipe/8463920/birria-de-pollo-chicken-birria-for-tacos/
Scraping recipe: https://www.allrecipes.com/recipe/255545/ground-turkey-taco-meat/
Scraping recipe: https://www.allrecipes.com/recipe/220326/sarahs-easy-shredded-chicken-taco-filling/
Scraping recipe: https://www.allrecipes.com/recipe/257538/authentic-tacos-al-pastor/
Scraping recipe: https://www.allrecipes.com/recipe/53729/fish-tacos/
Scraping recipe: https://www.allrecipes.

## Bowls
https://www.allrecipes.com/recipes/17881/main-dish/bowls/

In [12]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/17881/main-dish/bowls/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/bowls.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'bowls.txt'")

Scraping: https://www.allrecipes.com/recipes/17881/main-dish/bowls/
Saved 64 recipe URLs to 'bowls.txt'


In [13]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/bowls.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/bowls.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/teriyaki-salmon-bowl-recipe-8624279
Scraping recipe: https://www.allrecipes.com/meatless-grain-bowl-recipe-8584633
Scraping recipe: https://www.allrecipes.com/bourbon-teriyaki-salmon-rice-bowl-recipe-7968978
Scraping recipe: https://www.allrecipes.com/garlic-ginger-chicken-meatball-bowls-recipe-7570245
Scraping recipe: https://www.allrecipes.com/watermelon-tuna-poke-bowl-recipe-7568001
Scraping recipe: https://www.allrecipes.com/salmon-couscous-salad-recipe-7561932
Scraping recipe: https://www.allrecipes.com/spicy-canned-salmon-salad-rice-bowl-recipe-7558696
Scraping recipe: https://www.allrecipes.com/chicken-teriyaki-rice-bowls-recipe-7372857
Scraping recipe: https://www.allrecipes.com/recipe/8537903/spinach-and-artichoke-farro-bowl-with-chicken-and-mushrooms/
Scraping recipe: https://www.allrecipes.com/recipe/268091/easy-korean-ground-beef-bowl/
Scraping recipe: https://www.allrecipes.com/recipe/278876/spicy-tuna-rice-bowl/
Scraping recipe:

## Vegetable main dishes
https://www.allrecipes.com/recipes/22472/main-dish/vegetable-main-dishes/

In [14]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/22472/main-dish/vegetable-main-dishes/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/vegetable.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'vegetable.txt'")

Scraping: https://www.allrecipes.com/recipes/22472/main-dish/vegetable-main-dishes/
Saved 64 recipe URLs to 'vegetable.txt'


In [15]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/vegetable.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/vegetable.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/mushroom-steak-diane-recipe-8722391
Scraping recipe: https://www.allrecipes.com/caramelized-onion-and-roasted-garlic-pasta-recipe-8724725
Scraping recipe: https://www.allrecipes.com/million-dollar-potato-casserole-recipe-8710394
Scraping recipe: https://www.allrecipes.com/recipe/200877/ratatouille-bake/
Scraping recipe: https://www.allrecipes.com/broccoli-fritters-recipe-8646002
Scraping recipe: https://www.allrecipes.com/chef-john-s-tortilla-de-patatas-recipe-8640102
Scraping recipe: https://www.allrecipes.com/recipe/231457/pesto-spaghetti-squash/
Scraping recipe: https://www.allrecipes.com/recipe/222000/spaghetti-aglio-e-olio/
Scraping recipe: https://www.allrecipes.com/recipe/18417/spanakopita-greek-spinach-pie/
Scraping recipe: https://www.allrecipes.com/recipe/276505/grandmas-hash-brown-casserole/
Scraping recipe: https://www.allrecipes.com/recipe/15559/black-beans-and-rice/
Scraping recipe: https://www.allrecipes.com/recipe/13941/zucchi

## Burgers
https://www.allrecipes.com/recipes/248/main-dish/burgers/

In [16]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/248/main-dish/burgers/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/burgers.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'burgers.txt'")

Scraping: https://www.allrecipes.com/recipes/248/main-dish/burgers/
Saved 64 recipe URLs to 'burgers.txt'


In [17]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/burgers.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/burgers.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/hamburgers-with-grilled-apple-and-chili-crisp-recipe-8637636
Scraping recipe: https://www.allrecipes.com/copycat-mcdonalds-filet-o-fish-sandwich-recipe-8580443
Scraping recipe: https://www.allrecipes.com/copycat-onion-wrapped-flying-dutchman-recipe-8553215
Scraping recipe: https://www.allrecipes.com/spicy-chicken-burgers-with-mango-salsa-recipe-7971039
Scraping recipe: https://www.allrecipes.com/pork-dumpling-hamburger-sliders-recipe-7971775
Scraping recipe: https://www.allrecipes.com/chipotle-chicken-burgers-recipe-7971118
Scraping recipe: https://www.allrecipes.com/turkey-burgers-with-rice-krispies-recipe-7969063
Scraping recipe: https://www.allrecipes.com/mediterranean-ground-turkey-burgers-with-olive-and-feta-topping-recipe-7570594
Scraping recipe: https://www.allrecipes.com/chimichurri-chicken-burgers-recipe-7553187
Scraping recipe: https://www.allrecipes.com/seven-layer-sliders-recipe-7500680
Scraping recipe: https://www.allrecipes.com/

## Pizza
https://www.allrecipes.com/recipes/250/main-dish/pizza/

In [18]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/250/main-dish/pizza/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/pizza.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'pizza.txt'")

Scraping: https://www.allrecipes.com/recipes/250/main-dish/pizza/
Saved 64 recipe URLs to 'pizza.txt'


In [19]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/pizza.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/pizza.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/roman-style-pizza-recipe-8771243
Scraping recipe: https://www.allrecipes.com/sweet-potato-and-bacon-pizza-recipe-8747988
Scraping recipe: https://www.allrecipes.com/easy-tiktok-pizza-appetizer-recipe-8660871
Scraping recipe: https://www.allrecipes.com/pizza-poppers-recipe-11681552
Scraping recipe: https://www.allrecipes.com/new-england-beach-pizza-recipe-8693117
Scraping recipe: https://www.allrecipes.com/folded-pizza-sandwich-recipe-8666883
Scraping recipe: https://www.allrecipes.com/recipe/234535/chef-johns-sausage-and-egg-pizza/
Scraping recipe: https://www.allrecipes.com/perfect-pan-pizza-recipe-8656583
Scraping recipe: https://www.allrecipes.com/recipe/239433/fried-peach-and-pancetta-pizza/
Scraping recipe: https://www.allrecipes.com/buffalo-caesar-chicken-crust-pizza-recipe-8648525
Scraping recipe: https://www.allrecipes.com/pretzel-crust-pizza-recipe-8635843
Scraping recipe: https://www.allrecipes.com/copycat-crazy-puffs-recipe-8621973

## Sandwhiches 
https://www.allrecipes.com/recipes/251/main-dish/sandwiches/

In [20]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/251/main-dish/sandwiches/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/sandwhiches.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'sandwhiches.txt'")

Scraping: https://www.allrecipes.com/recipes/251/main-dish/sandwiches/
Saved 64 recipe URLs to 'sandwhiches.txt'


In [21]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/sandwhiches.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/sandwhiches.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/pickle-grilled-cheese-recipe-11711130
Scraping recipe: https://www.allrecipes.com/cuban-crunchwrap-recipe-11707035
Scraping recipe: https://www.allrecipes.com/easy-cottage-cheese-wrap-recipe-11697300
Scraping recipe: https://www.allrecipes.com/cream-cheese-and-peanut-butter-strawberry-sandwich-recipe-11696003
Scraping recipe: https://www.allrecipes.com/air-fryer-english-muffin-tuna-melt-recipe-11695943
Scraping recipe: https://www.allrecipes.com/everything-but-the-bagel-cottage-cheese-wrap-recipe-11694939
Scraping recipe: https://www.allrecipes.com/french-onion-hotdogs-recipe-11685564
Scraping recipe: https://www.allrecipes.com/sloppy-joe-sliders-recipe-8789641
Scraping recipe: https://www.allrecipes.com/mississippi-pot-roast-cheesesteak-recipe-8785850
Scraping recipe: https://www.allrecipes.com/famous-mexican-hotdogs-recipe-8780999
Scraping recipe: https://www.allrecipes.com/smashed-chicken-parmesan-texas-toast-recipe-8778198
Scraping recipe

## Stir fry
https://www.allrecipes.com/recipes/259/main-dish/stir-fry/

In [22]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/259/main-dish/stir-fry/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/stir_fry.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'stir_fry.txt'")

Scraping: https://www.allrecipes.com/recipes/259/main-dish/stir-fry/
Saved 64 recipe URLs to 'stir_fry.txt'


In [23]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/stir_fry.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/stir_fry.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/recipe/280800/quick-tofu-stir-fry/
Scraping recipe: https://www.allrecipes.com/turmeric-black-pepper-chicken-with-broccoli-recipe-8750707
Scraping recipe: https://www.allrecipes.com/shrimp-stir-fry-with-snap-peas-and-baby-corn-recipe-8753997
Scraping recipe: https://www.allrecipes.com/recipe/34668/chinese-fried-noodles/
Scraping recipe: https://www.allrecipes.com/caramel-apple-tofu-and-noodle-stir-fry-recipe-8745592
Scraping recipe: https://www.allrecipes.com/shiitake-mushroom-stir-fry-recipe-8722745
Scraping recipe: https://www.allrecipes.com/ramen-vegetable-stir-fry-recipe-8735776
Scraping recipe: https://www.allrecipes.com/ground-turkey-and-vegetable-stir-fry-recipe-8708979
Scraping recipe: https://www.allrecipes.com/shrimp-ramen-stir-fry-recipe-8733280
Scraping recipe: https://www.allrecipes.com/ground-turkey-teriyaki-stir-fry-recipe-8705245
Scraping recipe: https://www.allrecipes.com/ground-beef-and-broccoli-stir-fry-recipe-8679070
Scrap

## Main dish salads
https://www.allrecipes.com/recipes/260/main-dish/salads/

In [24]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/260/main-dish/salads/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/main_salads.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'main_salads.txt'")

Scraping: https://www.allrecipes.com/recipes/260/main-dish/salads/
Saved 64 recipe URLs to 'main_salads.txt'


In [25]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/main_salads.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/main_salads.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/recipe/246138/carries-pad-thai-salad/
Scraping recipe: https://www.allrecipes.com/recipe/14415/cobb-salad/
Scraping recipe: https://www.allrecipes.com/recipe/14186/grilled-chicken-salad-with-seasonal-fruit/
Scraping recipe: https://www.allrecipes.com/recipe/260452/greek-inspired-chicken-salad/
Scraping recipe: https://www.allrecipes.com/recipe/242415/grilled-chicken-peach-and-arugula-salad/
Scraping recipe: https://www.allrecipes.com/recipe/240327/spinach-salad-with-chicken-avocado-and-goat-cheese/
Scraping recipe: https://www.allrecipes.com/recipe/246243/chicken-paillard/
Scraping recipe: https://www.allrecipes.com/recipe/231132/easy-chinese-chicken-salad/
Scraping recipe: https://www.allrecipes.com/recipe/272849/healthy-chicken-salad/
Scraping recipe: https://www.allrecipes.com/recipe/257254/black-bean-corn-and-quinoa-salad/
Scraping recipe: https://www.allrecipes.com/recipe/230050/kale-quinoa-and-avocado-salad-with-lemon-dijon-vinaigrette/

## Beef main dishes 
https://www.allrecipes.com/recipes/446/main-dish/beef/

In [26]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/446/main-dish/beef/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/beef.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'beef.txt'")

Scraping: https://www.allrecipes.com/recipes/446/main-dish/beef/
Saved 64 recipe URLs to 'beef.txt'


In [27]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/beef.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/beef.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/guinness-chili-for-two-recipe-11692381
Scraping recipe: https://www.allrecipes.com/corned-beef-stroganoff-recipe-11692154
Scraping recipe: https://www.allrecipes.com/recipe/264005/kimchi-corned-beef/
Scraping recipe: https://www.allrecipes.com/recipe/222236/coconut-milk-corned-beef-and-cabbage/
Scraping recipe: https://www.allrecipes.com/best-beef-dinners-for-two-8745407
Scraping recipe: https://www.allrecipes.com/bistro-style-steak-recipe-8787026
Scraping recipe: https://www.allrecipes.com/mississippi-pot-roast-cheesesteak-recipe-8785850
Scraping recipe: https://www.allrecipes.com/slow-cooker-ground-beef-stroganoff-recipe-8778867
Scraping recipe: https://www.allrecipes.com/ground-beef-and-rice-skillet-recipe-8767178
Scraping recipe: https://www.allrecipes.com/cowboy-steak-recipe-8758222
Scraping recipe: https://www.allrecipes.com/pepper-parm-prime-rib-recipe-8757842
Scraping recipe: https://www.allrecipes.com/easy-roasted-beef-tenderloin-rec

## Dumplings
https://www.allrecipes.com/recipes/535/main-dish/dumplings/

In [28]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/535/main-dish/dumplings/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/dumplings.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'dumplings.txt'")

Scraping: https://www.allrecipes.com/recipes/535/main-dish/dumplings/
Saved 64 recipe URLs to 'dumplings.txt'


In [29]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/dumplings.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/dumplings.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/rustic-chicken-and-dumplins-recipe-8707675
Scraping recipe: https://www.allrecipes.com/recipe/18465/gnocchi-i/
Scraping recipe: https://www.allrecipes.com/recipe/6900/dumplings/
Scraping recipe: https://www.allrecipes.com/one-pan-dumpling-dinner-8774252
Scraping recipe: https://www.allrecipes.com/apple-sausage-gravy-dumplings-recipe-8714162
Scraping recipe: https://www.allrecipes.com/best-potsticker-gyoza-and-dumpling-recipes-8650336
Scraping recipe: https://www.allrecipes.com/sweet-potato-dumplings-with-peach-thai-chili-sauce-recipe-8347235
Scraping recipe: https://www.allrecipes.com/beef-and-mushroom-dumplings-in-broth-recipe-7369568
Scraping recipe: https://www.allrecipes.com/collard-green-wontons-recipe-7254880
Scraping recipe: https://www.allrecipes.com/recipe/8491621/manti-armenian-dumplings/
Scraping recipe: https://www.allrecipes.com/recipe/8486212/farmer-cheese-dumplings/
Scraping recipe: https://www.allrecipes.com/recipe/261153/perf

## Pork Main 
https://www.allrecipes.com/recipes/673/main-dish/pork/

In [30]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/673/main-dish/pork/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/pork.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'pork.txt'")

Scraping: https://www.allrecipes.com/recipes/673/main-dish/pork/
Saved 64 recipe URLs to 'pork.txt'


In [31]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/pork.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/pork.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/southwestern-pulled-pork-sliders-recipe-8786436
Scraping recipe: https://www.allrecipes.com/slow-cooker-honey-mustard-pulled-pork-recipe-8782549
Scraping recipe: https://www.allrecipes.com/apple-cider-pork-tenderloin-with-sweet-potatoes-recipe-8774666
Scraping recipe: https://www.allrecipes.com/boursin-pork-chops-recipe-8763443
Scraping recipe: https://www.allrecipes.com/ham-and-gruyere-scones-recipe-8747672
Scraping recipe: https://www.allrecipes.com/pork-tenderloins-with-cranberries-walnuts-and-dried-fruit-recipe-8746835
Scraping recipe: https://www.allrecipes.com/honey-mustard-pretzel-pork-chops-recipe-8742371
Scraping recipe: https://www.allrecipes.com/creamy-sage-and-garlic-pork-chops-recipe-8738719
Scraping recipe: https://www.allrecipes.com/smoked-pork-chops-with-sauerkraut-recipe-8732958
Scraping recipe: https://www.allrecipes.com/pulled-pork-casserole-recipe-8719329
Scraping recipe: https://www.allrecipes.com/country-ham-and-biscuits

## Turkey Main
https://www.allrecipes.com/recipes/687/main-dish/turkey/

In [32]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/687/main-dish/turkey/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/turkey.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'turkey.txt'")

Scraping: https://www.allrecipes.com/recipes/687/main-dish/turkey/
Saved 64 recipe URLs to 'turkey.txt'


In [33]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/turkey.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/turkey.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/ground-turkey-teriyaki-stir-fry-recipe-8705245
Scraping recipe: https://www.allrecipes.com/ground-turkey-stir-fry-recipe-8612814
Scraping recipe: https://www.allrecipes.com/recipe/8509206/jerky-roast-turkey/
Scraping recipe: https://www.allrecipes.com/recipe/16984/turkey-in-a-smoker/
Scraping recipe: https://www.allrecipes.com/recipe/166160/juicy-thanksgiving-turkey/
Scraping recipe: https://www.allrecipes.com/recipe/229658/oven-roasted-turkey-breast/
Scraping recipe: https://www.allrecipes.com/recipe/56348/a-simply-perfect-roast-turkey/
Scraping recipe: https://www.allrecipes.com/recipe/245534/roast-spatchcock-turkey/
Scraping recipe: https://www.allrecipes.com/recipe/215412/simple-deep-fried-turkey/
Scraping recipe: https://www.allrecipes.com/recipe/222332/how-to-cook-a-turkey/
Scraping recipe: https://www.allrecipes.com/recipe/23037/easy-beginners-turkey-with-stuffing/
Scraping recipe: https://www.allrecipes.com/recipe/23157/turkey-in-a-ba

## Vegetarian Main
https://www.allrecipes.com/recipes/265/everyday-cooking/vegetarian/main-dishes/

In [34]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/265/everyday-cooking/vegetarian/main-dishes/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/vegetarian.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'vegetarian.txt'")

Scraping: https://www.allrecipes.com/recipes/265/everyday-cooking/vegetarian/main-dishes/
Saved 64 recipe URLs to 'vegetarian.txt'


In [35]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/vegetarian.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/vegetarian.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/white-bean-and-tomato-pasta-recipe-11703008
Scraping recipe: https://www.allrecipes.com/marry-me-chickpeas-recipe-11688105
Scraping recipe: https://www.allrecipes.com/creamy-tuscan-white-bean-skillet-recipe-8769362
Scraping recipe: https://www.allrecipes.com/easy-zucchini-mushroom-skillet-recipe-8753947
Scraping recipe: https://www.allrecipes.com/ramen-vegetable-stir-fry-recipe-8735776
Scraping recipe: https://www.allrecipes.com/mushroom-steak-diane-recipe-8722391
Scraping recipe: https://www.allrecipes.com/caramelized-onion-and-roasted-garlic-pasta-recipe-8724725
Scraping recipe: https://www.allrecipes.com/creamy-caramelized-onion-pasta-recipe-8710600
Scraping recipe: https://www.allrecipes.com/butter-beans-alla-vodka-recipe-8660103
Scraping recipe: https://www.allrecipes.com/recipe/31818/summer-nights-eggplants/
Scraping recipe: https://www.allrecipes.com/zucchini-rollatini-recipe-8680237
Scraping recipe: https://www.allrecipes.com/oyster-m

## Southern Main
https://www.allrecipes.com/recipes/15877/us-recipes/southern/main-dishes/

In [36]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/15877/us-recipes/southern/main-dishes/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/southern.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'southern.txt'")

Scraping: https://www.allrecipes.com/recipes/15877/us-recipes/southern/main-dishes/
Saved 64 recipe URLs to 'southern.txt'


In [37]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/southern.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/southern.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/ultimate-lowcountry-shrimp-and-grits-recipe-8660975
Scraping recipe: https://www.allrecipes.com/crawfish-etouffee-recipe-7371763
Scraping recipe: https://www.allrecipes.com/recipe/58211/authentic-louisiana-red-beans-and-rice/
Scraping recipe: https://www.allrecipes.com/recipe/220128/chef-johns-buttermilk-fried-chicken/
Scraping recipe: https://www.allrecipes.com/recipe/140930/southern-fried-catfish/
Scraping recipe: https://www.allrecipes.com/recipe/150306/the-best-chicken-fried-steak/
Scraping recipe: https://www.allrecipes.com/recipe/44033/daves-low-country-boil/
Scraping recipe: https://www.allrecipes.com/recipe/13824/deep-fried-turkey/
Scraping recipe: https://www.allrecipes.com/recipe/220895/old-charleston-style-shrimp-and-grits/
Scraping recipe: https://www.allrecipes.com/recipe/200036/southern-fried-chicken-livers/
Scraping recipe: https://www.allrecipes.com/recipe/254804/chef-johns-nashville-hot-chicken/
Scraping recipe: https://www.a

## Chinese Main
https://www.allrecipes.com/recipes/17135/world-cuisine/asian/chinese/main-dishes/

In [38]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/17135/world-cuisine/asian/chinese/main-dishes/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/chinese.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'chinese.txt'")

Scraping: https://www.allrecipes.com/recipes/17135/world-cuisine/asian/chinese/main-dishes/
Saved 64 recipe URLs to 'chinese.txt'


In [39]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/chinese.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/chinese.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/4-ingredient-orange-chicken-recipe-8713472
Scraping recipe: https://www.allrecipes.com/baked-sweet-and-sour-chicken-recipe-8654805
Scraping recipe: https://www.allrecipes.com/copycat-panda-express-orange-chicken-recipe-8687732
Scraping recipe: https://www.allrecipes.com/ginger-chicken-recipe-8639346
Scraping recipe: https://www.allrecipes.com/black-pepper-chicken-recipe-8382732
Scraping recipe: https://www.allrecipes.com/sesame-chicken-with-broccoli-recipe-8426412
Scraping recipe: https://www.allrecipes.com/copycat-panda-express-chow-mein-recipe-7556785
Scraping recipe: https://www.allrecipes.com/recipe/228823/quick-beef-stir-fry/
Scraping recipe: https://www.allrecipes.com/recipe/61024/asian-orange-chicken/
Scraping recipe: https://www.allrecipes.com/recipe/9027/kung-pao-chicken/
Scraping recipe: https://www.allrecipes.com/recipe/93234/honey-walnut-shrimp/
Scraping recipe: https://www.allrecipes.com/recipe/223156/shrimp-with-broccoli-in-garl

## Filipino Main
https://www.allrecipes.com/recipes/17494/world-cuisine/asian/filipino/main-dishes/

In [40]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/17494/world-cuisine/asian/filipino/main-dishes/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/filipino.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'filipino.txt'")

Scraping: https://www.allrecipes.com/recipes/17494/world-cuisine/asian/filipino/main-dishes/
Saved 64 recipe URLs to 'filipino.txt'


In [41]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/filipino.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/filipino.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/recipe/47015/quick-and-easy-pancit/
Scraping recipe: https://www.allrecipes.com/recipe/212929/chicken-tinola/
Scraping recipe: https://www.allrecipes.com/recipe/128699/famous-chicken-adobo/
Scraping recipe: https://www.allrecipes.com/recipe/212994/pinakbet/
Scraping recipe: https://www.allrecipes.com/recipe/204958/pork-sinigang/
Scraping recipe: https://www.allrecipes.com/recipe/270708/lumpia-in-the-air-fryer/
Scraping recipe: https://www.allrecipes.com/recipe/212911/filipino-beef-steak/
Scraping recipe: https://www.allrecipes.com/recipe/212923/adobong-pusit-squid-adobo/
Scraping recipe: https://www.allrecipes.com/recipe/204821/ginataang-manok-chicken-cooked-in-coconut-milk/
Scraping recipe: https://www.allrecipes.com/recipe/237899/filippino-lechon-kawali/
Scraping recipe: https://www.allrecipes.com/recipe/265040/filipino-beef-giniling-afritada-style/
Scraping recipe: https://www.allrecipes.com/recipe/32010/filipino-lumpia/
Scraping recipe: h

## Indian Main
https://www.allrecipes.com/recipes/17136/world-cuisine/asian/indian/main-dishes/

In [42]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/17136/world-cuisine/asian/indian/main-dishes/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/indian.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'indian.txt'")

Scraping: https://www.allrecipes.com/recipes/17136/world-cuisine/asian/indian/main-dishes/
Saved 64 recipe URLs to 'indian.txt'


In [43]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/indian.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/indian.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/chickpea-tikka-masala-recipe-7497387
Scraping recipe: https://www.allrecipes.com/chicken-bhuna-recipe-7485475
Scraping recipe: https://www.allrecipes.com/lamb-rogan-josh-recipe-7485811
Scraping recipe: https://www.allrecipes.com/recipe/50347/indian-tandoori-chicken/
Scraping recipe: https://www.allrecipes.com/recipe/231026/keema-aloo-ground-beef-and-potatoes/
Scraping recipe: https://www.allrecipes.com/recipe/212721/indian-chicken-curry-murgh-kari/
Scraping recipe: https://www.allrecipes.com/recipe/141169/easy-indian-butter-chicken/
Scraping recipe: https://www.allrecipes.com/recipe/46822/indian-chicken-curry-ii/
Scraping recipe: https://www.allrecipes.com/recipe/174543/slow-cooker-butter-chicken/
Scraping recipe: https://www.allrecipes.com/recipe/45957/chicken-makhani-indian-butter-chicken/
Scraping recipe: https://www.allrecipes.com/recipe/212487/lamb-gosht-biryani/
Scraping recipe: https://www.allrecipes.com/recipe/45736/chicken-tikka-masa

## Japanese Main
https://www.allrecipes.com/recipes/17491/world-cuisine/asian/japanese/main-dishes/

In [44]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/17491/world-cuisine/asian/japanese/main-dishes/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/japanese.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'japanese.txt'")

Scraping: https://www.allrecipes.com/recipes/17491/world-cuisine/asian/japanese/main-dishes/
Saved 64 recipe URLs to 'japanese.txt'


In [46]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/japanese.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/japanese.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/recipe/8539106/yaki-udon/
Scraping recipe: https://www.allrecipes.com/recipe/8536667/homemade-sushi-rolls/
Scraping recipe: https://www.allrecipes.com/recipe/72068/chicken-katsu/
Scraping recipe: https://www.allrecipes.com/recipe/71698/sesame-seared-tuna/
Scraping recipe: https://www.allrecipes.com/recipe/127500/japanese-style-deep-fried-shrimp/
Scraping recipe: https://www.allrecipes.com/recipe/278708/air-fryer-chicken-katsu-with-homemade-katsu-sauce/
Scraping recipe: https://www.allrecipes.com/recipe/190094/miso-and-soy-chilean-sea-bass/
Scraping recipe: https://www.allrecipes.com/recipe/128589/oyakodon-japanese-chicken-and-egg-rice-bowl/
Scraping recipe: https://www.allrecipes.com/recipe/228953/california-roll/
Scraping recipe: https://www.allrecipes.com/recipe/24228/sushi-roll/
Scraping recipe: https://www.allrecipes.com/article/how-to-make-sushi-rolls/
Scraping recipe: https://www.allrecipes.com/recipe/169856/cream-cheese-and-crab-sushi-

## Thai Main
https://www.allrecipes.com/recipes/17137/world-cuisine/asian/thai/main-dishes/

In [47]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/17137/world-cuisine/asian/thai/main-dishes/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/thai.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'thai.txt'")

Scraping: https://www.allrecipes.com/recipes/17137/world-cuisine/asian/thai/main-dishes/
Saved 64 recipe URLs to 'thai.txt'


In [48]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/thai.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/thai.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/thai-pasta-salad-recipe-8646006
Scraping recipe: https://www.allrecipes.com/thai-inspired-potato-salad-recipe-8655649
Scraping recipe: https://www.allrecipes.com/larb-gai-6604488
Scraping recipe: https://www.allrecipes.com/recipe/8465373/slow-cooker-beef-massaman-curry/
Scraping recipe: https://www.allrecipes.com/recipe/142055/chicken-massaman-curry/
Scraping recipe: https://www.allrecipes.com/recipe/42968/pad-thai/
Scraping recipe: https://www.allrecipes.com/recipe/222350/authentic-pad-thai/
Scraping recipe: https://www.allrecipes.com/recipe/141833/thai-green-curry-chicken/
Scraping recipe: https://www.allrecipes.com/recipe/213947/panang-curry-with-chicken/
Scraping recipe: https://www.allrecipes.com/recipe/68532/curried-coconut-chicken/
Scraping recipe: https://www.allrecipes.com/recipe/86005/thai-pineapple-chicken-curry/
Scraping recipe: https://www.allrecipes.com/recipe/145572/thai-spicy-basil-chicken-fried-rice/
Scraping recipe: https://

## French Main
https://www.allrecipes.com/recipes/17138/world-cuisine/european/french/main-dishes/

In [49]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/17138/world-cuisine/european/french/main-dishes/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/french.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'french.txt'")

Scraping: https://www.allrecipes.com/recipes/17138/world-cuisine/european/french/main-dishes/
Saved 64 recipe URLs to 'french.txt'


In [51]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/french.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/french.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/recipe/236699/chef-johns-orange-duck/
Scraping recipe: https://www.allrecipes.com/recipe/76296/meat-pie/
Scraping recipe: https://www.allrecipes.com/recipe/8465314/trout-meuniere/
Scraping recipe: https://www.allrecipes.com/recipe/8524473/beans-and-greens-tartine/
Scraping recipe: https://www.allrecipes.com/recipe/282251/croque-madame-sandwich/
Scraping recipe: https://www.allrecipes.com/recipe/8495/chicken-cordon-bleu-i/
Scraping recipe: https://www.allrecipes.com/recipe/87386/lobster-thermidor/
Scraping recipe: https://www.allrecipes.com/recipe/17515/quiche-lorraine-i/
Scraping recipe: https://www.allrecipes.com/recipe/85107/omelet-in-a-bag/
Scraping recipe: https://www.allrecipes.com/recipe/239230/chef-johns-coq-au-vin/
Scraping recipe: https://www.allrecipes.com/recipe/213487/chicken-french-rochester-ny-style/
Scraping recipe: https://www.allrecipes.com/recipe/8669/chicken-cordon-bleu-ii/
Scraping recipe: https://www.allrecipes.com/recipe

## Greek Main
https://www.allrecipes.com/recipes/17152/world-cuisine/european/greek/main-dishes/

In [53]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/17152/world-cuisine/european/greek/main-dishes/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/greek.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'greek.txt'")

Scraping: https://www.allrecipes.com/recipes/17152/world-cuisine/european/greek/main-dishes/
Saved 64 recipe URLs to 'greek.txt'


In [54]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/greek.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/greek.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/greek-style-grilled-lamb-chops-recipe-8680071
Scraping recipe: https://www.allrecipes.com/greek-turkey-meatballs-recipe-8654633
Scraping recipe: https://www.allrecipes.com/recipe/14713/souvlaki/
Scraping recipe: https://www.allrecipes.com/greek-feta-and-spinach-potato-casserole-recipe-7372803
Scraping recipe: https://www.allrecipes.com/recipe/173420/traditional-gyro-meat/
Scraping recipe: https://www.allrecipes.com/recipe/19644/moussaka/
Scraping recipe: https://www.allrecipes.com/recipe/231644/chicken-souvlaki-with-tzatziki-sauce/
Scraping recipe: https://www.allrecipes.com/recipe/240559/traditional-gyros/
Scraping recipe: https://www.allrecipes.com/recipe/14527/greek-chicken/
Scraping recipe: https://www.allrecipes.com/recipe/25311/vegetarian-moussaka/
Scraping recipe: https://www.allrecipes.com/recipe/236932/branzino-mediterranean/
Scraping recipe: https://www.allrecipes.com/recipe/54202/greek-style-garlic-chicken-breast/
Scraping recipe: 

## Italian Main
https://www.allrecipes.com/recipes/16767/world-cuisine/european/italian/main-dishes/

In [55]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/16767/world-cuisine/european/italian/main-dishes/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/italian.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'italian.txt'")

Scraping: https://www.allrecipes.com/recipes/16767/world-cuisine/european/italian/main-dishes/
Saved 64 recipe URLs to 'italian.txt'


In [56]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/italian.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/italian.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/creamy-tuscan-white-bean-skillet-recipe-8769362
Scraping recipe: https://www.allrecipes.com/roman-style-pizza-recipe-8771243
Scraping recipe: https://www.allrecipes.com/cod-piccata-recipe-8763624
Scraping recipe: https://www.allrecipes.com/unstuffed-shells-pasta-bake-recipe-8747308
Scraping recipe: https://www.allrecipes.com/pappardelle-with-jammy-onion-ragu-recipe-8741109
Scraping recipe: https://www.allrecipes.com/tuscan-chicken-casserole-recipe-8735197
Scraping recipe: https://www.allrecipes.com/lemon-garlic-butter-chicken-spiedini-recipe-8727930
Scraping recipe: https://www.allrecipes.com/italian-steak-pizzaiola-recipe-8732132
Scraping recipe: https://www.allrecipes.com/slow-cooker-tuscan-chicken-meatballs-with-gnocchi-recipe-8718448
Scraping recipe: https://www.allrecipes.com/short-ribs-pizzaiola-recipe-8720220
Scraping recipe: https://www.allrecipes.com/pork-scallopini-recipe-8686741
Scraping recipe: https://www.allrecipes.com/air-fryer

## Mexican Main
https://www.allrecipes.com/recipes/17504/world-cuisine/latin-american/mexican/main-dishes/

In [57]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/17504/world-cuisine/latin-american/mexican/main-dishes/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/mexican.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'mexican.txt'")

Scraping: https://www.allrecipes.com/recipes/17504/world-cuisine/latin-american/mexican/main-dishes/
Saved 64 recipe URLs to 'mexican.txt'


In [58]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/mexican.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/mexican.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/easy-rotisserie-chicken-enchiladas-recipe-11706233
Scraping recipe: https://www.allrecipes.com/easiest-sheet-pan-tacos-recipe-11700898
Scraping recipe: https://www.allrecipes.com/easy-creamy-crockpot-chicken-enchilada-casserole-recipe-11681159
Scraping recipe: https://www.allrecipes.com/shrimp-fajita-rice-casserole-recipe-8788221
Scraping recipe: https://www.allrecipes.com/slow-cooker-chicken-enchilada-casserole-recipe-8788263
Scraping recipe: https://www.allrecipes.com/air-fried-tossed-taquitos-recipe-8784154
Scraping recipe: https://www.allrecipes.com/chicken-adobo-tacos-recipe-8736734
Scraping recipe: https://www.allrecipes.com/roscoe-s-chilaquiles-recipe-8739658
Scraping recipe: https://www.allrecipes.com/taquito-casserole-recipe-8673259
Scraping recipe: https://www.allrecipes.com/chicken-fajita-skillet-casserole-recipe-8701199
Scraping recipe: https://www.allrecipes.com/chicken-chalupas-recipe-8681727
Scraping recipe: https://www.allreci

## Breakfast and Brunch 
https://www.allrecipes.com/recipes/78/breakfast-and-brunch/

In [59]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/78/breakfast-and-brunch/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/breakfast_brunch.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'breakfast_brunch.txt'")

Scraping: https://www.allrecipes.com/recipes/78/breakfast-and-brunch/
Saved 64 recipe URLs to 'breakfast_brunch.txt'


In [60]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/breakfast_brunch.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/breakfast_brunch.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/breakfast-enchilada-casserole-recipe-11706588
Scraping recipe: https://www.allrecipes.com/recipe/9482/scotch-eggs/
Scraping recipe: https://www.allrecipes.com/chia-seed-overnight-oats-recipe-11699507
Scraping recipe: https://www.allrecipes.com/recipe/135063/minute-breakfast-burrito/
Scraping recipe: https://www.allrecipes.com/banana-baked-oatmeal-cups-recipe-11699825
Scraping recipe: https://www.allrecipes.com/homemade-ham-and-cheese-hot-pockets-recipe-11698211
Scraping recipe: https://www.allrecipes.com/sausage-egg-and-cheese-biscuit-casserole-recipe-11695493
Scraping recipe: https://www.allrecipes.com/reese-s-cup-overnight-oats-recipe-11695967
Scraping recipe: https://www.allrecipes.com/reddit-allrecipes-best-blueberry-muffins-11696720
Scraping recipe: https://www.allrecipes.com/high-protein-scrambled-pancakes-recipe-11694315
Scraping recipe: https://www.allrecipes.com/easy-bacon-pancake-sticks-recipe-11694154
Scraping recipe: https://www.a

## Breakfast Eggs
https://www.allrecipes.com/recipes/148/breakfast-and-brunch/eggs/ 

In [61]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/148/breakfast-and-brunch/eggs/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/breakfast_eggs.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'breakfast_eggs.txt'")

Scraping: https://www.allrecipes.com/recipes/148/breakfast-and-brunch/eggs/
Saved 64 recipe URLs to 'breakfast_eggs.txt'


In [3]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/breakfast_eggs.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/breakfast_eggs.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/homemade-ham-and-cheese-hot-pockets-recipe-11698211
Scraping recipe: https://www.allrecipes.com/sausage-egg-and-cheese-biscuit-casserole-recipe-11695493
Scraping recipe: https://www.allrecipes.com/recipe/263116/sherrys-perfect-sous-vide-eggs/
Scraping recipe: https://www.allrecipes.com/crescent-sausage-egg-roll-ups-recipe-11691116
Scraping recipe: https://www.allrecipes.com/recipe/23773/quiche-supreme/
Scraping recipe: https://www.allrecipes.com/recipe/18477/quiche-lorraine-ii/
Scraping recipe: https://www.allrecipes.com/cafe-tropical-burrito-recipe-8777851
Scraping recipe: https://www.allrecipes.com/menemen-turkish-scrambled-eggs-recipe-8774545
Scraping recipe: https://www.allrecipes.com/boursin-omelet-recipe-8750293
Scraping recipe: https://www.allrecipes.com/one-pan-breakfast-skillet-mac-and-cheese-recipe-8758487
Scraping recipe: https://www.allrecipes.com/recipe/9482/scotch-eggs/
Scraping recipe: https://www.allrecipes.com/leftover-mashed

## Breakfast meat and seafood
https://www.allrecipes.com/recipes/150/breakfast-and-brunch/meat-and-seafood/

In [4]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/150/breakfast-and-brunch/meat-and-seafood/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/breakfast_meat_seafood.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'breakfast_meat_seafood.txt'")

Scraping: https://www.allrecipes.com/recipes/150/breakfast-and-brunch/meat-and-seafood/
Saved 64 recipe URLs to 'breakfast_meat_seafood.txt'


In [5]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/breakfast_meat_seafood.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/breakfast_meat_seafood.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/crescent-sausage-egg-roll-ups-recipe-11691116
Scraping recipe: https://www.allrecipes.com/one-pan-breakfast-skillet-mac-and-cheese-recipe-8758487
Scraping recipe: https://www.allrecipes.com/million-dollar-bacon-recipe-8749105
Scraping recipe: https://www.allrecipes.com/full-english-breakfast-recipe-8739702
Scraping recipe: https://www.allrecipes.com/nyc-deli-bacon-egg-and-cheese-sandwich-recipe-8739440
Scraping recipe: https://www.allrecipes.com/turkey-sausage-potato-brussels-sprouts-and-apple-hash-recipe-8690240
Scraping recipe: https://www.allrecipes.com/mini-sausage-pancake-muffins-recipe-8403975
Scraping recipe: https://www.allrecipes.com/breakfast-pork-cutlets-recipe-7198099
Scraping recipe: https://www.allrecipes.com/recipe/216391/easy-sausage-gravy-and-biscuits/
Scraping recipe: https://www.allrecipes.com/recipe/21649/sausage-balls/
Scraping recipe: https://www.allrecipes.com/recipe/220961/leftover-ham-n-potato-casserole/
Scraping reci

## Smoothies 
https://www.allrecipes.com/recipes/138/drinks/smoothies/

In [6]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/138/drinks/smoothies/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/smoothies.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'smoothies.txt'")

Scraping: https://www.allrecipes.com/recipes/138/drinks/smoothies/
Saved 64 recipe URLs to 'smoothies.txt'


In [7]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/smoothies.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/smoothies.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/korean-banana-milk-recipe-7966165
Scraping recipe: https://www.allrecipes.com/recipe/265537/zucchini-smoothie/
Scraping recipe: https://www.allrecipes.com/recipe/264158/sweet-beet-smoothie/
Scraping recipe: https://www.allrecipes.com/monkey-c-juice-recipe-7370055
Scraping recipe: https://www.allrecipes.com/recipe/279625/pb-j-smoothie/
Scraping recipe: https://www.allrecipes.com/recipe/221261/peanut-butter-banana-smoothie/
Scraping recipe: https://www.allrecipes.com/recipe/215184/blueberry-smoothie/
Scraping recipe: https://www.allrecipes.com/recipe/244959/chocolate-banana-peanut-butter-protein-shake/
Scraping recipe: https://www.allrecipes.com/recipe/215189/fruit-and-yogurt-smoothie/
Scraping recipe: https://www.allrecipes.com/recipe/23539/strawberry-oatmeal-breakfast-smoothie/
Scraping recipe: https://www.allrecipes.com/recipe/241019/spinach-and-banana-power-smoothie/
Scraping recipe: https://www.allrecipes.com/recipe/23553/basic-fruit-smoot

## Bowls 
https://www.allrecipes.com/recipes/17881/main-dish/bowls/

In [8]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/17881/main-dish/bowls/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/bowls.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'bowls.txt'")

Scraping: https://www.allrecipes.com/recipes/17881/main-dish/bowls/
Saved 64 recipe URLs to 'bowls.txt'


In [9]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/bowls.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/bowls.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/teriyaki-salmon-bowl-recipe-8624279
Scraping recipe: https://www.allrecipes.com/meatless-grain-bowl-recipe-8584633
Scraping recipe: https://www.allrecipes.com/bourbon-teriyaki-salmon-rice-bowl-recipe-7968978
Scraping recipe: https://www.allrecipes.com/garlic-ginger-chicken-meatball-bowls-recipe-7570245
Scraping recipe: https://www.allrecipes.com/watermelon-tuna-poke-bowl-recipe-7568001
Scraping recipe: https://www.allrecipes.com/salmon-couscous-salad-recipe-7561932
Scraping recipe: https://www.allrecipes.com/spicy-canned-salmon-salad-rice-bowl-recipe-7558696
Scraping recipe: https://www.allrecipes.com/chicken-teriyaki-rice-bowls-recipe-7372857
Scraping recipe: https://www.allrecipes.com/recipe/8537903/spinach-and-artichoke-farro-bowl-with-chicken-and-mushrooms/
Scraping recipe: https://www.allrecipes.com/recipe/268091/easy-korean-ground-beef-bowl/
Scraping recipe: https://www.allrecipes.com/recipe/278876/spicy-tuna-rice-bowl/
Scraping recipe:

## Sandwhich 
https://www.allrecipes.com/recipes/251/main-dish/sandwiches/

In [10]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/251/main-dish/sandwiches/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/sandwhich.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'sandwhich.txt'")

Scraping: https://www.allrecipes.com/recipes/251/main-dish/sandwiches/
Saved 64 recipe URLs to 'sandwhich.txt'


In [11]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/sandwhich.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/sandwhich.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/pickle-grilled-cheese-recipe-11711130
Scraping recipe: https://www.allrecipes.com/cuban-crunchwrap-recipe-11707035
Scraping recipe: https://www.allrecipes.com/easy-cottage-cheese-wrap-recipe-11697300
Scraping recipe: https://www.allrecipes.com/cream-cheese-and-peanut-butter-strawberry-sandwich-recipe-11696003
Scraping recipe: https://www.allrecipes.com/air-fryer-english-muffin-tuna-melt-recipe-11695943
Scraping recipe: https://www.allrecipes.com/everything-but-the-bagel-cottage-cheese-wrap-recipe-11694939
Scraping recipe: https://www.allrecipes.com/french-onion-hotdogs-recipe-11685564
Scraping recipe: https://www.allrecipes.com/sloppy-joe-sliders-recipe-8789641
Scraping recipe: https://www.allrecipes.com/mississippi-pot-roast-cheesesteak-recipe-8785850
Scraping recipe: https://www.allrecipes.com/famous-mexican-hotdogs-recipe-8780999
Scraping recipe: https://www.allrecipes.com/smashed-chicken-parmesan-texas-toast-recipe-8778198
Scraping recipe

In [3]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/76/appetizers-and-snacks/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/apps_and_snacks.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'apps_and_snacks.txt'")

Scraping: https://www.allrecipes.com/recipes/76/appetizers-and-snacks/
Saved 64 recipe URLs to 'apps_and_snacks.txt'


In [4]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/apps_and_snacks.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/apps_and_snacks.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/cheesy-mashed-potato-puffs-recipe-11718758
Scraping recipe: https://www.allrecipes.com/boursin-stuffed-peppadew-peppers-recipe-11717965
Scraping recipe: https://www.allrecipes.com/boursin-stuffed-dates-recipe-11716755
Scraping recipe: https://www.allrecipes.com/pickle-marinated-cheese-recipe-11718027
Scraping recipe: https://www.allrecipes.com/copycat-pickle-de-gallo-recipe-11709316
Scraping recipe: https://www.allrecipes.com/cheesy-pickle-pigs-in-a-blanket-recipe-11714642
Scraping recipe: https://www.allrecipes.com/4-ingredient-pepper-pizza-bites-recipe-11716499
Scraping recipe: https://www.allrecipes.com/marinated-mozzarella-balls-recipe-11715215
Scraping recipe: https://www.allrecipes.com/copycat-wingstop-ranch-recipe-11714770
Scraping recipe: https://www.allrecipes.com/recipe/234065/classic-pickled-eggs/
Scraping recipe: https://www.allrecipes.com/pickle-rollup-dip-recipe-11703326
Scraping recipe: https://www.allrecipes.com/cheesy-pickle-

TooManyRedirects: Exceeded 30 redirects.

## Desserts
https://www.allrecipes.com/recipes/79/desserts/

In [5]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/79/desserts/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/desserts.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'desserts.txt'")

Scraping: https://www.allrecipes.com/recipes/79/desserts/
Saved 64 recipe URLs to 'desserts.txt'


In [6]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/desserts.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/desserts.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/strawberry-sando-recipe-11717106
Scraping recipe: https://www.allrecipes.com/best-banana-pudding-recipes-8699958
Scraping recipe: https://www.allrecipes.com/recipe/222844/old-fashioned-shortcake/
Scraping recipe: https://www.allrecipes.com/5-ingredient-chocolate-pretzel-bites-recipe-11714991
Scraping recipe: https://www.allrecipes.com/no-bake-carrot-cheesecake-recipe-11709646
Scraping recipe: https://www.allrecipes.com/ooey-gooey-carrot-butter-cake-recipe-11709616
Scraping recipe: https://www.allrecipes.com/banoffee-cheesecake-bars-recipe-11707744
Scraping recipe: https://www.allrecipes.com/no-bake-lemon-ricotta-cheesecake-bars-recipe-11706597
Scraping recipe: https://www.allrecipes.com/recipe/221855/french-yogurt-cake/
Scraping recipe: https://www.allrecipes.com/no-bake-millionaire-s-shortbread-recipe-11703318
Scraping recipe: https://www.allrecipes.com/lemon-olive-oil-cake-recipe-11706006
Scraping recipe: https://www.allrecipes.com/easy-lem

## Chocolate Desserts
https://www.allrecipes.com/recipes/1557/desserts/chocolate/

In [7]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/1557/desserts/chocolate/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/chocolate_desserts.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'chocolate_desserts.txt'")

Scraping: https://www.allrecipes.com/recipes/1557/desserts/chocolate/
Saved 64 recipe URLs to 'chocolate_desserts.txt'


In [8]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/chocolate_desserts.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/chocolate_desserts.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/french-silk-pie-bars-recipe-8785326
Scraping recipe: https://www.allrecipes.com/best-brownie-recipes-8744179
Scraping recipe: https://www.allrecipes.com/no-bake-espresso-martini-cheesecakes-recipe-8777811
Scraping recipe: https://www.allrecipes.com/blackout-cake-recipe-8762039
Scraping recipe: https://www.allrecipes.com/sleeping-gingerbread-treats-recipe-8758467
Scraping recipe: https://www.allrecipes.com/little-debbie-brownie-tree-dip-recipe-8758415
Scraping recipe: https://www.allrecipes.com/mint-chocolate-recipes-8732918
Scraping recipe: https://www.allrecipes.com/gingerbread-fudge-recipe-8749111
Scraping recipe: https://www.allrecipes.com/cottage-cheese-chocolate-chip-cookie-dough-recipe-8742638
Scraping recipe: https://www.allrecipes.com/air-fryer-s-mores-recipe-8736955
Scraping recipe: https://www.allrecipes.com/copycat-trader-joe-s-gone-bananas-recipe-8733325
Scraping recipe: https://www.allrecipes.com/blackberry-cheesecake-brownies-re

## Fruit Desserts
https://www.allrecipes.com/recipes/17140/desserts/fruit-desserts/

In [9]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/17140/desserts/fruit-desserts/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/fruit_desserts.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'fruit_desserts.txt'")

Scraping: https://www.allrecipes.com/recipes/17140/desserts/fruit-desserts/
Saved 64 recipe URLs to 'fruit_desserts.txt'


In [10]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/fruit_desserts.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/fruit_desserts.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/easy-strawberry-brownies-recipe-11696682
Scraping recipe: https://www.allrecipes.com/creamsicle-sheet-cake-recipe-8783024
Scraping recipe: https://www.allrecipes.com/sticky-rice-with-passion-fruit-sauce-recipe-11686748
Scraping recipe: https://www.allrecipes.com/rhubarb-buttermilk-clafoutis-recipe-11686831
Scraping recipe: https://www.allrecipes.com/bananas-foster-crispy-rice-recipe-11685503
Scraping recipe: https://www.allrecipes.com/fudgy-black-forest-bars-recipe-8781403
Scraping recipe: https://www.allrecipes.com/cherry-torte-with-cherry-prosecco-syrup-recipe-8786525
Scraping recipe: https://www.allrecipes.com/lemon-crumb-bars-recipe-8781116
Scraping recipe: https://www.allrecipes.com/whole-orange-blender-cake-recipe-8769291
Scraping recipe: https://www.allrecipes.com/no-bake-nutter-butter-banana-pudding-cheesecake-recipe-8733272
Scraping recipe: https://www.allrecipes.com/grate-apple-crisp-recipe-8727014
Scraping recipe: https://www.allre

## Frozen Desserts
https://www.allrecipes.com/recipes/364/desserts/frozen-desserts/

In [11]:
# Saving each recipes url 
category_urls = [
    'https://www.allrecipes.com/recipes/364/desserts/frozen-desserts/'
]

headers = {
    'User-Agent': 'Mozilla/5.0'
}

all_recipe_urls = []

for url in category_urls:
    print(f"Scraping: {url}")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    recipe_cards = soup.find_all('a', class_ = 'comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')
    recipe_urls = [card['href'] for card in recipe_cards if card.has_attr('href')]

    all_recipe_urls.extend(recipe_urls)

    # Delay for 1 seconds
    time.sleep(1)  

# Save url to a text file
with open('data/scraped_urls/frozen_dessert.txt', 'w') as file:
    for link in all_recipe_urls:
        file.write(f"{link}\n")

print(f"Saved {len(all_recipe_urls)} recipe URLs to 'frozen_dessert.txt'")

Scraping: https://www.allrecipes.com/recipes/364/desserts/frozen-desserts/
Saved 64 recipe URLs to 'frozen_dessert.txt'


In [12]:
def scrape_recipe(recipe_url):
    print(f"Scraping recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scraping the recipe information
    try:
        title = soup.find('h1', class_='article-heading text-headline-400').text.strip()
    except AttributeError:
        title = None

    # Scraping the intro
    try:
        intro = soup.find('p', class_='article-subheading text-utility-300').text.strip()
    except AttributeError:
        intro = None

    # Scraping prep time, cook time, total time, and servings
    times = soup.find_all('div', class_='mm-recipes-details__item')
    
    # Time variables
    prep_time = cook_time = total_time = servings = None

    for time in times:
        label = time.find('div', class_='mm-recipes-details__label').text.strip()
        value = time.find('div', class_='mm-recipes-details__value').text.strip()

        if label == 'Prep Time:':
            prep_time = value
        elif label == 'Cook Time:':
            cook_time = value
        elif label == 'Total Time:':
            total_time = value
        elif label == 'Servings:':
            servings = value

    # Ingredients
    ingredients = []
    try:
        ingredient_elements = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        for ingredient in ingredient_elements:
            ingredients.append(ingredient.get_text().strip())
    except AttributeError:
        ingredients = None

    # Directions
    directions = []
    try:
        direction_elements = soup.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')
        for direction in direction_elements:
            directions.append(direction.get_text().strip())
    except AttributeError:
        directions = None

    # Nutrition facts 
    nutrition = {}
    try:
        nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
        rows = nutrition_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                nutrition[cols[1].text.strip()] = cols[0].text.strip()
    except AttributeError:
        nutrition = None

    return {
        'title': title,
        'intro': intro,
        'prep_time': prep_time,
        'cook_time': cook_time,
        'total_time': total_time,
        'servings': servings,
        'ingredients': ingredients,
        'directions': directions,
        'nutrition': nutrition,
        'recipe_url': recipe_url
    }

# Read the saved URLs from the file
with open('data/scraped_urls/frozen_dessert.txt', 'r') as file:
    recipe_urls = file.readlines()

# Clean up any extra whitespace or newlines
recipe_urls = [url.strip() for url in recipe_urls]

# Set headers for requests
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a list to hold all the recipe data
all_recipes_data = []

# Loop through the URLs and scrape the information
for recipe_url in recipe_urls:
    recipe_data = scrape_recipe(recipe_url)
    
    # Append the data to the list
    all_recipes_data.append(recipe_data)
    
    # Polite delay to prevent being blocked
    time.sleep(1)  # delay for 1 second

# Convert the list of recipes to a pandas DataFrame
df = pd.DataFrame(all_recipes_data)

# Save the DataFrame to a CSV file
df.to_csv('data/scraped_data/frozen_dessert.csv', index=False)

# Optionally, display the table
print(df.head())

Scraping recipe: https://www.allrecipes.com/frozen-tajin-grapes-recipe-8709027
Scraping recipe: https://www.allrecipes.com/3-ingredient-ube-ice-cream-recipe-8681790
Scraping recipe: https://www.allrecipes.com/dulce-de-leche-ice-cream-recipe-8680054
Scraping recipe: https://www.allrecipes.com/copycat-choco-tacos-recipe-8660298
Scraping recipe: https://www.allrecipes.com/best-chef-john-ice-cream-recipes-8672733
Scraping recipe: https://www.allrecipes.com/bastani-saffron-and-rose-ice-cream-recipe-8661887
Scraping recipe: https://www.allrecipes.com/peanut-butter-cottage-cheese-ice-cream-recipe-8644150
Scraping recipe: https://www.allrecipes.com/easy-no-churn-ice-cream-recipe-8581998
Scraping recipe: https://www.allrecipes.com/donut-ice-cream-sandwiches-recipe-8423867
Scraping recipe: https://www.allrecipes.com/nutella-ice-cream-recipe-7508716
Scraping recipe: https://www.allrecipes.com/no-churn-blackberry-ice-cream-recipe-7852324
Scraping recipe: https://www.allrecipes.com/summer-fruit-ice

# Join all tables

In [13]:
import glob

In [14]:
csv_files = glob.glob('data/scraped_data/*.csv')

In [15]:
# Read and combine
dfs = [pd.read_csv(file) for file in csv_files]
all_recipes = pd.concat(dfs, ignore_index=True)

In [16]:
all_recipes.to_csv("data/all_recipes_combined.csv", index=False)