In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3

## Scraping Japanese Recipes
'www.justonecookbook.com'

In [2]:
recipes = []

# URL for the recipe index
base_url = "https://www.justonecookbook.com/recipes/page/{}/"

# Loop through all pages
for page in range(1, 20):  # Iterate through all pages
    url = base_url.format(page)
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}")
        continue

    # Parse HTML
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract recipe cards
    recipe_cards = soup.find_all("article", class_="post-filter post-sm post-abbr")
    # recipe_cards = soup.find_all('h3', class_='article-title')
    for card in recipe_cards:
        information = card.find("h3", class_="article-title")
        title = information.text.strip()
        link_tag = information.find("a")
        link = link_tag["href"]

        # Extract image URL
        image_tag = card.find("img")  # Assuming <img> tag exists in the card
        if image_tag and "src" in image_tag.attrs:
            image_url = image_tag["src"]
            # Download the image content
            image_response = requests.get(image_url)
            if image_response.status_code == 200:
                image_data = image_response.content  # Binary image data
            else:
                print(f"Failed to fetch image for {title}")
                image_data = None
        else:
            image_url = None
            image_data = None

        # Append to recipes list
        recipes.append(
            {
                "title": title,
                "link": link,
                "image_url": image_url,
                "image_data": image_data,
            }
        )

    print(f"Page {page} scraped successfully!")

Page 1 scraped successfully!
Page 2 scraped successfully!
Page 3 scraped successfully!
Page 4 scraped successfully!
Page 5 scraped successfully!
Page 6 scraped successfully!
Page 7 scraped successfully!
Page 8 scraped successfully!
Page 9 scraped successfully!
Page 10 scraped successfully!
Page 11 scraped successfully!
Page 12 scraped successfully!
Page 13 scraped successfully!
Page 14 scraped successfully!
Page 15 scraped successfully!
Page 16 scraped successfully!
Page 17 scraped successfully!
Page 18 scraped successfully!
Page 19 scraped successfully!


In [3]:
# Create an empty DataFrame to store all recipes from japanese recipes
japanese_recipes = []

for i in range(len(recipes)):  # len(recipes)
    current_recipe = recipes[i]
    url = current_recipe["link"]

    print(f"\rProgress: {i}, url:{url}", end="")

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}")
        continue

    # Parse HTML
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract recipe cards, 2 checks, one checks the hyperlink breakdcrumsb and the other checks if there is a jump to recipe button
    recipe_cards = soup.find("span", class_="jump-text")
    recipe_exists = soup.find("div", class_="breadcrumbs").text.strip()
    recipe_exists2 = recipe_cards.text.strip()

    # recipe_exists == 'Jump to Recipe' and
    if (
        ("How-Tos" not in recipe_exists)
        and ("Recipe Collection" not in recipe_exists)
        and recipe_exists2 == "Jump to Recipe"
    ):
        # if recipe_exists2 == 'Jump to Recipe':

        # Get Professional title
        current_recipe["title"] = soup.find(
            "h2", class_="wprm-recipe-name wprm-block-text-bold"
        ).text.strip()

        # Get recipe description
        current_recipe["description"] = soup.find(
            "div", class_="wprm-recipe-summary wprm-block-text-normal"
        ).text.strip()

        # Get times
        time_block = soup.find("div", class_="wprm-recipe-total-time-container")
        if time_block:
            time_list = time_block.text.strip().split()
            label = " ".join(time_list[0:2])
            value = " ".join(time_list[2:4])
            current_recipe[label] = value

        # Get type of food
        type_food = soup.find(
            "div",
            class_="wprm-recipe-meta-container wprm-recipe-tags-container wprm-recipe-details-container wprm-recipe-details-container-inline wprm-block-text-normal",
        )
        if type_food:
            type_cards = type_food.find_all("div", class_="wprm-recipe-tag-container")
            for card in type_cards:
                label_element = card.find("span", class_="wprm-recipe-tag-label")
                value_element = card.find("span", class_="wprm-block-text-normal")
                if label_element and value_element:
                    current_recipe[label_element.text.strip()] = (
                        value_element.text.strip()
                    )

        # Extract ingredients
        ingredients = []
        ingredient_cards = soup.find_all("li", class_="wprm-recipe-ingredient")
        for card in ingredient_cards:
            ingredient_block = card.find("span", class_="wprm-recipe-ingredient-name")
            if ingredient_block:
                name = ingredient_block.text.strip()
                ingredients.append(name)
        current_recipe["ingredients"] = ingredients

        # Get nutrition values
        nutrition_cards = soup.find_all(
            "span", class_="wprm-nutrition-label-text-nutrition-container"
        )
        for card in nutrition_cards:
            label_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-label"
            )
            value_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-value"
            )
            unit_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-unit"
            )
            if label_element and value_element and unit_element:
                current_recipe[label_element.text.strip()] = " ".join(
                    [value_element.text.strip(), unit_element.text.strip()]
                )

        # Get instructions
        instructions_block = soup.find(
            "div", class_="wprm-recipe-instructions-container"
        ).text.strip()
        current_recipe["instructions"] = instructions_block

        # Add to list
        japanese_recipes.append(current_recipe)

df_japanese_recipes = pd.DataFrame(japanese_recipes)

Progress: 1124, url:https://www.justonecookbook.com/how-to-make-shiraga-negi/ki-with-yuzu-kosho/and-stocks/inelli/wok-with-ray/

In [4]:
# Convert lists to strings
if "ingredients" in df_japanese_recipes.columns:
    df_japanese_recipes["ingredients"] = df_japanese_recipes["ingredients"].apply(
        lambda x: "; ".join(x) if isinstance(x, list) else x
    )

# Connect to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect("japanese_recipes.db")

# Save DataFrame to SQL database
df_japanese_recipes.to_sql("recipes", conn, if_exists="replace", index=False)

# Close the connection
conn.close()

print("Data saved to database!")

df_japanese_recipes

Data saved to database!


Unnamed: 0,title,link,image_url,image_data,description,Total Time:,Course:,Cuisine:,Keyword:,ingredients,...,Trans Fat:,Cholesterol:,Sodium:,Potassium:,Fiber:,Sugar:,Vitamin A:,Vitamin C:,Calcium:,Iron:
0,Yuzu Cha (Citron Tea),https://www.justonecookbook.com/yuzu-cha/,https://www.justonecookbook.com/wp-content/upl...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,Try my easy homemade recipe for Yuzu Cha (Citr...,1 hour,How to,Japanese,yuzu,yuzu; white rock sugar; shochu,...,,,,,,,,,,
1,Japanese Milk Bread (Shokupan),https://www.justonecookbook.com/japanese-milk-...,https://www.justonecookbook.com/wp-content/upl...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,Japanese Milk Bread is possibly the best versi...,3 hours,Breakfast,Japanese,japanese bread,warm water; sugar; Diamond Crystal kosher salt...,...,1 g,58 mg,1355 mg,786 mg,10 g,40 g,1068 IU,1 mg,348 mg,3 mg
2,Chicken Chashu,https://www.justonecookbook.com/chicken-chashu/,https://www.justonecookbook.com/wp-content/upl...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,Juicy and tender Chicken Chashu is a lighter v...,1 hour,Main Course,Japanese,chicken,"boneless, skin-on chicken thighs; green onions...",...,,,,,,,,,,
3,Gyudon (Japanese Beef Rice Bowl),https://www.justonecookbook.com/gyudon/,https://www.justonecookbook.com/wp-content/upl...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,With thinly sliced beef and tender onions simm...,20 minutes,Main Course,Japanese,"beef, donburi, rice bowl",onion; green onion/scallion; thinly sliced bee...,...,,69 mg,65 mg,468 mg,1 g,10 g,23 IU,4 mg,25 mg,6 mg
4,Japanese Beef Curry,https://www.justonecookbook.com/japanese-beef-...,https://www.justonecookbook.com/wp-content/upl...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,"With tender chunks of beef, potatoes, carrots,...",3 hours,Main Course,Japanese,"beef, curry",onions; unsalted butter; neutral oil; russet p...,...,1 g,73 mg,938 mg,873 mg,4 g,10 g,3992 IU,13 mg,66 mg,3 mg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,Taiwanese Hot Pot with Homemade Meatballs,https://www.justonecookbook.com/taiwanese-hot-...,https://www.justonecookbook.com/wp-content/upl...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,Delicious Taiwanese Hot Pot and Homemade Meatb...,30 minutes,Main Course,Taiwanese,"hot pot, nabe",ground pork; green onion/scallion; Tokyo negi ...,...,1 g,233 mg,812 mg,1735 mg,8 g,26 g,5975 IU,48 mg,348 mg,7 mg
944,Oyakodon (Chicken and Egg Rice Bowl),https://www.justonecookbook.com/oyakodon/,https://www.justonecookbook.com/wp-content/upl...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,Oyakodon is a classic comfort food of Japanese...,25 minutes,Main Course,Japanese,"donburi, over rice","onion; boneless, skinless chicken thighs; sake...",...,0.1 g,414 mg,1197 mg,586 mg,1 g,11 g,442 IU,4 mg,71 mg,4 mg
945,Hamachi (Yellowtail) Teriyaki with Yuzu Kosho,https://www.justonecookbook.com/hamachi-yellow...,https://www.justonecookbook.com/wp-content/upl...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,"Growing up, I had enjoyed eating Hamachi Teriy...",20 minutes,Main Course,Japanese,"hamachi, teriyaki, yellowtail, yuzu kosho","yellowtail (hamachi, buri); freshly ground bla...",...,1 g,70 mg,591 mg,725 mg,1 g,4 g,273 IU,3 mg,61 mg,1 mg
946,Teriyaki Pork Balls,https://www.justonecookbook.com/teriyaki-pork-...,https://www.justonecookbook.com/wp-content/upl...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,My Teriyaki Pork Balls recipe uses thin slices...,30 minutes,Main Course,Japanese,pork,sake; potato starch or cornstarch; thinly slic...,...,,107 mg,769 mg,874 mg,1 g,7 g,75 IU,3 mg,35 mg,1 mg


In [5]:
# Find null values in each column
null_counts = df_japanese_recipes.isnull().sum()

print("Null values in each column:")
print(null_counts)

Null values in each column:
title                     0
link                      0
image_url                 0
image_data                0
description               0
Total Time:              18
Course:                   8
Cuisine:                  8
Keyword:                  8
ingredients               0
instructions              0
Serving:                838
Calories:                22
Carbohydrates:           28
Protein:                 27
Fat:                     56
Saturated Fat:           62
Polyunsaturated Fat:    260
Monounsaturated Fat:    271
Trans Fat:              463
Cholesterol:            298
Sodium:                  18
Potassium:               28
Fiber:                   70
Sugar:                   41
Vitamin A:               80
Vitamin C:              142
Calcium:                 22
Iron:                    30
dtype: int64


In [6]:
# Check for duplicate rows
duplicates = df_japanese_recipes.duplicated()

print("Duplicate rows:")
print(df_japanese_recipes[duplicates])

Duplicate rows:
Empty DataFrame
Columns: [title, link, image_url, image_data, description, Total Time:, Course:, Cuisine:, Keyword:, ingredients, instructions, Serving:, Calories:, Carbohydrates:, Protein:, Fat:, Saturated Fat:, Polyunsaturated Fat:, Monounsaturated Fat:, Trans Fat:, Cholesterol:, Sodium:, Potassium:, Fiber:, Sugar:, Vitamin A:, Vitamin C:, Calcium:, Iron:]
Index: []

[0 rows x 29 columns]


---- 

## Chinese Recipes

https://omnivorescookbook.com/recipe-filter/

In [7]:
recipes = []

# URL for the recipe index
base_url = "https://omnivorescookbook.com/recipe-filter/page/{}/"

# Loop through all pages
for page in range(1, 37):  # Iterate through all pages
    url = base_url.format(page)
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}")
        continue

    # Parse HTML
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract recipe cards
    recipe_cards = soup.find_all("article", class_="post-sm post-abbr")
    for card in recipe_cards:
        # print(card)
        information = card.find("h3", class_="entry-title")
        # print(information)
        title = information.text.strip()
        link_tag = information.find("a")
        link = link_tag["href"]

        # Extract image URL
        image_tag = card.find("img")  # Assuming <img> tag exists in the card
        if image_tag and "src" in image_tag.attrs:
            image_url = image_tag["src"]
            # Download the image content
            image_response = requests.get(image_url)
            if image_response.status_code == 200:
                image_data = image_response.content  # Binary image data
            else:
                print(f"Failed to fetch image for {title}")
                image_data = None
        else:
            image_url = None
            image_data = None

        # Append to recipes list
        recipes.append(
            {
                "title": title,
                "link": link,
                "image_url": image_url,
                "image_data": image_data,
            }
        )

    print(f"Page {page} scraped successfully!")

Page 1 scraped successfully!
Page 2 scraped successfully!
Page 3 scraped successfully!
Page 4 scraped successfully!
Page 5 scraped successfully!
Page 6 scraped successfully!
Page 7 scraped successfully!
Page 8 scraped successfully!
Page 9 scraped successfully!
Page 10 scraped successfully!
Page 11 scraped successfully!
Page 12 scraped successfully!
Page 13 scraped successfully!
Page 14 scraped successfully!
Page 15 scraped successfully!
Page 16 scraped successfully!
Page 17 scraped successfully!
Page 18 scraped successfully!
Page 19 scraped successfully!
Page 20 scraped successfully!
Page 21 scraped successfully!
Page 22 scraped successfully!
Page 23 scraped successfully!
Page 24 scraped successfully!
Page 25 scraped successfully!
Page 26 scraped successfully!
Page 27 scraped successfully!
Page 28 scraped successfully!
Page 29 scraped successfully!
Page 30 scraped successfully!
Page 31 scraped successfully!
Page 32 scraped successfully!
Page 33 scraped successfully!
Page 34 scraped suc

In [8]:
# Create an empty DataFrame to store all recipes from japanese recipes
chinese_recipes = []

for i in range(len(recipes)):  # len(recipes)
    current_recipe = recipes[i]
    url = current_recipe["link"]

    print(f"\rProgress: {i}, url:{url}", end="")

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}")
        continue

    # Parse HTML
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract recipe cards, 2 checks, one checks the hyperlink breakdcrumsb and the other checks if there is a jump to recipe button
    recipe_cards = soup.find("a", class_="recipe-jump")
    # recipe_exists = recipe_cards.text.strip()

    if recipe_cards:

        # Get Professional title
        current_recipe["title"] = soup.find(
            "h2", class_="wprm-recipe-name wprm-block-text-bold"
        ).text.strip()

        # Get recipe description
        description_block = soup.find(
            "div", class_="wprm-recipe-summary wprm-block-text-normal"
        )
        if description_block:
            current_recipe["description"] = description_block.text.strip()

        # Get times
        time_block = soup.find("div", class_="wprm-recipe-total-time-container")
        if time_block:
            time_list = time_block.text.strip().split()
            label = " ".join(time_list[0:2])
            value = " ".join(time_list[2:4])
            current_recipe[label] = value

        # Get type of food
        type_food = soup.find(
            "div",
            class_="wprm-recipe-meta-container wprm-recipe-tags-container wprm-recipe-details-container wprm-recipe-details-container-inline wprm-block-text-normal",
        )
        if type_food:
            type_cards = type_food.find_all("div", class_="wprm-recipe-tag-container")
            for card in type_cards:
                label_element = card.find("span", class_="wprm-recipe-tag-label")
                value_element = card.find("span", class_="wprm-block-text-normal")
                if label_element and value_element:
                    current_recipe[label_element.text.strip()] = (
                        value_element.text.strip()
                    )

        # Extract ingredients
        ingredients = []
        ingredient_cards = soup.find_all("li", class_="wprm-recipe-ingredient")
        for card in ingredient_cards:
            ingredient_block = card.find("span", class_="wprm-recipe-ingredient-name")
            if ingredient_block:
                name = ingredient_block.text.strip()
                ingredients.append(name)
        current_recipe["ingredients"] = ingredients

        # Get nutrition values
        nutrition_cards = soup.find_all(
            "span", class_="wprm-nutrition-label-text-nutrition-container"
        )
        for card in nutrition_cards:
            label_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-label"
            )
            value_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-value"
            )
            unit_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-unit"
            )
            if label_element and value_element and unit_element:
                current_recipe[label_element.text.strip()] = " ".join(
                    [value_element.text.strip(), unit_element.text.strip()]
                )

        # Get instructions
        instructions_block = soup.find(
            "div", class_="wprm-recipe-instructions-container"
        ).text.strip()
        current_recipe["instructions"] = instructions_block

        # Add to list
        chinese_recipes.append(current_recipe)
        # print(current_recipe)

df_chinese_recipes = pd.DataFrame(chinese_recipes)

Progress: 928, url:https://omnivorescookbook.com/chinese-eggplant-with-garlic-sauceiata-sauce/-and-rice/ke-mushroom

In [9]:
# Convert lists to strings
if "ingredients" in df_chinese_recipes.columns:
    df_chinese_recipes["ingredients"] = df_chinese_recipes["ingredients"].apply(
        lambda x: "; ".join(x) if isinstance(x, list) else x
    )

# Connect to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect("chinese_recipes.db")

# Save DataFrame to SQL database
df_chinese_recipes.to_sql("recipes", conn, if_exists="replace", index=False)

# Close the connection
conn.close()

print("Data saved to database!")

df_chinese_recipes

Data saved to database!


Unnamed: 0,title,link,image_url,image_data,description,Total Time:,Course:,Cuisine:,Keyword:,ingredients,...,Potassium:,Fiber:,Sugar:,Calcium:,Iron:,instructions,Cholesterol:,Vitamin C:,Vitamin A:,Trans Fat:
0,Easy Oyster Mushroom Stir Fry,https://omnivorescookbook.com/easy-oyster-mush...,https://omnivorescookbook.com/wp-content/uploa...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,A super quick and easy oyster mushroom stir fr...,15 minutes,Side Dish,Chinese,homestyle,oyster mushrooms; peanut oil; garlic; sugar; s...,...,609 mg,1.9 g,3.5 g,4 mg,5 mg,InstructionsHeat the oil in a large skillet ov...,,,,
1,Honey Glazed Salmon,https://omnivorescookbook.com/honey-soy-sauce-...,https://omnivorescookbook.com/wp-content/uploa...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,A simple yet rich tasting honey glazed salmon ...,55 minutes,Main,Chinese Fusion,weekday dinner,salmon filets; salt; sugar; honey; Shaoxing wi...,...,743 mg,0.6 g,37.3 g,71 mg,2 mg,InstructionsTo cure the salmon (Optional): Mix...,78 mg,,,
2,Shrimp Toast,https://omnivorescookbook.com/shrimp-toast/,https://omnivorescookbook.com/wp-content/uploa...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,Make these crispy savory shrimp toasts as an a...,40 minutes,Appetizer,Chinese,restaurant-style,shrimp; egg white; ginger; garlic; light soy s...,...,138 mg,1 g,1.7 g,101 mg,2 mg,InstructionsTo prepare the shrimp spread(Optio...,90 mg,,,
3,Garlic Fried Rice,https://omnivorescookbook.com/garlic-fried-rice/,https://omnivorescookbook.com/wp-content/uploa...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,A Chinese style garlic fried rice featuring cr...,25 minutes,Side,Chinese,takeout,of leftover cooked jasmine rice; soy sauce; oy...,...,143 mg,1.7 g,1.4 g,50 mg,2 mg,InstructionsFluff day old rice in a bowl with ...,186 mg,,,
4,Chicken with Garlic Sauce,https://omnivorescookbook.com/chicken-with-gar...,https://omnivorescookbook.com/wp-content/uploa...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,Chicken with garlic sauce is a super easy take...,30 minutes,Main,Chinese,takeout,chicken breasts or thighs; Shaoxing wine; salt...,...,306 mg,0.9 g,7.3 g,30 mg,2 mg,"InstructionsCombine chicken, Shaoxing wine, sa...",62 mg,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900,Barter-Worthy Spam Musubi,https://omnivorescookbook.com/spam-musubi/,https://omnivorescookbook.com/wp-content/uploa...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,"The spam is grilled until crispy, perfectly ca...",45 minutes,"Appetizer, Main",hawaii,restaurant-style,vegetable oil; low-sodium Spam (12 oz. / 340 g...,...,271 mg,0.5 g,3.1 g,10 mg,2 mg,InstructionsStart making steamed rice (or sush...,30 mg,,,
901,Authentic Mapo Tofu (麻婆豆腐),https://omnivorescookbook.com/authentic-mapo-t...,https://omnivorescookbook.com/wp-content/uploa...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,An easy mapo tofu recipe that creates the auth...,25 minutes,Main,Chinese,"homestyle, restaurant-style",ground pork; Shaoxing wine; light soy sauce; m...,...,173 mg,1 g,2.6 g,206 mg,2 mg,"InstructionsCombine ground meat, cooking wine,...",19 mg,,,
902,Chinese Scallion Pancakes (葱油饼),https://omnivorescookbook.com/chinese-scallion...,https://omnivorescookbook.com/wp-content/uploa...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,Super crispy and flaky on the outside and slig...,1 hour,Appetizer,Chinese,restaurant-style,all-purpose flour; salt; boiling water; cool w...,...,58 mg,1.1 g,0.3 g,11 mg,2 mg,InstructionsMake the doughOPTION 1 – USING YOU...,,,,
903,Chicken and Broccoli (Chinese Takeout Style),https://omnivorescookbook.com/chicken-and-broc...,https://omnivorescookbook.com/wp-content/uploa...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,An easy chicken and broccoli stir fry recipe t...,25 minutes,Main,Chinese,takeout,boneless skinless chicken breast; Shaoxing win...,...,512 mg,0.8 g,3.5 g,25 mg,1 mg,InstructionsSlice the chicken against the grai...,73 mg,,,


----

## Scraping Thai Recipes

https://hungryinthailand.com/

In [10]:
# Gather all urls from thai recipes website

categories = [
    "thai-appetizers/",
    "thai-salads/",
    "thai-side-dish-recipes/",
    "thai-dinner/",
    "thai-desserts/",
    "thai-soups/",
]
pages = [2, 1, 1, 4, 1, 1]

all_urls = []

for i in range(len(categories)):
    page_count = pages[i]
    for j in range(page_count):
        url = f"https://hungryinthailand.com/category/{categories[i]}/page/{j+1}/"
        all_urls.append(url)

In [11]:
recipes = []

# Loop through all pages
for url in all_urls:  # Iterate through all pages
    print(url)
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}")
        continue

    # Parse HTML
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract recipe cards
    recipe_cards = soup.find_all("article", class_="status-publish")
    for card in recipe_cards:
        information = card.find("h2", class_="entry-title")
        # print(information)
        title = information.text.strip()
        link_tag = information.find("a")
        link = link_tag["href"]

        # Extract image URL
        image_tag = card.find(
            "div", class_="post-thumbnail-inner"
        )  # Assuming <img> tag exists in the card
        image_tag = image_tag.find("img")
        if image_tag and "src" in image_tag.attrs:
            image_url = image_tag["data-lzl-src"]
            # Download the image content
            image_response = requests.get(image_url)
            if image_response.status_code == 200:
                image_data = image_response.content  # Binary image data
            else:
                print(f"Failed to fetch image for {title}")
                image_data = None
        else:
            image_url = None
            image_data = None

        # Append to recipes list
        recipes.append(
            {
                "title": title,
                "link": link,
                "image_url": image_url,
                "image_data": image_data,
            }
        )

print(f"Pages scraped successfully!")

https://hungryinthailand.com/category/thai-appetizers//page/1/
https://hungryinthailand.com/category/thai-appetizers//page/2/
https://hungryinthailand.com/category/thai-salads//page/1/
https://hungryinthailand.com/category/thai-side-dish-recipes//page/1/
https://hungryinthailand.com/category/thai-dinner//page/1/
https://hungryinthailand.com/category/thai-dinner//page/2/
https://hungryinthailand.com/category/thai-dinner//page/3/
https://hungryinthailand.com/category/thai-dinner//page/4/
https://hungryinthailand.com/category/thai-desserts//page/1/
https://hungryinthailand.com/category/thai-soups//page/1/
Pages scraped successfully!


In [12]:
# Create an empty DataFrame to store all recipes from japanese recipes
thai_recipes = []

for i in range(len(recipes)):  # len(recipes)
    current_recipe = recipes[i]
    url = current_recipe["link"]

    print(f"\rProgress: {i}, url:{url}", end="")

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}")
        continue

    # Parse HTML
    soup = BeautifulSoup(response.content, "html.parser")

    # Checks if recipe exists on page
    recipe_cards = soup.find("a", class_="wprm-recipe-jump")

    if recipe_cards:

        # Get Professional title
        current_recipe["title"] = soup.find(
            "h2", class_="wprm-recipe-name wprm-block-text-bold"
        ).text.strip()

        # Get recipe description
        description_block = soup.find(
            "div", class_="wprm-recipe-summary wprm-block-text-normal"
        )
        if description_block:
            current_recipe["description"] = description_block.text.strip()

        # Get times
        time_block = soup.find("div", class_="wprm-recipe-total-time-container")
        if time_block:
            time_list = time_block.text.strip().split()
            label = " ".join(time_list[0:2])
            value = " ".join(time_list[2:4])
            current_recipe[label] = value

        # Get type of food
        type_food = soup.find("div", class_="wprm-recipe-custom-container")
        if type_food:
            type_cards = type_food.find_all("div", class_="wprm-recipe-tag-container")
            for card in type_cards:
                label_element = card.find("span", class_="wprm-recipe-tag-label")
                value_element = card.find("span", class_="wprm-block-text-normal")
                if label_element and value_element:
                    current_recipe[label_element.text.strip()] = (
                        value_element.text.strip()
                    )

        # Extract ingredients
        ingredients = []
        ingredient_cards = soup.find_all("li", class_="wprm-recipe-ingredient")
        for card in ingredient_cards:
            ingredient_block = card.find("span", class_="wprm-recipe-ingredient-name")
            if ingredient_block:
                name = ingredient_block.text.strip()
                ingredients.append(name)
        current_recipe["ingredients"] = ingredients

        # Get nutrition values
        nutrition_cards = soup.find_all(
            "span", class_="wprm-nutrition-label-text-nutrition-container"
        )
        for card in nutrition_cards:
            label_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-label"
            )
            value_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-value"
            )
            unit_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-unit"
            )
            if label_element and value_element and unit_element:
                current_recipe[label_element.text.strip()] = " ".join(
                    [value_element.text.strip(), unit_element.text.strip()]
                )

        # Get instructions
        instructions_block = soup.find(
            "div", class_="wprm-recipe-instructions-container"
        ).text.strip()
        current_recipe["instructions"] = instructions_block

        # Add to list
        thai_recipes.append(current_recipe)
        # print(current_recipe)

df_thai_recipes = pd.DataFrame(thai_recipes)

Progress: 308, url:https://hungryinthailand.com/tom-yum-pla/-glass-noodle-soup/od-soup//sauce/

In [13]:
# Convert lists to strings
if "ingredients" in df_thai_recipes.columns:
    df_thai_recipes["ingredients"] = df_thai_recipes["ingredients"].apply(
        lambda x: "; ".join(x) if isinstance(x, list) else x
    )

# Connect to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect("thai_recipes.db")

# Save DataFrame to SQL database
df_thai_recipes.to_sql("recipes", conn, if_exists="replace", index=False)

# Close the connection
conn.close()

print("Data saved to database!")

df_thai_recipes

Data saved to database!


Unnamed: 0,title,link,image_url,image_data,description,Total Time,Cuisine,Course,ingredients,Calories:,...,Fiber:,Sugar:,Vitamin A:,Vitamin C:,Calcium:,Iron:,instructions,Trans Fat:,Cholesterol:,Serving:
0,Authentic Thai Beef Satay Recipe With Peanut S...,https://hungryinthailand.com/thai-beef-satay-r...,https://hungryinthailand.com/wp-content/upload...,"b'RIFFd""\x02\x00WEBPVP8X\n\x00\x00\x00 \x00\x0...",Enjoy my family’s authentic Thai beef satay wi...,4 hours,Thai,"Appetizer, Main Course, Snack",beef; of garlic; lemongrass; coriander seeds; ...,83 kcal,...,0.2 g,4 g,3 IU,1 mg,15 mg,2 mg,InstructionsMarinate the beefIn a pan over low...,,,
1,Easy Thai Fish Sauce Wings Recipe,https://hungryinthailand.com/fish-sauce-wings/,https://hungryinthailand.com/wp-content/upload...,b'RIFF\xca\xbb\x01\x00WEBPVP8X\n\x00\x00\x00 \...,Enjoy perfectly crispy chicken every time with...,35 minutes,"Asian, Thai","Appetizer, Side Dish, Snack",chicken wings; fish sauce; rosdee seasoning po...,670 kcal,...,,0.5 g,319 IU,2 mg,32 mg,2 mg,"InstructionsCombine fish sauce, white pepper, ...",0.5 g,166 mg,
2,Sweet Thai Chili Wings Recipe,https://hungryinthailand.com/sweet-thai-chili-...,https://hungryinthailand.com/wp-content/upload...,b'RIFF\xc6\x91\x02\x00WEBPVP8X\n\x00\x00\x00 \...,"Sweet Thai chili wings recipe with a sticky, s...",40 minutes,Thai,"Appetizer, Snack",tempura flour; rosdee seasoning powder; ice-co...,239 kcal,...,0.05 g,8 g,135 IU,1 mg,18 mg,1 mg,"InstructionsIn a food processor, blend the gar...",0.2 g,71 mg,
3,Shrimp Satay Recipe With Thai Peanut Sauce,https://hungryinthailand.com/shrimp-satay-with...,https://hungryinthailand.com/wp-content/upload...,b'RIFFL!\x01\x00WEBPVP8X\n\x00\x00\x00 \x00\x0...,Enjoy this easy shrimp satay recipe with Thai ...,50 minutes,Thai,"Appetizer, Snack",shrimp; coconut milk; yellow curry powder; Ros...,392 kcal,...,2 g,8 g,2 IU,2 mg,94 mg,3 mg,"InstructionsClean the shrimp, remove veins, an...",0.04 g,160 mg,
4,Pork Gyoza Recipe (Pork Dumplings),https://hungryinthailand.com/pork-gyoza-recipe/,https://hungryinthailand.com/wp-content/upload...,b'RIFF\xf2Z\x01\x00WEBPVP8X\n\x00\x00\x00 \x00...,"Make this pork gyoza recipe for easy, homemade...",1 hour,Thai,"Appetizer, Snack",ground pork; white pepper; sesame oil; shoyu s...,126 kcal,...,0.1 g,0.01 g,116 IU,2 mg,20 mg,1 mg,InstructionsPREPARE THE FILLINGKnead ground po...,,31 mg,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,Thai Vegetable Soup (Tom Jued),https://hungryinthailand.com/thai-vegetable-so...,https://hungryinthailand.com/wp-content/upload...,b'RIFF\xa4\x83\x01\x00WEBPVP8X\n\x00\x00\x00 \...,Tom jued is a healthy Thai vegetable soup reci...,40 minutes,Thai,"Main Course, Soup",garlic; coriander root; black peppercorns; bou...,63 kcal,...,2 g,2 g,5695 IU,4 mg,43 mg,1 mg,InstructionsSoak glass noodles (and dried shii...,,0.1 mg,
305,Tom Kha Gai Recipe (Thai Coconut Chicken Soup),https://hungryinthailand.com/tom-kha-gai/,https://hungryinthailand.com/wp-content/upload...,b'RIFF\xfa\x9f\x01\x00WEBPVP8X\n\x00\x00\x00 \...,Try this tom kha gai recipe—a healthy and quic...,20 minutes,Thai,"Main Course, Soup",water; coconut milk; lemongrass; galangal; kaf...,451 kcal,...,3 g,6 g,769 IU,10 mg,82 mg,6 mg,"InstructionsIn a wok or pot, heat water and co...",0.1 g,64 mg,
306,Tom Yum Kung Recipe (Spicy Thai Shrimp Soup),https://hungryinthailand.com/tom-yum-kung/,https://hungryinthailand.com/wp-content/upload...,b'RIFF$\x1f\x02\x00WEBPVP8X\n\x00\x00\x00 \x00...,This tom yum kung recipe is an authentic Thai ...,25 minutes,Thai,"Main Course, Soup",shrimp; Asian mushrooms; lemongrass; shallots;...,198 kcal,...,1 g,14 g,164 IU,11 mg,199 mg,2 mg,"InstructionsPeel and devein the shrimp, set th...",,95 mg,
307,Thai Chicken Glass Noodle Soup Recipe,https://hungryinthailand.com/thai-chicken-glas...,https://hungryinthailand.com/wp-content/upload...,b'RIFF\xd8+\x02\x00WEBPVP8X\n\x00\x00\x00 \x00...,Enjoy a quick and easy Thai chicken glass nood...,30 minutes,Thai,Soup,glass noodles; dried chilies; chilies; shallot...,644 kcal,...,2 g,5 g,1349 IU,71 mg,72 mg,3 mg,InstructionsSoak the glass noodles in water as...,0.2 g,194 mg,


---

## Korean Recipes Scraping

https://kimchimari.com/recipe-index/

In [14]:
# Gather all urls from thai recipes website

categories = [
    "soups-guk-and-stews-jjigae/",
    "appetizer-2/",
    "salads/",
    "main-dishes/",
    "side-dishes/",
    "desserts/",
]
pages = [3, 2, 2, 5, 5, 3]

all_urls = []

for i in range(len(categories)):
    page_count = pages[i]
    for j in range(page_count):
        url = f"https://kimchimari.com/category/{categories[i]}/page/{j+1}/"
        all_urls.append(url)

In [15]:
recipes = []

# Loop through all pages
for url in all_urls:  # Iterate through all pages
    print(url)
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}")
        continue

    # Parse HTML
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract recipe cards
    recipe_cards = soup.find_all("article", class_="status-publish")
    for card in recipe_cards:
        information = card.find("h2", class_="entry-title")
        title = information.text.strip()
        link_tag = information.find("a")
        link = link_tag["href"]

        # Extract image URL
        image_tag = card.find("img")  # Assuming <img> tag exists in the card
        if image_tag and "data-lazy-src" in image_tag.attrs:
            image_url = image_tag["data-lazy-src"]
            # Download the image content
            image_response = requests.get(image_url)
            if image_response.status_code == 200:
                image_data = image_response.content  # Binary image data
            else:
                print(f"Failed to fetch image for {title}")
                image_data = None
        elif image_tag and "src" in image_tag.attrs:
            image_url = image_tag["src"]
            # Download the image content
            image_response = requests.get(image_url)
            if image_response.status_code == 200:
                image_data = image_response.content  # Binary image data
            else:
                print(f"Failed to fetch image for {title}")
                image_data = None
        else:
            image_url = None
            image_data = None

        # Append to recipes list
        recipes.append(
            {
                "title": title,
                "link": link,
                "image_url": image_url,
                "image_data": image_data,
            }
        )

print(f"Pages scraped successfully!")

https://kimchimari.com/category/soups-guk-and-stews-jjigae//page/1/
https://kimchimari.com/category/soups-guk-and-stews-jjigae//page/2/
https://kimchimari.com/category/soups-guk-and-stews-jjigae//page/3/
https://kimchimari.com/category/appetizer-2//page/1/
https://kimchimari.com/category/appetizer-2//page/2/
https://kimchimari.com/category/salads//page/1/
https://kimchimari.com/category/salads//page/2/
https://kimchimari.com/category/main-dishes//page/1/
https://kimchimari.com/category/main-dishes//page/2/
https://kimchimari.com/category/main-dishes//page/3/
https://kimchimari.com/category/main-dishes//page/4/
https://kimchimari.com/category/main-dishes//page/5/
https://kimchimari.com/category/side-dishes//page/1/
https://kimchimari.com/category/side-dishes//page/2/
https://kimchimari.com/category/side-dishes//page/3/
https://kimchimari.com/category/side-dishes//page/4/
https://kimchimari.com/category/side-dishes//page/5/
https://kimchimari.com/category/desserts//page/1/
https://kimchi

In [16]:
# Create an empty DataFrame to store all recipes from japanese recipes
korean_recipes = []

for i in range(len(recipes)):  # len(recipes)
    current_recipe = recipes[i]
    url = current_recipe["link"]

    print(f"\rProgress: {i}, url:{url}", end="")

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}")
        continue

    # Parse HTML
    soup = BeautifulSoup(response.content, "html.parser")

    # Checks if recipe exists on page
    recipe_cards = soup.find("a", class_="wprm-recipe-jump")

    if recipe_cards:

        # Get Professional title
        current_recipe["title"] = soup.find(
            "h2", class_="wprm-recipe-name"
        ).text.strip()

        # Get recipe description
        description_block = soup.find(
            "div", class_="wprm-recipe-summary wprm-block-text-normal"
        )
        if description_block:
            current_recipe["description"] = description_block.text.strip()

        # Get times
        time_block = soup.find("div", class_="wprm-recipe-total-time-container")
        if time_block:
            time_list = time_block.text.strip().split()
            label = " ".join(time_list[0:2])
            value = " ".join(time_list[2:4])
            current_recipe[label] = value

        # Get type of food
        type_food = soup.find("div", class_="wprm-recipe-tags-container")
        if type_food:
            type_cards = type_food.find_all("div", class_="wprm-recipe-tag-container")
            for card in type_cards:
                string = card.text.strip()
                str_list = string.split(":")
                if str_list:
                    current_recipe[str_list[0]] = str_list[1]

        # Extract ingredients
        ingredients = []
        ingredient_cards = soup.find_all("li", class_="wprm-recipe-ingredient")
        for card in ingredient_cards:
            ingredient_block = card.find("span", class_="wprm-recipe-ingredient-name")
            if ingredient_block:
                name = ingredient_block.text.strip()
                ingredients.append(name)
        current_recipe["ingredients"] = ingredients

        # Get nutrition values
        nutrition_cards = soup.find_all(
            "span", class_="wprm-nutrition-label-text-nutrition-container"
        )
        for card in nutrition_cards:
            label_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-label"
            )
            value_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-value"
            )
            unit_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-unit"
            )
            if label_element and value_element and unit_element:
                current_recipe[label_element.text.strip()] = " ".join(
                    [value_element.text.strip(), unit_element.text.strip()]
                )

        # Get instructions
        instructions_block = soup.find(
            "ul", class_="wprm-recipe-instructions"
        ).text.strip()
        current_recipe["instructions"] = instructions_block

        # Add to list
        korean_recipes.append(current_recipe)
        # print(current_recipe)

df_korean_recipes = pd.DataFrame(korean_recipes)

Progress: 307, url:https://kimchimari.com/korean-sweets-yakwa-yaksik/ssert/d-artisan/ns-%eb%b6%95%ec%96%b4%eb%b9%b5-bungeoppang/ewoo-bokeum/

In [17]:
# Convert lists to strings
if "ingredients" in df_korean_recipes.columns:
    df_korean_recipes["ingredients"] = df_korean_recipes["ingredients"].apply(
        lambda x: "; ".join(x) if isinstance(x, list) else x
    )

# Connect to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect("korean_recipes.db")

# Save DataFrame to SQL database
df_korean_recipes.to_sql("recipes", conn, if_exists="replace", index=False)

# Close the connection
conn.close()

print("Data saved to database!")

df_korean_recipes

Data saved to database!


Unnamed: 0,title,link,image_url,image_data,description,Total Time:,Course,Cuisine,KoreanCategory,ingredients,...,Fiber:,Sugar:,Vitamin A:,Vitamin C:,Calcium:,Iron:,instructions,Keyword,Trans Fat:,Serving:
0,Seolleongtang (Korean Beef Bone Broth),https://kimchimari.com/seolleongtang-korean-be...,https://kimchimari.com/wp-content/uploads/2024...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,Seolleontang is a bone broth made from boiling...,10 hours,Soup,Korean,Tang(탕),beef bones; water; green onions; beef brisket;...,...,0.2 g,0.3 g,36 IU,1 mg,76 mg,1 mg,Soak beef bones in cold water for 30 min or so...,,,
1,Sujebi (Hand-Pulled Noodle Soup),https://kimchimari.com/sujebi-hand-pulled-nood...,https://kimchimari.com/wp-content/uploads/2023...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,Sujebi is a fun hand-pulled or hand-torn noodl...,40 minutes,"Lunch, noodles",Korean,Myeon(면),all purpose flour; sea salt; water; water; dri...,...,4 g,1 g,392 IU,9 mg,63 mg,6 mg,"Mix flour, salt and water. Form a ball and the...","anchovy broth, flour noodles, noodle soup",,
2,Instant Pot Gamjatang,https://kimchimari.com/instant-pot-gamjatang-k...,https://kimchimari.com/wp-content/uploads/2018...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,Instant Pot Gamjatang recipe was a perfect rec...,50 minutes,"Main Course, Pork",Korean,Tang(탕),pork neck bones; cooking sake; potatoes; fresh...,...,2 g,7 g,640 IU,4.5 mg,50 mg,1.8 mg,Soak pork neck bones in cold water for 30 min....,"fall stew, instant pot, one pot meal, perilla ...",,
3,Instant Pot Tteok Guk (Rice Cake Soup),https://kimchimari.com/instant-pot-tteokguk-ri...,https://kimchimari.com/wp-content/uploads/2018...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,Anchovy Broth Tteokguk is a very elegant versi...,28 minutes,Soup,Korean,Guk (국),tteokguk tteok; beef stew meat; water; sesame ...,...,,1 g,180 IU,5.5 mg,46 mg,1.6 mg,"If using frozen rice cakes, soak in water for ...","beef soup, ddukguk, lunar new year, new year’s...",,
4,Tteok guk (떡국) – Korean rice cake soup,https://kimchimari.com/rice-cake-soup-tteokguk...,https://kimchimari.com/wp-content/uploads/2011...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,"Every New Year’s day, Koreans make Dduk Guk/T...",45 minutes,"rice, Soup",Korean,Guk (국),rice cake slices/ovalettes for soup; anchovy s...,...,,6 g,2725 IU,4.3 mg,42 mg,1.8 mg,Prepare the anchovy stock – see My Tips page (...,"chinese new year, gluten free, korean new year...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,Gluten Free Sweet Red Bean Rice Cake (Tteok Pp...,https://kimchimari.com/sweet-red-bean-rice-cake/,https://kimchimari.com/wp-content/uploads/2014...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,This deliciously addictive Sweet Red Bean Rice...,1 hour,"Dessert, Snack","Korean, Koreanfusion",Tteok (떡),sweet rice flour; sugar; milk; salted butter; ...,...,1 g,3 g,410 IU,0.2 mg,100 mg,0.7 mg,Preheat oven to 350°F or 180°C.In a mixing bow...,"anko, gluten free, sweets, tteok ppang",,
258,Sweet Rice Punch (Sikhye 식혜),https://kimchimari.com/sweet-rice-punch/,https://kimchimari.com/wp-content/uploads/2013...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,Sikhye is a classic Korean dessert drink that ...,8 hours,"Dessert, Drinks",Korean,Eumryo (음료),yeotkireum; water; short grain rice; sugar; pi...,...,,14 g,,,8 mg,0.8 mg,Soak the crushed milled malt barley (yeotkireu...,"classic, cold, icy, malted barley",,
259,Crispy Zucchini Pancakes (Hobak buchimgae),https://kimchimari.com/hobak-boochimgae/,https://kimchimari.com/wp-content/uploads/2012...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,Easy Korean Zucchini fritters that is great as...,20 minutes,"Appetizer, Side Dish",Korean,Jeon (전),large Korean Zucchini; onion; Green Chili Pepp...,...,1 g,1 g,125 IU,11.1 mg,11 mg,0.7 mg,Cut Zucchini into 1/4 in slices and then into ...,"crispy, flour batter, squash, summer",,
260,Korean Sweet Rice Dessert (Yaksik),https://kimchimari.com/yaksik-korean-sweet-ric...,https://kimchimari.com/wp-content/uploads/2011...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,A dessert filled with health-promoting ingredi...,2 hours,Dessert,Korean,Hankwa (한과),sweet rice; water; sugar; honey; soy sauce; se...,...,2 g,1 g,,0.4 mg,10 mg,0.6 mg,You need: a pressure cooker (pressure rice coo...,"gluten free, healthy, korean sweet snack, swee...",,


----

## Scraping Indian Recipes

https://ministryofcurry.com/recipe-search/

In [18]:
recipes = []

# URL for the recipe index
base_url = "https://ministryofcurry.com/recipe-search/?_paged={}"

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0"
}


# Loop through all pages
for page in range(1, 21):  # Iterate through all pages
    url = base_url.format(page)
    # print(url)
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}")
        continue

    # Parse HTML
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract recipe cards
    recipe_cards = soup.find_all("div", class_="fwpl-result")
    for card in recipe_cards:
        information = card.find("div", class_="fwpl-item el-cjl7ci")
        title = information.text.strip()
        link_tag = information.find("a")
        link = link_tag["href"]

        # Extract image URL
        image_tag = card.find("img")  # Assuming <img> tag exists in the card
        if image_tag and "data-lazy-src" in image_tag.attrs:
            image_url = image_tag["data-lazy-src"]
            # Download the image content
            image_response = requests.get(image_url, headers=headers)
            if image_response.status_code == 200:
                image_data = image_response.content  # Binary image data
            else:
                print(f"Failed to fetch image for {title}")
                image_data = None
        elif image_tag and "src" in image_tag.attrs:
            image_url = image_tag["src"]
            # Download the image content
            image_response = requests.get(image_url, headers=headers)
            if image_response.status_code == 200:
                image_data = image_response.content  # Binary image data
            else:
                print(f"Failed to fetch image for {title}")
                image_data = None
        else:
            image_url = None
            image_data = None

        # Append to recipes list
        recipes.append(
            {
                "title": title,
                "link": link,
                "image_url": image_url,
                "image_data": image_data,
            }
        )

    print(f"Page {page} scraped successfully!")

print(f"Pages scraped successfully!")

Page 1 scraped successfully!
Page 2 scraped successfully!
Page 3 scraped successfully!
Page 4 scraped successfully!
Page 5 scraped successfully!
Page 6 scraped successfully!
Page 7 scraped successfully!
Page 8 scraped successfully!
Page 9 scraped successfully!
Page 10 scraped successfully!
Page 11 scraped successfully!
Page 12 scraped successfully!
Page 13 scraped successfully!
Page 14 scraped successfully!
Page 15 scraped successfully!
Page 16 scraped successfully!
Page 17 scraped successfully!
Page 18 scraped successfully!
Page 19 scraped successfully!
Page 20 scraped successfully!
Pages scraped successfully!


In [19]:
# Create an empty DataFrame to store all recipes from japanese recipes
indian_recipes = []

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0"
}

for i in range(len(recipes)):  # len(recipes)
    current_recipe = recipes[i]
    url = current_recipe["link"]

    print(f"\rProgress: {i}, url:{url}", end="")

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}")
        continue

    # Parse HTML
    soup = BeautifulSoup(response.content, "html.parser")

    # Checks if recipe exists on page
    recipe_cards = soup.find("a", class_="wprm-recipe-jump")

    if recipe_cards:

        # Get Professional title
        current_recipe["title"] = soup.find(
            "h2", class_="wprm-recipe-name"
        ).text.strip()

        # Get recipe description
        description_block = soup.find(
            "div", class_="wprm-recipe-summary wprm-block-text-normal"
        )
        if description_block:
            current_recipe["description"] = description_block.text.strip()

        # Get times
        time_block = soup.find("div", class_="wprm-recipe-total-time-container")
        if time_block:
            time_card = time_block.find(
                "span", class_="wprm-recipe-total_time"
            ).text.strip()
            current_recipe["Total Time"] = time_card
            # time_list = time_block.text.strip()
            # print(time_list)
            # label = ' '.join(time_list[0:2])
            # value = ' '.join(time_list[2:4])
            # current_recipe[label] = value

        # Get type of food
        type_food = soup.find("div", class_="wprm-recipe-tags-container")
        if type_food:
            type_cards = type_food.find_all("div", class_="wprm-recipe-tag-container")
            for card in type_cards:
                string = card.text.strip()
                str_list = string.split(":")
                if str_list:
                    current_recipe[str_list[0]] = str_list[1]

        # Extract ingredients
        ingredients = []
        ingredient_cards = soup.find_all("li", class_="wprm-recipe-ingredient")
        for card in ingredient_cards:
            ingredient_block = card.find("span", class_="wprm-recipe-ingredient-name")
            if ingredient_block:
                name = ingredient_block.text.strip()
                ingredients.append(name)
        current_recipe["ingredients"] = ingredients

        # Get nutrition values
        nutrition_cards = soup.find_all(
            "span", class_="wprm-nutrition-label-text-nutrition-container"
        )
        for card in nutrition_cards:
            label_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-label"
            )
            value_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-value"
            )
            unit_element = card.find(
                "span", class_="wprm-nutrition-label-text-nutrition-unit"
            )
            if label_element and value_element and unit_element:
                current_recipe[label_element.text.strip()] = " ".join(
                    [value_element.text.strip(), unit_element.text.strip()]
                )

        # Get instructions
        instructions_block = soup.find(
            "ul", class_="wprm-recipe-instructions"
        ).text.strip()
        current_recipe["instructions"] = instructions_block

        # Add to list
        indian_recipes.append(current_recipe)
        # print(current_recipe)

df_indian_recipes = pd.DataFrame(indian_recipes)

Progress: 465, url:https://ministryofcurry.com/carrot-sheera/ots/ant/rrot-soup/ds/e/d-in-lush-yogurt/flavored-syrup/

In [20]:
# Convert lists to strings
if "ingredients" in df_indian_recipes.columns:
    df_indian_recipes["ingredients"] = df_indian_recipes["ingredients"].apply(
        lambda x: "; ".join(x) if isinstance(x, list) else x
    )

# Connect to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect("indian_recipes.db")

# Save DataFrame to SQL database
df_indian_recipes.to_sql("recipes", conn, if_exists="replace", index=False)

# Close the connection
conn.close()

print("Data saved to database!")

df_indian_recipes

Data saved to database!


Unnamed: 0,title,link,image_url,image_data,description,Total Time,Course,Cuisine,Diet,ingredients,...,Potassium:,Fiber:,Sugar:,Vitamin A:,Vitamin C:,Calcium:,Iron:,instructions,Cholesterol:,Serving:
0,Kala Chana Curry {Gujarati Rasawala Kala Chana...,https://ministryofcurry.com/kala-chana-curry/,https://ministryofcurry.com/wp-content/uploads...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,Hearty Kala Chana Curry made with black chickp...,9 hours,dinner,Indian,Vegetarian,dry black chana; water; oil; mustard seeds; hi...,...,411 mg,2 g,2 g,358 IU,2 mg,100 mg,2 mg,Rinse the kala chana thoroughly a couple of ti...,,
1,"Chilli Tofu {Bold Flavors, Light Twist}",https://ministryofcurry.com/chilli-tofu/,https://ministryofcurry.com/wp-content/uploads...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,A light spin on Chilli Paneer by using tofu an...,30 minutes,"dinner, Lunch",Indo-Chinese,"Vegan, Vegetarian",extra firm tofu; Kashmiri red chili powder; ko...,...,475 mg,4 g,8 g,1641 IU,87 mg,76 mg,2 mg,Wrap the tofu in paper towels and place a weig...,,
2,Quick & Easy Khichdi: Perfect for Cozy Evening...,https://ministryofcurry.com/khichdi/,https://ministryofcurry.com/wp-content/uploads...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,This simple khichdi recipe makes for a nourish...,30 minutes,dinner,Indian,Gluten Free,short-grain rice; moong dal; water; kosher sal...,...,42 mg,4 g,1 g,50 IU,0.04 mg,20 mg,3 mg,Rinse the rice and moong dal thoroughly until ...,,
3,Pomegranate Mojito Recipe,https://ministryofcurry.com/pomegranate-mojito/,https://ministryofcurry.com/wp-content/uploads...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,"Twist to the classic mojito, this Pomegranate ...",,Beverage,American,,pomegrante juice; club soda; ice cubes; fresh ...,...,438 mg,3 g,20 g,495 IU,17 mg,65 mg,1 mg,"Add 2 to 3 lime slices, mint leaves, pomegrana...",,
4,Easy Malai Laddo,https://ministryofcurry.com/malai-laddu/,https://ministryofcurry.com/wp-content/uploads...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,"Easy 5-ingredient Malai Laddu for a quick, del...",35 minutes,Dessert,Indian,,ricotta cheese; heavy cream; powdered sugar; c...,...,50 mg,0.02 g,6 g,244 IU,0.1 mg,95 mg,0.2 mg,"In a non-stick pan, add ricotta cheese and coo...",20 mg,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,Stuffed Baby Eggplant Curry,https://ministryofcurry.com/stuffed-baby-eggpl...,https://ministryofcurry.com/wp-content/uploads...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,Baby eggplants stuffed with finely diced onion...,30 minutes,Main Course,Indian,,small baby eggplants; yellow onion; ginger; ga...,...,49 mg,1 g,2 g,150 IU,2.3 mg,6 mg,0.2 mg,Wash and trim extra stem from the baby eggplan...,,
426,Palak Paneer,https://ministryofcurry.com/palak-paneer/,https://ministryofcurry.com/wp-content/uploads...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,A delicious north Indian dish with pureed spin...,25 minutes,Entree,Indian,,Spinach; cubed paneer; green chilies; yellow o...,...,348 mg,2 g,1 g,5315 IU,19.2 mg,352 mg,1.7 mg,Boil 3 cups of water in a medium sized pot. Ad...,41 mg,3 g
427,Vermicelli Sheera,https://ministryofcurry.com/vermicelli-sheera/,https://ministryofcurry.com/wp-content/uploads...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,Dessert made with thin wheat vermcelli noodles...,15 minutes,Dessert,Indian,,vermicelli; sugar; ghee; almonds; pistachios; ...,...,69 mg,1 g,37 g,,,13 mg,0.6 mg,Add ghee and vermicelli in a medium pan. Roast...,9 mg,
428,How to Cook Beets,https://ministryofcurry.com/roasted-beetroots/,https://ministryofcurry.com/wp-content/uploads...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,Cooking Beets 2 ways - roast in the oven or st...,1 hour,Misc.,American,,medium Beets; Olive oil,...,266 mg,2 g,5 g,25 IU,4 mg,13 mg,0.6 mg,Preheat oven to 350 degrees. Wash them thoroug...,,
