# Get cuisine ingredients

Description: This file retrieves information about ingredients from each cuisine and exports a file with all cuisine ingredient data.

In [104]:
# Set up
import urllib.request
import os
import string
import statistics
import csv

from pathlib import Path
from bs4 import BeautifulSoup as bs
from nltk.tokenize import word_tokenize
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from collections import defaultdict

url = "https://www.allrecipes.com/cuisine-a-z-6740455"
fhand = urllib.request.urlopen(url)
html = fhand.read()
soup = bs(html, "html.parser")

current_directory = Path.cwd()

In [77]:
# Create all food items data

# Food database
# food1.csv - https://fdc.nal.usda.gov/
# food2.csv - https://foodb.ca/downloads
# food3.csv - https://data.world/alexandra/generic-food-database/workspace/file?filename=generic-food.csv
# food4.csv - https://github.com/carolinehermans/feed-me/blob/master/ingredients.csv
# food5.csv - https://cosylab.iiitd.edu.in/culinarydb/

def get_all_foods():
    foods = ["food1.csv", "food2.csv", "food3.csv", "food4.csv", "food5.csv", "vegetables.csv"]
    unique_food_items = set()
    
    for food in foods:
        food_file_path = current_directory / "food_data" / food
        
        with open(food_file_path, "r") as csvfile:
            csv_reader = csv.reader(csvfile)
            
            for row in csv_reader:
                if row:
                    food_item = row[0]
                    cleaned_food_item = food_item.lower().split(' (', 1)[0]
                    unique_food_items.add(cleaned_food_item)
    
    return unique_food_items

In [74]:
# Get cuisine URLs
cuisine_blocks = soup.find_all("div", {"class": "alphabetical-list__group"})
cuisine_urls = set()

for block in cuisine_blocks:
    cuisine_in_block = block.find_all(class_ = "comp link-list__item")
    
    # Loop through each cuisine in the block
    for cuisine in cuisine_in_block:
        cuisine_url = cuisine.find("a")["href"]
        cuisine_urls.add(cuisine_url)

cuisine_urls.add("https://www.allrecipes.com/recipes/728/world-cuisine/latin-american/mexican/")

print(len(cuisine_urls))

# Retrieve dish URLs
def get_dish_url(cuisine_soup):
    dish_urls = []
    a_elem = cuisine_soup.find_all("a", class_ = "comp mntl-card-list-items mntl-document-card mntl-card card card--no-image")
        
    for elem in a_elem:
        dish_url = elem.get("href")
        dish_urls.append(dish_url)
    
    return dish_urls

# Retrieve cuisine names
def get_cuisine_name(cuisine_soup):
    cuisine_name = cuisine_soup.find("h1", {"class": "comp mntl-taxonomysc-heading mntl-text-block"}).text.split()
    return cuisine_name[0]

50


In [75]:
# Get ingredients from each dish
def get_ingred(dish_soup):
    ingred_elements = dish_soup.find_all("span", {"data-ingredient-name": "true"})
    
    ingredients = set()
    
    for ingred_elem in ingred_elements:
        ingred_name = ingred_elem.text if ingred_elem else None
        cleaned_ingred_name = ingred_name.split(',')[0].strip().lower()
        
        all_unique_foods = get_all_foods()
        for food_item in all_unique_foods:
            if cleaned_ingred_name == food_item:
                ingredients.add(cleaned_ingred_name)
    
    return ingredients

In [78]:
# Retrieve all ingredients data
def all_cuisine_ingred():
    # Initiate cuisine ingredients data
    all_cuisine_ingred_data = dict()
    
    for cuisine_url in cuisine_urls:
        # Open cuisine URL
        cuisine_fhand = urllib.request.urlopen(cuisine_url)
        cuisine_html = cuisine_fhand.read()
        cuisine_soup = bs(cuisine_html, "html.parser")
        
        # Get cuisine name
        cuisine_name = get_cuisine_name(cuisine_soup)
        
        print(f"Getting ingredients for {cuisine_name}...")
        
        # Get dish URLs
        dish_urls = get_dish_url(cuisine_soup)
        
        # Initialize cuisine_ingred
        cuisine_ingred = dict()
        
        for dish_url in dish_urls:
            # Open dish URL
            dish_fhand = urllib.request.urlopen(dish_url)
            dish_html = dish_fhand.read()
            dish_soup = bs(dish_html, "html.parser")
            
            # Get list of ingredients for a dish
            dish_ingredients = get_ingred(dish_soup)
            
            for ingredient in dish_ingredients:
                if ingredient in cuisine_ingred:
                    cuisine_ingred[ingredient] += 1
                else:
                    cuisine_ingred[ingredient] = 1
            
        print(cuisine_ingred)
        
        # Update all cuisine ingredients
        all_cuisine_ingred_data[cuisine_name] = cuisine_ingred
    
    return all_cuisine_ingred_data

all_cuisine_ingred_data = all_cuisine_ingred()


Getting ingredients for Southern...
{'eggs': 9, 'milk': 11, 'butter': 19, 'sweet potato': 1, 'brown sugar': 2, 'baking powder': 4, 'salt': 30, 'all-purpose flour': 24, 'cornmeal': 4, 'worcestershire sauce': 6, 'garlic': 11, 'peanut oil': 1, 'onion': 10, 'celery': 4, 'green bell pepper': 4, 'cayenne pepper': 8, 'olive oil': 4, 'bay leaves': 3, 'water': 13, 'mayonnaise': 2, 'jalapeno pepper': 1, 'cream cheese': 3, 'cabbage': 3, 'buttermilk': 7, 'lemon juice': 1, 'bacon': 5, 'pepper': 2, 'rum': 1, 'vegetable oil': 4, 'white onion': 3, 'cocoa': 1, 'vanilla': 1, 'white pepper': 2, 'chicken': 1, 'flour': 1, 'black pepper': 4, 'paprika': 4, 'egg': 5, 'lemon': 1, 'yellow bell pepper': 1, 'red bell pepper': 1, 'okra': 1, 'lard': 1, 'evaporated milk': 1, 'shortening': 1, 'mushrooms': 1, 'bay leaf': 1, 'vinegar': 2, 'orange': 1, 'orange juice': 1, 'sour cream': 1, 'chicken stock': 1, 'shrimp': 1, 'ketchup': 1, 'honey': 1, 'molasses': 1, 'red onion': 1, 'turnip greens': 1, 'dijon mustard': 1, 'mar

Getting ingredients for Filipino...
{'lemon': 2, 'olive oil': 10, 'onion': 25, 'garlic': 26, 'green onion': 1, 'carrot': 5, 'cabbage': 5, 'vegetable oil': 20, 'soy sauce': 16, 'cooking oil': 2, 'bok choy': 3, 'chicken': 1, 'black pepper': 3, 'bay leaf': 1, 'zucchini': 1, 'tomato': 4, 'bitter melon': 1, 'eggplant': 1, 'plum tomatoes': 1, 'salt': 15, 'water': 25, 'coconut milk': 6, 'evaporated milk': 4, 'eggs': 2, 'red bell pepper': 4, 'paprika': 1, 'egg yolk': 1, 'butter': 5, 'all-purpose flour': 5, 'squid': 1, 'canola oil': 2, 'pepper': 1, 'cayenne pepper': 1, 'potato': 1, 'vinegar': 3, 'beef stock': 1, 'bay leaves': 6, 'oil': 1, 'orange bell pepper': 1, 'baking powder': 2, 'milk': 2, 'okra': 1, 'tomatoes': 5, 'cocoa powder': 1, 'celery': 1, 'potatoes': 4, 'brown sugar': 6, 'ketchup': 1, 'sesame oil': 2, 'worcestershire sauce': 1, 'lemon juice': 1, 'honey': 2, 'sugar': 1, 'green bell pepper': 1, 'orange juice': 1, 'ginger': 2, 'chicken stock': 1, 'tomato paste': 1, 'apple cider vinegar

Getting ingredients for Australian...
{'lard': 1, 'salt': 24, 'all-purpose flour': 24, 'onion': 1, 'milk': 9, 'kiwifruit': 1, 'vinegar': 2, 'golden syrup': 6, 'margarine': 3, 'cooking spray': 2, 'orange juice': 1, 'kiwi': 1, 'lemon juice': 5, 'water': 7, 'butter': 18, 'baking powder': 12, 'eggs': 18, 'cocoa powder': 1, 'vegetable oil': 5, 'evaporated milk': 3, 'brown sugar': 4, 'sesame oil': 1, 'sherry': 1, 'soy sauce': 1, 'ketchup': 1, 'black pepper': 1, 'curry powder': 1, 'garlic': 2, 'beef stock': 1, 'coconut milk': 6, 'dates': 1, 'white bread': 1, 'cream cheese': 2, 'pie crust': 1, 'maple syrup': 1, 'sugar': 2, 'honey': 1, 'molasses': 1, 'egg': 8, 'cornmeal': 1, 'vanilla': 1, 'almond milk': 1, 'canola oil': 1, 'lemon': 1}
Getting ingredients for Brazilian...
{'butter': 9, 'lime': 4, 'bay leaves': 3, 'garlic': 15, 'soy sauce': 2, 'olive oil': 15, 'baking powder': 10, 'salt': 15, 'all-purpose flour': 12, 'eggs': 17, 'coconut milk': 7, 'tomatoes': 3, 'onion': 13, 'vegetable oil': 9, '

Getting ingredients for Malaysian...
{'shallots': 3, 'garlic': 16, 'vegetable oil': 10, 'coconut milk': 8, 'water': 9, 'lemon grass': 3, 'tomatoes': 1, 'ketchup': 2, 'red onion': 1, 'salt': 5, 'galangal': 2, 'lemongrass': 3, 'tomato': 2, 'onion': 4, 'garam masala': 2, 'coconut oil': 1, 'curry powder': 9, 'cayenne pepper': 2, 'shallot': 1, 'soy sauce': 3, 'carrot': 2, 'green bell pepper': 2, 'red bell pepper': 3, 'tomato paste': 1, 'egg': 2, 'black pepper': 1, 'cream cheese': 1, 'eggs': 1, 'sour cream': 1, 'grapeseed oil': 1, 'chicken stock': 1, 'brown sugar': 4, 'mango': 1, 'olive oil': 4, 'peanut butter': 1, 'peanut oil': 2, 'potatoes': 1, 'milk': 1, 'yogurt': 1, 'vinegar': 1, 'yellow bell pepper': 1, 'ginger': 1, 'orange bell pepper': 1}
Getting ingredients for Norwegian...
{'salt': 14, 'butter': 7, 'all-purpose flour': 16, 'sugar': 1, 'eggs': 6, 'cooking spray': 1, 'milk': 2, 'baking powder': 7, 'potatoes': 2, 'cayenne pepper': 2, 'bacon': 3, 'lemon juice': 1, 'egg': 3, 'water': 3, 

Getting ingredients for Argentinian...
{'red wine vinegar': 9, 'garlic': 8, 'onion': 5, 'salt': 9, 'baking powder': 2, 'dulce de leche': 2, 'egg': 6, 'all-purpose flour': 4, 'olive oil': 7, 'cayenne pepper': 2, 'green bell pepper': 1, 'butter': 5, 'shortening': 1, 'margarine': 1, 'salted butter': 1, 'vegetable oil': 2, 'potatoes': 1, 'water': 6, 'paprika': 4, 'milk': 3, 'cooking spray': 1, 'red wine': 1, 'mayonnaise': 1, 'eggs': 1, 'sweet corn': 1, 'shallot': 2, 'carrot': 1, 'tomatoes': 1, 'tomato': 1, 'white wine vinegar': 1, 'red bell pepper': 1, 'sunflower oil': 1}
Getting ingredients for Italian...
{'salt': 20, 'all-purpose flour': 4, 'olive oil': 27, 'egg': 5, 'ricotta cheese': 6, 'butter': 10, 'tomato paste': 8, 'mozzarella cheese': 5, 'water': 12, 'garlic': 29, 'shallots': 3, 'baking powder': 1, 'flour': 1, 'eggs': 14, 'pasta': 2, 'red wine vinegar': 3, 'oregano': 1, 'red onion': 4, 'lettuce': 1, 'provolone cheese': 2, 'mozzarella': 1, 'prosciutto': 2, 'zucchini': 1, 'feta chees

In [108]:
# Top 10 most popular ingredients in all cuisines

def get_top10_ingred(ingred_dict):
    sorted_ingredients = sorted(ingred_dict.items(), key=lambda x: x[1], reverse=True)
    top10_ingredients = dict(sorted_ingredients[:10])
    
    return top10_ingredients

def top10_all(regions):
    total_ingredient_counts = defaultdict(int)
    ingred_data = {}
    
    if regions == "all":
        ingred_data = all_cuisine_ingred_data
    else:
        for region in regions:
            ingred_data[region] = all_cuisine_ingred_data[region]
    
    for cuisine, ingredients in ingred_data.items():
        for ingredient, count in ingredients.items():
            total_ingredient_counts[ingredient] += count

    total_ingredient_counts = dict(total_ingredient_counts)
    top10_ingreds = get_top10_ingred(total_ingredient_counts)
    
    return top10_ingreds


top10_ingreds = top10_all("all")

asian_cuisines = ["Chinese", "Korean", "Japanese",
                  "Vietnamese", "Indonesian", "Thai", "Malaysian", "Filipino",
                  "Indian", "Bangladeshi"]

western_cuisines = ["French", "Dutch", "Swedish", "Greek",
                    "Italian", "Danish", "Norwegian", "Spanish",
                    "Russian", "Portuguese", "Austrian", "German",
                    "Swiss", "Finnish", "Belgian"]


top10_asian = top10_all(asian_cuisines)
top10_western = top10_all(western_cuisines)
print(top10_asian)
print(top10_western)

{'garlic': 200, 'water': 164, 'soy sauce': 144, 'vegetable oil': 141, 'salt': 127, 'onion': 94, 'sesame oil': 67, 'brown sugar': 50, 'coconut milk': 49, 'olive oil': 43}
{'salt': 233, 'all-purpose flour': 203, 'butter': 175, 'eggs': 137, 'garlic': 125, 'olive oil': 119, 'water': 118, 'onion': 91, 'milk': 85, 'egg': 64}


In [102]:
# Sort each cuisine by ingredient frequency
def sorted_cuisine_ingred(all_cuisine_ingred_data):
    sorted_cuisine_ingred = {}
    
    for cuisine_name, ingred_dict in all_cuisine_ingred_data.items():
        print(f"Analyzing {cuisine_name}...")
        
        top10_ingreds = get_top10_ingred(ingred_dict)
        sorted_cuisine_ingred[cuisine_name] = top10_ingreds
        
        print(f"{top10_ingreds}")
    
    return sorted_cuisine_ingred

sorted_cuisine_ingred = sorted_cuisine_ingred(all_cuisine_ingred_data)

Analyzing Southern...
{'salt': 30, 'all-purpose flour': 24, 'butter': 19, 'water': 13, 'milk': 11, 'garlic': 11, 'onion': 10, 'eggs': 9, 'cayenne pepper': 8, 'buttermilk': 7}
Analyzing Puerto...
{'salt': 25, 'garlic': 19, 'olive oil': 18, 'water': 17, 'onion': 8, 'coconut milk': 7, 'evaporated milk': 7, 'vegetable oil': 7, 'green bell pepper': 7, 'butter': 6}
Analyzing Colombian...
{'salt': 5, 'onion': 2, 'garlic': 2, 'eggs': 2, 'plum tomato': 1, 'lime': 1, 'olive oil': 1, 'apple cider vinegar': 1, 'jalapeno pepper': 1, 'water': 1}
Analyzing Dutch...
{'all-purpose flour': 15, 'butter': 13, 'salt': 8, 'egg': 7, 'baking powder': 6, 'water': 6, 'eggs': 6, 'milk': 4, 'shortening': 2, 'paprika': 2}
Analyzing Tex-Mex...
{'salt': 11, 'sour cream': 11, 'olive oil': 11, 'garlic': 11, 'water': 10, 'onion': 10, 'salsa': 9, 'cooking spray': 7, 'red bell pepper': 7, 'jalapeno pepper': 7}
Analyzing Vietnamese...
{'garlic': 29, 'water': 21, 'salt': 13, 'vegetable oil': 13, 'lime': 9, 'soy sauce': 9, 

In [103]:
# Write to csv

csv_file_name = 'sorted_cuisine_ingred.csv'

with open(csv_file_name, 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)

    # Write the header row
    writer.writerow(['Cuisine', 'Ingredient', 'Count'])

    # Write the data rows
    for cuisine, ingredients in sorted_cuisine_ingred.items():
        for ingredient, count in ingredients.items():
            writer.writerow([cuisine, ingredient, count])


26

18

31

20

19

18

20

19

17

27

23

16

18

21

17

16

23

26

24

28

17

18

19

20

18

25

18

23

33

29

19

28

17

14

13

23

15

14

14

20

17

17

23

22

19

18

18

17

25

27

27

22

21

20

29

19

24

20

21

24

25

16

22

15

23

19

25

15

14

21

21

18

20

31

19

18

16

17

18

24

19

16

21

17

18

16

17

27

29

25

25

17

20

15

16

16

28

21

23

14

22

20

19

19

27

23

18

23

25

23

24

30

17

19

17

17

18

15

32

17

22

20

17

15

16

22

16

17

28

16

14

17

18

19

22

26

23

17

16

19

27

17

18

19

26

22

18

18

24

19

29

22

26

18

19

23

17

16

17

29

23

17

30

23

19

18

18

26

21

17

16

18

16

29

21

18

16

16

15

22

24

21

22

18

25

17

23

16

15

22

23

20

33

22

20

29

19

18

20

27

27

19

20

21

24

19

19

20

32

28

20

35

22

24

22

21

22

23

30

25

37

18

23

20

19

25

19

30

26

18

26

19

22

18

17

29

17

27

17

19

16

17

30

19

17

16

25

17

21

20

15

19

20

28

19

20

26

22

28

22

23

14

15

27

15

16

24

15

13

18

19

15

28

15

15

24

24

17

16

21

15

31

18

18

19

26

16

17

18

16

26

29

18

16

16

17

17

24

15

14

16

21

28

19

26

26

18

19

25

23

22

32

19

20

27

18

19

17

19

21

24

22

21

25

21

20

24

22

33

20

19

21

30

29

21

20

24

22

28

27

25

18

20

25

27

19

23

23

30

18

26

18

16

29

17

17

22

16

14

18

21

16

18

24

25

26

17

17

24

18

23

16

16

25

29

17

18

14

16

20

17

16

18

29

17

17

16

18

25

16

25

16

25

29

17

18

25

15

16

18

17

18

17

22

19

29

17

18

19

24

25

23

19

18

17

19

30

23

26

33

22

32

20

22

25

19

21

21

22

33

23

19

22

17

17

18

19

24

26

15

29

20

27

19

22

22

18

17

23

20

28

14

27

14

16

23

22

18

23

16

15

15

15

14

14

16

22

14

20

23

16

16

29

18

16

15

16

21

24

14

16

22

29

20

23

26

20

19

24

26

25

15

28

17

25

17

16

16

28

18

30