In [None]:
!pip install requests beautifulsoup4 pandas selenium

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

In [3]:
import csv
def write_recipes_to_csv(recipes, page_number):
    filename = f"recipes_page_{page_number}.csv"
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(
            file,
            fieldnames=[
                "title",
                "url",
                "image_url",
                "author",
                "rating_percent",
                "cook_time",
            ],
        )
        writer.writeheader()
        writer.writerows(recipes)
    print(f"Data written to {filename}")

In [4]:
import requests
def fetch_recipe_data(page_number, timeout=60):
    url = f"https://api.food.com/services/mobile/fdc/search/sectionfront?pn={page_number}&recordType=Recipe&sortBy=trending&collectionId=17"

    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()  # Raise an error for bad responses (4xx and 5xx)
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        if "API rate limit" in str(e):  # Check if it's an API rate limit error
            raise
        return {}

In [1]:
import random
def scrape_recipes(category_urls, target_unique_titles=82212):
    unique_titles = set()  # Use a set to track unique titles
    recipes = []
    last_page_number = 17891

    for category_url in category_urls:
        page_number = 17891
        while len(unique_titles) < target_unique_titles:
            print(f"Fetching URL: {category_url}?page={page_number}")
            try:
                data = fetch_recipe_data(page_number)
            except requests.exceptions.ReadTimeout as e:
                print(f"Read timeout error: {e}")
                # Save state and return if read timeout error
                return recipes, last_page_number
            except Exception as e:
                print(f"Error fetching data: {e}")
                if "API rate limit" in str(e):
                    # Save state and return if rate limit error
                    print("API rate limit reached. Saving progress and exiting.")
                    return recipes, last_page_number

                # Random sleep to avoid hitting API rate limits
                time.sleep(random.uniform(1, 5))
                continue

            if not data.get("response") or not data.get("response").get("results"):
                print("No results found or no response, ending.")
                break

            for recipe in data.get("response").get("results", []):
                title = recipe.get("main_title", "").strip()
                if title in unique_titles:
                    continue  # Skip duplicate titles

                recipe_url = recipe.get("record_url")
                image_url = recipe.get("recipe_photo_url")
                author = recipe.get("main_username")
                rating_percent = recipe.get("main_rating")
                cook_time = recipe.get("recipe_totaltime")

                recipes.append(
                    {
                        "title": title,
                        "url": recipe_url,
                        "image_url": image_url,
                        "author": author,
                        "rating_percent": rating_percent,
                        "cook_time": cook_time,
                    }
                )

                unique_titles.add(title)  # Add title to the set

            print(f"Current count of unique recipes: {len(unique_titles)}")

            if len(unique_titles) >= target_unique_titles:
                break  # Stop if the target number of unique titles is reached

            # Write data to CSV every 100 pages
            if page_number % 100 == 0:
                write_recipes_to_csv(recipes, page_number)
                recipes = []  # Clear recipes list after saving to CSV

            print("Moving to next page")
            page_number += 1  # Move to the next page
            last_page_number = page_number

    # Write remaining data to CSV if needed
    if recipes:
        write_recipes_to_csv(recipes, last_page_number)

    return recipes, last_page_number

In [6]:
category_urls = [
    "https://www.food.com/recipe/all/trending",
    "https://www.food.com/recipe/all/quick-easy",
    "https://www.food.com/recipe/all/healthy",
    "https://www.food.com/recipe/all/editor-pick",
    "https://www.food.com/recipe/all/newest",
]

In [21]:
recipes = scrape_recipes(category_urls)

# Print or process the list of recipes
for recipe in recipes:
    print(recipe)

Fetching URL: https://www.food.com/recipe/all/trending?page=17891
Current count of unique recipes: 10
Moving to next page
Fetching URL: https://www.food.com/recipe/all/trending?page=17892
Current count of unique recipes: 20
Moving to next page
Fetching URL: https://www.food.com/recipe/all/trending?page=17893
Current count of unique recipes: 30
Moving to next page
Fetching URL: https://www.food.com/recipe/all/trending?page=17894
Current count of unique recipes: 40
Moving to next page
Fetching URL: https://www.food.com/recipe/all/trending?page=17895
Current count of unique recipes: 50
Moving to next page
Fetching URL: https://www.food.com/recipe/all/trending?page=17896
Current count of unique recipes: 60
Moving to next page
Fetching URL: https://www.food.com/recipe/all/trending?page=17897
Current count of unique recipes: 70
Moving to next page
Fetching URL: https://www.food.com/recipe/all/trending?page=17898
Current count of unique recipes: 80
Moving to next page
Fetching URL: https://ww

In [10]:
len(recipes)

2

In [92]:
# Extract titles
titles = [recipe["title"] for recipe in recipes]

# Find unique titles
unique_titles = set(titles)

# Number of unique titles
num_unique_titles = len(unique_titles)

print(f"Number of unique titles: {num_unique_titles}")

Number of unique titles: 2504


In [None]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Counter for errors
error_count = 0


def extract_recipe_details(recipe_url, cook_time, recipe_rating):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # headless mode

    service = Service("/opt/homebrew/bin/chromedriver")
    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        driver.get(recipe_url)

        # Wait for directions and ingredients to be present
        wait = WebDriverWait(driver, 3)
        directions_list = wait.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".direction-list li"))
        )
        ingredient_elements = wait.until(
            EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, ".ingredient-list li")
            )
        )

        # Extract directions
        directions = "\n".join([li.text.strip() for li in directions_list])

        # Extract ingredients
        ingredients_dict = {}
        for element in ingredient_elements:
            quantity = element.find_element(
                By.CSS_SELECTOR, ".ingredient-quantity"
            ).text.strip()
            text = element.find_element(
                By.CSS_SELECTOR, ".ingredient-text"
            ).text.strip()
            ingredients_dict[quantity] = text

        return {
            "directions": directions,
            "ingredients": ingredients_dict,
            "cook_time": cook_time,
            "recipe_rating": recipe_rating,
        }

    except Exception as e:
        global error_count
        error_count += 1
        return {
            "directions": "N/A",
            "ingredients": {},
            "cook_time": cook_time,
            "recipe_rating": recipe_rating,
        }

    finally:
        driver.quit()


def process_csv_file(input_file, output_file):
    global error_count
    print(f"Processing file: {input_file}")

    # Read the CSV file
    recipes = pd.read_csv(input_file)

    # Prepare to collect results
    batch_size = 10
    all_recipes = []

    # Open the output file in append mode
    with open(output_file, "a") as f:
        # Write the header if the file is empty
        if os.stat(output_file).st_size == 0:
            pd.DataFrame(
                columns=[
                    "title",
                    "url",
                    "image_url",
                    "author",
                    "rating_percent",
                    "cook_time",
                    "directions",
                    "ingredients",
                ]
            ).to_csv(f, index=False)

    # Process recipes in batches
    for i, (_, recipe) in enumerate(recipes.iterrows()):
        try:
            details = extract_recipe_details(
                recipe["url"], recipe["cook_time"], recipe["rating_percent"]
            )
            all_recipes.append(
                {
                    "title": recipe["title"],
                    "url": recipe["url"],
                    "image_url": recipe["image_url"],
                    "author": recipe["author"],
                    "rating_percent": recipe["rating_percent"],
                    "cook_time": recipe["cook_time"],
                    "directions": details["directions"],
                    "ingredients": details["ingredients"],
                }
            )

            # Write to CSV every 10 recipes
            if (i + 1) % batch_size == 0:
                df = pd.DataFrame(all_recipes)
                df.to_csv(output_file, mode="a", header=False, index=False)
                all_recipes = []  # Reset list for the next batch
                print(f"Processed and saved batch of {batch_size} recipes.")

        except Exception:
            # Log general error message
            print(f"Failed to extract details for {recipe['url']}")

    # Process any remaining recipes that didn't fill a complete batch
    if all_recipes:
        df = pd.DataFrame(all_recipes)
        df.to_csv(output_file, mode="a", header=False, index=False)
        print(f"Processed and saved final batch of {len(all_recipes)} recipes.")

    # Print the total error count
    print(f"Total errors encountered: {error_count}")

    # Delete the input file once done
    os.remove(input_file)


input_directory = "/Users/snehsuresh/Desktop/Projects/recipe-recommender-system-pipeline/notebooks/data"
output_directory = "/Users/snehsuresh/Desktop/Projects/recipe-recommender-system-pipeline/notebooks/data/output"

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Get the list of CSV files and sort them
csv_files = [f for f in os.listdir(input_directory) if f.endswith(".csv")]
csv_files.sort()  # Sort filenames in ascending order

# Process each CSV file in the sorted order
for filename in csv_files:
    input_file = os.path.join(input_directory, filename)
    output_file = os.path.join(output_directory, f"processed_{filename}")
    process_csv_file(input_file, output_file)

In [1]:
import pandas as pd
from pathlib import Path

# Assuming all CSV files are in a directory named 'data'
csv_files = Path(
    "/Users/snehsuresh/Desktop/Projects/recipe-recommender-system-pipeline/notebooks/data/output"
).glob("*.csv")

In [2]:
df_list = [pd.read_csv(file, encoding="utf-8", delimiter=",") for file in csv_files]
recipes_df = pd.concat(df_list,  ignore_index=True)

In [3]:
recipes_df

Unnamed: 0,title,url,image_url,author,rating_percent,cook_time,directions,ingredients
0,Rosemary Chicken Oreganata,https://www.food.com/recipe/rosemary-chicken-o...,https://img.sndimg.com/food/image/upload/v1/im...,JelsMom,5.0,225,,{}
1,Bittersweet Amarula Drumsticks,https://www.food.com/recipe/bittersweet-amarul...,https://img.sndimg.com/food/image/upload/v1/im...,Mommy Diva,5.0,70,,{}
2,Clam Fritters,https://www.food.com/recipe/clam-fritters-220462,https://img.sndimg.com/food/image/upload/v1/im...,Julie Bs Hive,5.0,40,,{}
3,Swiss Steak With Vegetables,https://www.food.com/recipe/swiss-steak-with-v...,https://img.sndimg.com/food/image/upload/v1/im...,Julie Bs Hive,5.0,75,,{}
4,Roast Pork Loin With Cider Glaze,https://www.food.com/recipe/roast-pork-loin-wi...,https://img.sndimg.com/food/image/upload/v1/im...,Mommy Diva,5.0,105,,{}
...,...,...,...,...,...,...,...,...
75798,Farfalle With Tomato Herb Sauce,https://www.food.com/recipe/farfalle-with-toma...,https://img.sndimg.com/food/image/upload/v1/im...,OwlMonkey,5.0,45,"Heat the olive oil in a pan.\nAdd garlic, pepe...","{'1': 'tablespoon fresh parsley, chopped', '4'..."
75799,Van Nuys (Aka Dirty Margarita),https://www.food.com/recipe/van-nuys-aka-dirty...,https://img.sndimg.com/food/image/upload/v1/im...,Boomette,5.0,2,Shake and strain into a thoroughly chilled mar...,"{'1 1⁄2': 'ounces tequila', '3⁄4': 'ounce Kahl..."
75800,Crabmeat Hoezel,https://www.food.com/recipe/crabmeat-hoezel-38...,https://img.sndimg.com/food/image/upload/v1/im...,Laury,5.0,1,In a small bowl or jar combine tarragon vinega...,"{'1⁄4': 'cup extra virgin olive oil', '1⁄2': '..."
75801,"Asparagus, Mushroom and Cheese Omelet With Herbs",https://www.food.com/recipe/asparagus-mushroom...,https://img.sndimg.com/food/image/upload/v1/im...,BecR2400,5.0,15,Heat medium skillet over medium-high heat for ...,"{'2 -3': 'large eggs', '1': 'sprig fresh thyme..."


In [4]:
# Count non-NaN values in the 'directions' column
non_nan_directions_count = recipes_df["directions"].notna().sum()

# Print the count of non-NaN directions
print(f"Number of non-NaN values in 'directions': {non_nan_directions_count}")

Number of non-NaN values in 'directions': 64264


In [5]:
# Create a copy of the original DataFrame
df_copy = recipes_df.copy()

In [6]:
# Remove rows where 'directions' or 'ingredients' are NaN
df_copy = df_copy.dropna(subset=["directions", "ingredients"])

In [7]:
# Remove all rows with any NaN values
df_copy = df_copy.dropna()

In [23]:
df_copy

Unnamed: 0,title,url,image_url,author,rating_percent,cook_time,directions,ingredients,combined_text
58,Sparkling Honey Lemonade in Citrus-Salt Rimmed...,https://www.food.com/recipe/sparkling-honey-le...,https://img.sndimg.com/food/image/upload/v1/im...,SusieQusie,5.0,25,Combine lemon juice and honey in a saucepan.\n...,"{'1': 'lemon, zest of', '1⁄2': 'cup honey', '6...",Combine lemon juice and honey in a saucepan.\n...
60,Skillet Pizza Potatoes,https://www.food.com/recipe/skillet-pizza-pota...,https://img.sndimg.com/food/image/upload/v1/im...,ShaniRage,5.0,35,Cook sausage over medium high heat.\nStir in p...,"{'1': 'cup shredded Italian cheese blend', '1⁄...",Cook sausage over medium high heat.\nStir in p...
61,Black Forest Ice Cream Shake,https://www.food.com/recipe/black-forest-ice-c...,https://img.sndimg.com/food/image/upload/v1/im...,PanNan,5.0,15,Place the pint of chocolate ice cream on the c...,"{'1': 'cup milk', '1⁄2': 'cup chocolate chips'}",Place the pint of chocolate ice cream on the c...
62,Tomato Chokha,https://www.food.com/recipe/tomato-chokha-218896,https://img.sndimg.com/food/image/upload/v1/im...,WizzyTheStick,5.0,20,Preheat grill or broiler. Grill or broil tomat...,"{'4': 'large beefsteak tomatoes', '1': 'tables...",Preheat grill or broiler. Grill or broil tomat...
64,Linda's Mushroom and Squash Medley,https://www.food.com/recipe/lindas-mushroom-an...,https://img.sndimg.com/food/image/upload/v1/im...,Lindas Busy Kitchen,5.0,25,"Cut mushrooms, not too thin, about 4 slices to...","{'1': '(8 inch) summer squash, sliced', '1⁄2':...","Cut mushrooms, not too thin, about 4 slices to..."
...,...,...,...,...,...,...,...,...,...
64936,Big Berry Smoothie,https://www.food.com/recipe/big-berry-smoothie...,https://img.sndimg.com/food/image/upload/v1/im...,scarley,5.0,5,Add all the ingredients to a blender and pulse...,"{'2': 'cups ice', '1': 'cup plain yogurt', '1⁄...",Add all the ingredients to a blender and pulse...
64937,Farfalle With Tomato Herb Sauce,https://www.food.com/recipe/farfalle-with-toma...,https://img.sndimg.com/food/image/upload/v1/im...,OwlMonkey,5.0,45,"Heat the olive oil in a pan.\nAdd garlic, pepe...","{'1': 'tablespoon fresh parsley, chopped', '4'...","Heat the olive oil in a pan.\nAdd garlic, pepe..."
64938,Van Nuys (Aka Dirty Margarita),https://www.food.com/recipe/van-nuys-aka-dirty...,https://img.sndimg.com/food/image/upload/v1/im...,Boomette,5.0,2,Shake and strain into a thoroughly chilled mar...,"{'1 1⁄2': 'ounces tequila', '3⁄4': 'ounce Kahl...",Shake and strain into a thoroughly chilled mar...
64939,Crabmeat Hoezel,https://www.food.com/recipe/crabmeat-hoezel-38...,https://img.sndimg.com/food/image/upload/v1/im...,Laury,5.0,1,In a small bowl or jar combine tarragon vinega...,"{'1⁄4': 'cup extra virgin olive oil', '1⁄2': '...",In a small bowl or jar combine tarragon vinega...


In [8]:
# Combine directions and ingredients into a single text field
df_copy["combined_text"] = df_copy.apply(
    lambda row: f"{row['directions']} {' '.join(row['ingredients'].values()) if isinstance(row['ingredients'], dict) else ''}",
    axis=1,
)

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re

In [10]:
# Download necessary NLTK data
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/snehsuresh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/snehsuresh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [12]:
def preprocess_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    tokens = [
        lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words
    ]
    return " ".join(tokens)

In [13]:
# Convert ingredients dictionary to a string
def ingredients_to_string(ingredients_dict):
    return " ".join([f"{value}" for value in ingredients_dict.values()])

In [14]:
# Combine ingredients and directions into a single text field
df_copy["ingredients_str"] = df_copy["ingredients"].apply(
    lambda x: ingredients_to_string(eval(x)) if pd.notna(x) else ""
)

In [15]:
df_copy["ingredients_str"]

58       lemon, zest of cup honey cups sparkling water ...
60             cup shredded Italian cheese blend cup water
61                            cup milk cup chocolate chips
62       large beefsteak tomatoes tablespoon oil teaspo...
64       (8 inch) summer squash, sliced cup water table...
                               ...                        
75797     cups ice cup plain yogurt banana cup raspberries
75798    tablespoon fresh parsley, chopped tablespoons ...
75799                          ounces tequila ounce Kahlua
75800    cup extra virgin olive oil teaspoon salt teasp...
75801    large eggs sprig fresh thyme, for garnish (opt...
Name: ingredients_str, Length: 61244, dtype: object

In [16]:
df_copy["text"] = df_copy["ingredients_str"] + " " + df_copy["directions"]

In [17]:
df_copy["cleaned_text"] = df_copy["text"].apply(preprocess_text)

In [18]:
df_copy["cleaned_text"]

58       lemon zest cup honey cup sparkling water cup s...
60       cup shredded italian cheese blend cup water co...
61       cup milk cup chocolate chip place pint chocola...
62       large beefsteak tomato tablespoon oil teaspoon...
64       inch summer squash sliced cup water tablespoon...
                               ...                        
75797    cup ice cup plain yogurt banana cup raspberry ...
75798    tablespoon fresh parsley chopped tablespoon gr...
75799    ounce tequila ounce kahlua shake strain thorou...
75800    cup extra virgin olive oil teaspoon salt teasp...
75801    large egg sprig fresh thyme garnish optional t...
Name: cleaned_text, Length: 61244, dtype: object

In [19]:
# Create a document-term matrix for ingredients and directions combined
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words="english")


In [36]:
vectorizer

In [20]:
dtm = vectorizer.fit_transform(df_copy["cleaned_text"])

In [35]:
# Fit LDA model
lda = LatentDirichletAllocation(
    n_components=100, random_state=42
)  # Adjust number of topics as needed
lda.fit(dtm)

In [22]:
# Transform the original documents into topic distributions
topic_features = lda.transform(dtm)

In [68]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(
            " ".join(
                [feature_names[i] for i in topic.argsort()[: -no_top_words - 1 : -1]]
            )
        )



# Display top 10 words in each topic
display_topics(lda, vectorizer.get_feature_names_out(), 10)

Topic 0:
chicken breast broth add cup cooked boneless skinless piece cook
Topic 1:
pie crust filling bake minute shell oven cup inch pour
Topic 2:
chicken oven place skin minute breast pan wing thigh piece
Topic 3:
beef patty ground onion chili tomato seasoning taco mix bun
Topic 4:
pepper bell onion tomato add red cut green remove seed
Topic 5:
water cup day temperature room use cool cold salt let
Topic 6:
cheese cup shredded cheddar cream tortilla ounce minute mixture jack
Topic 7:
cup water mint sugar stir add bowl gelatin boiling pour
Topic 8:
steak roast meat minute lamb place oven garlic hour remove
Topic 9:
lemon juice zest fresh tablespoon add cup rind teaspoon grated
Topic 10:
oven baking sheet pumpkin minute preheat bake layer teaspoon toss
Topic 11:
yogurt banana plain point greek frozen fold vegan organic corner
Topic 12:
spinach tofu baby wilted leaf horseradish ounce piece squeeze miso
Topic 13:
nut cup couscous chopped pine tarragon water fennel caper olive
Topic 14:
sau

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

# Preprocess user input
user_input = (
    "fish"
)
user_input_processed = preprocess_text(user_input)
user_input_dtm = vectorizer.transform([user_input_processed])
user_input_topic_features = lda.transform(user_input_dtm)

In [29]:
# Compute similarity scores
similarities = cosine_similarity(user_input_topic_features, topic_features)

In [67]:
# Get recommendations
top_indices = similarities[0].argsort()[-5:][::-1]  # Get top 10 recommended recipes
recommended_recipes = df_copy.iloc[top_indices]
print(recommended_recipes[["title", "directions"]])

                                 title  \
38943          Ultimate Scrambled Eggs   
8122                        Fried Eggs   
46984             Fried Plantain Chips   
39956     Steamed Asparagus With Lemon   
17414  Home-Style Chicken and Ham Soup   

                                              directions  
38943                           Scramble till just done.  
8122        wash pan.\nfry eggs.\nsalt.\npepper.\nserve.  
46984  Peel plantain and slice thinly.\nHeat oil and ...  
39956  Steam asparagus gently until tender, certainly...  
17414  In large saucepan cook bacon until crisp; remo...  


In [30]:
from sklearn.decomposition import NMF

# Fit NMF model
nmf = NMF(n_components=10, random_state=42)
nmf.fit(dtm)

# Transform the document-term matrix into topic distributions
nmf_topic_features = nmf.transform(dtm)

In [38]:
import tensorflow as tf

print(tf.__version__)

2.17.0


In [41]:
texts = df_copy["cleaned_text"].tolist()

In [42]:
from bertopic import BERTopic

# Initialize BERTopic
topic_model = BERTopic()

# Fit the model on your text data
topics, probs = topic_model.fit_transform(texts)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got f

In [43]:
topics_info = topic_model.get_topic_info()
print(topics_info)

     Topic  Count                                      Name  \
0       -1  32453                    -1_add_cup_minute_heat   
1        0   1717        0_chicken_breast_skinless_boneless   
2        1   1266           1_tortilla_salsa_taco_enchilada   
3        2   1127               2_potato_mash_tender_russet   
4        3   1108                3_mushroom_cap_sliced_stem   
..     ...    ...                                       ...   
331    330     10             330_can_basil_undrained_pasta   
332    331     10            331_crostini_pizza_feta_tomato   
333    332     10        332_capsicum_marjoram_quinoa_sieve   
334    333     10           333_advocaat_bourbon_speed_yolk   
335    334     10  334_rhubarb_apricotpineapple_tapioca_oat   

                                        Representation  \
0    [add, cup, minute, heat, sugar, pepper, stir, ...   
1    [chicken, breast, skinless, boneless, wing, ma...   
2    [tortilla, salsa, taco, enchilada, shredded, c...   
3    [potat

In [44]:
topic_model.visualize_topics()

In [49]:
all_topics, all_probs = topic_model.transform(df_copy["cleaned_text"].tolist())

In [50]:
df_copy["predicted_topic"] = all_topics

In [45]:
user_input_processed

'fish'

In [46]:
user_topics, user_probs = topic_model.transform(user_input_processed)

In [52]:
predicted_topic = user_topics[0]
predicted_topic

5

In [None]:
all_topics, all_probs = topic_model.transform(df_copy["cleaned_text"].tolist())



In [58]:
print(type(all_probs))
print(all_probs[:5])

<class 'numpy.ndarray'>
[0.        0.        0.7531983 0.        1.       ]


In [63]:
# Assuming all_probs is a list of single probabilities
df_copy["predicted_topic"] = all_topics
df_copy["topic_probabilities"] = all_probs

In [64]:

filtered_recipes = df_copy[df_copy['predicted_topic'] == predicted_topic]

In [65]:
filtered_recipes["probability"] = filtered_recipes[
    "topic_probabilities"
]  # No need to index

# Sort recipes by probability
sorted_recipes = filtered_recipes.sort_values(by="probability", ascending=False)

# Get the top 5 recipes
top_recipes = sorted_recipes.head(5)

In [66]:
# Display the top 5 recipes
print(top_recipes[["title", "url", "cleaned_text"]])

                              title  \
318        Lemon Baked Fish Fillets   
43266       French Broiled Flounder   
43124    Cod, Chickpea & Olive Stew   
43119  Pat's Southern-Fried Panfish   
42854        Red Snapper With Herbs   

                                                     url  \
318    https://www.food.com/recipe/lemon-baked-fish-f...   
43266  https://www.food.com/recipe/french-broiled-flo...   
43124  https://www.food.com/recipe/cod-chickpea-olive...   
43119  https://www.food.com/recipe/pats-southern-frie...   
42854  https://www.food.com/recipe/red-snapper-with-h...   

                                            cleaned_text  
318    lb firm sole fillet medium lemon thinly sliced...  
43266  ounce flounder cleaned scaled ready cook per p...  
43124  cod fish fillet skinned boned cut chunk onion ...  
43119  ounce bass fillet ounce teaspoon table salt ma...  
42854  lb red snapper fillet cut diagonal inch inch f...  


In [27]:
import numpy as np
# Combine topic features from both models
combined_topic_features = np.hstack((topic_features, nmf_topic_features))

In [31]:
# Transform user input using both LDA and NMF
user_input_lda_features = lda.transform(user_input_dtm)
user_input_nmf_features = nmf.transform(user_input_dtm)

# Combine topic features from both models
user_input_combined_features = np.hstack(
    (user_input_lda_features, user_input_nmf_features)
)

In [32]:
similarities = cosine_similarity(user_input_combined_features, combined_topic_features)
# Get recommendations

In [33]:
top_indices = similarities[0].argsort()[-10:][::-1]  # Get top 10 recommended recipes
recommended_recipes = df_copy.iloc[top_indices]
print(recommended_recipes[["title", "directions"]])

                                   title  \
38943            Ultimate Scrambled Eggs   
8122                          Fried Eggs   
46984               Fried Plantain Chips   
39956       Steamed Asparagus With Lemon   
17414    Home-Style Chicken and Ham Soup   
55987              Chargrilled Asparagus   
1974                    Sauteed Radishes   
75487  Egyptian Fried Eggs With Pastrami   
63395                 Buttered Snow Peas   
59082              Green Beans Ala Katie   

                                              directions  
38943                           Scramble till just done.  
8122        wash pan.\nfry eggs.\nsalt.\npepper.\nserve.  
46984  Peel plantain and slice thinly.\nHeat oil and ...  
39956  Steam asparagus gently until tender, certainly...  
17414  In large saucepan cook bacon until crisp; remo...  
55987  Remove woody ends of asparagus.\nToss asparagu...  
1974   Heat butter in fry pan.\nAdd radishes and salt...  
75487  Fry pastrami in the skillet with the