In [1]:
import pandas as pd
import numpy as np
import math
from collections import Counter

n = 160

In [2]:

# Load the dataset from https://www.kaggle.com/datasets/saldenisov/recipenlg?resource=download
recipes = pd.read_csv("full_dataset.csv", nrows = n)

# User ingredients
user_ingredients = ['bite size shredded rice biscuits', 'brown sugar', 'milk', 'vanilla', 'nuts', 'butter', 'chicken', 'garlic', 'onion', 'pepper', 'salt', 'tomato', 'lemon']
#print (recipes)

recipe_ing = {}
recipe_dict = {}
for index, row in recipes.iterrows():
    #print (row['NER'])
    recipe_ing[index] = row['NER']
    recipe_dict[index] = row['title']

In [3]:
#standardize the format of each ingredient
def clean_ingredient(ingredient):
    ingredient = ingredient.lower()
    while ingredient[0] != "\"":
        ingredient = ingredient[1:]
    while ingredient[-1] != "\"":
        ingredient = ingredient[:-1]
    return ingredient[1:-1]

In [None]:
#get the unique ingredients and the frequency of each ingredient

ingredient_freq = Counter()

i = 0
add = True

all_ing = set()
for r in recipe_ing.values():
    for ing in r.split(','):
        ing = clean_ingredient(ing)
        ingredient_freq[ing] += 1
        all_ing.add(ing)
        if ing == '+':
            add = False
    if add:
        i += 1
all_ing = sorted(list(all_ing))
print(len(all_ing), len(ingredient_freq))
ingredient_freq = sorted(ingredient_freq.items(), key=lambda x: x[0])

print(recipe_ing[159])

# Print the sorted ingredients and their frequencies
print(i)

In [None]:
#calculate wieghts for each ingredient

ingredient_weights = {}
for ing, freq in ingredient_freq:
    ingredient_weights[ing] = math.log(n/(freq + 1))
print(ingredient_weights)

In [6]:
def create_vector(recipe):
    vector = []
    for ingredient in all_ing:
        if(ingredient in recipe):
            vector.append(ingredient_weights[ingredient])
        else:
            vector.append(0)
    return vector

In [7]:
# Encode the recipes
recipe_vectors = {}
i = 0
for ingredients in recipe_ing.items():
    recipe_vectors[recipe_dict[i]] = create_vector(ingredients[1])
    i += 1
user_vector = create_vector(user_ingredients)

In [9]:
def weighted_cos_sim(a, b):
    # Compute weighted dot product
    a = [float(ai) for ai in a]
    b = [float(bi) for bi in b]
    weights = [float(ingredient_weights[w]) for w in ingredient_weights]

    dot_product = sum([w * ai * bi for ai, bi, w in zip(a, b, weights)])

    # Compute weighted magnitudes
    magnitude_a = math.sqrt(sum([w * ai**2 for ai, w in zip(a, weights)]))
    magnitude_b = math.sqrt(sum([w * bi**2 for bi, w in zip(b, weights)]))

    # Handle the case where either magnitude is zero (no common ingredients)
    if magnitude_a == 0 or magnitude_b == 0:
        return 0.0
    
    # Return the weighted cosine similarity
    return dot_product / (magnitude_a * magnitude_b)

In [None]:
similarities = {}
for recipe, vector in recipe_vectors.items():
    similarity = weighted_cos_sim(user_vector, vector)
    similarities[recipe] = similarity

recipe_scores = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

print("Recommendations:")
for recipe, score in recipe_scores:
    print(f"{recipe}: {score:.4f}")