Note: The recipe recommendation system is built upon 125,000 recipes using NLP, which requires a large amount of computing resources. Therefore, it is advised to run this program in Databricks. If the computing resources are limited, the program's running speed may be slow.

#### Read in Recipe data

In [0]:
file1 = "dbfs:/FileStore/shared_uploads/hui.yun@sjsu.edu/recipes_raw_nosource_ar.json"
file2 = "dbfs:/FileStore/shared_uploads/hui.yun@sjsu.edu/recipes_raw_nosource_epi.json"
file3 = "dbfs:/FileStore/shared_uploads/hui.yun@sjsu.edu/recipes_raw_nosource_fn.json"

In [0]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import json
from pyspark.sql.functions import lit
from pyspark.sql.functions import concat_ws

#sc = SparkContext()
#spark = SparkSession.builder.appName("Convert RDD to DataFrame").getOrCreate()

In [0]:
# read in receipe data
def read_json(f):
    df = sc.wholeTextFiles(f).map(lambda x: json.loads(x[1])).flatMap(lambda x: x.values()).toDF()
    return df

f1 = read_json(file1)
f2 = read_json(file2)
f3 = read_json(file3)

In [0]:
df = f1.union(f2).union(f3)

Recipe DataFrame Output

In [0]:
df.show()

+--------------------+--------------------+--------------------+--------------------+
|         ingredients|        instructions|        picture_link|               title|
+--------------------+--------------------+--------------------+--------------------+
|[4 skinless, bone...|Place the chicken...|55lznCYBbs2mT8BTx...|Slow Cooker Chick...|
|[2 (10.75 ounce) ...|In a slow cooker,...|QyrvGdGNMBA2lDdci...|Awesome Slow Cook...|
|[1/2 cup packed b...|Preheat oven to 3...|LVW1DI0vtlCrpAhNS...|Brown Sugar Meatloaf|
|[1 cup butter, so...|Preheat oven to 3...|0SO5kdWOV94j6EfAV...|Best Chocolate Ch...|
|[8 ounces whole w...|Preheat oven to 3...|YCnbhplMgiraW4rUX...|Homemade Mac and ...|
|[2 cups all-purpo...|Preheat oven to 3...|jRnWGDXDdyOg3rta4...| Banana Banana Bread|
|[For potato crust...|Bring a large sau...|aUca10AaD8T2yYvcL...|Chef John's Fishe...|
|[3 cups all-purpo...|Grease and flour ...|YdgEVyLVffZgh9NZP...|Mom's Zucchini Bread|
|[1 1/2 cups butte...|In a large bowl, ...|UrgvDGu4roL

Original Ingredients Format

In [0]:
for i in df.select('ingredients').collect():
    print(i)

Row(ingredients=['4 skinless, boneless chicken breast halves ADVERTISEMENT', '2 tablespoons butter ADVERTISEMENT', '2 (10.75 ounce) cans condensed cream of chicken soup ADVERTISEMENT', '1 onion, finely diced ADVERTISEMENT', '2 (10 ounce) packages refrigerated biscuit dough, torn into pieces ADVERTISEMENT', 'ADVERTISEMENT'])
Row(ingredients=['2 (10.75 ounce) cans condensed cream of mushroom soup ADVERTISEMENT', '1 (1 ounce) package dry onion soup mix ADVERTISEMENT', '1 1/4 cups water ADVERTISEMENT', '5 1/2 pounds pot roast ADVERTISEMENT', 'ADVERTISEMENT'])
Row(ingredients=['1/2 cup packed brown sugar ADVERTISEMENT', '1/2 cup ketchup ADVERTISEMENT', '1 1/2 pounds lean ground beef ADVERTISEMENT', '3/4 cup milk ADVERTISEMENT', '2 eggs ADVERTISEMENT', '1 1/2 teaspoons salt ADVERTISEMENT', '1/4 teaspoon ground black pepper ADVERTISEMENT', '1 small onion, chopped ADVERTISEMENT', '1/4 teaspoon ground ginger ADVERTISEMENT', '3/4 cup finely crushed saltine cracker crumbs ADVERTISEMENT', 'ADVERTI

#### Preprocessing and Parsing of Ingredients

In [0]:
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.functions import trim
from pyspark.sql.functions import lower

#turn a list of string to string
df = df.withColumn("ingredients", concat_ws(", ", "ingredients"))

# remove non-alphabetical characters from the "ingredient" column
df = df.withColumn(
    "ingredients",
    regexp_replace(col("ingredients"), "[^a-zA-Z, ]", "")
)
# remove extra space
df = df.withColumn(
    "ingredients",
    regexp_replace(col("ingredients"), r',\s+', ',')
)

df = df.withColumn("ingredients", trim(df["ingredients"]))

# convert all words to lower case
df = df.withColumn('ingredients', lower('ingredients'))

In [0]:
# read in stop words
sw = sc.textFile("dbfs:/FileStore/shared_uploads/hui.yun@sjsu.edu/stopwords.text")
sw = sw.map(lambda x: x.split(',')).flatMap(lambda x: x).map(lambda x: x.strip()).collect()
sw = [word for word in sw if word != '']

In [0]:
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import lower

# remove stopwords
for word in sw:
    df = df.withColumn('ingredients', regexp_replace('ingredients', r'\b{}\b'.format(word), ''))

In [0]:
from pyspark.sql.functions import split, trim, array_remove, transform, col

# Split the string by delimiter ',' and return an array of trimmed strings
words = split("ingredients", ",")
# Trim the resulting array of strings
trimmed_words = transform(words, lambda w: trim(w))
# Remove empty strings from the resulting array
words_without_empty = array_remove(trimmed_words, '')

df = df.select("title","instructions", words_without_empty.alias("ingredients"))


In [0]:
from pyspark.sql.functions import col, trim, transform
df = df.withColumn("ingredients", transform(col("ingredients"), lambda x: regexp_replace(trim(x), r"\s+", " ")))

Ingredients Format After Cleaning

In [0]:
for i in df.select('ingredients').collect():
    print(i)

Row(ingredients=['chicken breast', 'butter', 'cream chicken soup', 'onion', 'biscuit dough'])
Row(ingredients=['cream mushroom soup', 'onion soup'])
Row(ingredients=['ketchup', 'lean beef', 'milk', 'eggs', 'onion', 'ginger', 'saltine cracker crumbs'])
Row(ingredients=['butter', 'eggs', 'vanilla extract', 'flour', 'chocolate chips', 'walnuts'])
Row(ingredients=['wheat rotini pasta', 'broccoli', 'onion', 'garlic', 'butter', 'flour', 'milk', 'cheddar cheese', 'cream cheese', 'bread crumbs'])
Row(ingredients=['flour', 'butter', 'eggs', 'overripe bananas'])
Row(ingredients=['potato crust', 'russet potatoes', 'chunks', 'butter', 'nutmeg', 'cayenne', 'milk', 'spinach', 'olive', 'spinach leaves', 'sauce', 'butter', 'flour', 'garlic', 'milk', 'lemon zest', 'rest', 'butter', 'cayenne', 'cod fillets', 'lemon', 'chives garnish'])
Row(ingredients=['flour', 'cinnamon', 'eggs', 'vegetable', 'vanilla extract', 'zucchini', 'walnuts'])
Row(ingredients=['butter', 'eggs', 'vanilla extract', 'flour'])
Row(

#### Recipe Recommendation

User input ingredients

In [0]:
user_input = "eggs, butter, flour, milk"
input_ingredient = user_input.split(", ")

1. Word Embeddings Using Word2Vec

In [0]:
from pyspark.sql.functions import array_contains, col
condition = col("ingredients").isNull()
for ingredient in input_ingredient:
    condition = condition | array_contains("ingredients", ingredient)

df = df.filter(condition)

In [0]:
from pyspark.ml.feature import Word2Vec, IDF
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import explode, collect_list, udf
from pyspark.sql.types import FloatType

word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol="ingredients", outputCol="ingredient_vec")
model = word2Vec.fit(df)
df = model.transform(df)

2. Using TF-IDF to calculate weights of ingredients

In [0]:
# Calculate the IDF weights for each ingredient in the "ingredients" column
idf = IDF(inputCol="ingredient_vec", outputCol="ingredient_idf")
idfModel = idf.fit(df)
df = idfModel.transform(df)

In [0]:
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf

# Define a UDF to calculate the weighted average of ingredient vectors
@udf(returnType=VectorUDT())
def weighted_avg_udf(ingredient_vec, ingredient_idf):
    n = len(ingredient_vec)
    weights = [ingredient_idf[i] for i in range(n)]
    weighted_avg = [ingredient_vec[i] * weights[i] for i in range(n)]
    return Vectors.dense(sum(weighted_avg))


In [0]:
# Group the DataFrame by "title" and calculate the weighted average of ingredient vectors for each recipe
df = df.groupBy("title") \
       .agg(collect_list("ingredients").alias("ingredients"),
            collect_list("instructions").alias("instruction"),
            weighted_avg_udf(collect_list("ingredient_vec"), collect_list("ingredient_idf")).alias("recipe_vec"))

In [0]:
from pyspark.ml.linalg import Vectors, VectorUDT

# Define a UDF to calculate the cosine similarity between two vectors
@udf(returnType=FloatType())
def cosine_similarity_udf(v1, v2):
    v1_sparse = Vectors.sparse(len(v1), [(i, v1[i]) for i in range(len(v1))])
    v2_sparse = Vectors.sparse(len(v2), [(i, v2[i]) for i in range(len(v2))])
    return float(v1_sparse.dot(v2_sparse) / (v1_sparse.norm(2) * v2_sparse.norm(2)))
  


In [0]:
# Create a DataFrame with the input ingredient as a vector
input_df = spark.createDataFrame([(input_ingredient,)], ["ingredients"])
input_df = model.transform(input_df)
input_df = idfModel.transform(input_df)

In [0]:
# Calculate the weighted average of the input ingredient vector
input_vec = input_df.select(weighted_avg_udf(collect_list("ingredient_vec"), collect_list("ingredient_idf")).alias("input_vec")).first().input_vec

3. Calculate the cosine similarity between the input vector and recipe vectors

In [0]:
# Calculate the cosine similarity between the input vector and recipe vectors

from pyspark.sql.functions import lit
input_list = input_vec.tolist()
input_col = lit(input_list)


# calculate cosine similarity between input vector and recipe vector
df = df.withColumn("similarity", cosine_similarity_udf(col("recipe_vec"), input_col))



4. Top 5 recipes based on similarity score

In [0]:
# Rank the recipes based on similarity score and select the top recommendations
top_recs = df.orderBy(col("similarity").desc()).select("title", "ingredients", "instruction").limit(5)

In [0]:
# Show the top recommendations
top_recs.show(truncate=False)

+--------------------------------+--------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------