In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext,Row 
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import concat, udf
from pyspark.ml.linalg import Vectors, VectorUDT

from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from pyspark.sql.functions import col
from pyspark.sql.functions import regexp_replace
from ast import literal_eval
import torch
from transformers import BertModel, BertTokenizer

import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
sc._conf.getAll()

Out[2]: [('spark.app.id', 'app-20230310220301-0000'),
 ('spark.databricks.preemption.enabled', 'true'),
 ('spark.sql.hive.metastore.jars', '/databricks/databricks-hive/*'),
 ('spark.driver.tempDirectory', '/local_disk0/tmp'),
 ('spark.sql.warehouse.dir', 'dbfs:/user/hive/warehouse'),
 ('spark.databricks.managedCatalog.clientClassName',
  'com.databricks.managedcatalog.ManagedCatalogClientImpl'),
 ('spark.databricks.credential.scope.fs.gs.auth.access.tokenProviderClassName',
  'com.databricks.backend.daemon.driver.credentials.CredentialScopeGCPTokenProvider'),
 ('spark.hadoop.fs.fcfs-s3.impl.disable.cache', 'true'),
 ('spark.sql.streaming.checkpointFileManagerClass',
  'com.databricks.spark.sql.streaming.DatabricksCheckpointFileManager'),
 ('spark.databricks.clusterUsageTags.clusterAvailability',
  'SPOT_WITH_FALLBACK'),
 ('spark.databricks.service.dbutils.repl.backend',
  'com.databricks.dbconnect.ReplDBUtils'),
 ('spark.hadoop.databricks.s3.verifyBucketExists.enabled', 'false'),
 ('sp

In [None]:
sc._conf.set('spark.executor.memory','16g')

Out[3]: <pyspark.conf.SparkConf at 0x7f32805f61c0>

In [None]:
sc._conf.get('spark.executor.memory')

Out[4]: '16g'

In [None]:
sc._conf.set('spark.driver.memory','16g')

Out[5]: <pyspark.conf.SparkConf at 0x7f32805f61c0>

In [None]:
sc._conf.get('spark.driver.memory')

Out[6]: '16g'

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
ingredient_df = spark.read.format("mongo").option('uri',f'mongodb+srv://{cluster_detail}/project_db.recipes').load()

In [None]:
ingredient_df.show()

+-------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+------------+--------------------+--------------------+------+----------+----------+--------------------+--------------------+----+--------------------+
|    _id|actual_review_cnt|             details|          directions|           hierarchy|         ingredients|               intro|                name|           nutrition|picture_cnt|publish_date|      publisher_name|       publisher_url|rating|rating_cnt|review_cnt|             reviews|     similar_recipes|text|                 url|
+-------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+------------+--------------------+--------------------+------+----------+----------+--------------------+--------------------+----+-----

Recipe Recommendation System

In [None]:
## Since the ingredient column is made of lists within list, it will be very difficult to use them, therefore creating a user defined function to clean them and merge them

def clean_ingredients(ingredients):
    ingredient = []
    for i in ingredients:
        ingredient.append("".join(i))
    return ",".join(ingredient)

transformation_udf = udf(clean_ingredients, StringType())

ingredient_df = ingredient_df.withColumn("ingredients_clean", transformation_udf("ingredients")).drop("ingredients")

In [None]:
ingredient_df = ingredient_df.withColumn("directions_clean", transformation_udf("directions")).drop("directions")

In [None]:
# Assuming your dataframe is named "df"
ingredient_df = ingredient_df.withColumn("details", col("directions_clean").cast("string"))

In [None]:
#creating dataframe with only the required columns
df_cols = ingredient_df.select('_id','name','details','ingredients_clean')

In [None]:
df_cols = df_cols.filter(df_cols.details.isNotNull())
df_cols = df_cols.filter(df_cols.ingredients_clean.isNotNull())

df_cols.show(5)
                                          

+-------+--------------------+--------------------+--------------------+
|    _id|                name|             details|   ingredients_clean|
+-------+--------------------+--------------------+--------------------+
|6561259|Irish Black Russi...|Half-fill a highb...|1(1.5 fluid ounce...|
|6561268|Sweet Butternut S...|Preheat oven to 3...|1unbaked 9-inch p...|
|6561269|Cheesy Tortilla S...|Melt butter in me...|3tablespoonsbutte...|
|6561273|Bahama Mama I Recipe|In a mixing glass...|¼fluid ouncecoffe...|
|6561278|The Cheesecake Fa...|Combine chicken b...|4skinless, bonele...|
+-------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [None]:
df_cols = df_cols.withColumn('main_corpus', concat("details", "ingredients_clean"))

In [None]:
df_cols.show(5)

+-------+--------------------+--------------------+--------------------+--------------------+
|    _id|                name|             details|   ingredients_clean|         main_corpus|
+-------+--------------------+--------------------+--------------------+--------------------+
|6561259|Irish Black Russi...|Half-fill a highb...|1(1.5 fluid ounce...|Half-fill a highb...|
|6561268|Sweet Butternut S...|Preheat oven to 3...|1unbaked 9-inch p...|Preheat oven to 3...|
|6561269|Cheesy Tortilla S...|Melt butter in me...|3tablespoonsbutte...|Melt butter in me...|
|6561273|Bahama Mama I Recipe|In a mixing glass...|¼fluid ouncecoffe...|In a mixing glass...|
|6561278|The Cheesecake Fa...|Combine chicken b...|4skinless, bonele...|Combine chicken b...|
+-------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



Using LABSE sentence transformer

In [None]:
# Used LABSE because this will be useful for multi lingual textual data too
emb_model2 = SentenceTransformer('LABSE')

In [None]:
# udf for getting embeddings
def get_embeddings(str):
    return emb_model2.encode(str, show_progress_bar=True).tolist()

# get the embeddings for the ingredients_str column
get_embeddings_udf = udf(get_embeddings, ArrayType(FloatType()))

In [None]:
df_cols = df_cols.withColumn("main_corpus_emb", get_embeddings_udf("main_corpus"))

In [None]:
df_cols.show(2)

+-------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    _id|                name|             details|   ingredients_clean|         main_corpus|     main_corpus_emb|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+
|6561259|Irish Black Russi...|Half-fill a highb...|1(1.5 fluid ounce...|Half-fill a highb...|[-0.012809866, -0...|
|6561268|Sweet Butternut S...|Preheat oven to 3...|1unbaked 9-inch p...|Preheat oven to 3...|[0.006374121, -0....|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [None]:
df_cols.alias("a").crossJoin(df_cols.alias('h')).printSchema()

root
 |-- _id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- details: string (nullable = true)
 |-- ingredients_clean: string (nullable = true)
 |-- main_corpus: string (nullable = true)
 |-- main_corpus_emb: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- _id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- details: string (nullable = true)
 |-- ingredients_clean: string (nullable = true)
 |-- main_corpus: string (nullable = true)
 |-- main_corpus_emb: array (nullable = true)
 |    |-- element: float (containsNull = true)



In [None]:
df_cols2 = df_cols.alias("a").join(df_cols.alias('h'), col('a._id') == col("h._id") , 'left')

In [None]:
df_cols2 = df_cols.alias("a").crossJoin(df_cols.alias('h')).filter('a._id != h._id')

In [None]:
df_cols2.printSchema()

root
 |-- _id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- details: string (nullable = true)
 |-- ingredients_clean: string (nullable = true)
 |-- main_corpus: string (nullable = true)
 |-- main_corpus_emb: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- _id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- details: string (nullable = true)
 |-- ingredients_clean: string (nullable = true)
 |-- main_corpus: string (nullable = true)
 |-- main_corpus_emb: array (nullable = true)
 |    |-- element: float (containsNull = true)



In [None]:
def get_similarity(a,b, c, d):
    #return cosine_similarity(np.array(x).reshape(1,-1), np.array(y).reshape(1,-1) )
    if c!=d:
        return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
    else:
        return 0.0

  
similarity_udf = udf(get_similarity, FloatType())

In [None]:
#Taking 10 recipes only to check the recommendations given out for them and assess the system's performance
df_cols_tmp = df_cols2.limit(10)

In [None]:
#Self joining the table to get the candidate recipes beside the primary recipes
df_cols_tmp.select('a._id','a.name',col('h.name').alias('candidate_recipe_name'),col('h._id').alias('candidate_recipe_id'),similarity_udf('a.main_corpus_emb', 'h.main_corpus_emb', 'a._id', 'h._id'))\
            .withColumnRenamed('get_similarity(main_corpus_emb, main_corpus_emb, _id, _id)','sim_score')\
            .orderBy('a._id',col("sim_score").desc())\
            .createOrReplaceTempView('tmp_table')
            #.write\
             #.saveAsTable("recos1")

In [None]:

df_cols_tmp.select('a._id','a.name',col('h.name').alias('candidate_recipe_name'),col('h._id').alias('candidate_recipe_id'),similarity_udf('a.main_corpus_emb', 'h.main_corpus_emb', 'a._id', 'h._id'))\
            .withColumnRenamed('get_similarity(main_corpus_emb, main_corpus_emb, _id, _id)','sim_score')\
            .orderBy('a._id',col("sim_score").desc())\
            .printSchema()

root
 |-- _id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- candidate_recipe_name: string (nullable = true)
 |-- candidate_recipe_id: string (nullable = true)
 |-- sim_score: float (nullable = true)



In [None]:
res = spark.sql("SELECT * FROM tmp_table")

In [None]:
res.show(500)

+-------+--------------------+---------------------+-------------------+----------+
|    _id|                name|candidate_recipe_name|candidate_recipe_id| sim_score|
+-------+--------------------+---------------------+-------------------+----------+
|6561259|Irish Black Russi...| Sea Breeze Cockta...|            6561288|0.73596966|
|6561259|Irish Black Russi...| Cold Brewed Coffe...|            6561283| 0.7039512|
|6561259|Irish Black Russi...| Oreo® Cookie Mart...|            6561298|0.69753426|
|6561259|Irish Black Russi...| Bahama Mama I Recipe|            6561273| 0.6591013|
|6561259|Irish Black Russi...| Whole30® Cinnamon...|            6561287| 0.5799921|
|6561259|Irish Black Russi...| Sweet Butternut S...|            6561268|0.53387403|
|6561259|Irish Black Russi...| The Cheesecake Fa...|            6561278| 0.5317645|
|6561259|Irish Black Russi...| Cheesy Tortilla S...|            6561269|0.49933115|
|6561259|Irish Black Russi...| Killer Cranberry ...|            6561291|0.48

With the above example we can see that the primary recipe is a Vodka based beverage with some fruit mix; the recommendations to this kind of recipe are also vodka based and has fruit mixed in it inline with a basic recommendation system like this. 

This recommendation system can be fine tuned by pre-processing the data, including more features and tuning the model however this serves as the base V1 model.

In [None]:
sc.stop()