<h1>Cooking time Prediction from cooking directions

In [None]:
%pip install mlflow

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext,Row 
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressionModel

In [None]:
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt

In [None]:
sc._conf.get('spark.executor.memory'),sc._conf.get('spark.driver.memory')

In [None]:
sc._conf.set('spark.executor.memory','64g'),sc._conf.set('spark.driver.memory','64g')

In [None]:
spark = SparkSession.builder.getOrCreate()
spark

<h4>Reading the data from MongoDB Atlas

In [None]:
df = spark.read.format("mongo").option('uri',f'mongodb+srv://{cluster_detail}/project_db.recipes').load()
df.count()

In [None]:
df.printSchema() #Getting the schema

In [None]:
df.select('directions').show(1,False) # We have to embedd this directions string

<h4>De-array the directions and using regex to remove punctuations and keeping the fractions like 1/2 and 1/4

In [None]:
#We need the numbers and fractions as they can indicate the amount of time taken for a recipe
df1 = df.withColumn('directions', explode('directions'))
df_time = df1.select('name','details.Total Time',regexp_replace("directions", "[^\w\s/]|(?<!\d)\/(?!\d)", "").alias('cooking_steps'))
df_time.select('cooking_steps').show(3,False)

<h4>Using Bert's Sentence Transformer based on MiniLM to embedd our direction string

In [None]:
emb_model = SentenceTransformer('all-MiniLM-L6-v2')

# udf for getting embeddings
def get_embeddings(str):
    return emb_model.encode(str, show_progress_bar=True).tolist()

# get the embeddings for the ingredients_str column
get_embeddings_udf = udf(get_embeddings, ArrayType(FloatType()))

In [None]:
df_time = df_time.withColumnRenamed('Total time','total_time').withColumn('cooking_steps_emb',get_embeddings_udf("cooking_steps"))
df_time.show(1) #We can see the embedded directions in last column

<h4>We create a dense vector which has same dimensions and all values are non-zero

In [None]:
#To remove the sparscity in the embeddings
def get_dense_vector(embeddings):
    return Vectors.dense(embeddings)
 
get_dense_vector_udf = udf(get_dense_vector, VectorUDT())

In [None]:
df_time = df_time.filter(df_time.total_time.isNotNull()) #Filtering null values from our label(cooking time = 0 for few items)

<h4>We convert the label column - total_time from days, hours and minutes to minutes only

In [None]:
def label_to_minutes(label):
    parts = label.split()
    total_minutes = 0
    for i in range(0, len(parts), 2):
        value = int(parts[i])
        unit = parts[i+1]
        if unit.startswith("hr"):
            total_minutes += value * 60
        elif unit.startswith("day"):
            total_minutes += value * 24 * 60
        elif unit.startswith("min"):
            total_minutes += value
    return int(total_minutes)

label_to_minutes_udf = udf(label_to_minutes)

In [None]:
df_time_new = df_time.withColumn('cooking_steps_dense',get_dense_vector_udf('cooking_steps_emb'))\
                        .withColumn('label',label_to_minutes_udf('total_time'))\
                        .withColumn("label", col("label").cast("int"))
df_time_new.printSchema()

<h4>We use VectorAssembler to get our features ready and fit in RandomForestRegressor.</h4>
We used 50 trees with 15 depth to fit the model (this was found using CV but removed due to processesing power)

In [None]:
assembler = VectorAssembler(inputCols=["cooking_steps_dense"], outputCol="features")
df_vector = assembler.transform(df_time_new).select("features", "label")
featureSet = df_vector.randomSplit([0.8, 0.2], 1)
feature_train = featureSet[0]
feature_valid = featureSet[1]
feature_train = feature_train.repartition(50)
feature_valid = feature_valid.repartition(50)
feature_train.cache()
feature_train.cache() #We cache and repartition it make it more efficient

In [None]:
feature_train.printSchema()

In [None]:
# #NULL Check
# feature_train.filter(feature_train.features.isNull()).count(),feature_train.filter(feature_train.label.isNull()).count(),feature_valid.filter(feature_valid.features.isNull()).count(),feature_valid.filter(feature_valid.label.isNull()).count()

In [None]:
rf = RandomForestRegressor(numTrees=50, maxDepth=15,seed=42)
rfmodel = rf.fit(feature_train)

In [None]:
predictions = rfmodel.transform(feature_valid)
predictions.cache() #Cache for easier availability

In [None]:
predictions.printSchema()

In [None]:
# create a RegressionEvaluator object with R2 as the evaluation metric
evaluator = RegressionEvaluator(metricName="r2").setLabelCol("label")\
                                                 .setPredictionCol("prediction")

r2 = evaluator.evaluate(predictions)

print("R-squared (R2) on validation data = %g" % r2)

In [None]:
spark.stop()