## Configure spark session

##### Building a classification model to predict recipe type based on directions
- Parse the directions from recipes document, preprocess and build recipte type predictions

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:2.4.0")\
    .config("spark.mongodb.input.uri", f"mongodb+srv://{cluster_detail}/project_db.recipes")\
    .config("spark.mongodb.output.uri", f"mongodb+srv://{cluster_detail}/project_db.filtered_recipes")\
    .config("spark.network.timeout", "3600s")\
    .getOrCreate()

In [None]:
spark

## Read from the collection

In [None]:
df = spark.read.format("mongo").option('uri',f'mongodb+srv://{cluster_detail}/project_db.recipes').load()
#df = spark.read.format('com.mongodb.spark.sql.DefaultSource').load()

In [None]:
df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- actual_review_cnt: integer (nullable = true)
 |-- details: struct (nullable = true)
 |    |-- Additional Time: string (nullable = true)
 |    |-- Bake Time: string (nullable = true)
 |    |-- Cook Time: string (nullable = true)
 |    |-- Cool Time: string (nullable = true)
 |    |-- Grill Time: string (nullable = true)
 |    |-- Marinate Time: string (nullable = true)
 |    |-- Prep Time: string (nullable = true)
 |    |-- Refrigerate Time: string (nullable = true)
 |    |-- Rest Time: string (nullable = true)
 |    |-- Servings: string (nullable = true)
 |    |-- Stand Time: string (nullable = true)
 |    |-- Total Time: string (nullable = true)
 |    |-- Yield: string (nullable = true)
 |-- directions: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hierarchy: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ingredients: array (nullable = true)
 |    |-- element: array (containsNull 

In [None]:
df.show()

+-------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+------------+--------------------+--------------------+------+----------+----------+--------------------+--------------------+----+--------------------+
|    _id|actual_review_cnt|             details|          directions|           hierarchy|         ingredients|               intro|                name|           nutrition|picture_cnt|publish_date|      publisher_name|       publisher_url|rating|rating_cnt|review_cnt|             reviews|     similar_recipes|text|                 url|
+-------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+------------+--------------------+--------------------+------+----------+----------+--------------------+--------------------+----+-----

## Filter the data frame and write to a new collection

In [None]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [None]:
df_data = df.select('name', 'hierarchy', 'directions').rdd.map(lambda x: (x[0], [i for i in x[1] if i.lower()!='recipes'], " ".join(x[2]))).map(lambda x: (x[0], "/".join(x[1]), x[2]))
df_data = df_data.toDF(['name','recipe_type','directions'])
df_data = df_data.select('recipe_type','directions')
df_data.show()

+--------------------+--------------------+
|         recipe_type|          directions|
+--------------------+--------------------+
|Drinks Recipes/Co...|Half-fill a highb...|
|Vegetables/Squash...|Preheat oven to 3...|
|    Everyday Cooking|Melt butter in me...|
|Drinks Recipes/Co...|In a mixing glass...|
|               Salad|Combine chicken b...|
|Drinks Recipes/Co...|Pour coffee groun...|
|Drinks Recipes/Co...|Mix ground coffee...|
|Drinks Recipes/Co...|Combine vodka, cr...|
|Side Dish/Sauces ...|Heat oil in a sau...|
|Drinks Recipes/Co...|Crush one cookie ...|
|Side Dish/Sauces ...|Whisk brown sugar...|
|Drinks Recipes/Co...|Muddle cucumber-f...|
|Salad/Coleslaw Re...|Combine cabbage, ...|
|Drinks Recipes/Ho...|Stir milk powder,...|
|Side Dish/Sauces ...|In a small saucep...|
|Desserts/Frosting...|Place the marshma...|
|Drinks Recipes/Co...|Fill a tall glass...|
|Side Dish/Sauces ...|Combine cranberri...|
|Drinks Recipes/Co...|Combine the cider...|
|Drinks Recipes/Co...|Combine li

In [None]:
df_data.printSchema()

root
 |-- recipe_type: string (nullable = true)
 |-- directions: string (nullable = true)



In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer, IDF, StringIndexer, Tokenizer
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression


In [None]:
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="directions", outputCol="words", pattern="\\W")
# bag of words count
countVectors = CountVectorizer(inputCol="words", outputCol="features", vocabSize=10000, minDF=5)

In [None]:
label_stringIdx = StringIndexer(inputCol = "recipe_type", outputCol = "label")
# label_stringIdx1 = StringIndexer(inputCol = "directions", outputCol = "label_directions")
pipeline = Pipeline(stages=[regexTokenizer, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df_data)
dataset = pipelineFit.transform(df_data)
dataset.show(5)

+--------------------+--------------------+--------------------+--------------------+-----+
|         recipe_type|          directions|               words|            features|label|
+--------------------+--------------------+--------------------+--------------------+-----+
|Drinks Recipes/Co...|Half-fill a highb...|[half, fill, a, h...|(3880,[0,1,2,3,4,...| 14.0|
|Vegetables/Squash...|Preheat oven to 3...|[preheat, oven, t...|(3880,[0,1,2,3,4,...|577.0|
|    Everyday Cooking|Melt butter in me...|[melt, butter, in...|(3880,[0,3,4,5,7,...|  2.0|
|Drinks Recipes/Co...|In a mixing glass...|[in, a, mixing, g...|(3880,[0,2,3,7,8,...| 29.0|
|               Salad|Combine chicken b...|[combine, chicken...|(3880,[0,1,2,3,4,...| 43.0|
+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [None]:
#dataset1 = dataset.select('features','label')
#dataset1.show()

In [None]:
# set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))


Training Dataset Count: 15352
Test Dataset Count: 6533


In [None]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)


In [None]:
predictions = lrModel.transform(testData)

In [None]:
predictions.printSchema()

root
 |-- recipe_type: string (nullable = true)
 |-- directions: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print("Test accuracy:", accuracy)



Test accuracy: 0.14847696311036276
