## Setup

In [1]:
import os
# give googe drive the required permission
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Create a folder in your drive and add the name of that folder here.
# For example, for the code below to run correctly, you need to have a folder named FoodRecSys in 'My Drive'.
# The said folder will be your home directory for the rest of the project.
# You will be able to save and read data from the folder.

os.chdir("/content/drive/MyDrive/FoodRecSys/")
os.getcwd()

'/content/drive/MyDrive/FoodRecSys'

In [3]:
try:
  import pyspark
except:
  !pip install pyspark==3.1.2
  import pyspark

Collecting pyspark==3.1.2
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.4/212.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9 (from pyspark==3.1.2)
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880745 sha256=8dc0b2a114565f2183d8f341d0da62ba3cffd40def98fc081248b838e05db197
  Stored in directory: /root/.cache/pip/wheels/ef/70/50/7882e1bcb5693225f7cc86698f10953201b48b3f36317c2d18
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0.10

In [4]:
try:
  import lenskit
except:
  %pip install lenskit

Collecting lenskit
  Downloading lenskit-0.14.2-py3-none-any.whl (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.0/74.0 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting binpickle>=0.3.2 (from lenskit)
  Downloading binpickle-0.3.4-py3-none-any.whl (13 kB)
Collecting seedbank>=0.1.0 (from lenskit)
  Downloading seedbank-0.1.2-py3-none-any.whl (7.9 kB)
Collecting csr>=0.3.1 (from lenskit)
  Downloading csr-0.5.0-py3-none-any.whl (25 kB)
Collecting anyconfig (from seedbank>=0.1.0->lenskit)
  Downloading anyconfig-0.13.0-py2.py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.8/87.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: binpickle, anyconfig, seedbank, csr, lenskit
Successfully installed anyconfig-0.13.0 binpickle-0.3.4 csr-0.5.0 lenskit-0.14.2 seedbank-0.1.2


In [3]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext

In [4]:
spark = SparkSession.builder.master("local").config('spark.ui.port', '4050').getOrCreate()

In [5]:
spark

## Imports

In [6]:
# import necessary libraries
import pandas as pd
import numpy as np

In [7]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType,BooleanType,DateType,FloatType,StringType, ArrayType

In [50]:
# Import the required functions for ALS and estimating
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import *

In [9]:
from lenskit import batch, topn, util
from lenskit.algorithms import Recommender, als, item_knn as knn

## Read the data

To connect the data files with your google collab, download them and upload them to your home folder in your google drive.

In [10]:
train_ratings_df = spark.read.parquet('train_interaction_level_df.parquet', # Replace the given path with the path for your file
                                      header=True,
                                      inferSchema=True)

In [11]:
test_ratings_all_df  = spark.read.parquet('test_interaction_level_df.parquet', # Replace the given path with the path for your file
                                          header=True,
                                          inferSchema=True)

In [13]:
raw_recipes_df = spark.read.csv("RAW_recipes_cleaned.csv", # Replace the given path with the path for your file
                                header=True,
                                inferSchema=True)

In [15]:
# Count the total number of ratings in the dataset
numerator = train_ratings_df.select("rating").count()

# Count the number of distinct userIds and distinct recipe_Ids
# find the number of unique users in the training data. The output must be an integer
num_users = train_ratings_df.select("user_id").distinct().count()

# find the number of unique recipes in the training data. The output must be an integer
num_recipes =train_ratings_df.select("recipe_id").distinct().count()

# Set the denominator equal to the number of users multiplied by the number of movies
denominator = num_users * num_recipes

# Divide the numerator by the denominator
sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The training dataframe is ", "%.7f" % sparsity + "% empty.")

The training dataframe is  99.9829264% empty.


## Functions

In [16]:
def manual_recommendation_check (user_id):
  '''
  Given a user ID form the test dataset, this function will return the names of the recipes recommended to the user.

  Initialize this function after the all_recs data frame is calculated.

  Input user_id of a user from the test set as an integer.
  Prints the names of recipes recommended to this user.
  Returns nothing.
  '''
  recs_user = all_recs_als[all_recs_als.user == user_id]
  recs_user_list = list(recs_user.item.values)
  recs_user_list = [x.item() for x in recs_user_list]
  display((raw_recipes_df.filter(F.col('id').isin(recs_user_list))
               .select("name")
               .collect()
               ))

## Model

#### Task 02 - Collaborative Filtering Model

Add the argument details in the algorithm initialization below to build the ALS model.

In [18]:
# Create ALS model
als = ALS(userCol= 'user_id', # add the name of the column for users
          itemCol= 'recipe_id', # add the name of the column for recipes
          ratingCol= 'rating', # add the name of the column for ratings
          nonnegative = True,
          implicitPrefs = False,
          coldStartStrategy="drop"
         )

In [30]:
assert str(type(als)) == "<class 'pyspark.ml.recommendation.ALS'>"

#### Training

In [31]:
#Fit the model to the 'train' dataset
model = als.fit(train_ratings_df)

## Prediction

In [32]:
# use the model to create predictions for test data
test_predictions_unseen =  model.transform(test_ratings_all_df)

In [33]:
assert test_predictions_unseen.select(F.col("prediction"))

In [34]:
test_predictions_unseen.select(F.col("user_id"),
                               F.col("recipe_id"),
                               F.col("rating"),
                               F.col("prediction")
                              ).show(5)

+-------+---------+------+----------+
|user_id|recipe_id|rating|prediction|
+-------+---------+------+----------+
| 199020|    55265|     5|  4.395199|
| 369284|    76143|     5|  3.154817|
| 224235|    89385|     5| 4.6746197|
| 385423|    95476|     4| 4.2403584|
| 538098|    95476|     5| 4.5045733|
+-------+---------+------+----------+
only showing top 5 rows



In [35]:
# Each user in the test set must have 10 predictions.
# Use ALS model to get these predictions.
# You can use the recommendForAllUsers() method.

recommendations = model.recommendForAllUsers(10)

In [36]:
assert len(dict(recommendations.select(F.col("recommendations")).collect()[0][0])) == 10

In [37]:
# Transfroming the results by exploding the recommendations column

recommendations = (recommendations.select(F.col("user_id"),
                                          F.posexplode(F.col("recommendations")).alias("pos", "item"))
                                  .select(F.col("user_id"),
                                          F.col("pos"),
                                          F.col("item.recipe_id").alias("recomended_recipe_id"),
                                          F.col("item.rating").alias("predicted_rating")))

## Evaluation

### Task 04 - Model Evaluation

#### 1. RMSE

Add the argument details to the evaluator function below to calculate the RMSE score of the ALS model.

In [39]:
# Define evaluator as RMSE and print RMSE value
evaluator_seen = RegressionEvaluator(metricName="rmse",
                                     labelCol= "rating",  # add the name of the column that has the ratings
                                     predictionCol= "prediction" # add the name of the column that has the predicted ratings.
                                     )

In [41]:
RMSE = evaluator_seen.evaluate(test_predictions_unseen) # Use the evaluator to find the RMSE on the test set.
print(RMSE)

1.4239452194482718


#### Rank based Metrics

We will use the lenskit library to calculate the ranking-based matrics. The lenskit library is available in pandas only so we need to convert the data frames from PySpark dataframes to Pandas dataframes.

In [42]:
all_recs_als = recommendations.toPandas()

In [44]:
# Rename the columns of to eunsre that they match the columns names as in the cell below.

column_names = {"user_id": "user",
    "pos": "rank",
    "recomended_recipe_id": "item",
    "predicted_rating": "score"
}
all_recs_als = all_recs_als.rename(columns=column_names)

In [45]:
all_recs_als

Unnamed: 0,user,rank,item,score
0,28170,0,252213,5.819049
1,28170,1,237031,5.808922
2,28170,2,114710,5.791225
3,28170,3,108417,5.789297
4,28170,4,227785,5.762231
...,...,...,...,...
233615,1904821,5,9410,0.000000
233616,1904821,6,9970,0.000000
233617,1904821,7,10150,0.000000
233618,1904821,8,11440,0.000000


In [46]:
# Adding a column to make sure the we know these recommendations are from the ALS algorithm.

all_recs_als["algorithm"] = "ALS"

In [63]:
# Convert the test dataset to pandas and ensure that it has the same column names as shown in the cell below.
# Also, note there are only three columns in the rest data. You have to ensure your test data looks identical.
column_names = {"user_id": "user",
    "recipe_id": "item",
    "rating": "rating"
}
test_data = (test_ratings_all_df.select("user_id","recipe_id","rating").toPandas())
test_data = test_data.rename(columns=column_names)

In [64]:
test_data

Unnamed: 0,user,item,rating
0,1802380878,35912,0
1,2001602879,40335,5
2,2758877,50348,0
3,199020,55265,5
4,369284,76143,5
...,...,...,...
18655,855082,438292,5
18656,1553277,447699,0
18657,2000072578,447699,5
18658,2775141,469503,5


In [65]:
# code to calculate the necessary metrics
# the code below uses lenskit library to evaluate the rank metrics

rla = topn.RecListAnalysis()
rla.add_metric(topn.recip_rank)
rla.add_metric(topn.ndcg)
rla.add_metric(topn.dcg)
results = rla.compute(all_recs_als, test_data)
results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,recip_rank,ndcg,dcg
algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALS,28170,10,0.0,,
ALS,56680,10,0.0,0.0,0.0
ALS,108460,10,0.0,0.0,0.0
ALS,139830,10,0.0,,
ALS,198430,10,0.0,,


## Manual Prediction Checking

1. Why are ndcg and dcg nulls? EG: user 28170

In [66]:
test_data[test_data.user == 28170]

Unnamed: 0,user,item,rating


In [67]:
all_recs_als[all_recs_als.user == 28170]

Unnamed: 0,user,rank,item,score,algorithm
0,28170,0,252213,5.819049,ALS
1,28170,1,237031,5.808922,ALS
2,28170,2,114710,5.791225,ALS
3,28170,3,108417,5.789297,ALS
4,28170,4,227785,5.762231,ALS
5,28170,5,21499,5.693985,ALS
6,28170,6,225884,5.693835,ALS
7,28170,7,94864,5.660673,ALS
8,28170,8,200763,5.65672,ALS
9,28170,9,156550,5.648351,ALS


User ```28170``` does not appear in the test set. Hence cannot be evaluated.

2. Why are all metrics 0 for specific users?

In [68]:
test_data[test_data.user == 56680]

Unnamed: 0,user,item,rating
446,56680,229831,5


In [69]:
all_recs_als[all_recs_als.user == 56680]

Unnamed: 0,user,rank,item,score,algorithm
10,56680,0,390157,6.457039,ALS
11,56680,1,227566,6.413482,ALS
12,56680,2,279575,6.310296,ALS
13,56680,3,196603,6.304212,ALS
14,56680,4,395403,6.221063,ALS
15,56680,5,202252,6.221063,ALS
16,56680,6,278733,6.221063,ALS
17,56680,7,294131,6.219316,ALS
18,56680,8,216290,6.199569,ALS
19,56680,9,147459,6.193484,ALS


User-recipe combination does not appear in the recommendations set.

3. Are any non zero metrics?

In [70]:
results[results.ndcg > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,recip_rank,ndcg,dcg
algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [71]:
results[results.dcg > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,recip_rank,ndcg,dcg
algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [72]:
results[results.recip_rank > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,recip_rank,ndcg,dcg
algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


There are few user recipe combination that has a corresponding value in the test set. Hence, we have only few non-zero value of evaluation metrics.


Manually, check one prediction.

In [99]:
# use the function diclared earlier to print the recomendations for the user 653438
recommendations.filter(col("user_id") == 653438).select("recomended_recipe_id")\
              .join(raw_recipes_df.select("id","name").withColumnRenamed("id","recomended_recipe_id"),['recomended_recipe_id'],'inner')\
              .select("name").collect()

[Row(name='wonderful chili'),
 Row(name='bisquick cinnamon raisin biscuits'),
 Row(name='kk  azeri frittata'),
 Row(name='brisket with apricot nectar'),
 Row(name='sweettreats ultimate chocolate chip cookies'),
 Row(name='ground turkey lettuce wraps'),
 Row(name='cheesy corn dog bake'),
 Row(name='triple layer brownie cake'),
 Row(name='buttery baked lamb chops'),
 Row(name='green chilies queso')]

Of the recipes that have been recommended, few appear similar, and few do not.

## Saving the models and predictions

In [102]:
all_recs_als.to_csv("recommendation_als.csv", # modify the path
                    index=False)

In [103]:
model.save('ALS_model.model') # modify the path