In [1]:
import pandas as pd
import numpy as np
import gzip
from collections import defaultdict
import math
from scipy import sparse, optimize
from sklearn import svm
import string
import random
import string
from sklearn import linear_model, model_selection, metrics
import os
import matplotlib.pyplot as plt

In [2]:
# load the data
def readFiles(file):
    file_path = os.path.join('data', file)
    return pd.read_csv(file_path, compression='gzip')

recipes = readFiles('RAW_recipes.csv.gz')
interactions = readFiles('RAW_interactions.csv.gz')

In [3]:
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [4]:
interactions.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


# Splitting the Dataset

In [5]:
train, temp = model_selection.train_test_split(interactions, test_size=0.3, random_state=42)
validation, test = model_selection.train_test_split(temp, test_size=0.5, random_state=42)

# Baseline Model

In [6]:
global_avg_rating = train['rating'].mean()

In [7]:
validation['baselinePrediction'] = global_avg_rating
test['baselinePrediction'] = global_avg_rating

In [8]:
baselineRMSE = np.sqrt(metrics.mean_squared_error(validation['rating'], validation['baselinePrediction']))
print(f"Baseline RMSE: {baselineRMSE}")

Baseline RMSE: 1.267727910857182


# Model 1 : Jaccard Similarity for Users

Formula for Jaccard Similarity between users:
$$J(u, v) = \frac{|I_u \cap I_v|}{|I_u \cup I_v|}$$
Here the predicted rating $\hat{r}_{u, i}$ for user $u$ and recipe $i$ is defined such that:

$$ \hat{r}_{u, i}=   \left\{
\begin{array}{ll}
      \frac{\sum_{v \in S(u, \Gamma)} r_{v, i}}{|S(u, \Gamma)|} & \text{if } |S(u, \Gamma)| > 0 \\
      r_{global\_average} & \text{otherwise}\\
\end{array} 
\right.  $$
where \
$S(u, \Gamma) = \{v ∣ J(u,v) > \Gamma\}$ : Set of users similar to user $u$ with Jaccard similarity greater than the threshold $\Gamma$, \
$r_{v, i}$ : Rating given by user $v$ to recipe $i$, \
$r_{global\_average}$ : Global average rating, used as a fallback when no similar users are found.

In [9]:
def jaccardSimilarityUsers(data):
    # Create user-recipe interaction matrix
    interactionMatrix = sparse.csr_matrix(
        (np.ones(len(data)), (data['user_id'], data['recipe_id']))
    )
    
    intersection = interactionMatrix.dot(interactionMatrix.T)
    rowSums = interactionMatrix.sum(axis=1).A1 
    union = rowSums[:, None] + rowSums - intersection
    
    union[union == 0] = 1  
    
    jaccard_sim = intersection / union
    return jaccard_sim, np.unique(data['user_id'])

In [10]:
def predictWithJaccardU(data, jaccard_sim, users, threshold):
    predictions = []
    for _, row in data.iterrows():
        user = row['user_id']
        recipe = row['recipe_id']
        
        # Find similar users
        user_idx = np.where(users == user)[0][0]
        similarUsers = np.where(jaccard_sim[user_idx] > threshold)[0]
        
        # Aggregate ratings from similar users
        similarRatings = train.loc[(train['user_id'].isin(users[similarUsers])) & (train['recipe_id'] == recipe), 'rating']
        
        # Predict average rating if similar users exist, otherwise global average
        if len(similarRatings) > 0:
            predictions.append(similarRatings.mean())
        else:
            predictions.append(global_avg_rating)
    return predictions

In [11]:
# # similarity functions time out for really big datasets

# thresholds = np.linspace(0, 1, 20)
# bestThreshold = None
# bestRMSE = float('inf')

# jaccard_sim, users = jaccardSimilarityUsers(train)
# for threshold in thresholds:
#     validation['model1'] = predictWithJaccardU(validation, jaccard_sim, users, threshold)
#     rmse = np.sqrt(metrics.mean_squared_error(validation['rating'], validation['model1']))
#     if rmse < bestRMSE:
#         bestRMSE = rmse
#         bestThreshold = threshold


In [12]:
# print(f"Best Threshold: {best_threshold}")
# print(f"Validation RMSE with Jaccard Similarity: {best_rmse}")

# Model 2 : User Average Rating

The **User Average Rating Model** predicts a user's rating for a recipe based on their historical average ratings. If a user has no historical ratings, the model falls back to the global average rating.

Here the predicted rating $\hat{r}_{u, i}$ for user $u$ and recipe $i$ is defined such that:
$$ \hat{r}_{u, i}=   \left\{
\begin{array}{ll}
      \frac{\sum_{j \in I_u} r_{u, j}}{|I_u|} & \text{if } |I_u| > 0 \\
      r_{global\_average} & \text{otherwise}\\
\end{array} 
\right.  $$
where \
$I_u$ : Set of recipes rated by user $u$, \
$r_{u, j}$ : Rating given by user $u$ to recipe $j$, \
$r_{global\_average}$ : Global average rating, used as a fallback when no similar users are found.

In [13]:
user_avg_ratings = train.groupby('user_id')['rating'].mean().to_dict()
def model2(data, user_avg_ratings, global_avg_rating):
    predictions = []
    for _, row in data.iterrows():
        user = row['user_id']
        if user in user_avg_ratings:
            predictions.append(user_avg_ratings[user])
        else:
            predictions.append(global_avg_rating)  # Fallback to global average
    return predictions

In [14]:
validation['model2'] = model2(validation, user_avg_ratings, global_avg_rating)

# Calculate RMSE
model2RMSE = np.sqrt(metrics.mean_squared_error(validation['rating'], validation['model2']))
print(f"Validation RMSE with User Average Rating Model: {model2RMSE}")


Validation RMSE with User Average Rating Model: 1.2621231116198637


# Model 3 : Item Average Rating

The **Item Average Rating Model** predicts a user's rating for a recipe based on the recipe's historical average ratings. If a user has no historical ratings, the model falls back to the global average rating.

Here the predicted rating $\hat{r}_{u, i}$ for user $u$ and recipe $i$ is defined such that:
$$ \hat{r}_{u, i}=   \left\{
\begin{array}{ll}
      \frac{\sum_{v \in U_i} r_{v, i}}{|U_i|} & \text{if } |U_i| > 0 \\
      r_{global\_average} & \text{otherwise}\\
\end{array} 
\right.  $$
where \
$U_i$ : Set of users who rated recipe $i$, \
$r_{v, i}$ : Rating given by user $v$ to recipe $i$, \
$r_{global\_average}$ : Global average rating, used as a fallback when no similar users are found.

In [15]:
item_avg_ratings = train.groupby('recipe_id')['rating'].mean().to_dict()
def model3(data, item_avg_ratings, global_avg_rating):
    predictions = []
    for _, row in data.iterrows():
        recipe = row['recipe_id']
        if recipe in item_avg_ratings:
            predictions.append(item_avg_ratings[recipe])
        else:
            predictions.append(global_avg_rating)  # Fallback to global average
    return predictions

In [16]:
validation['model3'] = model3(validation, item_avg_ratings, global_avg_rating)
model3RMSE = np.sqrt(metrics.mean_squared_error(validation['rating'], validation['model3']))
print(f"Validation RMSE with Item Average Rating Model: {model3RMSE}")

Validation RMSE with Item Average Rating Model: 1.3465669342075821


# Model 4 : Stochastic Gradient Descent
We can use SGD to build a simple linear regression model for rating prediction. SGD is particularly effective for large datasets as it processes data in small batches, making it computationally efficient.

The predicted rating is given by:
$$\hat{r}_{u, i} = w_0 + w_u + w_i$$
Where \
$\hat{r}_{u, i}$ : the predicted rating for recipe $i$ given by user $u$, \
$w_0$ : Global bias term, \
$w_u$ : User-specific bias, and \
$w_i$ : Recipe-specific bias.

The objective is to minimize the error:
$$\mathcal{L} = \frac{1}{N} \sum_{(u, i)} (r_{u, i} - \hat{r}_{u, i})^2 + \lambda (w_u^2 + w_i^2)$$
Where \
${r}_{u, i}$ : the true rating
$\lambda$ : regularization term to prevent overfitting.


In [23]:
trainData = train[['user_id', 'recipe_id', 'rating']].values
validationData = validation[['user_id', 'recipe_id', 'rating']].values

learning_rate = 0.01
n_epochs = 20
regularization = 0.01

globalBias = train['rating'].mean()
userBiases = np.zeros(train['user_id'].nunique())
itemBiases = np.zeros(train['recipe_id'].nunique())

# Mapping user_id and recipe_id to indices
userMap = {user_id: idx for idx, user_id in enumerate(train['user_id'].unique())}
itemMap = {recipe_id: idx for idx, recipe_id in enumerate(train['recipe_id'].unique())}

# SGD training loop
for epoch in range(n_epochs):
    np.random.shuffle(trainData)
    for user_id, recipe_id, rating in trainData:
        # Map user and recipe to indices
        user_idx = userMap[user_id]
        recipe_idx = itemMap[recipe_id]

        # Predicted rating
        prediction = globalBias + userBiases[user_idx] + itemBiases[recipe_idx]

        # Error
        error = rating - prediction

        # Update parameters
        globalBias += learning_rate * (error - regularization * globalBias)
        userBiases[user_idx] += learning_rate * (error - regularization * userBiases[user_idx])
        itemBiases[recipe_idx] += learning_rate * (error - regularization * itemBiases[recipe_idx])

    # Calculate training RMSE for monitoring
    train_predictions = [
        globalBias + userBiases[userMap[user_id]] + itemBiases[itemMap[recipe_id]]
        for user_id, recipe_id, rating in trainData
    ]
    trainRMSE = np.sqrt(metrics.mean_squared_error(train['rating'], train_predictions))
    print(f"Epoch {epoch + 1}/{n_epochs} - Training RMSE: {trainRMSE}")

# Validation predictions
validation_predictions = []
for user_id, recipe_id, rating in validationData:
    if user_id in userMap and recipe_id in itemMap:
        prediction = (
            globalBias + userBiases[userMap[user_id]] + itemBiases[itemMap[recipe_id]]
        )
    else:
        prediction = globalBias  # Fallback to global bias for unseen users/recipes
    validation_predictions.append(prediction)

# Validation RMSE
validationRMSE = np.sqrt(metrics.mean_squared_error(validation['rating'], validation_predictions))
print(f"Validation RMSE with SGD: {validationRMSE}")


Epoch 1/20 - Training RMSE: 1.295423845077975
Epoch 2/20 - Training RMSE: 1.3087202980150054
Epoch 3/20 - Training RMSE: 1.3155656316008355
Epoch 4/20 - Training RMSE: 1.3226067062762659
Epoch 5/20 - Training RMSE: 1.3379196055788496
Epoch 6/20 - Training RMSE: 1.3335094855249956
Epoch 7/20 - Training RMSE: 1.3403084158082146
Epoch 8/20 - Training RMSE: 1.3554528745590628
Epoch 9/20 - Training RMSE: 1.3494252222451268
Epoch 10/20 - Training RMSE: 1.353053305066383
Epoch 11/20 - Training RMSE: 1.3611110910797022
Epoch 12/20 - Training RMSE: 1.3655891909609972
Epoch 13/20 - Training RMSE: 1.3654027684731802
Epoch 14/20 - Training RMSE: 1.370371692710648
Epoch 15/20 - Training RMSE: 1.3757079702661907
Epoch 16/20 - Training RMSE: 1.3792976131752819
Epoch 17/20 - Training RMSE: 1.380110986558947
Epoch 18/20 - Training RMSE: 1.3833787848834618
Epoch 19/20 - Training RMSE: 1.3856052972078126
Epoch 20/20 - Training RMSE: 1.39180369154773
Validation RMSE with SGD: 1.2263176800258797


In [26]:
# SGD with early stopping
# Parameters for early stopping
best_validation_rmse = float('inf')
early_stopping_patience = 3  # Number of epochs to wait for improvement
patience_counter = 0

# SGD training loop with early stopping
for epoch in range(n_epochs):
    np.random.shuffle(trainData)
    for user_id, recipe_id, rating in trainData:
        # Map user and recipe to indices
        user_idx = userMap[user_id]
        recipe_idx = itemMap[recipe_id]

        # Predicted rating
        prediction = globalBias + userBiases[user_idx] + itemBiases[recipe_idx]

        # Error
        error = rating - prediction

        # Update parameters
        globalBias += learning_rate * (error - regularization * globalBias)
        userBiases[user_idx] += learning_rate * (error - regularization * userBiases[user_idx])
        itemBiases[recipe_idx] += learning_rate * (error - regularization * itemBiases[recipe_idx])

    # Calculate training RMSE for monitoring
    train_predictions = [
        globalBias + userBiases[userMap[user_id]] + itemBiases[itemMap[recipe_id]]
        for user_id, recipe_id, rating in trainData
    ]
    trainRMSE = np.sqrt(metrics.mean_squared_error(train['rating'], train_predictions))
    print(f"Epoch {epoch + 1}/{n_epochs} - Training RMSE: {trainRMSE}")

    # Validation predictions
    validation_predictions = []
    for user_id, recipe_id, rating in validationData:
        if user_id in userMap and recipe_id in itemMap:
            prediction = (
                globalBias + userBiases[userMap[user_id]] + itemBiases[itemMap[recipe_id]]
            )
        else:
            prediction = globalBias  # Fallback to global bias for unseen users/recipes
        validation_predictions.append(prediction)

    # Calculate validation RMSE
    validationRMSE = np.sqrt(metrics.mean_squared_error(validation['rating'], validation_predictions))
    print(f"Epoch {epoch + 1}/{n_epochs} - Validation RMSE: {validationRMSE}")

    # Early stopping logic
    if validationRMSE < best_validation_rmse:
        best_validation_rmse = validationRMSE
        patience_counter = 0  # Reset counter if improvement is observed
    else:
        patience_counter += 1  # Increment counter if no improvement
        if patience_counter >= early_stopping_patience:
            print(f"Early stopping triggered at epoch {epoch + 1}.")
            break


Epoch 1/20 - Training RMSE: 1.395713026588628
Epoch 1/20 - Validation RMSE: 1.230340252351327
Epoch 2/20 - Training RMSE: 1.401765824876346
Epoch 2/20 - Validation RMSE: 1.2283458185497216
Epoch 3/20 - Training RMSE: 1.4020039494075458
Epoch 3/20 - Validation RMSE: 1.2363307803743135
Epoch 4/20 - Training RMSE: 1.4039563172573775
Epoch 4/20 - Validation RMSE: 1.2309014054306118
Epoch 5/20 - Training RMSE: 1.4085370712374854
Epoch 5/20 - Validation RMSE: 1.238415955097816
Early stopping triggered at epoch 5.


In [28]:
testData = test[['user_id', 'recipe_id', 'rating']].values
test_predictions = []
for user_id, recipe_id, rating in testData:
        if user_id in userMap and recipe_id in itemMap:
            prediction = (
                globalBias + userBiases[userMap[user_id]] + itemBiases[itemMap[recipe_id]]
            )
        else:
            prediction = globalBias  # Fallback to global bias for unseen users/recipes
        test_predictions.append(prediction)


testRMSE = np.sqrt(metrics.mean_squared_error(test['rating'], test_predictions))
print(f"Test RMSE with SGD: {testRMSE}")


Test RMSE with SGD: 1.2395443751582742


# Model 5 : Latent Factor Model

Matrix factorization decomposes the user-item interaction matrix $R$ into two lower-dimensional matrices:
> - $P$ : User latent factor matrix $(m \times k)$
> - $Q$ : Item latent factor matrix $(n \times k)$

The predicted rating is given by:
$$\hat{r}_{u, i} = \mu + b_u + b_i + p_u^Tq_i$$
Where \
$\mu$ : the global average rating, \
$b_u$ : User-specific bias, \
$b_i$ : Recipe-specific bias, \
$p_u$ : Latent factors for $u$, and \
$q_i$ : : Latent factors for $i$

The objective is to minimize the error:
$$\mathcal{L} = \frac{1}{N} \sum_{(u, i)} (r_{u, i} - \hat{r}_{u, i})^2 + \lambda (b_u^2 + b_i^2 + ||p_u||^2 + ||q_i||^2)$$
Where \
${r}_{u, i}$ : the true rating
$\lambda$ : regularization term to prevent overfitting.

In [31]:
# Hyperparameters
n_factors = 1  # Number of latent factors
learning_rate = 0.01
regularization = 0.1
n_epochs = 20

# Initialize parameters
global_avg_rating = train['rating'].mean()
userBiases = np.zeros(train['user_id'].nunique())
itemBiases = np.zeros(train['recipe_id'].nunique())
userFactors = np.random.normal(scale=0.1, size=(train['user_id'].nunique(), n_factors))
itemFactors = np.random.normal(scale=0.1, size=(train['recipe_id'].nunique(), n_factors))

# Mapping user_id and recipe_id to indices
userMap = {user_id: idx for idx, user_id in enumerate(train['user_id'].unique())}
itemMap = {recipe_id: idx for idx, recipe_id in enumerate(train['recipe_id'].unique())}

# Training loop
for epoch in range(n_epochs):
    np.random.shuffle(trainData)
    for user_id, recipe_id, rating in trainData:
        user_idx = userMap[user_id]
        item_idx = itemMap[recipe_id]

        # Predicted rating
        prediction = (
            global_avg_rating
            + userBiases[user_idx]
            + itemBiases[item_idx]
            + np.dot(userFactors[user_idx], itemFactors[item_idx])
        )
        
        # Error
        error = rating - prediction

        # Update parameters
        userBiases[user_idx] += learning_rate * (error - regularization * userBiases[user_idx])
        itemBiases[item_idx] += learning_rate * (error - regularization * itemBiases[item_idx])
        userFactors[user_idx] += learning_rate * (
            error * itemFactors[item_idx] - regularization * userFactors[user_idx]
        )
        itemFactors[item_idx] += learning_rate * (
            error * userFactors[user_idx] - regularization * itemFactors[item_idx]
        )

    # Calculate training RMSE
    train_predictions = [
        global_avg_rating
        + userBiases[userMap[user_id]]
        + itemBiases[itemMap[recipe_id]]
        + np.dot(userFactors[userMap[user_id]], itemFactors[itemMap[recipe_id]])
        for user_id, recipe_id, rating in trainData
    ]
    trainRMSE = np.sqrt(metrics.mean_squared_error(train['rating'], train_predictions))
    print(f"Epoch {epoch + 1}/{n_epochs} - Training RMSE: {trainRMSE}")


Epoch 1/20 - Training RMSE: 1.285052559461166
Epoch 2/20 - Training RMSE: 1.2920047686402045
Epoch 3/20 - Training RMSE: 1.2975353449190212
Epoch 4/20 - Training RMSE: 1.3028104937714868
Epoch 5/20 - Training RMSE: 1.3068558260745509
Epoch 6/20 - Training RMSE: 1.3118444410181027
Epoch 7/20 - Training RMSE: 1.3155176624252511
Epoch 8/20 - Training RMSE: 1.3189610986459341
Epoch 9/20 - Training RMSE: 1.3222005483861614
Epoch 10/20 - Training RMSE: 1.326668150461448
Epoch 11/20 - Training RMSE: 1.3299610419552697
Epoch 12/20 - Training RMSE: 1.3338105219924654
Epoch 13/20 - Training RMSE: 1.3376298012595893
Epoch 14/20 - Training RMSE: 1.3399074675784877
Epoch 15/20 - Training RMSE: 1.3429359181606206
Epoch 16/20 - Training RMSE: 1.3475337977602897
Epoch 17/20 - Training RMSE: 1.3507731828452854
Epoch 18/20 - Training RMSE: 1.3536588850047284
Epoch 19/20 - Training RMSE: 1.3572326646896595
Epoch 20/20 - Training RMSE: 1.3598321844807797


In [32]:
# Validation predictions
validation_predictions = []
for user_id, recipe_id, rating in validationData:
    if user_id in userMap and recipe_id in itemMap:
        prediction = (
            global_avg_rating
            + userBiases[userMap[user_id]]
            + itemBiases[itemMap[recipe_id]]
            + np.dot(userFactors[userMap[user_id]], itemFactors[itemMap[recipe_id]])
        )
    else:
        prediction = global_avg_rating  # Fallback for unseen users/items
    validation_predictions.append(prediction)

# Validation RMSE
validationRMSE = np.sqrt(metrics.mean_squared_error(validation['rating'], validation_predictions))
print(f"Validation RMSE with LFM: {validationRMSE}")


Validation RMSE with LFM: 1.2388656775368099


In [35]:
test_predictions = []
for user_id, recipe_id, rating in testData:
    if user_id in userMap and recipe_id in itemMap:
        prediction = (
            global_avg_rating
            + userBiases[userMap[user_id]]
            + itemBiases[itemMap[recipe_id]]
            + np.dot(userFactors[userMap[user_id]], itemFactors[itemMap[recipe_id]])
        )
    else:
        prediction = global_avg_rating  # Fallback for unseen users/items
    test_predictions.append(prediction)

# Validation RMSE
testRMSE = np.sqrt(metrics.mean_squared_error(test['rating'], test_predictions))
print(f"Test RMSE with LFM: {testRMSE}")


Validation RMSE with LFM: 1.2392408927195446
