In [3]:
import os
print(os.getcwd())
os.chdir(r'D:\UKW_work\code\recipe_recommender_system')
print(os.getcwd())

import pandas as pd
import numpy as np
np.random.seed(42)

D:\UKW_work\code\recipe_recommender_system
D:\UKW_work\code\recipe_recommender_system


In [2]:
from baseline.config import DATA_DIR
from baseline.config import RAW_recepies_path, RAW_interactions_path
from baseline.config import PP_users_path, PP_recipes_path
from baseline.config import  token_interactions_train_path
from baseline.config import  token_interactions_val_path, token_interactions_test_path


In [4]:
train_df = pd.read_csv(token_interactions_train_path)
val_df = pd.read_csv(token_interactions_val_path)
test_df = pd.read_csv(token_interactions_test_path)



In [5]:
print(train_df['user_id'].nunique(), train_df['recipe_id'].nunique(), train_df.shape)
train_df.columns

25076 160901 (698901, 6)


Index(['user_id', 'recipe_id', 'date', 'rating', 'u', 'i'], dtype='object')

In [6]:

# ---------------------------
# 1. Determine number of users and items
# ---------------------------
# We assume that the mapping in 'u' and 'i' is consistent across all datasets.
num_users = train_df['u'].max() + 1  # Contiguous integer mapping starting at 0
num_items = train_df['i'].max() + 1

print('num_users:',  num_users)
print('num_items:',  num_items)

num_users: 25076
num_items: 178263


In [None]:
# ---------------------------
# 2. Set Hyperparameters and Initialize Latent Factors
# ---------------------------
latent_dim = 3

# Randomly initialize latent factors for users and items.
user_factors = np.random.normal(scale=0.1, size=(num_users, latent_dim))
item_factors = np.random.normal(scale=0.1, size=(num_items, latent_dim))



In [10]:
# ---------------------------
# 3. Define Prediction and RMSE Functions
# ---------------------------
def predict(u_idx, i_idx):
    """
    Predict the rating for a given user and item by taking the dot product 
    of the corresponding latent factors.
    """
    return np.dot(user_factors[u_idx], item_factors[i_idx])

def compute_rmse(df):
    """
    Compute RMSE over a DataFrame containing (u, i, rating) columns.
    """
    errors = []
    for _, row in df.iterrows():
        u_idx = row['u']
        i_idx = row['i']
        rating = row['rating']
        pred = predict(u_idx, i_idx)
        errors.append((rating - pred)**2)
    mse = np.mean(errors)
    return np.sqrt(mse)

In [20]:

# ---------------------------
# 3. Define Prediction and RMSE Functions
# ---------------------------
# def predict(user, item):
#     """Predict the rating by taking the dot product of user and item latent factors."""
#     return np.dot(user_factors[user], item_factors[item])


def predict(user, item):
    """Predict the rating by taking the dot product of user and item latent factors."""
    if user >= user_factors.shape[0] or item >= item_factors.shape[0]:
        return np.mean(user_factors)  # Return average rating if index out of bounds
    return np.dot(user_factors[user], item_factors[item])  # Ensure valid indexing

# def compute_rmse(data):
#     """Compute RMSE for a given dataset (list of tuples: user, item, rating)."""
#     squared_errors = [(rating - predict(user, item))**2 for user, item, rating in data]
#     mse = np.mean(squared_errors)
#     return np.sqrt(mse)


def compute_rmse(df):
    """
    Compute RMSE over a DataFrame with extra columns.
    Uses the mapped user ('u') and item ('i') indices along with the 'rating'.
    """
    errors = []
    for _, row in df.iterrows():
        u_idx = row['u']
        i_idx = row['i']
        rating = row['rating']
        pred = predict(u_idx, i_idx)
        errors.append((rating - pred) ** 2)
    mse = np.mean(errors)
    return np.sqrt(mse)



In [21]:
# ---------------------------
# 4. Training Loop with Validation
# ---------------------------

train_data = train_df.copy()
val_data = val_df.copy()
test_data = test_df.copy()

learning_rate = 0.005
num_epochs = 1

for epoch in range(num_epochs):
    # Shuffle training data at the start of each epoch
    train_data_shuffled = train_data.sample(frac=1).reset_index(drop=True)
    
    # Process each training example
    for _, row in train_data_shuffled.iterrows():
        u_idx = row['u']
        i_idx = row['i']
        rating = row['rating']
        
        # Compute prediction and error
        pred = predict(u_idx, i_idx)
        error = rating - pred
        
        # Compute gradients (derivative of squared error)
        grad_u = -2 * error * item_factors[i_idx]
        grad_i = -2 * error * user_factors[u_idx]
        
        # Update latent factors using gradient descent
        user_factors[u_idx] -= learning_rate * grad_u
        item_factors[i_idx] -= learning_rate * grad_i
    
    # Compute RMSE for training, validation, and test datasets
    train_rmse = compute_rmse(train_data)
    val_rmse = compute_rmse(val_data)
    test_rmse = compute_rmse(test_data)
    
    # Print intermediate results for this epoch
    print(f"Epoch {epoch+1}/{num_epochs} - Train RMSE: {train_rmse:.4f} | Val RMSE: {val_rmse:.4f} | Test RMSE: {test_rmse:.4f}")

Epoch 1/1 - Train RMSE: 1.8648 | Val RMSE: 4.4320 | Test RMSE: 4.4220


In [23]:
user_id = 8937
item_id  = 44551

predicted_rating = np.dot(user_factors[user_id], item_factors[item_id])
predicted_rating

np.float64(4.015899425069802)

# BAYESIAN AVERAGE Rating

In [None]:
def add_bayesian_average(df, m=5):
    """
    Adds a 'bayesian_avg' column to the DataFrame.
    
    Parameters:
      df (pd.DataFrame): DataFrame with columns ['user_id', 'recipe_id', 'date', 'rating', 'u', 'i']
      m (float): Hyperparameter controlling the weight of the prior.
      
    Returns:
      pd.DataFrame: The original DataFrame with an added 'bayesian_avg' column.
    """
    # Compute the overall average rating (C)
    C = df['rating'].mean()
    
    # Group by recipe_id to get the number of ratings (v) and average rating (R) per recipe
    grouped = df.groupby('recipe_id').agg(
        count=('rating', 'count'),
        mean_rating=('rating', 'mean')
    ).reset_index()
    
    # Compute Bayesian average rating for each recipe
    grouped['bayesian_avg'] = (grouped['count'] * grouped['mean_rating'] + m * C) / (grouped['count'] + m)
    
    # Merge the Bayesian average back to the original DataFrame
    df = df.merge(grouped[['recipe_id', 'bayesian_avg']], on='recipe_id', how='left')
    return df

In [24]:
train_data.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723


In [26]:
# Add the Bayesian average rating as a new column
train_data = add_bayesian_average(train_data, m=5)
train_data.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i,bayesian_avg
0,2046,4684,2000-02-25,5.0,22095,44367,4.899279
1,2046,517,2000-02-25,5.0,22095,87844,4.695778
2,1773,7435,2000-03-13,5.0,24732,138181,4.462348
3,1773,278,2000-03-13,4.0,24732,93054,4.462968
4,2046,3431,2000-04-07,5.0,22095,101723,4.759265


In [29]:
# Add the Bayesian average rating as a new column
val_data = add_bayesian_average(val_data, m=5)
test_data = add_bayesian_average(test_data, m=5)
test_data.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i,bayesian_avg
0,8937,44551,2005-12-23,4.0,2,173538,4.177573
1,56680,126118,2006-10-07,4.0,16,177847,4.177573
2,349752,219596,2008-04-12,0.0,26,89896,3.510906
3,628951,82783,2007-11-13,2.0,45,172637,3.844239
4,92816,435013,2013-07-31,3.0,52,177935,4.010906


In [None]:
# ---------------------------
# 2. Set Hyperparameters and Initialize Latent Factors
# ---------------------------
latent_dim = 3

# Randomly initialize latent factors for users and items.
user_factors = np.random.normal(scale=0.1, size=(num_users, latent_dim))
item_factors = np.random.normal(scale=0.1, size=(num_items, latent_dim))

In [27]:

# ---------------------------
# 3. Define Prediction and RMSE Functions
# ---------------------------

def predict(user, item):
    """Predict the rating by taking the dot product of user and item latent factors."""
    if user >= user_factors.shape[0] or item >= item_factors.shape[0]:
        return np.mean(user_factors)  # Return average rating if index out of bounds
    return np.dot(user_factors[user], item_factors[item])  # Ensure valid indexing



def compute_rmse(df):
    """
    Compute RMSE over a DataFrame with extra columns.
    Uses the mapped user ('u') and item ('i') indices along with the 'rating'.
    """
    errors = []
    for _, row in df.iterrows():
        u_idx = row['u']
        i_idx = row['i']
        rating = row['bayesian_avg']
        pred = predict(u_idx, i_idx)
        errors.append((rating - pred) ** 2)
    mse = np.mean(errors)
    return np.sqrt(mse)



In [30]:
# ---------------------------
# 4. Training Loop with Validation
# ---------------------------

learning_rate = 0.005
num_epochs = 1

for epoch in range(num_epochs):
    # Shuffle training data at the start of each epoch
    train_data_shuffled = train_data.sample(frac=1).reset_index(drop=True)
    
    # Process each training example
    for _, row in train_data_shuffled.iterrows():
        u_idx = row['u']
        i_idx = row['i']
        rating = row['bayesian_avg']
        
        # Compute prediction and error
        pred = predict(u_idx, i_idx)
        error = rating - pred
        
        # Compute gradients (derivative of squared error)
        grad_u = -2 * error * item_factors[i_idx]
        grad_i = -2 * error * user_factors[u_idx]
        
        # Update latent factors using gradient descent
        user_factors[u_idx] -= learning_rate * grad_u
        item_factors[i_idx] -= learning_rate * grad_i
    
    # Compute RMSE for training, validation, and test datasets
    train_rmse = compute_rmse(train_data)
    val_rmse = compute_rmse(val_data)
    test_rmse = compute_rmse(test_data)
    
    # Print intermediate results for this epoch
    print(f"Epoch {epoch+1}/{num_epochs} - Train RMSE: {train_rmse:.4f} | Val RMSE: {val_rmse:.4f} | Test RMSE: {test_rmse:.4f}")

Epoch 1/1 - Train RMSE: 1.2859 | Val RMSE: 4.2448 | Test RMSE: 4.2237


In [31]:
user_id = 8937
item_id  = 44551

predicted_rating = np.dot(user_factors[user_id], item_factors[item_id])
predicted_rating

np.float64(5.252547005635863)

In [32]:
train_data['bayesian_avg'].max(), train_data['bayesian_avg'].min()

(np.float64(4.965089335455729), np.float64(2.3225374552332876))