# GPT subword tokenizer with start-of-step

In [7]:
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
import hvplot.pandas  # Provides hvplot accessor to Pandas DataFrames
import warnings
warnings.filterwarnings(action='once')

DATA_DIR = Path(r"D:\UKW_work\code\recipe_recommender_system\data\food_com_GeniusKitchen")

In [16]:
import pandas as pd
import numpy as np
import ast
import torch
from scipy.sparse import csr_matrix
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel

In [2]:
recepies_token = pd.read_csv( DATA_DIR / 'PP_recipes.csv')
recepies_token_subset = recepies_token.head(10)  
recepies_token_subset.to_csv(r'D:\UKW_work\code\recipe_recommender_system\temp\recepies_token_subset.csv',index= False)

In [3]:
users_token = pd.read_csv( DATA_DIR / 'PP_users.csv')
users_token_subset = users_token.head(10)  
users_token_subset.to_csv(r'D:\UKW_work\code\recipe_recommender_system\temp\users_token_subset.csv',index= False)

In [9]:
#########################################
# 1. Load and Explore the Data Files
#########################################
# Load PP_users.csv and PP_recipes.csv (adjust file paths if necessary)
users_df = pd.read_csv( DATA_DIR / 'PP_users.csv')
recipes_df = pd.read_csv( DATA_DIR / 'PP_recipes.csv')

In [10]:
# Display column names and shapes
print("PP_users.csv columns:", users_df.columns.tolist())
print("PP_users.csv shape:", users_df.shape)
print("PP_recipes.csv columns:", recipes_df.columns.tolist())
print("PP_recipes.csv shape:", recipes_df.shape)

PP_users.csv columns: ['u', 'techniques', 'items', 'n_items', 'ratings', 'n_ratings']
PP_users.csv shape: (25076, 6)
PP_recipes.csv columns: ['id', 'i', 'name_tokens', 'ingredient_tokens', 'steps_tokens', 'techniques', 'calorie_level', 'ingredient_ids']
PP_recipes.csv shape: (178265, 8)


In [11]:
#########################################
# 2. Process PP_users.csv into Interaction Data
#########################################
# Expected columns in PP_users.csv:
#   u               : User ID (mapped to contiguous integers)
#   items           : Recipes interacted with (aggregated; stored as a list in string form)
#   ratings         : Ratings given to each recipe (list in string form)
#   techniques, n_items, n_ratings are also available but not used directly for recommendations.
# Parse the 'items' and 'ratings' columns from string to Python lists.
users_df['items'] = users_df['items'].apply(ast.literal_eval)
users_df['ratings'] = users_df['ratings'].apply(ast.literal_eval)


In [18]:
# Build an "exploded" interactions DataFrame: one row per (user, recipe, rating)
interaction_list = []
for _, row in users_df.iterrows():
    user_id = row['u']
    for recipe_id, rating in zip(row['items'], row['ratings']):
        interaction_list.append({'user_id': user_id, 'recipe_id': recipe_id, 'rating': rating})
interactions_df = pd.DataFrame(interaction_list)
print("\nSample interactions:")
print(interactions_df.head())
print("Total interactions:", interactions_df.shape[0])


Sample interactions:
   user_id  recipe_id  rating
0        0       1118     5.0
1        0      27680     5.0
2        0      32541     5.0
3        0     137353     5.0
4        0      16428     5.0
Total interactions: 698901


In [19]:
#########################################
# 3. Build a Sparse User–Item Rating Matrix
#########################################
# Determine the number of users and recipes.
n_users = interactions_df['user_id'].max() + 1
n_recipes = interactions_df['recipe_id'].max() + 1
print(f"\nNumber of users: {n_users}, Number of recipes: {n_recipes}")

# Create arrays for constructing a sparse matrix.
row_indices = interactions_df['user_id'].values
col_indices = interactions_df['recipe_id'].values
ratings = interactions_df['rating'].values

# Build a CSR (Compressed Sparse Row) matrix.
user_item_sparse = csr_matrix((ratings, (row_indices, col_indices)), shape=(n_users, n_recipes))
print("User-Item Sparse Matrix shape:", user_item_sparse.shape)


Number of users: 25076, Number of recipes: 178263
User-Item Sparse Matrix shape: (25076, 178263)


In [20]:
#########################################
# 4. Collaborative Filtering: On-The-Fly Similarity
#########################################
def recommend_collaborative(user_id, top_k=5):
    """
    Recommend recipes for a given user using collaborative filtering.
    Computes cosine similarity on the fly between the target user's vector and all user vectors.
    Then, aggregates ratings from similar users (weighted by similarity) to score recipes.
    Excludes recipes already rated by the target user.
    """
    # Get target user's vector (1 x n_recipes)
    user_vec = user_item_sparse.getrow(user_id)
    # Compute cosine similarity between this user and all users.
    # This returns a dense array of shape (1, n_users)
    sim_scores = cosine_similarity(user_vec, user_item_sparse).flatten()
    
    # Compute a weighted score for each recipe.
    # user_item_sparse.T has shape (n_recipes x n_users); dot with sim_scores gives (n_recipes,)
    weighted_scores = user_item_sparse.T.dot(sim_scores)
    # Normalize by the sum of similarity scores.
    weighted_scores = weighted_scores / (sim_scores.sum() + 1e-8)
    
    # Exclude recipes already rated by the user.
    user_rated = set(interactions_df[interactions_df['user_id'] == user_id]['recipe_id'])
    candidate_indices = [i for i in range(n_recipes) if i not in user_rated]
    
    # Get scores for candidates.
    candidate_scores = {recipe: weighted_scores[recipe] for recipe in candidate_indices}
    # Sort recipes by score (descending) and return top_k.
    recommended = sorted(candidate_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    return recommended

In [21]:
#########################################
# 5. Process PP_recipes.csv for Content-Based Filtering
#########################################
# Expected columns in PP_recipes.csv:
#  - id               : Recipe ID
#  - name_tokens      : BPE-tokenized recipe name (string representation of list)
#  - ingredient_tokens: BPE-tokenized ingredients (string representation of a list of lists)
#  - steps_tokens     : BPE-tokenized steps (string representation of list)

# Parse the tokenized fields.
recipes_df['name_tokens'] = recipes_df['name_tokens'].apply(ast.literal_eval)
recipes_df['ingredient_tokens'] = recipes_df['ingredient_tokens'].apply(ast.literal_eval)
recipes_df['steps_tokens'] = recipes_df['steps_tokens'].apply(ast.literal_eval)

In [23]:
# Combine the tokens into one text for each recipe.
def combine_recipe_text(row):
    # Convert each token in name_tokens to a string.
    name_text = " ".join([str(token) for token in row['name_tokens']])
    
    # For ingredient_tokens, each element is a list; convert inner tokens to string.
    ingredient_text = " ".join([" ".join([str(tok) for tok in lst]) for lst in row['ingredient_tokens']])
    
    # Convert each token in steps_tokens to a string.
    steps_text = " ".join([str(token) for token in row['steps_tokens']])
    
    return name_text + " " + ingredient_text + " " + steps_text

recipes_df['combined_text'] = recipes_df.apply(combine_recipe_text, axis=1)
print("\nSample combined recipe text:")
print(recipes_df[['id', 'combined_text']].head())



Sample combined recipe text:
       id                                      combined_text
0  424415  40480 37229 2911 1019 249 6878 6878 2839 1781 ...
1  146223  40480 18376 7056 246 1531 2032 40481 17918 259...
2  312329  40480 21044 16954 8294 556 10837 40481 5867 24...
3   74301  40480 10025 31156 40481 1270 1645 28447 21601 ...
4   76272  40480 17841 252 782 2373 1641 2373 252 40481 1...


In [24]:
#########################################
# 6. Compute Recipe Embeddings Using OpenAIGPT
#########################################
# Initialize the GPT tokenizer and model.
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTModel.from_pretrained('openai-gpt')
model.eval()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


OpenAIGPTModel(
  (tokens_embed): Embedding(40478, 768)
  (positions_embed): Embedding(512, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x Block(
      (attn): Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
  )
)

In [25]:
def get_embedding(text, max_length=512):
    """
    Compute an embedding for the given text using the OpenAIGPT model.
    Uses mean pooling over the last hidden state.
    """
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding.squeeze()

In [26]:
# Compute embeddings for recipes and store in a dictionary (key: recipe id)
recipe_embeddings = {}
for idx, row in recipes_df.iterrows():
    rid = row['id']
    text = row['combined_text']
    try:
        emb = get_embedding(text)
        recipe_embeddings[rid] = emb
    except Exception as e:
        print(f"Error processing recipe {rid}: {e}")
print("\nComputed embeddings for", len(recipe_embeddings), "recipes.")


KeyboardInterrupt: 

In [None]:


import pickle

# Open a file and use dump() 
with open(r'D:\UKW_work\code\recipe_recommender_system\temp\recipe_embeddings.pkl', 'wb') as file:       
    # A new file will be created 
    pickle.dump(recipe_embeddings, file)

In [None]:
#########################################
# 7. Build User Profiles for Content-Based Filtering
#########################################
# Assume a user "likes" a recipe if the rating is high (e.g., >= 4).
user_profiles = {}
rating_threshold = 4
for user in range(n_users):
    # Get recipes rated highly by this user.
    user_data = interactions_df[interactions_df['user_id'] == user]
    liked_recipes = user_data[user_data['rating'] >= rating_threshold]['recipe_id'].tolist()
    # Collect embeddings if available.
    embeddings = [recipe_embeddings[rid] for rid in liked_recipes if rid in recipe_embeddings]
    if embeddings:
        stacked = torch.stack(embeddings)
        user_profiles[user] = stacked.mean(dim=0)

In [None]:
def recommend_content_based(user_id, top_k=5):
    """
    Recommend recipes using content-based filtering.
    Computes cosine similarity between the user's profile and each recipe embedding.
    Excludes recipes the user has already interacted with.
    """
    if user_id not in user_profiles:
        return []
    profile = user_profiles[user_id]
    scores = {}
    already_rated = set(interactions_df[interactions_df['user_id'] == user_id]['recipe_id'])
    for rid, emb in recipe_embeddings.items():
        if rid in already_rated:
            continue
        sim = F.cosine_similarity(profile, emb, dim=0)
        scores[rid] = sim.item()
    recommended = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    return recommended

#########################################
# 8. Hybrid Recommendation: Combine Both Approaches
#########################################
def recommend_hybrid(user_id, top_k=5, alpha=0.5):
    """
    Hybrid recommender that combines collaborative and content-based scores.
    alpha: weight for the collaborative score, (1 - alpha) for the content-based score.
    """
    collab_recs = recommend_collaborative(user_id, top_k=top_k*2)
    content_recs = recommend_content_based(user_id, top_k=top_k*2)
    
    # Convert recommendations to dictionaries.
    collab_dict = dict(collab_recs)
    content_dict = dict(content_recs)
    
    combined_scores = {}
    candidate_recipes = set(collab_dict.keys()).union(set(content_dict.keys()))
    for rid in candidate_recipes:
        c_score = collab_dict.get(rid, 0)
        ct_score = content_dict.get(rid, 0)
        combined_scores[rid] = alpha * c_score + (1 - alpha) * ct_score
    hybrid_recs = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    return hybrid_recs


In [None]:
#########################################
# 9. Example: Generate Recommendations for a Sample User
#########################################
sample_user = 0  # For example, user 0
print("\n--- Collaborative Filtering Recommendations for User", sample_user, "---")
print(recommend_collaborative(sample_user, top_k=5))

print("\n--- Content-Based Recommendations for User", sample_user, "---")
print(recommend_content_based(sample_user, top_k=5))

print("\n--- Hybrid Recommendations for User", sample_user, "---")
print(recommend_hybrid(sample_user, top_k=5, alpha=0.5))

# 'all-MiniLM-L6-v2'

In [1]:
from pathlib import Path
import hvplot.pandas  # Provides hvplot accessor to Pandas DataFrames
import warnings
warnings.filterwarnings(action='once')

DATA_DIR = Path(r"D:\UKW_work\code\recipe_recommender_system\data\food_com_GeniusKitchen")

In [2]:
import pandas as pd
import numpy as np
import ast
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
# Instead of using OpenAIGPT, we import SentenceTransformer:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#########################################
# 1. Load and Explore the Data Files
#########################################
# Load the CSV files (adjust file paths as needed)
users_df = pd.read_csv(DATA_DIR / 'PP_users.csv')
recipes_df = pd.read_csv(DATA_DIR /  'PP_recipes.csv')

print("PP_users.csv columns:", users_df.columns.tolist())
print("PP_users.csv shape:", users_df.shape)
print("PP_recipes.csv columns:", recipes_df.columns.tolist())
print("PP_recipes.csv shape:", recipes_df.shape)

PP_users.csv columns: ['u', 'techniques', 'items', 'n_items', 'ratings', 'n_ratings']
PP_users.csv shape: (25076, 6)
PP_recipes.csv columns: ['id', 'i', 'name_tokens', 'ingredient_tokens', 'steps_tokens', 'techniques', 'calorie_level', 'ingredient_ids']
PP_recipes.csv shape: (178265, 8)


In [4]:
#########################################
# 2. Process PP_users.csv into Interaction Data
#########################################
# Expected columns in PP_users.csv:
#  - u         : user id (contiguous integers)
#  - items     : recipes interacted with (string representation of a list)
#  - ratings   : ratings given (string representation of a list)

# Convert the string representations into actual lists.
users_df['items'] = users_df['items'].apply(ast.literal_eval)
users_df['ratings'] = users_df['ratings'].apply(ast.literal_eval)

# Build an "exploded" interactions DataFrame: one row per (user, recipe, rating)
interaction_list = []
for _, row in users_df.iterrows():
    user_id = row['u']
    for recipe_id, rating in zip(row['items'], row['ratings']):
        interaction_list.append({'user_id': user_id, 'recipe_id': recipe_id, 'rating': rating})
interactions_df = pd.DataFrame(interaction_list)
print("\nSample interactions:")
print(interactions_df.head())
print("Total interactions:", interactions_df.shape[0])


Sample interactions:
   user_id  recipe_id  rating
0        0       1118     5.0
1        0      27680     5.0
2        0      32541     5.0
3        0     137353     5.0
4        0      16428     5.0
Total interactions: 698901


In [5]:
#########################################
# 3. Build a Sparse User–Item Rating Matrix
#########################################
# Determine the number of users and recipes.
n_users = interactions_df['user_id'].max() + 1
n_recipes = interactions_df['recipe_id'].max() + 1
print(f"\nNumber of users: {n_users}, Number of recipes: {n_recipes}")

# Create arrays for constructing a sparse matrix.
row_indices = interactions_df['user_id'].values
col_indices = interactions_df['recipe_id'].values
ratings = interactions_df['rating'].values

# Build a CSR (Compressed Sparse Row) matrix.
user_item_sparse = csr_matrix((ratings, (row_indices, col_indices)), shape=(n_users, n_recipes))
print("User-Item Sparse Matrix shape:", user_item_sparse.shape)


Number of users: 25076, Number of recipes: 178263
User-Item Sparse Matrix shape: (25076, 178263)


In [6]:
#########################################
# 4. Collaborative Filtering: On-The-Fly Similarity
#########################################
def recommend_collaborative(user_id, top_k=5):
    """
    Recommend recipes for a given user using collaborative filtering.
    Computes cosine similarity on the fly between the target user's vector and all user vectors.
    Then, aggregates ratings from similar users (weighted by similarity) to score recipes.
    Excludes recipes already rated by the target user.
    """
    # Get target user's vector (1 x n_recipes)
    user_vec = user_item_sparse.getrow(user_id)
    # Compute cosine similarity between this user and all users.
    sim_scores = cosine_similarity(user_vec, user_item_sparse).flatten()
    
    # Compute a weighted score for each recipe.
    weighted_scores = user_item_sparse.T.dot(sim_scores)
    # Normalize by the sum of similarity scores.
    weighted_scores = weighted_scores / (sim_scores.sum() + 1e-8)
    
    # Exclude recipes already rated by the user.
    user_rated = set(interactions_df[interactions_df['user_id'] == user_id]['recipe_id'])
    candidate_indices = [i for i in range(n_recipes) if i not in user_rated]
    
    candidate_scores = {recipe: weighted_scores[recipe] for recipe in candidate_indices}
    recommended = sorted(candidate_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    return recommended

In [7]:
#########################################
# 5. Process PP_recipes.csv for Content-Based Filtering
#########################################
# Expected columns in PP_recipes.csv:
#  - id               : Recipe ID
#  - name_tokens      : BPE-tokenized recipe name (string representation of list)
#  - ingredient_tokens: BPE-tokenized ingredients (string representation of a list of lists)
#  - steps_tokens     : BPE-tokenized steps (string representation of list)

# Parse the tokenized fields.
recipes_df['name_tokens'] = recipes_df['name_tokens'].apply(ast.literal_eval)
recipes_df['ingredient_tokens'] = recipes_df['ingredient_tokens'].apply(ast.literal_eval)
recipes_df['steps_tokens'] = recipes_df['steps_tokens'].apply(ast.literal_eval)

# Combine the tokens into one text for each recipe.
def combine_recipe_text(row):
    # Convert tokens to strings in case some tokens are non-string (e.g., integers)
    name_text = " ".join([str(token) for token in row['name_tokens']])
    ingredient_text = " ".join([" ".join([str(tok) for tok in lst]) for lst in row['ingredient_tokens']])
    steps_text = " ".join([str(token) for token in row['steps_tokens']])
    return name_text + " " + ingredient_text + " " + steps_text

recipes_df['combined_text'] = recipes_df.apply(combine_recipe_text, axis=1)
print("\nSample combined recipe text:")
print(recipes_df[['id', 'combined_text']].head())


Sample combined recipe text:
       id                                      combined_text
0  424415  40480 37229 2911 1019 249 6878 6878 2839 1781 ...
1  146223  40480 18376 7056 246 1531 2032 40481 17918 259...
2  312329  40480 21044 16954 8294 556 10837 40481 5867 24...
3   74301  40480 10025 31156 40481 1270 1645 28447 21601 ...
4   76272  40480 17841 252 782 2373 1641 2373 252 40481 1...


In [8]:
#########################################
# 6. Compute Recipe Embeddings Using SentenceTransformer
#########################################
# Initialize the SentenceTransformer model (all-MiniLM-L6-v2) for fast, CPU-friendly embeddings.
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# We now compute embeddings in a batched manner.
# Create lists for recipe IDs and combined texts.
recipe_ids = recipes_df['id'].tolist()
recipe_texts = recipes_df['combined_text'].tolist()

In [9]:
# Compute embeddings with the SentenceTransformer model.
# Set convert_to_tensor=True to get PyTorch tensors.
embeddings = embedding_model.encode(recipe_texts, batch_size=64, show_progress_bar=True, convert_to_tensor=True)

# Store embeddings in a dictionary (key: recipe id)
recipe_embeddings = {rid: emb for rid, emb in zip(recipe_ids, embeddings)}
print("\nComputed embeddings for", len(recipe_embeddings), "recipes.")


Batches: 100%|██████████| 2786/2786 [3:13:14<00:00,  4.16s/it]  



Computed embeddings for 178265 recipes.


In [None]:
import torch

TEMP_DIR = Path(r'D:\UKW_work\code\recipe_recommender_system\temp')
RECIPE_EMBD_DIR = TEMP_DIR / 'recipes_embeddings'
USER_PROFILE_EMBD_DIR = TEMP_DIR / 'user_profile_embeddings'


embd_save_path = RECIPE_EMBD_DIR / 'recipe_combined_text_embeddings_all-MiniLM-L6-v2.pt'
# Save the dictionary containing recipe embeddings
torch.save(recipe_embeddings, embd_save_path)
print(f"Embeddings saved to {embd_save_path}")


Embeddings saved to D:\UKW_work\code\recipe_recommender_system\temp\recipes_embeddings\recipe_embeddings_all-MiniLM-L6-v2.pt


In [21]:
from tqdm import tqdm

#########################################
# 7. Build User Profiles for Content-Based Filtering
#########################################
# Assume a user "likes" a recipe if the rating is high (e.g., >= 4).
user_profiles = {}
rating_threshold = 4
for user in tqdm(range(n_users)):
    # Get recipes rated highly by this user.
    user_data = interactions_df[interactions_df['user_id'] == user]
    liked_recipes = user_data[user_data['rating'] >= rating_threshold]['recipe_id'].tolist()
    # Collect embeddings if available.
    embeddings_list = [recipe_embeddings[rid] for rid in liked_recipes if rid in recipe_embeddings]
    if embeddings_list:
        stacked = torch.stack(embeddings_list)
        user_profiles[user] = stacked.mean(dim=0)

  0%|          | 0/25076 [00:00<?, ?it/s]

100%|██████████| 25076/25076 [01:08<00:00, 364.66it/s]


In [24]:
user_profiles_save_path = USER_PROFILE_EMBD_DIR / 'user_profiles_embeddings_all-MiniLM-L6-v2.pt'
# Save the dictionary containing recipe embeddings
torch.save(user_profiles, user_profiles_save_path)
print(f"User Profile saved to {user_profiles_save_path}")

User Profile saved to D:\UKW_work\code\recipe_recommender_system\temp\user_profile_embeddings\user_profiles_embeddings_all-MiniLM-L6-v2.pt


In [26]:

def recommend_content_based(user_id, top_k=5):
    """
    Recommend recipes using content-based filtering.
    Computes cosine similarity between the user's profile and each recipe embedding.
    Excludes recipes the user has already interacted with.
    """
    if user_id not in user_profiles:
        return []
    profile = user_profiles[user_id]
    scores = {}
    already_rated = set(interactions_df[interactions_df['user_id'] == user_id]['recipe_id'])
    for rid, emb in recipe_embeddings.items():
        if rid in already_rated:
            continue
        sim = F.cosine_similarity(profile, emb, dim=0)
        scores[rid] = sim.item()
    recommended = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    return recommended

#########################################
# 8. Hybrid Recommendation: Combine Both Approaches
#########################################
def recommend_hybrid(user_id, top_k=5, alpha=0.5):
    """
    Hybrid recommender that combines collaborative and content-based scores.
    alpha: weight for the collaborative score, (1 - alpha) for the content-based score.
    """
    collab_recs = recommend_collaborative(user_id, top_k=top_k*2)
    content_recs = recommend_content_based(user_id, top_k=top_k*2)
    
    # Convert recommendations to dictionaries.
    collab_dict = dict(collab_recs)
    content_dict = dict(content_recs)
    
    combined_scores = {}
    candidate_recipes = set(collab_dict.keys()).union(set(content_dict.keys()))
    for rid in candidate_recipes:
        c_score = collab_dict.get(rid, 0)
        ct_score = content_dict.get(rid, 0)
        combined_scores[rid] = alpha * c_score + (1 - alpha) * ct_score
    hybrid_recs = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    return hybrid_recs

In [27]:
#########################################
# 9. Example: Generate Recommendations for a Sample User
#########################################
sample_user = 0  # For example, user 0
print("\n--- Collaborative Filtering Recommendations for User", sample_user, "---")
print(recommend_collaborative(sample_user, top_k=5))

print("\n--- Content-Based Recommendations for User", sample_user, "---")
print(recommend_content_based(sample_user, top_k=5))

print("\n--- Hybrid Recommendations for User", sample_user, "---")
print(recommend_hybrid(sample_user, top_k=5, alpha=0.5))


--- Collaborative Filtering Recommendations for User 0 ---
[(135961, np.float64(0.4032051223097654)), (134610, np.float64(0.3209369091004125)), (99787, np.float64(0.31785938317405305)), (127080, np.float64(0.28640487761433614)), (52334, np.float64(0.2702166199654352))]

--- Content-Based Recommendations for User 0 ---
[(281152, 0.9963469505310059), (294720, 0.9961875081062317), (312189, 0.9961854815483093), (64667, 0.9961628317832947), (91009, 0.9961289167404175)]

--- Hybrid Recommendations for User 0 ---
[(281152, 0.49817347526550293), (294720, 0.49809375405311584), (312189, 0.49809274077415466), (64667, 0.49808141589164734), (91009, 0.49806445837020874)]


# Test set data: Get Test predictions

In [7]:
from pathlib import Path
import pandas as pd


DATA_DIR = Path(r"D:\UKW_work\code\recipe_recommender_system\data\food_com_GeniusKitchen")

token_interactions_train_path = DATA_DIR / 'interactions_train.csv'
token_interactions_val_path = DATA_DIR / 'interactions_validation.csv'
token_interactions_test_path = DATA_DIR / 'interactions_test.csv'

User ID,

recipe_id
Recipe ID

date
Date of interaction

rating
Rating given

u
User ID, mapped to contiguous integers from 0

i
Recipe ID, mapped to contiguous integers from

In [None]:
#=====================================================
#  Split     # Users    # Recipes # Actions Sparsity
#=====================================================
# Train     25,076      160,901     698,901     99.983%
# Dev       7,023       6,621       7,023       –
# Test      12,455      11,695      12,455       –
# =====================================================

In [9]:
train_df = pd.read_csv(token_interactions_train_path)
train_df

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723
...,...,...,...,...,...,...
698896,926904,457971,2018-12-18,5.0,13681,141067
698897,2002312797,27208,2018-12-18,5.0,14897,99787
698898,1290903,131607,2018-12-18,5.0,11605,76163
698899,226867,363072,2018-12-18,5.0,3604,29101


In [10]:
test_df = pd.read_csv(token_interactions_test_path)
test_df

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,8937,44551,2005-12-23,4.0,2,173538
1,56680,126118,2006-10-07,4.0,16,177847
2,349752,219596,2008-04-12,0.0,26,89896
3,628951,82783,2007-11-13,2.0,45,172637
4,92816,435013,2013-07-31,3.0,52,177935
...,...,...,...,...,...,...
12450,101053,179011,2009-01-03,5.0,25054,130258
12451,252205,81398,2005-12-26,2.0,25055,152255
12452,624305,142984,2011-01-15,1.0,25057,139864
12453,173575,104842,2004-12-18,3.0,25059,140646


In [13]:
test_df[['user_id', 'date', 'recipe_id', 'rating']]

Unnamed: 0,user_id,date,recipe_id,rating
0,8937,2005-12-23,44551,4.0
1,56680,2006-10-07,126118,4.0
2,349752,2008-04-12,219596,0.0
3,628951,2007-11-13,82783,2.0
4,92816,2013-07-31,435013,3.0
...,...,...,...,...
12450,101053,2009-01-03,179011,5.0
12451,252205,2005-12-26,81398,2.0
12452,624305,2011-01-15,142984,1.0
12453,173575,2004-12-18,104842,3.0


In [12]:
val_df = pd.read_csv(token_interactions_val_path)
val_df

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,76535,33627,2005-02-15,4.0,5,177317
1,160497,75307,2005-10-24,4.0,23,170785
2,930021,100961,2008-11-30,4.0,31,165555
3,58439,154105,2007-03-24,4.0,44,177453
4,628951,14525,2008-02-16,5.0,45,142367
...,...,...,...,...,...,...
7018,557416,247915,2007-10-25,5.0,25006,164841
7019,218411,116676,2005-09-21,3.0,25008,117202
7020,587445,206493,2008-07-16,5.0,25009,117301
7021,1724643,65883,2011-11-08,5.0,25047,131974


In [29]:
common_ids = set(test_df['user_id']) & set(train_df['user_id'])
print("Number of common user IDs:", len(common_ids))

Number of common user IDs: 12455


In [38]:
common_ids = set(test_df['recipe_id']) & set(train_df['recipe_id'])
print("Number of common recipe IDs:", len(common_ids))

Number of common recipe IDs: 0


In [35]:
### COLUMNS ####
# 1. Recipe name
# 2. id: Recipe ID
# 3. minutes:  Minutes to prepare recipe
# 4. contributor_id: User ID who submitted this recipe
# 5. submitted: Date recipe was submitted
# 6. tags: Food.com tags for recipe
# 7. nutrition:  Nutrition information (calories (#), total fat (PDV), 
#   sugar (PDV) , sodium (PDV) , protein (PDV) , saturated fat (PDV) , and carbohydrates (PDV))
# 8. n_steps:  Number of steps in recipe
# 9. steps: Text for recipe steps, in order
# 10. description: User-provided description



raw_df = pd.read_csv(DATA_DIR / 'RAW_recipes.csv')
raw_df['id'].nunique(), raw_df.shape

(231637, (231637, 12))

In [37]:
raw_usr_df = pd.read_csv(DATA_DIR / 'RAW_interactions.csv')
raw_usr_df['recipe_id'].nunique(), raw_usr_df.shape

(231637, (1132367, 5))

In [20]:
raw_df['id'].nunique(), raw_df.shape

(231637, (231637, 12))

In [None]:
#=====================================================
#  Split     # Users    # Recipes # Actions Sparsity
#=====================================================
# Train     25,076      160,901     698,901     99.983%
# Dev       7,023       6,621       7,023       –
# Test      12,455      11,695      12,455       –
# =====================================================

In [27]:
train_recipe_ids = train_df['recipe_id'].to_list()
print(len(train_recipe_ids),  train_df['recipe_id'].nunique(),train_df.shape)

val_recipe_ids = val_df['recipe_id'].to_list()
print(len(val_recipe_ids), val_df['recipe_id'].nunique(), val_df.shape)

test_recipe_ids = test_df['recipe_id'].to_list()
print(len(test_recipe_ids),test_df['recipe_id'].nunique(), test_df.shape)

698901 160901 (698901, 6)
7023 6621 (7023, 6)
12455 11695 (12455, 6)


In [28]:
160901 +6621  + 11695 

179217

In [None]:
698901  + 7023 + 12455 #total user reviews

718379

In [30]:
# Create RAW_recipe_train_df with interactions_train.csv using list of unique recipe_ids

# Suppose recipe_ids is your list of recipe IDs to filter by.
filtered_df = raw_df[raw_df['id'].isin(train_recipe_ids)]
# print(filtered_df)
filtered_df.shape

(160901, 12)

In [None]:
filtered_test_df = raw_df[raw_df['id'].isin(test_recipe_ids)]
# print(filtered_df)
filtered_test_df.shape

In [33]:
import pickle


objects = []
with (open(r"D:\UKW_work\code\recipe_recommender_system\data\food_com_GeniusKitchen\ingr_map.pkl", "rb")) as openfile:
    while True:
        try:
            objects.append(pickle.load(openfile))
        except EOFError:
            break

ModuleNotFoundError: No module named 'pandas.core.indexes.numeric'

In [34]:
map_rec_usr = pd.read_pickle(r"D:\UKW_work\code\recipe_recommender_system\data\food_com_GeniusKitchen\ingr_map.pkl")
map_rec_usr

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id
0,"medium heads bibb or red leaf lettuce, washed,...",13,"medium heads bibb or red leaf lettuce, washed,...",73,lettuce,4507,4308
1,mixed baby lettuces and spring greens,6,mixed baby lettuces and spring green,36,lettuce,4507,4308
2,romaine lettuce leaf,3,romaine lettuce leaf,20,lettuce,4507,4308
3,iceberg lettuce leaf,3,iceberg lettuce leaf,20,lettuce,4507,4308
4,red romaine lettuce,3,red romaine lettuce,19,lettuce,4507,4308
...,...,...,...,...,...,...,...
11654,soybeans,1,soybean,7,soybean,31,6702
11655,goose,1,goose,5,goose,8,3318
11656,ajwain,1,ajwain,6,ajwain,13,47
11657,brinjals,1,brinjal,7,brinjal,2,750
