In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from src.recommendation_engine.content_based import ContentBasedRecommender
from src.recommendation_engine.collaborative import CollaborativeRecommender
from src.recommendation_engine.hybrid import HybridRecommender

In [2]:
# Paths to your data files
posts_csv_path = "data/processed/all_posts_with_features.csv"
interactions_csv_path = "data/processed/interaction_df.csv"

# Initialize the Content-Based and Collaborative recommenders
content_recommender = ContentBasedRecommender(posts_csv_path)
collaborative_recommender = CollaborativeRecommender(interactions_csv_path)

# Instantiate the HybridRecommender using the previously created content-based and collaborative models
hybrid_recommender = HybridRecommender(content_model=content_recommender, collaborative_model=collaborative_recommender, weight_content=0.3, weight_collaborative=0.7)

# Select a valid user_id (you can choose any valid user from your dataset)
user_id = 1

top_n=400

# Generate hybrid recommendations for the user
recommendations_df = hybrid_recommender.recommend_hybrid(user_id, top_n=top_n)

# Get the true ratings for the recommended posts
interaction_df = pd.read_csv(interactions_csv_path)  # Load the interactions data

# Extract the true ratings (ground truth) for the recommended posts
true_ratings = interaction_df[interaction_df['post_id'].isin(recommendations_df['post_id'])]

# Ensure the true ratings correspond to the recommended posts
true_ratings = true_ratings.set_index('post_id').loc[recommendations_df['post_id']].reset_index()

# Handle missing ratings (if any post in recommendations does not have a rating)
missing_posts = recommendations_df[~recommendations_df['post_id'].isin(true_ratings['post_id'])]
if not missing_posts.empty:
    print(f"Missing ratings for the following posts: {missing_posts['post_id'].values}")
    # You can choose to assign a default rating (e.g., 0 or the average rating)
    true_ratings = pd.concat([true_ratings, missing_posts.assign(rating_percent=0)], ignore_index=True)

# Now, limit the true_ratings to match the top N recommended posts (10 in this case)
true_ratings = true_ratings.head(top_n)

# Ensure the lengths of true_ratings and recommendations_df match
# No need for assertion now, as we are slicing the data
assert len(true_ratings) == len(recommendations_df), f"Mismatch between true ratings ({len(true_ratings)}) and predicted ratings ({len(recommendations_df)})"

# Now, get the predicted ratings (the weighted score from the hybrid model)
predicted_ratings = recommendations_df['weighted_score'].values
true_ratings_values = true_ratings['rating_percent'].values

# Check for NaN values in the true ratings and predicted ratings
if np.any(np.isnan(true_ratings_values)) or np.any(np.isnan(predicted_ratings)):
    print("NaN values detected. Replacing NaN values with 0.")
    
    # Replace NaN values with 0 or any other default value
    true_ratings_values = np.nan_to_num(true_ratings_values, nan=0)
    predicted_ratings = np.nan_to_num(predicted_ratings, nan=0)

Loading interactions data from data/processed/interaction_df.csv...
Interactions DataFrame Loaded: (9781, 9) rows, columns: ['id', 'post_id', 'user_id', 'viewed_at', 'interaction_type', 'rating_percent', 'liked_at', 'inspired_at', 'rated_at']
     id  post_id  user_id            viewed_at interaction_type  \
0  9447     1256        1  2024-09-24 13:33:57           viewed   
1  9487     1253        1  2024-09-25 07:34:56           viewed   
2  9488     1257        1  2024-09-25 07:36:46           viewed   
3  9489     1258        1  2024-09-25 07:36:47           viewed   
4  9502     1252        1  2024-09-26 15:09:11           viewed   

   rating_percent liked_at inspired_at rated_at  
0             NaN      NaN         NaN      NaN  
1             NaN      NaN         NaN      NaN  
2             NaN      NaN         NaN      NaN  
3             NaN      NaN         NaN      NaN  
4             NaN      NaN         NaN      NaN  
Creating User-Post Interaction Matrix...
User-Post Mat

  combined_df = pd.concat([content_df, collaborative_df], ignore_index=True)


In [3]:
# Compute MAE (Mean Absolute Error)
mae = mean_absolute_error(true_ratings_values, predicted_ratings)
print(f"Hybrid Model - Mean Absolute Error (MAE): {mae}")

# Compute RMSE (Root Mean Square Error)
rmse = np.sqrt(mean_squared_error(true_ratings_values, predicted_ratings))
print(f"Hybrid Model - Root Mean Square Error (RMSE): {rmse}")

Hybrid Model - Mean Absolute Error (MAE): 21.256244208555152
Hybrid Model - Root Mean Square Error (RMSE): 28.302076514884284
