In [10]:
# -----------------------------
# CONTENT-BASED RECOMMENDATION SYSTEM (Fixed)
# -----------------------------

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# -----------------------------
# 1Ô∏è‚É£ Load datasets
# -----------------------------
products = pd.read_csv("product_metadata_5000.csv")
users = pd.read_csv("user_metadata_5000.csv")
interactions = pd.read_csv("user_interactions_5000.csv")
reviews = pd.read_csv("reviews_5000.csv")

print(f"Products: {products.shape}")
print(f"Users: {users.shape}")
print(f"Interactions: {interactions.shape}")
print(f"Reviews: {reviews.shape}")

# -----------------------------
# 2Ô∏è‚É£ Preprocess product content
# -----------------------------
products['content'] = (
    products['description'].fillna('') + ' ' +
    products['tags'].fillna('') + ' ' +
    products['category'].fillna('') + ' ' +
    products['brand'].fillna('')
)

# -----------------------------
# 3Ô∏è‚É£ TF-IDF Vectorization
# -----------------------------
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(products['content'])
print("‚úÖ TF-IDF Matrix shape:", tfidf_matrix.shape)

# -----------------------------
# 4Ô∏è‚É£ Build user profiles
# -----------------------------
user_profiles = {}

for user_id, group in interactions.groupby('user_id'):
    # Get valid products that exist in products dataset
    interacted_products = group[group['interaction_value'] > 0.0]['product_id']
    valid_products = products[products['product_id'].isin(interacted_products)]
    
    if valid_products.empty:
        continue
    
    # Get indices of valid products
    idx = valid_products.index.tolist()
    
    # Compute mean vector safely
    user_vector = tfidf_matrix[idx].mean(axis=0)
    
    # Convert to csr_matrix for consistency
    if isinstance(user_vector, np.matrix):
        user_vector = csr_matrix(user_vector)
    
    user_profiles[user_id] = user_vector

print(f"‚úÖ Built user profiles for {len(user_profiles)} users.")

# -----------------------------
# 5Ô∏è‚É£ Recommendation function
# -----------------------------
def recommend_products(user_id, top_n=5):
    if user_id not in user_profiles:
        return []
    
    user_vec = user_profiles[user_id]
    sims = cosine_similarity(user_vec, tfidf_matrix).flatten()
    
    products_scores = pd.DataFrame({
        'product_id': products['product_id'],
        'score': sims
    })
    
    # Exclude already interacted products
    interacted = interactions[interactions['user_id'] == user_id]['product_id'].tolist()
    products_scores = products_scores[~products_scores['product_id'].isin(interacted)]
    
    return products_scores.sort_values('score', ascending=False).head(top_n)

# -----------------------------
# 6Ô∏è‚É£ Sample recommendations
# -----------------------------
sample_users = np.random.choice(list(user_profiles.keys()), min(5, len(user_profiles)), replace=False)

for u in sample_users:
    print(f"\nüéØ Recommendations for {u}:")
    recs = recommend_products(u, top_n=5)
    print(recs)


Products: (5000, 7)
Users: (5000, 5)
Interactions: (5000, 5)
Reviews: (5000, 6)
‚úÖ TF-IDF Matrix shape: (5000, 36)
‚úÖ Built user profiles for 0 users.
