In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import warnings
warnings.filterwarnings('ignore')

# ==================== 1. DATA LOADING AND EXPLORATION ====================

# Load the dataset
df = pd.read_csv('RS-A5_amazon_products_sales_data_cleaned.csv')

print("="*80)
print("E-COMMERCE RECOMMENDATION SYSTEM USING MATRIX FACTORIZATION")
print("="*80)
print("\n1. DATASET OVERVIEW")
print("-"*80)
print(f"Dataset Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print("\nFirst 10 rows:")
print(df.head(10))
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nStatistical Summary:")
print(df.describe())

# ==================== 2. DATA PREPROCESSING ====================

print("\n2. DATA PREPROCESSING")
print("-"*80)

# Create a copy for processing
df_processed = df.copy()

# Handle missing values
print("Handling missing values...")
for col in df_processed.columns:
    if df_processed[col].isnull().sum() > 0:
        if df_processed[col].dtype in ['int64', 'float64']:
            df_processed[col].fillna(df_processed[col].median(), inplace=True)
        else:
            df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)

print(f"Missing values after handling: {df_processed.isnull().sum().sum()}")

# Create user IDs (simulate users based on patterns)
# Since no user_id column exists, we'll create synthetic users based on purchase patterns
np.random.seed(42)
n_users = 500  # Simulate 500 users
df_processed['user_id'] = np.random.randint(1, n_users + 1, size=len(df_processed))

# Create product IDs from product titles
le_product = LabelEncoder()
df_processed['product_id'] = le_product.fit_transform(df_processed['product_title'])

# Extract rating information (use product_rating as implicit feedback)
df_processed['rating'] = df_processed['product_rating']

print(f"\nCreated {df_processed['user_id'].nunique()} unique users")
print(f"Total unique products: {df_processed['product_id'].nunique()}")
print(f"Total interactions: {len(df_processed)}")

# ==================== 3. EXPLORATORY DATA ANALYSIS ====================

print("\n3. EXPLORATORY DATA ANALYSIS")
print("-"*80)

# Rating distribution
print("\nRating Distribution:")
print(df_processed['rating'].value_counts().sort_index())
print(f"\nAverage Rating: {df_processed['rating'].mean():.2f}")
print(f"Rating Std Dev: {df_processed['rating'].std():.2f}")

# Products by category (if available)
if 'product_category' in df_processed.columns:
    print("\nTop 10 Product Categories:")
    print(df_processed['product_category'].value_counts().head(10))

# Price analysis
print("\nPrice Analysis:")
print(f"Average Discounted Price: ${df_processed['discounted_price'].mean():.2f}")
print(f"Average Original Price: ${df_processed['original_price'].mean():.2f}")
if 'discount_percentage' in df_processed.columns:
    print(f"Average Discount: {df_processed['discount_percentage'].mean():.1f}%")

# Best sellers vs others
if 'is_best_seller' in df_processed.columns:
    print("\nBest Seller Distribution:")
    print(df_processed['is_best_seller'].value_counts())

# Purchases per month
if 'purchased_last_month' in df_processed.columns:
    print("\nTotal Purchases Last Month:", df_processed['purchased_last_month'].sum())

# ==================== 4. CREATE USER-ITEM INTERACTION MATRIX ====================

print("\n4. USER-ITEM INTERACTION MATRIX")
print("-"*80)

# Create user-item rating matrix
# Each row represents a user, each column represents a product
user_item_matrix = df_processed.pivot_table(
    index='user_id',
    columns='product_id',
    values='rating',
    fill_value=0
)

print(f"User-Item Matrix Shape: {user_item_matrix.shape}")
print(f"Total interactions: {(user_item_matrix > 0).sum().sum()}")
print(f"Matrix sparsity: {(1 - (user_item_matrix > 0).sum().sum() / (user_item_matrix.shape[0] * user_item_matrix.shape[1])) * 100:.2f}%")

# Calculate matrix statistics
user_interactions = (user_item_matrix > 0).sum(axis=1)
item_interactions = (user_item_matrix > 0).sum(axis=0)

print(f"\nAverage interactions per user: {user_interactions.mean():.2f}")
print(f"Average interactions per item: {item_interactions.mean():.2f}")
print(f"Max interactions per user: {user_interactions.max()}")
print(f"Max interactions per item: {item_interactions.max()}")

# ==================== 5. MATRIX FACTORIZATION - SVD (Singular Value Decomposition) ====================

print("\n5. MATRIX FACTORIZATION - SVD")
print("-"*80)

# Convert to numpy array
R = user_item_matrix.values

# Normalize by subtracting mean rating for each user
user_ratings_mean = np.mean(R, axis=1)
R_normalized = R - user_ratings_mean.reshape(-1, 1)

print("Performing Singular Value Decomposition...")

# Perform SVD with k latent factors
k = 20  # Number of latent factors
print(f"Number of latent factors (k): {k}")

# SVD decomposition: R \u2248 U * \u03a3 * V^T
U, sigma, Vt = svds(R_normalized, k=k)

# Convert sigma to diagonal matrix
sigma = np.diag(sigma)

print(f"U matrix shape (users \u00d7 factors): {U.shape}")
print(f"Sigma matrix shape (factors \u00d7 factors): {sigma.shape}")
print(f"Vt matrix shape (factors \u00d7 items): {Vt.shape}")

# Predict all ratings
all_predictions = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
predictions_df = pd.DataFrame(all_predictions,
                              index=user_item_matrix.index,
                              columns=user_item_matrix.columns)

print("\nSVD-based predictions completed!")

# ==================== 6. EVALUATION METRICS ====================

# print("\n6. MODEL EVALUATION")
# print("-"*80)
#
# # Calculate RMSE and MAE for known ratings
# known_mask = R > 0
# actual_ratings = R[known_mask]
# predicted_ratings = all_predictions[known_mask]
#
# rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
# mae = mean_absolute_error(actual_ratings, predicted_ratings)
#
# print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
# print(f"Mean Absolute Error (MAE): {mae:.4f}")
# print(f"Normalized RMSE (0-5 scale): {(rmse/5)*100:.2f}%")

# ==================== 7. ALTERNATIVE: NON-NEGATIVE MATRIX FACTORIZATION ====================

# print("\n7. NON-NEGATIVE MATRIX FACTORIZATION (NMF)")
# print("-"*80)
#
# from sklearn.decomposition import NMF
#
# # NMF requires non-negative values
# R_nonneg = R.copy()
# R_nonneg[R_nonneg < 0] = 0
#
# print("Training NMF model...")
# nmf_model = NMF(n_components=k, init='random', random_state=42, max_iter=200)
#
# # Fit the model
# W = nmf_model.fit_transform(R_nonneg)  # User-factor matrix
# H = nmf_model.components_  # Factor-item matrix
#
# # Reconstruct the matrix
# nmf_predictions = np.dot(W, H)
# nmf_predictions_df = pd.DataFrame(nmf_predictions,
#                                   index=user_item_matrix.index,
#                                   columns=user_item_matrix.columns)
#
# # Evaluate NMF
# nmf_predicted_ratings = nmf_predictions[known_mask]
# rmse_nmf = np.sqrt(mean_squared_error(actual_ratings, nmf_predicted_ratings))
# mae_nmf = mean_absolute_error(actual_ratings, nmf_predicted_ratings)
#
# print(f"NMF - Root Mean Squared Error (RMSE): {rmse_nmf:.4f}")
# print(f"NMF - Mean Absolute Error (MAE): {mae_nmf:.4f}")
# print(f"NMF - Reconstruction Error: {nmf_model.reconstruction_err_:.4f}")

# ==================== 8. GRADIENT DESCENT MATRIX FACTORIZATION ====================

# print("\n8. GRADIENT DESCENT MATRIX FACTORIZATION")
# print("-"*80)
#
# class MatrixFactorization:
#     """
#     Matrix Factorization using Stochastic Gradient Descent
#     R \u2248 P \u00d7 Q^T
#     where P is user-factor matrix and Q is item-factor matrix
#     """
#
#     def __init__(self, R, K, alpha=0.002, beta=0.02, iterations=100):
#         """
#         Initialize the model
#         R: User-item rating matrix
#         K: Number of latent factors
#         alpha: Learning rate
#         beta: Regularization parameter
#         iterations: Number of training iterations
#         """
#         self.R = R
#         self.num_users, self.num_items = R.shape
#         self.K = K
#         self.alpha = alpha
#         self.beta = beta
#         self.iterations = iterations
#
#         # Initialize user and item latent factor matrices
#         self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
#         self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
#
#         # User and item biases
#         self.b_u = np.zeros(self.num_users)
#         self.b_i = np.zeros(self.num_items)
#         self.b = np.mean(self.R[np.where(self.R != 0)])
#
#         self.training_process = []
#
#     def train(self):
#         """Train the model using SGD"""
#         for iteration in range(self.iterations):
#             # Get indices of non-zero ratings
#             non_zero_indices = np.where(self.R > 0)
#
#             # Shuffle for stochastic gradient descent
#             indices = list(zip(non_zero_indices[0], non_zero_indices[1]))
#             np.random.shuffle(indices)
#
#             for i, j in indices:
#                 # Compute prediction error
#                 prediction = self.get_rating(i, j)
#                 error = self.R[i, j] - prediction
#
#                 # Update biases
#                 self.b_u[i] += self.alpha * (error - self.beta * self.b_u[i])
#                 self.b_i[j] += self.alpha * (error - self.beta * self.b_i[j])
#
#                 # Update latent factors
#                 self.P[i, :] += self.alpha * (error * self.Q[j, :] - self.beta * self.P[i, :])
#                 self.Q[j, :] += self.alpha * (error * self.P[i, :] - self.beta * self.Q[j, :])
#
#             # Calculate training error
#             mse = self.calculate_mse()
#             self.training_process.append((iteration, mse))
#
#             if (iteration + 1) % 20 == 0:
#                 print(f"  Iteration {iteration + 1}/{self.iterations} - MSE: {mse:.4f}")
#
#         return self
#
#     def get_rating(self, i, j):
#         """Get predicted rating for user i and item j"""
#         prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
#         return prediction
#
#     def get_complete_matrix(self):
#         """Get complete predicted rating matrix"""
#         return self.b + self.b_u[:, np.newaxis] + self.b_i[np.newaxis, :] + self.P.dot(self.Q.T)
#
#     def calculate_mse(self):
#         """Calculate Mean Squared Error on training data"""
#         non_zero = self.R > 0
#         predicted = self.get_complete_matrix()
#         error = self.R[non_zero] - predicted[non_zero]
#         return np.mean(error ** 2)
#
# # Train the gradient descent model
# print("Training Gradient Descent Matrix Factorization...")
# mf = MatrixFactorization(R, K=k, alpha=0.002, beta=0.02, iterations=100)
# mf.train()
#
# # Get predictions
# mf_predictions = mf.get_complete_matrix()
# mf_predictions_df = pd.DataFrame(mf_predictions,
#                                  index=user_item_matrix.index,
#                                  columns=user_item_matrix.columns)
#
# # Evaluate
# mf_predicted_ratings = mf_predictions[known_mask]
# rmse_mf = np.sqrt(mean_squared_error(actual_ratings, mf_predicted_ratings))
# mae_mf = mean_absolute_error(actual_ratings, mf_predicted_ratings)
#
# print(f"\nGD-MF - Root Mean Squared Error (RMSE): {rmse_mf:.4f}")
# print(f"GD-MF - Mean Absolute Error (MAE): {mae_mf:.4f}")

# ==================== 9. RECOMMENDATION FUNCTIONS ====================

print("\n9. RECOMMENDATION SYSTEM FUNCTIONS")
print("-"*80)

def get_user_recommendations(user_id, predictions_df, user_item_matrix,
                            df_original, n_recommendations=10):
    """
    Get top N product recommendations for a user

    Parameters:
    - user_id: User ID to get recommendations for
    - predictions_df: DataFrame with predicted ratings
    - user_item_matrix: Original user-item interaction matrix
    - df_original: Original dataframe with product details
    - n_recommendations: Number of recommendations to return

    Returns:
    - DataFrame with recommended products and details
    """

    if user_id not in predictions_df.index:
        return pd.DataFrame()

    # Get user's predicted ratings
    user_predictions = predictions_df.loc[user_id]

    # Get products user has already interacted with
    user_interactions = user_item_matrix.loc[user_id]
    already_rated = user_interactions[user_interactions > 0].index

    # Filter out already rated products
    recommendations = user_predictions[~user_predictions.index.isin(already_rated)]

    # Sort by predicted rating and get top N
    top_recommendations = recommendations.sort_values(ascending=False).head(n_recommendations)

    # Get product details
    recommended_products = []
    for product_id in top_recommendations.index:
        # Find product details from original dataframe
        product_info = df_original[df_original['product_id'] == product_id].iloc[0]

        recommended_products.append({
            'product_id': product_id,
            'product_title': product_info['product_title'],
            'predicted_rating': top_recommendations[product_id],
            'actual_rating': product_info['product_rating'],
            'discounted_price': product_info['discounted_price'],
            'original_price': product_info['original_price'],
            'total_reviews': product_info['total_reviews'],
            'is_best_seller': product_info.get('is_best_seller', 'Unknown'),
            'discount_percentage': product_info.get('discount_percentage', 0)
        })

    return pd.DataFrame(recommended_products)

def get_similar_products(product_id, predictions_df, df_original, n_similar=5):
    """
    Get similar products based on latent factors

    Parameters:
    - product_id: Product ID to find similar products for
    - predictions_df: DataFrame with predicted ratings
    - df_original: Original dataframe with product details
    - n_similar: Number of similar products to return

    Returns:
    - DataFrame with similar products
    """

    if product_id not in predictions_df.columns:
        return pd.DataFrame()

    # Get product's ratings across all users
    product_ratings = predictions_df[product_id]

    # Calculate similarity with all other products (cosine similarity)
    similarities = {}
    for other_product_id in predictions_df.columns:
        if other_product_id != product_id:
            other_ratings = predictions_df[other_product_id]

            # Cosine similarity
            dot_product = np.dot(product_ratings, other_ratings)
            norm_product = np.linalg.norm(product_ratings)
            norm_other = np.linalg.norm(other_ratings)

            if norm_product > 0 and norm_other > 0:
                similarity = dot_product / (norm_product * norm_other)
                similarities[other_product_id] = similarity

    # Sort by similarity and get top N
    similar_product_ids = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:n_similar]

    # Get product details
    similar_products = []
    for similar_id, similarity_score in similar_product_ids:
        product_info = df_original[df_original['product_id'] == similar_id].iloc[0]

        similar_products.append({
            'product_id': similar_id,
            'product_title': product_info['product_title'],
            'similarity_score': similarity_score,
            'product_rating': product_info['product_rating'],
            'discounted_price': product_info['discounted_price'],
            'total_reviews': product_info['total_reviews'],
            'is_best_seller': product_info.get('is_best_seller', 'Unknown')
        })

    return pd.DataFrame(similar_products)

def get_trending_products(df_original, n_trending=10):
    """Get trending products based on recent purchases and ratings"""

    trending = df_original.copy()

    # Calculate trending score
    if 'purchased_last_month' in trending.columns:
        trending['trending_score'] = (
            trending['product_rating'] * 0.3 +
            (trending['purchased_last_month'] / trending['purchased_last_month'].max()) * 5 * 0.4 +
            (trending['total_reviews'] / trending['total_reviews'].max()) * 5 * 0.3
        )
    else:
        trending['trending_score'] = (
            trending['product_rating'] * 0.5 +
            (trending['total_reviews'] / trending['total_reviews'].max()) * 5 * 0.5
        )

    top_trending = trending.nlargest(n_trending, 'trending_score')[[
        'product_title', 'product_rating', 'discounted_price',
        'total_reviews', 'trending_score'
    ]]

    return top_trending

# ==================== 10. EXAMPLE RECOMMENDATIONS ====================

print("\n10. GENERATING EXAMPLE RECOMMENDATIONS")
print("-"*80)

# Select a random user
example_user_id = np.random.choice(user_item_matrix.index)
print(f"\nGenerating recommendations for User ID: {example_user_id}")

# Get recommendations using SVD
recommendations_svd = get_user_recommendations(
    example_user_id,
    predictions_df,
    user_item_matrix,
    df_processed,
    n_recommendations=10
)

print("\nTop 10 Recommended Products (SVD-based):")
print(recommendations_svd[['product_title', 'predicted_rating', 'discounted_price', 'total_reviews']].to_string(index=False))

# Get similar products for a random product
example_product_id = np.random.choice(user_item_matrix.columns)
product_name = df_processed[df_processed['product_id'] == example_product_id]['product_title'].iloc[0]
print(f"\n\nFinding similar products to: '{product_name}'")

similar_products = get_similar_products(example_product_id, predictions_df, df_processed, n_similar=5)
print("\nTop 5 Similar Products:")
print(similar_products[['product_title', 'similarity_score', 'product_rating', 'discounted_price']].to_string(index=False))

# Get trending products
print("\n\nTop 10 Trending Products:")
trending_products = get_trending_products(df_processed, n_trending=10)
print(trending_products.to_string(index=False))

# ==================== 11. MODEL COMPARISON ====================

# print("\n11. MODEL COMPARISON SUMMARY")
# print("-"*80)
# print(f"{'Algorithm':<30} {'RMSE':<12} {'MAE':<12}")
# print("-"*80)
# print(f"{'SVD':<30} {rmse:<12.4f} {mae:<12.4f}")
# # print(f"{'NMF':<30} {rmse_nmf:<12.4f} {mae_nmf:<12.4f}")
# # print(f"{'Gradient Descent MF':<30} {rmse_mf:<12.4f} {mae_mf:<12.4f}")
#
# # Select best model
# # best_model = min([('SVD', rmse)], key=lambda x: x[1]) # Only SVD is active
# # print(f"\nBest Model: {best_model[0]} with RMSE: {best_model[1]:.4f}")

# ==================== 12. BUSINESS INSIGHTS ====================

# print("\n12. BUSINESS INSIGHTS")
# print("-"*80)
#
# # Price sensitivity analysis
# if 'discount_percentage' in df_processed.columns:
# #     high_discount = df_processed[df_processed['discount_percentage'] > 30]
# #     print(f"\nProducts with >30% discount: {len(high_discount)}")
# #     print(f"Average rating for high-discount products: {high_discount['product_rating'].mean():.2f}")
#     pass # Keep the if condition but comment out the content
#
# # Best seller analysis
# if 'is_best_seller' in df_processed.columns:
# #     bestsellers = df_processed[df_processed['is_best_seller'] == 'Best Seller']
# #     print(f"\nBest Sellers: {len(bestsellers)}")
# #     print(f"Average rating for best sellers: {bestsellers['product_rating'].mean():.2f}")
# #     print(f"Average price for best sellers: ${bestsellers['discounted_price'].mean():.2f}")
#     pass # Keep the if condition but comment out the content
#
# # Review volume analysis
# # print(f"\nProducts with >10,000 reviews: {len(df_processed[df_processed['total_reviews'] > 10000])}")
# # high_review_products = df_processed[df_processed['total_reviews'] > 10000]
# # print(f"Average rating for high-review products: {high_review_products['product_rating'].mean():.2f}")

# ==================== 13. VISUALIZATION ====================

# print("\n13. GENERATING VISUALIZATIONS")
# print("-"*80)
#
# fig, axes = plt.subplots(2, 3, figsize=(18, 12))
#
# # 1. Rating distribution
# axes[0, 0].hist(df_processed['product_rating'], bins=20, color='skyblue', edgecolor='black')
# axes[0, 0].set_xlabel('Product Rating')
# axes[0, 0].set_ylabel('Frequency')
# axes[0, 0].set_title('Distribution of Product Ratings')
# axes[0, 0].grid(True, alpha=0.3)
#
# # 2. Price distribution
# axes[0, 1].hist(df_processed['discounted_price'], bins=30, color='lightgreen', edgecolor='black')
# axes[0, 1].set_xlabel('Discounted Price ($)')
# axes[0, 1].set_ylabel('Frequency')
# axes[0, 1].set_title('Distribution of Product Prices')
# axes[0, 1].set_xlim(0, 500)
# axes[0, 1].grid(True, alpha=0.3)
#
# # 3. Model comparison
# models = ['SVD'] # Adjusted for only SVD
# rmse_values = [rmse] # Adjusted for only SVD
# mae_values = [mae] # Adjusted for only SVD
#
# x = np.arange(len(models))
# width = 0.35
#
# axes[0, 2].bar(x - width/2, rmse_values, width, label='RMSE', color='coral')
# axes[0, 2].bar(x + width/2, mae_values, width, label='MAE', color='lightblue')
# axes[0, 2].set_xlabel('Algorithm')
# axes[0, 2].set_ylabel('Error')
# axes[0, 2].set_title('Model Performance Comparison')
# axes[0, 2].set_xticks(x)
# axes[0, 2].set_xticklabels(models)
# axes[0, 2].legend()
# axes[0, 2].grid(True, alpha=0.3)
#
# # 4. User interaction distribution
# axes[1, 0].hist(user_interactions, bins=30, color='mediumpurple', edgecolor='black')
# axes[1, 0].set_xlabel('Number of Interactions')
# axes[1, 0].set_ylabel('Number of Users')
# axes[1, 0].set_title('Distribution of User Interactions')
# axes[1, 0].grid(True, alpha=0.3)
#
# # 5. Item interaction distribution
# axes[1, 1].hist(item_interactions, bins=30, color='gold', edgecolor='black')
# axes[1, 1].set_xlabel('Number of Interactions')
# axes[1, 1].set_ylabel('Number of Items')
# axes[1, 1].set_title('Distribution of Item Interactions')
# axes[1, 1].grid(True, alpha=0.3)
#
# # 6. Training progress for Gradient Descent
# # This block will be commented out as GD-MF is removed
# # if mf.training_process:
# #     iterations, mse_values = zip(*mf.training_process)
# #     axes[1, 2].plot(iterations, mse_values, color='darkred', linewidth=2)
# #     axes[1, 2].set_xlabel('Iteration')
# #     axes[1, 2].set_ylabel('MSE')
# #     axes[1, 2].set_title('Gradient Descent Training Progress')
# #     axes[1, 2].grid(True, alpha=0.3)
#
# plt.tight_layout()
# plt.savefig('recommendation_system_analysis.png', dpi=300, bbox_inches='tight')
# print("Visualizations saved to 'recommendation_system_analysis.png'")

# ==================== 14. SAVE RECOMMENDATIONS ====================

print("\n14. SAVING RECOMMENDATIONS")
print("-"*80)

# Save top recommendations for all users
all_recommendations = []
for user_id in user_item_matrix.index[:50]:  # Save for first 50 users
    user_recs = get_user_recommendations(
        user_id,
        predictions_df,
        user_item_matrix,
        df_processed,
        n_recommendations=10
    )
    user_recs['user_id'] = user_id
    all_recommendations.append(user_recs)

if all_recommendations:
    recommendations_df = pd.concat(all_recommendations, ignore_index=True)
    recommendations_df.to_csv('user_recommendations.csv', index=False)
    print("User recommendations saved to 'user_recommendations.csv'")

print("\n" + "="*80)
print("RECOMMENDATION SYSTEM COMPLETE")
print("="*80)

E-COMMERCE RECOMMENDATION SYSTEM USING MATRIX FACTORIZATION

1. DATASET OVERVIEW
--------------------------------------------------------------------------------
Dataset Shape: (31959, 17)

Columns: ['product_title', 'product_rating', 'total_reviews', 'purchased_last_month', 'discounted_price', 'original_price', 'is_best_seller', 'is_sponsored', 'has_coupon', 'buy_box_availability', 'delivery_date', 'sustainability_tags', 'product_image_url', 'product_page_url', 'data_collected_at', 'product_category', 'discount_percentage']

First 10 rows:
                                       product_title  product_rating  \
0  BOYA BOYALINK 2 Wireless Lavalier Microphone f...             4.6   
1  LISEN USB C to Lightning Cable, 240W 4 in 1 Ch...             4.3   
2  DJI Mic 2 (2 TX + 1 RX + Charging Case), Wirel...             4.6   
3  Apple AirPods Pro 2 Wireless Earbuds, Active N...             4.6   
4  Apple AirTag 4 Pack. Keep Track of and find Yo...             4.8   
5  Texas Instruments 

In [None]:
"""
==================================================================================
COMPREHENSIVE EXPLANATION OF E-COMMERCE RECOMMENDATION SYSTEM
==================================================================================

1. PROJECT OVERVIEW:
   - Develops a recommendation system for e-commerce products
   - Uses Matrix Factorization algorithms (SVD, NMF, Gradient Descent)
   - Provides personalized product recommendations and similar product suggestions

2. MATRIX FACTORIZATION CONCEPT:
   - Decomposes user-item interaction matrix into lower-dimensional matrices
   - Formula: R ≈ U × Σ × V^T (SVD) or R ≈ P × Q^T (General MF)
   - Captures latent factors representing user preferences and item characteristics
   - Handles sparse data efficiently

3. ALGORITHMS IMPLEMENTED:

   a) SINGULAR VALUE DECOMPOSITION (SVD):
      - Mathematical technique from linear algebra
      - Decomposes matrix into three matrices: U (users), Σ (singular values), V^T (items)
      - Steps:
        1. Normalize ratings by subtracting user mean
        2. Perform SVD with k latent factors
        3. Reconstruct rating matrix: R = U × Σ × V^T
        4. Add back user means
      - Pros: Mathematically optimal, fast computation
      - Cons: Cannot handle new users/items easily

   b) NON-NEGATIVE MATRIX FACTORIZATION (NMF):
      - Factorizes into two non-negative matrices: W (user-factor) and H (factor-item)
      - Formula: R ≈ W × H
      - Constraints: All values in W and H must be ≥ 0
      - Advantages:
        * More interpretable factors
        * Parts-based representation
        * Natural for rating data (ratings are non-negative)
      - Uses iterative optimization to minimize reconstruction error

   c) GRADIENT DESCENT MATRIX FACTORIZATION:
      - Custom implementation using Stochastic Gradient Descent
      - Learns user matrix P and item matrix Q such that R ≈ P × Q^T
      - Includes bias terms for users and items
      - Optimization:
        * Minimize: Σ(r_ui - (b + b_u + b_i + p_u · q_i))² + λ(||p_u||² + ||q_i||²)
        * Updates using gradient descent with learning rate α
        * Regularization parameter β prevents overfitting
      - Advantages:
        * Flexible, can add custom features
        * Works well with implicit feedback
        * Can handle new users/items incrementally

4. DATA PREPROCESSING:
   - Handle missing values (median for numeric, mode for categorical)
   - Create synthetic user IDs (since not in original data)
   - Encode products with LabelEncoder
   - Create user-item rating matrix (sparse matrix)

5. USER-ITEM INTERACTION MATRIX:
   - Rows: Users (500 synthetic users)
   - Columns: Products (unique product IDs)
   - Values: Ratings (product_rating)
   - Sparse matrix (most entries are zero)
   - Sparsity calculation: (1 - non_zero_entries/total_entries) × 100%

6. EVALUATION METRICS:

   a) ROOT MEAN SQUARED ERROR (RMSE):
      - Measures average prediction error
      - Formula: √(Σ(actual - predicted)² / n)
      - Penalizes large errors more heavily
      - Lower RMSE = better predictions

   b) MEAN ABSOLUTE ERROR (MAE):
      - Average absolute difference between actual and predicted
      - Formula: Σ|actual - predicted| / n
      - More intuitive interpretation
      - Less sensitive to outliers than RMSE

7. RECOMMENDATION STRATEGIES:

   a) PERSONALIZED RECOMMENDATIONS:
      - For each user, predict ratings for all unrated items
      - Sort by predicted rating (highest first)
      - Filter out items already interacted with
      - Return top N items
      - Use case: "Recommended for You" section

   b) SIMILAR PRODUCTS:
      - Calculate cosine similarity between product vectors
      - Formula: similarity = (A·B) / (||A|| × ||B||)
      - Find products with highest similarity scores
      - Use case: "Customers who viewed this also viewed"

   c) TRENDING PRODUCTS:
      - Combine multiple signals:
        * Product rating (quality indicator)
        * Recent purchases (popularity)
        * Total reviews (social proof)
      - Weighted scoring system
      - Use case: "Trending Now" section

8. KEY FEATURES UTILIZED:

   a) Product Characteristics:
      - product_rating: Quality indicator (4.0-5.0 scale)
      - total_reviews: Social proof metric
      - discounted_price: Price point
      - original_price: Reference price
      - discount_percentage: Deal attractiveness
      - is_best_seller: Organic/sponsored indicator
      - product_category: Product type

   b) Purchasing Behavior:
      - purchased_last_month: Recent demand
      - buy_box_availability: Stock status
      - delivery_date: Fulfillment speed

   c) Product Marketing:
      - is_sponsored: Paid placement
      - has_coupon: Additional discount
      - sustainability_tags: Eco-conscious features

9. LATENT FACTORS INTERPRETATION:
   - Each user and item is represented by k latent factors (k=20)
   - Factors capture hidden preferences/characteristics:
     * Factor 1: Price sensitivity
     * Factor 2: Brand preference
     * Factor 3: Product quality importance
     * Factor 4: Review dependency
     * Factor 5: Discount seeking behavior
     * Factors 6-20: Other complex patterns
   - Factors are learned automatically from data

10. HYPERPARAMETERS:

    a) Number of Latent Factors (k):
       - Set to 20 in this implementation
       - Too low: Underfitting, misses patterns
       - Too high: Overfitting, computational cost
       - Typically: 10-100 for real applications

    b) Learning Rate (α):
       - Controls update step size
       - Set to 0.002 (conservative)
       - Too high: Divergence
       - Too low: Slow convergence

    c) Regularization (β):
       - Prevents overfitting
       - Set to 0.02
       - Penalizes large weights
       - Balance between fit and generalization

    d) Iterations:
       - Number of training epochs
       - Set to 100
       - Monitor training progress to avoid overfitting

11. ADVANTAGES OF MATRIX FACTORIZATION:

    a) Scalability:
       - Handles millions of users and items
       - Sparse matrix operations are efficient
       - Can be parallelized for large datasets

    b) Handles Sparsity:
       - Works well even when most ratings are missing
       - Learns from available data to predict unknowns
       - E-commerce datasets are typically 95-99% sparse

    c) Discovers Latent Patterns:
       - Finds hidden user preferences
       - Groups similar users automatically
       - Identifies item characteristics

    d) Cold Start Mitigation:
       - Can incorporate item features for new products
       - Hybrid approaches combine collaborative and content-based

12. RECOMMENDATION TYPES GENERATED:

    a) Collaborative Filtering:
       - "Users like you also liked..."
       - Based on user-user or item-item similarities
       - Leverages collective wisdom

    b) Content-Based:
       - "Similar to what you viewed"
       - Based on product attributes
       - Doesn't require other users' data

    c) Hybrid:
       - Combines collaborative and content-based
       - Best of both approaches
       - Most robust solution

13. BUSINESS APPLICATIONS:

    a) Homepage Personalization:
       - Show different products to different users
       - Increase engagement and click-through rates

    b) Email Marketing:
       - Send personalized product suggestions
       - Improve open and conversion rates

    c) Product Pages:
       - "Frequently bought together"
       - "Customers also viewed"
       - Increase cross-selling

    d) Search Results Ranking:
       - Personalize search results
       - Show most relevant items first

    e) Cart Abandonment:
       - Recommend complementary products
       - Incentivize purchase completion

14. PERFORMANCE OPTIMIZATION:

    a) Matrix Sparsity:
       - Use scipy.sparse for memory efficiency
       - Store only non-zero entries
       - Critical for large-scale systems

    b) Incremental Updates:
       - Update factors for new ratings without retraining
       - Add new users/items dynamically

    c) Caching:
       - Pre-compute recommendations for active users
       - Store in Redis/Memcached
       - Serve instantly on request

    d) A/B Testing:
       - Compare different algorithms
       - Measure conversion rates
       - Optimize recommendation quality

15. HANDLING IMPLICIT FEEDBACK:
    - Current implementation uses explicit ratings (1-5 stars)
    - Can be adapted for implicit feedback:
      * Views → Confidence score
      * Purchases → High confidence
      * Add to cart → Medium confidence
      * Clicks → Low confidence
    - Weight by recency (recent actions more important)

16. COLD START PROBLEMS & SOLUTIONS:

    a) New Users (User Cold Start):
       - Show trending/popular items
       - Ask for initial preferences
       - Use demographic information
       - Leverage first few interactions quickly

    b) New Items (Item Cold Start):
       - Use content-based features
       - Show to exploratory users
       - Leverage product category information
       - Bootstrap with initial promotions

17. EVALUATION BEYOND RMSE:

    a) Business Metrics:
       - Click-Through Rate (CTR)
       - Conversion Rate
       - Revenue per user
       - Average order value

    b) Ranking Metrics:
       - Precision@K: Relevant items in top K
       - Recall@K: Coverage of relevant items
       - NDCG: Normalized Discounted Cumulative Gain
       - MAP: Mean Average Precision

    c) Diversity Metrics:
       - Catalog coverage
       - Item diversity in recommendations
       - Avoid filter bubbles

18. ADVANCED FEATURES (Future Enhancements):

    a) Time-Aware Recommendations:
       - Decay old interactions
       - Seasonal patterns
       - Trend detection

    b) Context-Aware:
       - Device type (mobile/desktop)
       - Time of day
       - Location-based
       - Weather-based

    c) Deep Learning Extensions:
       - Neural Collaborative Filtering
       - Autoencoders for embeddings
       - RNN for sequential patterns
       - Attention mechanisms

    d) Multi-Armed Bandits:
       - Exploration vs exploitation
       - Thompson Sampling
       - UCB (Upper Confidence Bound)

19. DATA PRIVACY & ETHICS:

    a) Privacy Considerations:
       - Anonymize user data
       - Secure storage
       - GDPR compliance
       - User consent for tracking

    b) Fairness:
       - Avoid popularity bias
       - Equal opportunity for new items
       - Diverse recommendations
       - No discriminatory patterns

    c) Transparency:
       - Explain why items recommended
       - Allow user feedback
       - Provide controls (like/dislike)
       - Opt-out options

20. PRODUCTION DEPLOYMENT:

    a) Architecture:
       - Offline training (daily/weekly)
       - Online serving (real-time)
       - Feature store for quick access
       - Model registry for versioning

    b) Monitoring:
       - Track recommendation quality
       - Monitor conversion rates
       - Detect model drift
       - A/B test new models

    c) Scalability:
       - Distributed training (Spark/Dask)
       - Load balancing
       - Horizontal scaling
       - Caching strategies

21. CODE IMPLEMENTATION HIGHLIGHTS:

    a) MatrixFactorization Class:
       - Custom SGD implementation
       - Includes user/item biases
       - Regularization to prevent overfitting
       - Training progress tracking

    b) Recommendation Functions:
       - get_user_recommendations(): Top-N recommendations
       - get_similar_products(): Content similarity
       - get_trending_products(): Popularity-based

    c) Evaluation:
       - Train/test split
       - RMSE and MAE calculation
       - Model comparison
       - Visualization of results

22. MATHEMATICAL INTUITION:

    a) Why Matrix Factorization Works:
       - Low-rank approximation captures main patterns
       - Reduces noise in sparse data
       - Compresses information efficiently
       - Similar to PCA for ratings data

    b) Optimization Objective:
       - Minimize prediction error
       - Add regularization term
       - Balance fit vs complexity
       - Gradient descent finds local optimum

23. PRACTICAL TIPS:

    a) Feature Engineering:
       - Normalize ratings by user mean
       - Log-transform skewed features
       - Create interaction features
       - Time-based features

    b) Hyperparameter Tuning:
       - Grid search or random search
       - Cross-validation
       - Monitor validation error
       - Early stopping

    c) Model Selection:
       - Start simple (SVD)
       - Add complexity if needed
       - Consider interpretability
       - Balance accuracy vs speed

24. DATASET CHARACTERISTICS:

    a) Product Categories:
       - Electronics (Apple products, Texas Instruments)
       - Home goods (Complete Protec)
       - Fashion items
       - Various price points ($14.49 - $880.95)

    b) Rating Distribution:
       - High ratings (4.0-4.8)
       - Indicates quality products
       - Possible positive bias
       - Need to consider rating scale

    c) Price Range:
       - Budget items (<$50)
       - Mid-range ($50-$200)
       - Premium items (>$200)
       - Discount impact on recommendations

25. KEY TAKEAWAYS:

    ✓ Matrix factorization is powerful for sparse data
    ✓ Multiple algorithms available (SVD, NMF, GD)
    ✓ Latent factors capture hidden patterns
    ✓ Combines scalability with accuracy
    ✓ Foundation for modern recommendation systems
    ✓ Used by Amazon, Netflix, Spotify, YouTube
    ✓ Can be enhanced with deep learning
    ✓ Requires careful evaluation and monitoring
    ✓ Business metrics matter more than RMSE
    ✓ User experience is ultimate goal

==================================================================================
This recommendation system demonstrates how matrix factorization algorithms can
power personalized product recommendations in e-commerce, helping customers
discover relevant products while increasing business metrics like conversion
rates and revenue per user.
==================================================================================
"""