In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix

# Load the dataset
data = pd.read_csv(r"Womens Clothing E-Commerce Reviews.csv")

# Group by 'Age' and 'Clothing ID', and calculate the mean rating
data = data.groupby(["Age", "Clothing ID"], as_index=False).agg({"Rating": "mean"})

# Create the user-item matrix
user_item_matrix = data.pivot(index="Age", columns="Clothing ID", values="Rating").fillna(0)

# Convert the user-item matrix to a sparse matrix
user_item_sparse_matrix = csr_matrix(user_item_matrix.values)

# Matrix factorization using Singular Value Decomposition (SVD)
# Choosing k (number of latent features) as 2 for simplicity
U, sigma, Vt = svds(user_item_sparse_matrix, k=2)
sigma = np.diag(sigma)

# Reconstruct the predicted ratings matrix
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings_df = pd.DataFrame(
    predicted_ratings, columns=user_item_matrix.columns, index=user_item_matrix.index
)

# Define a recommendation function
def recommend_items(user_age, num_recommendations=3):
    if user_age not in predicted_ratings_df.index:
        raise ValueError("User age not found in the dataset.")

    # Get the predicted ratings for the given user age
    user_predictions = predicted_ratings_df.loc[user_age].sort_values(ascending=False)

    # Select the top recommended items
    recommended_items = user_predictions.head(num_recommendations).index.tolist()
    return recommended_items

# Attempt to get recommendations for a user aged 34
try:
    recommendations = recommend_items(user_age=34)
    print("Recommendations for user aged 34:", recommendations)
except ValueError as e:
    print(e)

# Evaluate the model using Root Mean Squared Error (RMSE)

# Split data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Create user-item matrices for train and test sets
train_matrix = train_data.pivot(index="Age", columns="Clothing ID", values="Rating").fillna(0)
test_matrix = test_data.pivot(index="Age", columns="Clothing ID", values="Rating").fillna(0)

# Ensure the test matrix aligns with the predicted ratings dimensions
common_indices = test_matrix.index.intersection(predicted_ratings_df.index)
common_columns = test_matrix.columns.intersection(predicted_ratings_df.columns)
test_matrix = test_matrix.loc[common_indices, common_columns]
predicted_matrix = predicted_ratings_df.loc[common_indices, common_columns]

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test_matrix.values, predicted_matrix.values))
print("RMSE of the model:", rmse)


Recommendations for user aged 34: [984, 1083, 834]
RMSE of the model: 1.3985110616255156


Dataset Preparation:

The dataset contains user reviews, including Age, Clothing ID, and Rating.
The ratings are grouped by Age and Clothing ID, and their mean values are calculated to create a compact representation.
User-Item Matrix:

A user-item matrix is constructed, where rows represent users (ages), columns represent items (clothing IDs), and the values are ratings. Missing values are filled with 0.
Matrix Factorization (SVD):

Singular Value Decomposition (SVD) is applied to the sparse matrix. It breaks down the matrix into three components:
U (user features),
sigma (importance of features),
Vt (item features).
These are used to approximate the original matrix and predict missing ratings.
Recommendation:

Predicted ratings are stored in a new matrix.
For a given user (age), the top n items with the highest predicted ratings are recommended.
Evaluation (RMSE):

The dataset is split into training and testing sets.
Predicted ratings are compared with actual ratings in the test set.
Root Mean Squared Error (RMSE) is calculated to measure prediction accuracy.