In [1]:
# Code for loading data, creating user-item matrix, applying NMF,
# reconstructing the matrix, predicting ratings, and recommending products.

import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the data from CSV
file_path = 'extract_amazon/ratings_Beauty.csv'
df = pd.read_csv(file_path, nrows=100000)

print("Data loaded successfully:")
print(df.head())

# Create the user-item interaction matrix
user_item_matrix = df.pivot(index='UserId', columns='ProductId', values='Rating').fillna(0)

print("User-Item Interaction Matrix:")
print(user_item_matrix.head())

# Convert the matrix to a numpy array
user_item_matrix_np = user_item_matrix.values

# Split the data into training and test sets
train_data, test_data = train_test_split(user_item_matrix_np, test_size=0.2, random_state=42)

print("Training data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

# Apply NMF
model = NMF(n_components=10, init='random', random_state=42)  # n_components is the number of latent features
W_train = model.fit_transform(train_data)
H_train = model.components_

# Print the decomposed matrices
print("User Feature Matrix (W):")
print(W_train)

print("Item Feature Matrix (H):")
print(H_train)

# Reconstruct the user-item matrix from the training data
reconstructed_matrix_train = np.dot(W_train, H_train)

print("Reconstructed User-Item Matrix (Train):")
print(reconstructed_matrix_train)

# Predict ratings for the test data
W_test = model.transform(test_data)
reconstructed_matrix_test = np.dot(W_test, H_train)

# Calculate RMSE for the test data
rmse = np.sqrt(mean_squared_error(test_data, reconstructed_matrix_test))
print(f"RMSE on test data: {rmse}")

# Predict the rating for a specific user and product
def predict_rating(user_id, product_id):
    user_index = user_item_matrix.index.get_loc(user_id)
    product_index = user_item_matrix.columns.get_loc(product_id)
    return reconstructed_matrix_train[user_index, product_index]

# Function to recommend top N products for a user
def recommend_top_n(user_id, n=5):
    user_index = user_item_matrix.index.get_loc(user_id)
    user_ratings = reconstructed_matrix_train[user_index]
    top_n_product_indices = np.argsort(user_ratings)[-n:][::-1]
    top_n_product_ids = [user_item_matrix.columns[i] for i in top_n_product_indices]
    return top_n_product_ids

# Example: Predict rating for a specific user and product
example_user_id = df['UserId'].iloc[0]
example_product_id = df['ProductId'].iloc[0]
predicted_rating = predict_rating(example_user_id, example_product_id)
print(f"Predicted rating for user {example_user_id} on product {example_product_id} is {predicted_rating:.2f}")

# Example: Recommend top 5 products for a specific user
top_products = recommend_top_n(example_user_id, n=5)
print(f"Top recommended products for user {example_user_id}: {top_products}")


Data loaded successfully:
           UserId   ProductId  Rating   Timestamp
0  A39HTATAQ9V7YF  0205616461     5.0  1369699200
1  A3JM6GV9MNOF9X  0558925278     3.0  1355443200
2  A1Z513UWSAAO0F  0558925278     5.0  1404691200
3  A1WMRR494NWEWV  0733001998     4.0  1382572800
4  A3IAAVS479H7M7  0737104473     1.0  1274227200
User-Item Interaction Matrix:
ProductId              0205616461  0558925278  0733001998  0737104473  \
UserId                                                                  
A00205921JHJK5X9LNP42         0.0         0.0         0.0         0.0   
A00473363TJ8YSZ3YAGG9         0.0         0.0         0.0         0.0   
A00700212KB3K0MVESPIY         0.0         0.0         0.0         0.0   
A0081289HG0BXFQJQUWW          0.0         0.0         0.0         0.0   
A01247753D6GFZD87MUV8         0.0         0.0         0.0         0.0   

ProductId              0762451459  1304139212  1304139220  130414089X  \
UserId                                                     