In [26]:
import pandas as pd
from surprise import Dataset, Reader, NMF
from surprise.model_selection import train_test_split
from surprise import accuracy
from collections import defaultdict

In [27]:
ratings_df = pd.read_csv("../data/processed/ratings_explicit_only.csv")
books_df = pd.read_csv("../data/processed/books_final.csv")

ratings_df['ISBN'] = ratings_df['ISBN'].astype(str).str.strip()
books_df['ISBN'] = books_df['ISBN'].astype(str).str.strip()

In [28]:
user_counts = ratings_df['User-ID'].value_counts()
book_counts = ratings_df['ISBN'].value_counts()

active_users = user_counts[user_counts >= 10].index
popular_books = book_counts[book_counts >= 10].index

ratings_df = ratings_df[
    (ratings_df['User-ID'].isin(active_users)) &
    (ratings_df['ISBN'].isin(popular_books))
]

books_df = books_df[books_df['ISBN'].isin(ratings_df['ISBN'])]

In [29]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(ratings_df[["User-ID", "ISBN", "Rating"]], reader)

In [30]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

model = NMF(n_factors=20, random_state=42)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x217b360bc50>

In [31]:
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)

RMSE: 2.1190


In [32]:
def precision_at_k(predictions, k=10, threshold=7.0):
    from collections import defaultdict
    user_est_true = defaultdict(list)
    
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions = []
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        top_k = user_ratings[:k]
        
        hits = sum((true_r >= threshold) for (_, true_r) in top_k)
        precisions.append(hits / k)
    
    return sum(precisions) / len(precisions)

In [33]:
p_at_10 = precision_at_k(predictions, k=10, threshold=7.0)
print(f"Precision@10: {p_at_10:.4f}")

Precision@10: 0.2533


In [34]:
def get_unrated_books(user_id, ratings_df, all_books):
    rated = ratings_df[ratings_df['User-ID'] == user_id]['ISBN'].tolist()
    return [isbn for isbn in all_books if isbn not in rated]

def recommend_top_n_for_user(user_id, model, ratings_df, books_df, n=10):
    all_books = books_df['ISBN'].tolist()
    unrated_books = get_unrated_books(user_id, ratings_df, all_books)

    predictions = [model.predict(user_id, isbn) for isbn in unrated_books]
    predictions.sort(key=lambda x: x.est, reverse=True)

    return [(user_id, pred.iid, pred.est) for pred in predictions[:n]]

In [36]:
def recommend_all_users(model, ratings_df, books_df, n=10):
    user_ids = ratings_df['User-ID'].unique()
    all_recommendations = []

    for uid in user_ids:
        user_recs = recommend_top_n_for_user(uid, model, ratings_df, books_df, n)
        all_recommendations.extend(user_recs)

    return pd.DataFrame(all_recommendations, columns=["User-ID", "ISBN", "PredictedRating"])

In [37]:
recommendations_df = recommend_all_users(model, ratings_df, books_df, n=10)
recommendations_df.to_csv("../outputs/nmf_recommendations.csv", index=False)

In [42]:
book_title_map = dict(zip(books_df['ISBN'], books_df['Title']))

target_user = ratings_df['User-ID'].sample(4).iloc[0]
recs = recommend_top_n_for_user(target_user, model, ratings_df, books_df)

print(f"Top 10 recommendations for User {target_user}:\n")
for uid, isbn, est_rating in recs:
    title = book_title_map.get(isbn, "Unknown Title")
    print(f"{title} (Predicted Rating: {round(est_rating, 2)})")

Top 10 recommendations for User 177862:

Redeeming Love (Predicted Rating: 9.77)
Once a Princess (Predicted Rating: 9.57)
Watchers (Predicted Rating: 9.53)
Chicken Soup for the Kid's Soul : 101 Stories of Courage, Hope and Laughter (Chicken Soup for the Soul (Paperback Health Communications)) (Predicted Rating: 9.39)
The Little Prince (Wordsworth Collection) (Predicted Rating: 9.36)
The Secret Garden (Predicted Rating: 9.08)
Talk Before Sleep (Predicted Rating: 9.06)
Wolves of the Calla (The Dark Tower, Book 5) (Predicted Rating: 8.94)
Ophelia Speaks : Adolescent Girls Write About Their Search for Self (Predicted Rating: 8.89)
Dracula (Bantam Classics) (Predicted Rating: 8.8)


## Collaborative Filtering with NMF – Final Implementation Summary

In this notebook, we implemented and optimized an explicit collaborative filtering system using Non-negative Matrix Factorization (NMF). The approach was designed in accordance with the strategy defined during the initial data inspection phase.

### Problem with Initial Implementation:
- Due to the highly sparse nature of the dataset (0.0038% density), many users received fewer than 10 recommendations.
- The standard `train_test_split` based evaluation in Surprise was limited in coverage and did not ensure consistent Top-N recommendations.

### Improvements Applied:
- Filtered the dataset to include only:
  - Users with at least 10 ratings
  - Books with at least 10 ratings
- Normalized all ISBN values to avoid mismatches
- Trained an NMF model using Surprise with 20 latent factors
- Implemented a custom Top-N generation function that:
  - Retrieves all books a user has not rated
  - Predicts ratings using the trained model
  - Returns the top N items based on predicted score

### Evaluation:
- RMSE was calculated on the test set
- A custom Precision@10 function was applied to assess recommendation ranking performance
- Results showed significantly improved recommendation quality and consistency

### Output:
- Final user-item recommendations were saved to a CSV file (`nmf_recommendations.csv`)
- An example output was displayed for a randomly selected user, including actual book titles and predicted ratings

This notebook concludes the collaborative filtering phase of the project. The next step will focus on implementing a content-based filtering system to handle cold-start cases and further enhance the recommendation pipeline.
