In [1]:
from sklearn.datasets import load_svmlight_file
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd


In [2]:
# Get the file path of the uploaded file
file_path = '/workspaces/book-recommendation-system/user_book_ratings.libsvm'

In [3]:
# Load the Sparse Matrix
print("Loading the sparse matrix...")
ratings_matrix, _ = load_svmlight_file(file_path)
ratings_matrix = csr_matrix(ratings_matrix) # Ensure CSR format
print("Sparse matrix loaded successfully.")

Loading the sparse matrix...
Sparse matrix loaded successfully.


In [4]:
# Efficient Cosine Similarity for Sparse Matrices
def cosine_similarity_sparse(user_vector, matrix):
    """Compute cosine similarities between a user vector and all rows in a sparse matrix."""
    dot_product = matrix.dot(user_vector.T).toarray().flatten()
    user_norm = np.sqrt(user_vector.power(2).sum())
    norms = np.sqrt(matrix.multiply(matrix).sum(axis=1)).A.flatten()
    similarities = dot_product / (user_norm * norms + 1e-10) # Add small constant to avoid division by zero
    return similarities

In [9]:
def recommend_books_sparse(user_index, k=10):
    """Recommend books for a specific user using sparse cosine similarity."""
    
    # Step 1: Compute sparse cosine similarity
    user_vector = ratings_matrix[user_index]
    similarity_vector = cosine_similarity_sparse(user_vector, ratings_matrix)

    # Step 2: Find K most similar users (excluding the current user)
    similarity_vector[user_index] = 0  # Set self-similarity to 0
    similar_users = np.argsort(similarity_vector)[-k:][::-1]

    # Step 3: Collect books rated by similar users
    similar_users_books = set()
    for sim_user in similar_users:
        similar_users_books.update(ratings_matrix[sim_user].indices)

    # Step 4: Exclude books already rated by the current user
    user_books = set(user_vector.indices)
    candidate_books = similar_users_books - user_books

    # Step 5: Calculate estimated ratings for candidate books
    estimated_ratings = {}
    for book in candidate_books:
        numerator = 0
        denominator = 0
        for sim_user in similar_users:
            sim = similarity_vector[sim_user]
            rating = ratings_matrix[sim_user, book]
            if rating > 0:  # Only consider books with ratings
                numerator += sim * rating
                denominator += sim
        if denominator > 0:
            estimated_ratings[book] = numerator / denominator

    # Step 6: Return top N recommended books
    top_books = sorted(estimated_ratings.items(), key=lambda x: x[1], reverse=True)[:5]
    return top_books


In [10]:
def generate_recommendations_sparse(ratings_matrix, k=10, user_limit=500):
    """Generate recommendations for a subset of users."""
    recommendations = []
    num_users = min(user_limit, ratings_matrix.shape[0])  # Limit to specified number of users

    for user_index in range(num_users):
        print(f"Processing user {user_index + 1}/{num_users}...")
        top_books = recommend_books_sparse(user_index, k)

        if top_books:
            book_ids = ",".join(str(book + 1) for book, _ in top_books)  # Convert to 1-based indexing
            scores = ",".join(f"{score:.0f}" for _, score in top_books)

            recommendations.append({
                'User_ID': user_index + 1,  # Convert to 1-based indexing
                'Book_IDs': book_ids,
                'Recommendation_Scores': scores
            })

    return recommendations


In [None]:
# Generate Recommendations for Subset of Users
print("Generating recommendations for a subset of users...")

user_limit = 105283  # Change this number to control the subset size

all_recommendations = generate_recommendations_sparse(
    ratings_matrix,
    k=10,
    user_limit=user_limit
)

# Save Recommendations to a CSV File
recommendations_df = pd.DataFrame(all_recommendations)
recommendations_csv_path = f'recommendations_{user_limit}_users_new.csv'

recommendations_df.to_csv(recommendations_csv_path, index=False, sep=';')

print(f"Recommendations saved to '{recommendations_csv_path}'.")


Generating recommendations for a subset of users...
Processing user 1/105283...
Processing user 2/105283...
Processing user 3/105283...
Processing user 4/105283...
Processing user 5/105283...
Processing user 6/105283...
Processing user 7/105283...
Processing user 8/105283...
Processing user 9/105283...
Processing user 10/105283...
Processing user 11/105283...
Processing user 12/105283...
Processing user 13/105283...
Processing user 14/105283...
Processing user 15/105283...
Processing user 16/105283...
Processing user 17/105283...
Processing user 18/105283...
Processing user 19/105283...
Processing user 20/105283...
Processing user 21/105283...
Processing user 22/105283...
Processing user 23/105283...
Processing user 24/105283...
Processing user 25/105283...
Processing user 26/105283...
Processing user 27/105283...
Processing user 28/105283...
Processing user 29/105283...
Processing user 30/105283...
Processing user 31/105283...
Processing user 32/105283...
Processing user 33/105283...
