In [None]:
# Collaborative Filtering for Book Recommendations
# This notebook implements collaborative filtering to recommend books based on user ratings.
# It uses item-based collaborative filtering to find books similar to a target book.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split

# Load book ratings data
book_ratings = pd.read_csv("BX-Book-Ratings.csv")

# Rename columns to match expected format
book_ratings.columns = ['user_id', 'item_id', 'rating']

# Since no book titles are available in the dataset, we'll use ISBN as a placeholder for title
book_ratings['title'] = book_ratings['item_id']

# Filter out implicit ratings (0 ratings are implicit in this dataset)
book_ratings = book_ratings[book_ratings['rating'] > 0]

# Filter users and items with few ratings
user_counts = book_ratings['user_id'].value_counts()
item_counts = book_ratings['item_id'].value_counts()
book_ratings = book_ratings[book_ratings['user_id'].isin(user_counts[user_counts >= 5].index)]
book_ratings = book_ratings[book_ratings['item_id'].isin(item_counts[item_counts >= 10].index)]

print(f"Filtered dataset shape: {book_ratings.shape}")
book_ratings.head(2)

In [None]:
# Data preprocessing and exploration

# Plot rating distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='rating', data=book_ratings)
plt.title('Distribution of Book Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.savefig('rating_distribution.png')
plt.show()

# Calculate average rating per book
mean_ratings = pd.DataFrame(book_ratings.groupby('title')['rating'].mean())
mean_ratings['num_of_ratings'] = pd.DataFrame(book_ratings.groupby('title')['rating'].count())

print("Top 5 highest rated books:")
mean_ratings.sort_values('rating', ascending=False).head(5)

In [None]:
# Create pivot table for collaborative filtering
pivot = book_ratings.pivot_table(index='user_id', columns='title', values='rating')

print(f"Pivot table shape: {pivot.shape}")
pivot.head()

In [None]:
# Item-based collaborative filtering
# Select a target book (using ISBN as example)
target_book = pivot.columns[0]  # First book in the dataset
print(f"Target book: {target_book}")

# Find correlations
selected_book_ratings = pivot[target_book]
similar_books = pivot.corrwith(selected_book_ratings)
similar_df = pd.DataFrame(similar_books, columns=['Correlation'])
similar_df.dropna(inplace=True)
similar_df = similar_df.join(mean_ratings['num_of_ratings'])

# Filter books with sufficient ratings
similar_df = similar_df[similar_df['num_of_ratings'] >= 50]
similar_df.sort_values('Correlation', ascending=False).head(10)

In [None]:
# User-based collaborative filtering
def get_similar_users(user_id, n_similar=5):
    user_ratings = pivot.loc[user_id].dropna()
    similar_users = pivot.corrwith(user_ratings)
    similar_users = similar_users.dropna().sort_values(ascending=False)
    return similar_users.head(n_similar)

# Example: Get similar users for user 0
similar_users = get_similar_users(0)
print("Similar users:")
similar_users

In [None]:
# Evaluation using RMSE
# Split data into train and test sets
train_data, test_data = train_test_split(book_ratings, test_size=0.2, random_state=42)

# Simple baseline: predict average rating
avg_rating = train_data['rating'].mean()
test_data['predicted_rating'] = avg_rating

# Calculate RMSE
rmse = sqrt(mean_squared_error(test_data['rating'], test_data['predicted_rating']))
print(f"Baseline RMSE: {rmse:.4f}")

# Plot actual vs predicted
plt.figure(figsize=(8, 6))
plt.scatter(test_data['rating'], test_data['predicted_rating'], alpha=0.5)
plt.xlabel('Actual Rating')
plt.ylabel('Predicted Rating')
plt.title('Actual vs Predicted Ratings (Baseline)')
plt.savefig('actual_vs_predicted.png')
plt.show()

In [None]:
# Visualization: Top rated books
top_books = mean_ratings[mean_ratings['num_of_ratings'] >= 100].sort_values('rating', ascending=False).head(10)

plt.figure(figsize=(12, 8))
sns.barplot(x=top_books['rating'], y=top_books.index)
plt.title('Top 10 Highest Rated Books (with >= 100 ratings)')
plt.xlabel('Average Rating')
plt.ylabel('Book ISBN')
plt.savefig('top_rated_books.png')
plt.show()