# ðŸ“š Book Recommendation System â€” Extended

This notebook builds multiple recommenders and **evaluates** them using RMSE, Precision@K, Recall@K, Coverage, and Novelty.

**Models included:**
- Popularity-based
- Item-based Collaborative Filtering
- Simple User-based Collaborative Filtering (neighborhood aggregation)
- Matrix Factorization (Surprise SVD & NMF)
- Content-based (TF-IDF + linear_kernel)
- Hybrid (weighted combination)

**Evaluation & plots** included.

> NOTE: This notebook expects `books.csv`, `users.csv`, `ratings.csv` to be present in the working directory. Adjust thresholds to fit your environment.


In [1]:
# Imports
import pandas as pd, numpy as np, pickle, os
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.model_selection import train_test_split
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Surprise (matrix factorization)
try:
    from surprise import Dataset, Reader, SVD, NMF, accuracy, dump
    from surprise.model_selection import train_test_split as surprise_train_test_split, GridSearchCV as SurpriseGridSearchCV
except Exception as e:
    print('Surprise not installed or import failed:', e)
    print('Install with: pip install scikit-surprise or pip install surprise==0.1.6')


Surprise not installed or import failed: No module named 'surprise'
Install with: pip install scikit-surprise or pip install surprise==0.1.6


In [2]:
!pip install scikit-surprise


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'


  error: subprocess-exited-with-error
  
  Getting requirements to build wheel did not run successfully.
  exit code: 1
  
  [45 lines of output]
  
  Error compiling Cython file:
  ------------------------------------------------------------
  ...
          self.avg_cltr_i = avg_cltr_i
          self.avg_cocltr = avg_cocltr
  
          return self
  
      def compute_averages(self, np.ndarray[np.int_t] cltr_u,
                                               ^
  ------------------------------------------------------------
  
  surprise\prediction_algorithms\co_clustering.pyx:157:45: Invalid type.
  Compiling surprise/similarities.pyx because it changed.
  Compiling surprise/prediction_algorithms/matrix_factorization.pyx because it changed.
  Compiling surprise/prediction_algorithms/optimize_baselines.pyx because it changed.
  Compiling surprise/prediction_algorithms/slope_one.pyx because it changed.
  Compiling surprise/prediction_algorithms/co_clustering.pyx because it changed.
  [1/

In [None]:
# Load data (adjust paths if needed)
books = pd.read_csv('books.csv')
users = pd.read_csv('users.csv')
ratings = pd.read_csv('ratings.csv')

print('books', books.shape, 'users', users.shape, 'ratings', ratings.shape)

In [None]:
# Basic cleaning & merge
books['Year-Of-Publication'] = pd.to_numeric(books['Year-Of-Publication'], errors='coerce')
books.loc[(books['Year-Of-Publication'] < 1500) | (books['Year-Of-Publication'] > 2025), 'Year-Of-Publication'] = None
books['Book-Author'].fillna('Unknown', inplace=True)
books['Publisher'].fillna('Unknown', inplace=True)

ratings_with_name = ratings.merge(books, on='ISBN')
print('merged shape:', ratings_with_name.shape)


In [None]:
# Popularity-based (Top N)
num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index().rename(columns={'Book-Rating':'num_ratings'})
avg_rating_df = ratings_with_name.groupby('Book-Title')['Book-Rating'].mean().reset_index().rename(columns={'Book-Rating':'avg_ratings'})

popular_df = num_rating_df.merge(avg_rating_df, on='Book-Title')
popular_df = popular_df[popular_df['num_ratings']>250].sort_values('avg_ratings', ascending=False).head(50)
popular_df = popular_df.merge(books, on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-M','num_ratings','avg_ratings']]
popular_df.head()

In [None]:
# Collaborative filtering preparation (use thresholds to reduce sparsity)
x = ratings_with_name.groupby('User-ID').count()['Book-Rating'] > 200
book_readers = x[x].index
filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(book_readers)]

y = filtered_rating.groupby('Book-Title').count()['Book-Rating'] > 50
famous_books = y[y].index
final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]

# pivot: books x users
pt = final_ratings.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
pt.fillna(0, inplace=True)
print('pt shape:', pt.shape)

In [None]:
# Item-based similarity (cosine on rows of pt)
item_similarity = cosine_similarity(pt)
print('item_similarity shape:', item_similarity.shape)
book_list = list(pt.index)


In [None]:
# User-based similarity (cosine on users)
user_item = pt.T  # users x books
user_similarity = cosine_similarity(user_item)
print('user_similarity shape:', user_similarity.shape)
user_list = list(user_item.index)


In [None]:
# Content-based: TF-IDF (title + author) + linear kernel
books_unique = books.drop_duplicates('Book-Title').reset_index(drop=True)
books_unique['features'] = books_unique['Book-Title'].astype(str) + ' ' + books_unique['Book-Author'].astype(str)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books_unique['features'])
content_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)
title_to_idx = {title: idx for idx, title in enumerate(books_unique['Book-Title'].values)}
print('books_unique:', books_unique.shape, 'content_similarity shape:', content_similarity.shape)


In [None]:
# Recommendation functions

def recommend_item_cf(book_name, topk=5):
    if book_name not in pt.index:
        return []
    idx = pt.index.get_loc(book_name)
    sims = list(enumerate(item_similarity[idx]))
    sims = sorted(sims, key=lambda x: x[1], reverse=True)[1: topk+1]
    recs = []
    for i, score in sims:
        title = pt.index[i]
        tmp = books[books['Book-Title']==title].drop_duplicates('Book-Title').iloc[0]
        recs.append((tmp['Book-Title'], tmp['Book-Author'], tmp.get('Image-URL-M','')))
    return recs

def recommend_user_cf(book_name, topk=5):
    # aggregate ratings from users who rated the book, using user similarity
    users_who_rated = final_ratings[final_ratings['Book-Title']==book_name]['User-ID'].unique()
    if len(users_who_rated)==0:
        return []
    user_index_map = {uid: i for i, uid in enumerate(user_item.index)}
    agg_scores = defaultdict(float)
    agg_weights = defaultdict(float)
    for uid in users_who_rated:
        if uid not in user_index_map:
            continue
        uidx = user_index_map[uid]
        sim_vec = user_similarity[uidx]
        for other_idx, sim_score in enumerate(sim_vec):
            if sim_score <= 0: continue
            rated = user_item.iloc[other_idx]
            for book_title, rating in rated[rated>0].items():
                if book_title == book_name: continue
                agg_scores[book_title] += sim_score * rating
                agg_weights[book_title] += abs(sim_score)
    pred_scores = []
    for book_title in agg_scores:
        if agg_weights[book_title]==0: continue
        pred_scores.append((book_title, agg_scores[book_title]/agg_weights[book_title]))
    pred_scores = sorted(pred_scores, key=lambda x: x[1], reverse=True)[:topk]
    recs = []
    for title, sc in pred_scores:
        tmp = books[books['Book-Title']==title].drop_duplicates('Book-Title').iloc[0]
        recs.append((tmp['Book-Title'], tmp['Book-Author'], tmp.get('Image-URL-M','')))
    return recs

def recommend_content(book_name, topk=5):
    if book_name not in title_to_idx:
        return []
    idx = title_to_idx[book_name]
    sims = list(enumerate(content_similarity[idx]))
    sims = sorted(sims, key=lambda x: x[1], reverse=True)[1: topk+1]
    recs = []
    for i, sc in sims:
        tmp = books_unique.iloc[i]
        recs.append((tmp['Book-Title'], tmp['Book-Author'], tmp.get('Image-URL-M','')))
    return recs

def recommend_hybrid(book_name, topk=5, alpha=0.6):
    n = len(books_unique)
    item_scores = np.zeros(n)
    content_scores = np.zeros(n)
    if book_name in pt.index:
        idx_item = pt.index.get_loc(book_name)
        sims = item_similarity[idx_item]
        for i_pt, score in enumerate(sims):
            title = pt.index[i_pt]
            if title in title_to_idx:
                item_scores[title_to_idx[title]] = score
    if book_name in title_to_idx:
        content_scores = content_similarity[title_to_idx[book_name]]
    final = alpha*item_scores + (1-alpha)*content_scores
    top_idx = np.argsort(final)[::-1][1: topk+1]
    recs = []
    for idx in top_idx:
        tmp = books_unique.iloc[idx]
        recs.append((tmp['Book-Title'], tmp['Book-Author'], tmp.get('Image-URL-M','')))
    return recs


In [None]:
# Test recommendations (example)
sample = pt.index[0] if len(pt.index)>0 else books_unique['Book-Title'].iloc[0]
print('sample:', sample)
print('Item-CF:', recommend_item_cf(sample))
print('User-CF:', recommend_user_cf(sample))
print('Content:', recommend_content(sample))
print('Hybrid:', recommend_hybrid(sample))


In [None]:
# Surprise: train SVD and NMF on final_ratings (user-item interactions)
from surprise import Dataset, Reader, SVD, NMF, accuracy
from surprise.model_selection import train_test_split as s_train_test_split

ratings_surprise = final_ratings[['User-ID','Book-Title','Book-Rating']].copy()
reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(ratings_surprise[['User-ID','Book-Title','Book-Rating']], reader)
trainset, testset = s_train_test_split(data, test_size=0.2)
# Train SVD (default small factors)
svd = SVD(n_factors=50, random_state=42)
svd.fit(trainset)
pred_svd = svd.test(testset)
print('SVD RMSE:', accuracy.rmse(pred_svd))

# Train NMF
nmf = NMF(n_factors=15, random_state=42)
nmf.fit(trainset)
pred_nmf = nmf.test(testset)
print('NMF RMSE:', accuracy.rmse(pred_nmf))


In [None]:
# Evaluation helpers: Precision@K, Recall@K, Coverage, Novelty
def get_user_ground_truth(test_df, user):
    return set(test_df[test_df['User-ID']==user]['Book-Title'].values)

def recommend_for_user_itemcf(user_id, train_df, k=5):
    # produce top-k recommendations for a user using item-CF (weighted by user's ratings)
    user_ratings = train_df[train_df['User-ID']==user_id]
    if user_ratings.empty:
        return []
    scores = defaultdict(float)
    weights = defaultdict(float)
    for _, row in user_ratings.iterrows():
        rated_title = row['Book-Title']
        rating = row['Book-Rating']
        if rated_title not in pt.index:
            continue
        rated_idx = pt.index.get_loc(rated_title)
        sims = item_similarity[rated_idx]
        for j, sim in enumerate(sims):
            candidate = pt.index[j]
            if candidate in user_ratings['Book-Title'].values: continue
            scores[candidate] += sim * rating
            weights[candidate] += abs(sim)
    preds = []
    for b in scores:
        if weights[b]>0:
            preds.append((b, scores[b]/weights[b]))
    preds = sorted(preds, key=lambda x: x[1], reverse=True)[:k]
    return [p[0] for p in preds]


In [None]:
def precision_recall_at_k(model_func, train_df, test_df, k=5, n_users=200):
    users = test_df['User-ID'].unique()[:n_users]
    precisions = []
    recalls = []
    recommended_items = set()
    for user in users:
        gt = get_user_ground_truth(test_df, user)
        if len(gt)==0: continue
        recs = model_func(user, train_df, k)
        recommended_items.update(recs)
        hit = len(set(recs) & gt)
        precisions.append(hit / k)
        recalls.append(hit / len(gt))
    precision = np.mean(precisions) if precisions else 0
    recall = np.mean(recalls) if recalls else 0
    coverage = len(recommended_items) / len(pt.index)
    # novelty: use popularity: avg log pop of recommended items (lower popularity -> higher novelty)
    pop_count = final_ratings['Book-Title'].value_counts().to_dict()
    avg_pop = np.mean([pop_count.get(b,0) for b in recommended_items]) if recommended_items else 0
    novelty = -np.log1p(avg_pop)
    return precision, recall, coverage, novelty


In [None]:
# Train/test split on final_ratings for evaluation
train_df, test_df = train_test_split(final_ratings, test_size=0.2, random_state=42)

print('Evaluating Item-CF (top-5)...')
prec_item, rec_item, cov_item, nov_item = precision_recall_at_k(recommend_for_user_itemcf, train_df, test_df, k=5, n_users=200)
print('Item-CF -> Precision@5:', round(prec_item,4), 'Recall@5:', round(rec_item,4), 'Coverage:', round(cov_item,4), 'Novelty:', round(nov_item,4))


In [None]:
# wrapper for content-based recommendations for user: use one book from train as query
def model_content_wrapper(user, train_df, k=5):
    user_books = train_df[train_df['User-ID']==user]['Book-Title'].values
    if len(user_books)==0: return []
    query = user_books[0]
    recs = recommend_content(query, topk=k)
    return [r[0] for r in recs]

print('Evaluating Content-Based (top-5)...')
prec_cb, rec_cb, cov_cb, nov_cb = precision_recall_at_k(model_content_wrapper, train_df, test_df, k=5, n_users=200)
print('Content -> Precision@5:', round(prec_cb,4), 'Recall@5:', round(rec_cb,4), 'Coverage:', round(cov_cb,4), 'Novelty:', round(nov_cb,4))


In [None]:
# hybrid wrapper
def model_hybrid_wrapper(user, train_df, k=5):
    user_books = train_df[train_df['User-ID']==user]['Book-Title'].values
    if len(user_books)==0: return []
    query = user_books[0]
    recs = recommend_hybrid(query, topk=k, alpha=0.6)
    return [r[0] for r in recs]

print('Evaluating Hybrid (top-5)...')
prec_h, rec_h, cov_h, nov_h = precision_recall_at_k(model_hybrid_wrapper, train_df, test_df, k=5, n_users=200)
print('Hybrid -> Precision@5:', round(prec_h,4), 'Recall@5:', round(rec_h,4), 'Coverage:', round(cov_h,4), 'Novelty:', round(nov_h,4))


In [None]:
# Plot Precision@5 and Recall@5 comparison
methods = ['Item-CF','Content','Hybrid']
precisions = [prec_item, prec_cb, prec_h]
recalls = [rec_item, rec_cb, rec_h]

x = np.arange(len(methods))
width = 0.35
fig, ax = plt.subplots(figsize=(8,5))
rects1 = ax.bar(x - width/2, precisions, width, label='Precision@5')
rects2 = ax.bar(x + width/2, recalls, width, label='Recall@5')
ax.set_ylabel('Score')
ax.set_title('Comparison of Precision@5 and Recall@5')
ax.set_xticks(x)
ax.set_xticklabels(methods)
ax.legend()
for rect in rects1+rects2:
    height = rect.get_height()
    ax.annotate(f'{height:.3f}', xy=(rect.get_x()+rect.get_width()/2, height), xytext=(0,3), textcoords='offset points', ha='center', va='bottom')
plt.show()

In [None]:
# Save key artifacts for Flask
pickle.dump(pt, open('pt.pkl','wb'))
pickle.dump(books, open('books.pkl','wb'))
pickle.dump(item_similarity, open('item_similarity.pkl','wb'))
pickle.dump(content_similarity, open('content_similarity.pkl','wb'))
pickle.dump(tfidf, open('tfidf.pkl','wb'))
pickle.dump(books_unique, open('books_unique.pkl','wb'))
print('Saved pickles: pt.pkl, books.pkl, item_similarity.pkl, content_similarity.pkl, tfidf.pkl, books_unique.pkl')

In [None]:
# Summary
print('Precision@5: Item-CF', round(prec_item,4), 'Content', round(prec_cb,4), 'Hybrid', round(prec_h,4))
print('RMSE: see SVD/NMF outputs earlier')
print('Choose Hybrid if it balances precision and coverage; choose SVD/NMF for rating prediction accuracy (RMSE).')