# Book Recommender Project

In [1]:
#Dataset Source: https://www.kaggle.com/datasets/somnambwl/bookcrossing-dataset

In [2]:
import pandas as pd

### 1. Read and clean data

In [3]:
# Read Users.csv with 'User-ID' as string
users = pd.read_csv(
    'data/Users.csv',
    sep=';',
    encoding='latin-1',
    on_bad_lines='skip',
    dtype={'User-ID': str},
    low_memory=False)

In [4]:
users.head()

Unnamed: 0,User-ID,Age
0,1,
1,2,18.0
2,3,
3,4,17.0
4,5,


In [5]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278859 entries, 0 to 278858
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   User-ID  278859 non-null  object
 1   Age      168627 non-null  object
dtypes: object(2)
memory usage: 4.3+ MB


In [6]:
# Attempt to convert to integer; invalid ones will be turned into NaN
users['User-ID'] = pd.to_numeric(users['User-ID'], errors='coerce')

# Drop invalid rows and create a clean copy
users = users.dropna(subset=['User-ID']).copy()

# Now it's safe to convert the column type
users['User-ID'] = users['User-ID'].astype(int)

In [7]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Index: 278858 entries, 0 to 278858
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   User-ID  278858 non-null  int64 
 1   Age      168627 non-null  object
dtypes: int64(1), object(1)
memory usage: 6.4+ MB


In [8]:
# Load Ratings.csv
ratings = pd.read_csv(
    'data/Ratings.csv',
    sep=';',
    encoding='latin-1',
    on_bad_lines='skip')

In [9]:
ratings

Unnamed: 0,User-ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [10]:
# Rename columns
ratings.columns = ['User-ID', 'ISBN', 'Book-Rating']

In [11]:
# Remove 0 ratings (considered implicit/no rating)
ratings_cleaned = ratings[ratings['Book-Rating'] > 0].copy()

In [12]:
ratings_cleaned

Unnamed: 0,User-ID,ISBN,Book-Rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
6,276736,3257224281,8
7,276737,0600570967,6
...,...,...,...
1149773,276704,0806917695,5
1149775,276704,1563526298,9
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [13]:
# Load Books.csv
books = pd.read_csv(
    'data/Books.csv',
    sep=';',
    encoding='latin-1',
    on_bad_lines='skip')

In [14]:
books

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company
...,...,...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271375,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco
271377,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press


In [15]:
# Rename columns
books.columns = ['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']

In [16]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271379 non-null  object
 1   Book-Title           271379 non-null  object
 2   Book-Author          271377 non-null  object
 3   Year-Of-Publication  271379 non-null  int64 
 4   Publisher            271377 non-null  object
dtypes: int64(1), object(4)
memory usage: 10.4+ MB


In [17]:
# Check publication year distribution
print("📈 Year distribution:")
print(books['Year-Of-Publication'].value_counts().sort_index().tail(10))

📈 Year distribution:
Year-Of-Publication
2011    2
2012    1
2020    3
2021    1
2024    1
2026    1
2030    7
2037    1
2038    1
2050    2
Name: count, dtype: int64


In [18]:
# Remove invalid publication years
books = books[books['Year-Of-Publication'].apply(lambda x: str(x).isdigit())]
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(int)
books = books[(books['Year-Of-Publication'] > 1900) & (books['Year-Of-Publication'] <= 2025)]

In [19]:
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company
...,...,...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271375,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco
271377,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press


### 2. Content based filtering

In [20]:
# Sample a subset of books to avoid memory issues
books_sample = books.sample(n=30000, random_state=22).reset_index(drop=True)

# Create text_features as before
books_sample['text_features'] = (
    books_sample['Book-Title'].fillna('') + ' ' +
    books_sample['Book-Author'].fillna('') + ' ' +
    books_sample['Publisher'].fillna(''))

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel  

In [22]:
# TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books_sample['text_features'])

# Cosine similarity
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Reverse index
book_indices = pd.Series(books_sample.index, index=books_sample['Book-Title'].str.lower()).drop_duplicates()

In [23]:
# Recommendation function
def get_recommendations(title, top_n=5):
    title = title.lower()
    if title not in book_indices:
        print("❌ Book title not found.")
        return []
    
    idx = book_indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    book_indices_list = [i[0] for i in sim_scores]
    
    return books_sample.iloc[book_indices_list][['Book-Title', 'Book-Author', 'Publisher']]

In [24]:
# Get Recommendation by Book Title
get_recommendations("Carrie")

Unnamed: 0,Book-Title,Book-Author,Publisher
21057,Shining,Stephen King,Signet Book
16621,Cujo,Stephen King,Signet Book
16639,Thinner,Stephen King,Signet Book
11421,Night Shift,Stephen King,Signet Book
10022,Stephen King's Danse Macabre,Stephen King,Berkley Publishing Group


### 3. Item-Item Collaborative Filtering (Cosine Similarity)

In [25]:
# Keep only users who rated at least 10 books
active_users = ratings_cleaned['User-ID'].value_counts()
active_users = active_users[active_users >= 10].index
ratings_filtered = ratings_cleaned[ratings_cleaned['User-ID'].isin(active_users)]

# Keep only books that were rated at least 10 times
popular_books = ratings_filtered['ISBN'].value_counts()
popular_books = popular_books[popular_books >= 10].index
ratings_filtered = ratings_filtered[ratings_filtered['ISBN'].isin(popular_books)]

In [26]:
from scipy.sparse import csr_matrix

# Pivot table (User x Book) and use csr_matrix for less memory usage
rating_matrix = ratings_filtered.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating')
rating_matrix = rating_matrix.fillna(0)
sparse_rating_matrix = csr_matrix(rating_matrix.values)

print(f"✅ Rating matrix shape: {rating_matrix.shape}")

✅ Rating matrix shape: (6570, 3411)


In [27]:
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
# Transpose: books as rows, users as columns
book_user_matrix = rating_matrix.T

# Cosine similarity between books
book_similarity = cosine_similarity(book_user_matrix, dense_output=False)

# Create DataFrame for easier lookup
book_similarity_df = pd.DataFrame(book_similarity, index=rating_matrix.columns, columns=rating_matrix.columns)

In [29]:
book_similarity_df.head()

ISBN,000649840X,0006514855,0007110928,0007154615,0020198906,0020199600,0020427859,0020442009,0020442203,0020442602,...,3551551685,3596259924,3746614007,8408043641,8432206407,8445071416,8472236552,8478886451,8495501090,8495501198
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000649840X,1.0,0.062177,0.055386,0.059027,0.0,0.0,0.0,0.0,0.044748,0.059395,...,0.0,0.076711,0.0,0.037828,0.0,0.0,0.056752,0.0,0.0,0.0
0006514855,0.062177,1.0,0.227269,0.079653,0.0,0.0,0.116592,0.0,0.0,0.0,...,0.0,0.103517,0.0,0.051046,0.0,0.0,0.076583,0.0,0.0,0.0
0007110928,0.055386,0.227269,1.0,0.070953,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.09221,0.0,0.045471,0.0,0.0,0.068218,0.0,0.0,0.0
0007154615,0.059027,0.079653,0.070953,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.098272,0.0,0.04846,0.0,0.0,0.072703,0.0,0.0,0.0
0020198906,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Map ISBN → book title
isbn_to_title = books.set_index('ISBN')['Book-Title'].to_dict()

In [31]:
def get_similar_books(isbn, top_n=5):
    isbn = str(isbn).strip()
    
    if isbn not in book_similarity_df.columns:
        return {"Query Title": f"❌ ISBN not found: {isbn}", "Recommendations": []}
    
    similar_scores = book_similarity_df[isbn].sort_values(ascending=False)[1:top_n+1]
    result = [(isbn_to_title.get(i, "Unknown Title"), score) for i, score in similar_scores.items()]
    
    return {"Query Title": isbn_to_title.get(isbn, "Unknown Title"), "Recommendations": result}

In [32]:
# Get recommendations by ISBN
res = get_similar_books('0385504209')  # The Da Vinci Code

In [33]:
print(f"\n📚 Similar books to '{res['Query Title']}'\n")
for title, score in res["Recommendations"]:
    print(f"• {title}")


📚 Similar books to 'The Da Vinci Code'

• Angels & Demons
• The Lovely Bones: A Novel
• Middlesex: A Novel
• Street Dreams
• The Face


### 4. User-User Collaborative Filtering

In [34]:
user_similarity_sparse = cosine_similarity(sparse_rating_matrix, dense_output=False)

# Convert to DataFrame for easy access
user_similarity_df = pd.DataFrame.sparse.from_spmatrix(
    user_similarity_sparse,
    index=rating_matrix.index,
    columns=rating_matrix.index
)

print(f"✅ User-user similarity matrix shape: {user_similarity_df.shape}")

✅ User-user similarity matrix shape: (6570, 6570)


In [35]:
def recommend_books_for_user(user_id, top_similar_users=5, top_n_books=5):
    if user_id not in user_similarity_df.index:
        print(f"❌ User-ID {user_id} not found.")
        return []

    # Get most similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:top_similar_users+1].index

    # Ratings from similar users
    similar_users_ratings = rating_matrix.loc[similar_users]

    # Average their ratings
    mean_ratings = similar_users_ratings.mean(axis=0)

    # Remove books already rated by the current user
    books_already_rated = rating_matrix.loc[user_id]
    books_already_rated = books_already_rated[books_already_rated > 0].index
    mean_ratings = mean_ratings.drop(index=books_already_rated, errors='ignore')

    # Top recommended books
    top_books = mean_ratings.sort_values(ascending=False).head(top_n_books).index

    # Return titles
    return books[books['ISBN'].isin(top_books)][['Book-Title', 'Book-Author', 'Publisher']]

In [36]:
# look for a sample
rating_matrix.index[:10]

Index([242, 243, 254, 388, 446, 503, 505, 507, 638, 643], dtype='int64', name='User-ID')

In [37]:
# Get recommendation by User ID
recommend_books_for_user(user_id=242) 

Unnamed: 0,Book-Title,Book-Author,Publisher
2230,Fahrenheit 451,RAY BRADBURY,Del Rey
3354,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,Del Rey
5233,Brave New World,Aldous Huxley,Perennial
10327,A Wind in the Door,Madeleine L'Engle,Yearling Books


### 📘 5. Recommender Systems Project – Book-Crossing Dataset

## 📌 Objective
The goal of this project was to implement different types of recommender systems using the Book-Crossing dataset, which includes user ratings and metadata for books.

---

## 📂 Dataset Overview
- **Users.csv**: Contains `User-ID`, location, and `Age`
- **Books.csv**: Contains book `ISBN`, `Title`, `Author`, `Year`, and `Publisher`
- **Ratings.csv**: Contains `User-ID`, `ISBN`, and `Book-Rating`

Only explicit ratings (1–10) were kept for collaborative filtering. Implicit ratings (value = 0) were removed.

---

## 🧹 Preprocessing Highlights
- Cleaned invalid or missing values
- Converted `User-ID` safely to integer
- Filtered to include only:
  - Users with at least 10 ratings
  - Books with at least 10 ratings

---

## 🎭 Content-Based Filtering

- Combined book metadata (`Title`, `Author`, `Publisher`) into a single text feature
- Applied **TF-IDF Vectorization**
- Used **Cosine Similarity** to find similar books
- Created a function `get_recommendations(title)` to return top 5 similar books

**Note**: Due to memory limitations, this was done on a random 30,000 book sample.

---

## 🤝 Collaborative Filtering (Item-Item)

- Built a **pivot table** with `User-ID` × `ISBN`
- Used **Cosine Similarity** on the transposed matrix (books as rows)
- Created `get_similar_books(isbn)` to return similar books based on other users’ ratings

---

## 🧑‍🤝‍🧑 Collaborative Filtering (User-User)

- Calculated similarity between users using **Cosine Similarity**
- For a given user, found top similar users
- Aggregated ratings from similar users to recommend new books
- Created `recommend_books_for_user(user_id)` function

Recommendations exclude books already rated by the user.

---

## ✅ Summary
| Method                    | Description                            | Strengths                             |
|--------------------------|----------------------------------------|----------------------------------------|
| Content-Based Filtering  | Similar books by metadata              | Works without ratings, fast            |
| Item-Item CF             | Similar books based on user ratings    | Accurate when item similarities matter |
| User-User CF             | Recommends based on similar users      | Good personalization                   |