<a href="https://colab.research.google.com/github/suhailamohammed/book-recommender-backend/blob/main/Book_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Getting started on Recommendation System

In [None]:
import pandas as pd

In [None]:
file_path = 'my_books.csv'
df = pd.read_csv(file_path)

print(df.head())

                 title          author                genres  \
0           The Hobbit  J.R.R. Tolkien    Fantasy, Adventure   
1  Pride and Prejudice     Jane Austen      Romance, Classic   
2                 Dune   Frank Herbert     Sci-Fi, Political   
3          The Martian       Andy Weir      Sci-Fi, Survival   
4                 1984   George Orwell  Dystopian, Political   

                         keywords  \
0         dragons, quest, journey   
1        marriage, society, witty   
2        desert, prophecy, empire   
3         Mars, stranded, science   
4  surveillance, control, freedom   

                                         description  rating  
0  A hobbit embarks on a dangerous adventure with...       5  
1  A love story in 19th-century England exploring...       4  
2  A noble family gets caught in a galactic battl...       5  
3  An astronaut is stranded on Mars and uses scie...       4  
4  A bleak future where a totalitarian regime mon...       3  


In [None]:
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   title                        10 non-null     object
 1   author                       10 non-null     object
 2   genres                       10 non-null     object
 3   keywords                     10 non-null     object
 4   description                  10 non-null     object
 5   rating                       10 non-null     int64 
 6   combined_features            10 non-null     object
 7   combined_features_weighted1  10 non-null     object
 8   combined_features_weighted2  10 non-null     object
dtypes: int64(1), object(8)
memory usage: 852.0+ bytes
None
          rating
count  10.000000
mean    4.100000
std     0.737865
min     3.000000
25%     4.000000
50%     4.000000
75%     4.750000
max     5.000000


In [None]:
df.isnull().sum()
# df_cleaned = df.dropna()

# print("\nDataFrame after removing null values:")
# print(df_cleaned.head())
# print(df_cleaned.info())
# print(df_cleaned.describe())

Unnamed: 0,0
title,0
author,0
genres,0
keywords,0
description,0
rating,0


In [None]:
df['combined_features'] = (
    df['genres'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))+ ' ' +
    df['keywords'].apply(lambda x: ' '.join(x) if isinstance(x,list) else str(x)) + '' +
    df['description'].fillna(''))
print(df['combined_features'].head())

0    Fantasy, Adventure dragons, quest, journeyA ho...
1    Romance, Classic marriage, society, wittyA lov...
2    Sci-Fi, Political desert, prophecy, empireA no...
3    Sci-Fi, Survival Mars, stranded, scienceAn ast...
4    Dystopian, Political surveillance, control, fr...
Name: combined_features, dtype: object


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_features'])


tfidf_matrix.shape
vectorizer.get_feature_names_out()


array(['19th', 'adventure', 'age', 'american', 'astronaut', 'attends',
       'battle', 'bleak', 'boy', 'caught', 'century', 'citizen', 'class',
       'classic', 'coming', 'control', 'dangerous', 'desert', 'destiny',
       'discover', 'discovers', 'dragons', 'dreams', 'dwarves',
       'dynamics', 'dystopian', 'embarks', 'empirea', 'england',
       'exploring', 'faces', 'family', 'fantasy', 'fi', 'fights',
       'freedoma', 'friendshipa', 'future', 'galactic', 'gamea', 'gets',
       'girl', 'goes', 'hobbit', 'injustice', 'innocencea', 'journey',
       'journeya', 'justice', 'legend', 'love', 'magic', 'magical',
       'marriage', 'mars', 'monitors', 'murder', 'mystery', 'noble',
       'personal', 'philosophy', 'planet', 'political', 'powerful',
       'prophecy', 'psychological', 'quest', 'racial', 'racism',
       'rebellion', 'reclaim', 'refuses', 'regime', 'romance', 'school',
       'sci', 'science', 'sciencean', 'shepherd', 'silence', 'social',
       'society', 'south', 's

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.        , 0.        , 0.        , 0.        ,
        0.05832334, 0.        , 0.19098788, 0.        , 0.        ],
       [0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.05469193, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        , 0.09395813, 0.05609189,
        0.        , 0.        , 0.        , 0.        , 0.05281596],
       [0.        , 0.        , 0.09395813, 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.10912123],
       [0.        , 0.        , 0.05609189, 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.13028817],
       [0.05832334, 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.05968055],
       [0.        , 0.05469193, 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ],
       [0.19098788, 0.        , 0.       

In [None]:
def get_book_recommendations(title, top_n):
  idx = df[df['title'] == title].index.item()
  sim_scores = list(enumerate(cosine_sim[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:top_n+1]
  book_indices = [i[0] for i in sim_scores]

  return df['title'].iloc[book_indices]

print(get_book_recommendations('Dune',5))

3            The Martian
4                   1984
9       The Hunger Games
0             The Hobbit
1    Pride and Prejudice
Name: title, dtype: object


In [None]:
from sklearn.metrics.pairwise import linear_kernel

def get_book_recommendations_genre(genre, top_n):
  #first we need to convert the genre into a vector
  genre_vector = vectorizer.transform([genre])

  #we will then find the similarity between the genre vector and book vectors.
  #we will be using linear kernel which find the do product between GV and BV
  # we could had used cosine sim but linear kernel is faster and lighter
  genre_sim_scores = linear_kernel(genre_vector, tfidf_matrix).flatten()

  top_n_recom_index = genre_sim_scores.argsort()[-top_n:][::-1]

  return df['title'].iloc[top_n_recom_index]

genre_vector = get_book_recommendations_genre('Sci fi',5)
print(genre_vector)

3           The Martian
2                  Dune
8    The Silent Patient
9      The Hunger Games
7         The Alchemist
Name: title, dtype: object


In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack, csr_matrix

scaler = MinMaxScaler()
ratings_normalised = scaler.fit_transform(df[['rating']])

ratings_weighted = ratings_normalised * 100

# Combine tfidf matrix with ratings as an extra feature
tfidf_with_ratings = hstack([tfidf_matrix, csr_matrix(ratings_weighted)])


In [None]:
def get_book_recommendations_with_genre_ratings(genre, top_n):
    genre_vec = vectorizer.transform([genre])
    # Append zero rating for the genre query vector
    # Create sparse matrix for rating feature of the query vector (0 as neutral)
    rating_vec = csr_matrix(np.array([[0]]))  # shape (1,1)

    # Append rating feature to query vector
    genre_vec_with_rating = hstack([genre_vec, rating_vec])

    similarities = linear_kernel(genre_vec_with_rating, tfidf_with_ratings).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    return df.iloc[top_indices][['title', 'rating']]

print(get_book_recommendations_with_genre_ratings('Sci fi',5))

                title  rating
3         The Martian       4
2                Dune       5
8  The Silent Patient       3
9    The Hunger Games       4
7       The Alchemist       4


## Building Real Recommendation System

In [None]:
import numpy as np
import pandas as pd

### Fetch books from Goodreads

In [None]:
goodreads_file_path = 'goodreads_library_export.csv'

df_goodreads = pd.read_csv(goodreads_file_path)

# print(df_goodreads.head())
print(df_goodreads.columns)
# print(df_goodreads.info())

Index(['Book Id', 'Title', 'Author', 'Author l-f', 'Additional Authors',
       'ISBN', 'ISBN13', 'My Rating', 'Average Rating', 'Publisher', 'Binding',
       'Number of Pages', 'Year Published', 'Original Publication Year',
       'Date Read', 'Date Added', 'Bookshelves', 'Bookshelves with positions',
       'Exclusive Shelf', 'My Review', 'Spoiler', 'Private Notes',
       'Read Count', 'Owned Copies'],
      dtype='object')


In [None]:
df_goodreads.isnull().sum()

Unnamed: 0,0
Book Id,0
Title,0
Author,0
Author l-f,0
Additional Authors,584
ISBN,0
ISBN13,0
My Rating,0
Average Rating,0
Publisher,26


### Fetch books from Google API

In [None]:
import requests
import time

def fetch_many_books(query='novel', total=500):
    books = []
    max_per_request = 40

    for start in range(0, total, max_per_request):
        params = {
            'q': query,
            'startIndex': start,
            'maxResults': max_per_request,
            'orderBy': 'relevance',
            'printType': 'books'
        }
        response = requests.get("https://www.googleapis.com/books/v1/volumes", params=params)

        if response.status_code != 200:
            print(f"Failed at index {start}: {response.status_code}")
            break

        data = response.json()
        items = data.get('items', [])
        if not items:
            break

        for item in items:
            info = item.get('volumeInfo', {})
            books.append({
                'title': info.get('title', 'Unknown'),
                'authors': ', '.join(info.get('authors', [])),
                'description': info.get('description', ''),
                'average_rating': info.get('averageRating', 0),
                'categories': ', '.join(info.get('categories', [])) if 'categories' in info else 'Unknown'
            })

        time.sleep(0.2)

    return books

# Example: Fetch top 120 books
df_books = pd.DataFrame(fetch_many_books(query='novel'))
print(f"Fetched {len(df_books)} books.")

Fetched 320 books.


In [None]:
import requests

def search_books_openlibrary(query, limit):
    url = "https://openlibrary.org/search.json"
    params = {
        'q': query,
        'limit': limit
    }
    response = requests.get(url, params=params)
    data = response.json()

    books = []
    for doc in data.get('docs', []):
        # Title
        title = doc.get('title', 'Unknown')

        # Authors - a list of author names
        authors_list = doc.get('author_name', [])
        authors = ', '.join(authors_list) if authors_list else 'Unknown'

        # Description - Open Library Search API does NOT provide description here, need another call per work or edition
        description = ''  # Placeholder, could fetch separately if needed

        # Average rating - not available in Open Library API (no ratings)
        average_rating = 0  # Placeholder or None

        # Categories/subjects - subjects field can be used here
        categories_list = doc.get('subject', [])
        categories = ', '.join(categories_list) if categories_list else 'Unknown'

        books.append({
            'title': title,
            'authors': authors,
            'description': description,
            'average_rating': average_rating,
            'categories': categories
        })

    return books

# Example usage:
df_books = pd.DataFrame(search_books_openlibrary("mystery", limit=500))

print(len(df_books), df_books.head())

500                              title                                  authors  \
0              A Caribbean Mystery                Agatha Christie, Christie   
1  The Mysterious Affair at Styles  Agatha Christie, Aric Cushing, Bookstar   
2    The Mystery of the Blue Train                          Agatha Christie   
3            Murder in Mesopotamia                          Agatha Christie   
4                Death on the Nile                          Agatha Christie   

  description  average_rating categories  
0                           0    Unknown  
1                           0    Unknown  
2                           0    Unknown  
3                           0    Unknown  
4                           0    Unknown  


### Compare the results

In [None]:
print('goodreads column: ', df_goodreads.columns)
print('google api books:', df_books.columns)

goodreads column:  Index(['Book Id', 'title', 'Author', 'Author l-f', 'Additional Authors',
       'ISBN', 'ISBN13', 'my_rating', 'Average Rating', 'Publisher', 'Binding',
       'Number of Pages', 'Year Published', 'Original Publication Year',
       'Date Read', 'Date Added', 'Bookshelves', 'Bookshelves with positions',
       'Exclusive Shelf', 'My Review', 'Spoiler', 'Private Notes',
       'Read Count', 'Owned Copies'],
      dtype='object')
google api books: Index(['title', 'authors', 'description', 'average_rating', 'categories'], dtype='object')


### Preprocess datasets

In [None]:
df_goodreads = df_goodreads.rename(columns={'Title': 'title', 'My Rating': 'my_rating'})

df_goodreads['title'] = df_goodreads['title'].str.lower().str.strip()
df_books['title'] = df_books['title'].str.lower().str.strip()

In [None]:
import string
import re

def clean_title(title):
    if isinstance(title, str):
        # Remove parentheses and their content
        title = re.sub(r'\s*\([^)]*\)', '', title)
        # Lowercase and strip whitespace
        title = title.lower().strip()
        # Remove punctuation
        title = title.translate(str.maketrans('', '', string.punctuation))
        return title
    return title

df_goodreads['title'] = df_goodreads['title'].apply(clean_title)
df_books['title'] = df_books['title'].apply(clean_title)


In [None]:
print(df_goodreads.columns)
print(df_books.columns)

Index(['Book Id', 'title', 'Author', 'Author l-f', 'Additional Authors',
       'ISBN', 'ISBN13', 'my_rating', 'Average Rating', 'Publisher', 'Binding',
       'Number of Pages', 'Year Published', 'Original Publication Year',
       'Date Read', 'Date Added', 'Bookshelves', 'Bookshelves with positions',
       'Exclusive Shelf', 'My Review', 'Spoiler', 'Private Notes',
       'Read Count', 'Owned Copies', 'matches'],
      dtype='object')
Index(['title', 'authors', 'description', 'average_rating', 'categories'], dtype='object')


In [None]:
import pandas as pd
from fuzzywuzzy import process, fuzz

# --- Sample setup (you can skip this if df_goodreads and df_books already exist) ---
# Just a test example to prove it works
# df_goodreads = pd.DataFrame({'title': ['The Hobbit', 'Harry Potter', 'Pride and Prejudice']})
# df_books = pd.DataFrame({'title': ['harry potter and the philosopher\'s stone', 'the hobbit', 'pride & prejudice']})

# --- Make sure both dataframes have the 'title' column ---
if 'title' not in df_goodreads.columns or 'title' not in df_books.columns:
    raise ValueError("Both df_goodreads and df_books must have a 'title' column.")

# --- Lowercase and clean titles ---
df_goodreads = df_goodreads.copy()
df_books = df_books.copy()

df_goodreads['title'] = df_goodreads['title'].astype(str).str.lower().str.strip()
df_books['title'] = df_books['title'].astype(str).str.lower().str.strip()

# --- Apply fuzzy matching ---
def get_best_match(title, choices, threshold=75):
    match = process.extractOne(title, choices, scorer=fuzz.ratio)
    if match and match[1] >= threshold:
        return match[0]  # return matched title from df_books
    return None

# Get all book titles from df_books
book_titles = df_books['title'].dropna().unique().tolist()

# Match each goodreads title to best matching book title
df_goodreads['matched_title'] = df_goodreads['title'].apply(lambda x: get_best_match(x, book_titles))

# --- Merge matched results ---
df_global = pd.merge(
    df_goodreads,
    df_books,
    left_on='matched_title',
    right_on='title',
    how='outer',
    suffixes=('_goodreads', '_books')
)

# --- Prefer Goodreads title where available ---
df_global['title'] = df_global['title_goodreads'].combine_first(df_global['title_books'])

# --- Drop intermediate columns ---
df_global.drop(columns=['title_goodreads', 'title_books', 'matched_title'], inplace=True)

# --- Optional: drop duplicates ---
df_global.drop_duplicates(subset=['title'], inplace=True)

# --- Result ---
print(df_global.head())
print(df_global.info())


   Book Id Author Author l-f Additional Authors ISBN ISBN13  my_rating  \
0      NaN    NaN        NaN                NaN  NaN    NaN        NaN   
1      NaN    NaN        NaN                NaN  NaN    NaN        NaN   
2      NaN    NaN        NaN                NaN  NaN    NaN        NaN   
3      NaN    NaN        NaN                NaN  NaN    NaN        NaN   
4      NaN    NaN        NaN                NaN  NaN    NaN        NaN   

   Average Rating Publisher Binding  ...  Private Notes  Read Count  \
0             NaN       NaN     NaN  ...            NaN         NaN   
1             NaN       NaN     NaN  ...            NaN         NaN   
2             NaN       NaN     NaN  ...            NaN         NaN   
3             NaN       NaN     NaN  ...            NaN         NaN   
4             NaN       NaN     NaN  ...            NaN         NaN   

   Owned Copies matches match_score                        authors  \
0           NaN     NaN         NaN                       

In [None]:
# df_global = pd.merge(df_goodreads, df_books, on='title', how='outer')
print(df_global.columns)

Index(['Book Id', 'Author', 'Author l-f', 'Additional Authors', 'ISBN',
       'ISBN13', 'my_rating', 'Average Rating', 'Publisher', 'Binding',
       'Number of Pages', 'Year Published', 'Original Publication Year',
       'Date Read', 'Date Added', 'Bookshelves', 'Bookshelves with positions',
       'Exclusive Shelf', 'My Review', 'Spoiler', 'Private Notes',
       'Read Count', 'Owned Copies', 'matches', 'match_score', 'authors',
       'description', 'average_rating', 'categories', 'title'],
      dtype='object')


In [None]:
# Combine texts
df_global['combined_text'] = (
    df_global['title'].fillna('') + ' ' +
    df_global['authors'].fillna('') + ' ' +
    df_global['description'].fillna('') + ' ' +
    df_global['categories'].fillna('') + ' ' +
    df_global['My Review'].fillna('') + ' ' +
    df_global['Bookshelves'].fillna('') + ' ' +
    df_global['Publisher'].fillna('') + ' ' +
    df_global['Year Published'].fillna('').astype(str)
)

print(df_global.head())

   Book Id Author Author l-f Additional Authors ISBN ISBN13  my_rating  \
0      NaN    NaN        NaN                NaN  NaN    NaN        NaN   
1      NaN    NaN        NaN                NaN  NaN    NaN        NaN   
2      NaN    NaN        NaN                NaN  NaN    NaN        NaN   
3      NaN    NaN        NaN                NaN  NaN    NaN        NaN   
4      NaN    NaN        NaN                NaN  NaN    NaN        NaN   

   Average Rating Publisher Binding  ...  Read Count  Owned Copies  matches  \
0             NaN       NaN     NaN  ...         NaN           NaN      NaN   
1             NaN       NaN     NaN  ...         NaN           NaN      NaN   
2             NaN       NaN     NaN  ...         NaN           NaN      NaN   
3             NaN       NaN     NaN  ...         NaN           NaN      NaN   
4             NaN       NaN     NaN  ...         NaN           NaN      NaN   

  match_score                        authors description average_rating  \
0    

In [None]:
# Scale the numerical ratings

from sklearn.preprocessing import MinMaxScaler

df_global[['my_rating', 'Average Rating', 'Number of Pages', 'Read Count']] = \
    df_global[['my_rating', 'Average Rating', 'Number of Pages', 'Read Count']].fillna(0)

# Normalize numeric features between 0 and 1
scaler = MinMaxScaler()
numeric_features = df_global[['my_rating', 'Average Rating', 'Number of Pages', 'Read Count']]
numeric_features_scaled = scaler.fit_transform(numeric_features)

### Vectorize textual Data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df_global['combined_text'])

### Combine text and numerical data

In [None]:
from scipy.sparse import hstack

combined_features = hstack([tfidf_matrix, numeric_features_scaled]).tocsr()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(combined_features, combined_features)

In [None]:
print(cosine_sim)

[[1.         0.02775842 0.02162331 ... 0.         0.         0.        ]
 [0.02775842 1.         0.24266653 ... 0.         0.         0.        ]
 [0.02162331 0.24266653 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.43542354 0.42187474]
 [0.         0.         0.         ... 0.43542354 1.         0.43410072]
 [0.         0.         0.         ... 0.42187474 0.43410072 1.        ]]


In [None]:
# Assume df_global has a column 'title' and 'Exclusive Shelf'

# Step 1: Get all books you have on any shelf: 'read', 'to-read', 'currently-reading'
my_shelves = ['read', 'to-read', 'currently-reading', 'best-of-best']
my_books_df = df_global[df_global['Exclusive Shelf'].isin(my_shelves)]


# Step 2: Reset df_global index so it aligns with combined_features matrix
df_global = df_global.reset_index(drop=True)
my_books = my_books.reset_index(drop=True)

# Step 3: Compute similarity as before
my_indices = my_books_df.index
user_sim = cosine_similarity(combined_features[my_indices], combined_features)
mean_sim = user_sim.mean(axis=0)

# Sort indices by similarity (highest first)
sorted_indices = np.argsort(mean_sim)[::-1]

# Exclude already interacted books
filtered_indices = [idx for idx in sorted_indices if df_global.at[idx, 'title'].lower() not in my_titles_set]

# Shuffle the top similar ones to get variation
np.random.seed()  # ensures different shuffle each time
shuffled_indices = np.random.permutation(filtered_indices[:150])  # buffer top 150 to randomize

# Select top N from shuffled
final_recommendations = shuffled_indices[:50]


# Step 5: Display recommended books
recommended_books = df_global.loc[final_recommendations][['title', 'authors', 'Average Rating', 'categories', 'Exclusive Shelf']]
print(recommended_books)

                                                 title  \
408                                  the pelican brief   
98                                       endless night   
455             the virgin suicides bloomsbury classic   
272                        tales of terror and mystery   
106                                   five little pigs   
17                                a pocket full of rye   
324                                    the dead secret   
350                the house with a clock in its walls   
212                            nightmares  dreamscapes   
120                                          gone girl   
349                        the house on the borderland   
76                                          dead souls   
397                      the mystery of the blue train   
24                                         alias grace   
211                                        night shift   
142                       house of a thousand lanterns   
245           

In [None]:
np.save('combined_features.npy', combined_features.toarray())  # Save dense matrix (small datasets)

In [None]:
# Save df_global as CSV file
df_global.to_csv('global_book_dataset.csv', index=False)