In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**libraries**

In [2]:
!pip install fireducks

Collecting fireducks
  Downloading fireducks-1.1.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (961 bytes)
Collecting firefw==1.1.5 (from fireducks)
  Downloading firefw-1.1.5-py3-none-any.whl.metadata (818 bytes)
Collecting pyarrow<18.2,>=18.1 (from fireducks)
  Downloading pyarrow-18.1.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading fireducks-1.1.5-cp310-cp310-manylinux_2_28_x86_64.whl (7.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading firefw-1.1.5-py3-none-any.whl (12 kB)
Downloading pyarrow-18.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, firefw, fireducks
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 17.0.0
    Uninstalling pyarrow-17.0.0:
      Successfully uninstalled 

In [3]:
#data stuff
#import fireducks.pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#ml models and similarity
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

#save models for deployment
import pickle

**loading and inspecting data**

In [4]:
books = pd.read_csv('/content/drive/MyDrive/ML datasets/bookrec/Books.csv')
ratings = pd.read_csv('/content/drive/MyDrive/ML datasets/bookrec/Ratings.csv')
users = pd.read_csv('/content/drive/MyDrive/ML datasets/bookrec/Users.csv')

#first 5 rows
print("\n\nbooks:\n",books.head())
print("\n\nratings:\n",ratings.head())
print("\n\nusers:\n",users.head())

  books = pd.read_csv('/content/drive/MyDrive/ML datasets/bookrec/Books.csv')




books:
          ISBN                                         Book-Title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

            Book-Author Year-Of-Publication                   Publisher  \
0    Mark P. O. Morford                2002     Oxford University Press   
1  Richard Bruce Wright                2001       HarperFlamingo Canada   
2          Carlo D'Este                1991             HarperPerennial   
3      Gina Bari Kolata                1999        Farrar Straus Giroux   
4       E. J. W. Barber                1999  W. W. Norton &amp; Company   

                                         Image-URL-S  \
0  http://images.amazon.com/images/P/0195153448.0...   
1  http://images.a

In [5]:
#shape of datasets
print("shapes:")
print("books: ", books.shape)
print("Ratings: ", ratings.shape)
print("Users: ", users.shape)

shapes:
books:  (271360, 8)
Ratings:  (1149780, 3)
Users:  (278858, 3)


**data cleaning**

In [6]:
#checking for missing values
print("missing values: ")
print(books.isnull().sum())
print(ratings.isnull().sum())
print(users.isnull().sum())

missing values: 
ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64
User-ID        0
ISBN           0
Book-Rating    0
dtype: int64
User-ID          0
Location         0
Age         110762
dtype: int64


In [7]:
#fill missing values in 'Age' with the mean age
users['Age'].fillna(users['Age'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  users['Age'].fillna(users['Age'].mean(), inplace=True)


In [8]:
#removing duplicates if any
books.drop_duplicates(inplace=True)
ratings.drop_duplicates(inplace=True)
users.drop_duplicates(inplace=True)

In [9]:
#verifying cleaning
print("\nCleaned Data:")
print(books.isnull().sum())
print(users.isnull().sum())
print(ratings.isnull().sum())


Cleaned Data:
ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64
User-ID     0
Location    0
Age         0
dtype: int64
User-ID        0
ISBN           0
Book-Rating    0
dtype: int64


**merging datasets**

In [10]:
ratings_with_books = ratings.merge(books, on='ISBN', how='inner')
df = ratings_with_books.merge(users, on='User-ID', how='inner')
print(df.head())

   User-ID        ISBN  Book-Rating  \
0   276725  034545104X            0   
1   276726  0155061224            5   
2   276727  0446520802            0   
3   276729  052165615X            3   
4   276729  0521795028            6   

                                          Book-Title      Book-Author  \
0                               Flesh Tones: A Novel       M. J. Rose   
1                                   Rites of Passage       Judith Rae   
2                                       The Notebook  Nicholas Sparks   
3                                     Help!: Level 1    Philip Prowse   
4  The Amsterdam Connection : Level 4 (Cambridge ...      Sue Leather   

  Year-Of-Publication                   Publisher  \
0                2002            Ballantine Books   
1                2001                      Heinle   
2                1996                Warner Books   
3                1999  Cambridge University Press   
4                2001  Cambridge University Press   

       

**popularity-based recommendation**

In [11]:
#counting number of ratings and calculate average rating
num_rating_df = df.groupby('Book-Title')['Book-Rating'].count().reset_index()
num_rating_df.rename(columns={'Book-Rating': 'num_ratings'}, inplace = True)

avg_rating_df = df.groupby('Book-Title')['Book-Rating'].mean().reset_index()
avg_rating_df.rename(columns={'Book-Rating': 'avg_rating'}, inplace=True)

#merge counts and averages
popular_df = num_rating_df.merge(avg_rating_df, on='Book-Title')

#filter books with at least 250 ratings and sort by average rating
popular_df = popular_df[popular_df['num_ratings'] >= 250].sort_values('avg_rating', ascending=False)

#add author and image details
popular_df = popular_df.merge(books, on='Book-Title')[['Book-Title', 'Book-Author', 'Image-URL-M', 'num_ratings', 'avg_rating']]

#display top 10 popular books
print(popular_df.head(10))

                                          Book-Title    Book-Author  \
0  Harry Potter and the Prisoner of Azkaban (Book 3)  J. K. Rowling   
1  Harry Potter and the Prisoner of Azkaban (Book 3)  J. K. Rowling   
2  Harry Potter and the Prisoner of Azkaban (Book 3)  J. K. Rowling   
3       Harry Potter and the Goblet of Fire (Book 4)  J. K. Rowling   
4       Harry Potter and the Goblet of Fire (Book 4)  J. K. Rowling   
5     Harry Potter and the Sorcerer's Stone (Book 1)  J. K. Rowling   
6     Harry Potter and the Sorcerer's Stone (Book 1)  J. K. Rowling   
7     Harry Potter and the Sorcerer's Stone (Book 1)  J. K. Rowling   
8     Harry Potter and the Sorcerer's Stone (Book 1)  J. K. Rowling   
9  Harry Potter and the Order of the Phoenix (Boo...  J. K. Rowling   

                                         Image-URL-M  num_ratings  avg_rating  
0  http://images.amazon.com/images/P/0439136350.0...          428    5.852804  
1  http://images.amazon.com/images/P/0439136369.0...      

**collaborative filtering**

In [12]:
'''
#creating a pivot table for collaborative filtering
pt = df.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating').fillna(0)

#calculate cosine similarity
similarity_scores = cosine_similarity(pt)

#function to recomment similar books
def recommend(book_name):
  #fetch index of book
  index = np.where(pt.index == book_name)[0][0]

  #calculate similarity scores
  scores = list(enumerate(similarity_scores[index]))
  scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:6]#top 5 recommendations

  #return recommended books
  recommendations = []
  for i in scores:
    recommendations.append(pt.index[i[0]])
  return recommendations

#eg: get recommendations for '1984'
print(recommend('1984'))
'''

"\n#creating a pivot table for collaborative filtering\npt = df.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating').fillna(0)\n\n#calculate cosine similarity\nsimilarity_scores = cosine_similarity(pt)\n\n#function to recomment similar books\ndef recommend(book_name):\n  #fetch index of book\n  index = np.where(pt.index == book_name)[0][0]\n\n  #calculate similarity scores\n  scores = list(enumerate(similarity_scores[index]))\n  scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:6]#top 5 recommendations\n\n  #return recommended books\n  recommendations = []\n  for i in scores:\n    recommendations.append(pt.index[i[0]])\n  return recommendations\n\n#eg: get recommendations for '1984'\nprint(recommend('1984'))\n"

the above creates a pivot table where rows are books and columns are users, with ratings as values.

computes similarity between books using cosine similarity

finds the top 5 most similar books for a given title

the above didnt work due to RAM limitations

**optimized collaborative filtering:**

1. filtering active users and popular books to reduce matrix size
2. using sparse matrices for cosine similarity instead of dense matrices

In [13]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [14]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from fuzzywuzzy import process
import numpy as np

#(1)filter active Users and popular Books
#filter active users who have rated more than 200 books
active_users = ratings.groupby('User-ID')['Book-Rating'].count()
active_users = active_users[active_users > 200].index

#filter books with more than 50 ratings
popular_books = ratings.groupby('ISBN')['Book-Rating'].count()
popular_books = popular_books[popular_books > 50].index

#filter data
filtered_ratings = ratings[(ratings['User-ID'].isin(active_users)) &
                            (ratings['ISBN'].isin(popular_books))]

print(f"Filtered Ratings Shape: {filtered_ratings.shape}")

#(2)merge Ratings with Books to Map Titles to ISBNs
#merge books with ratings to get titles
filtered_ratings = filtered_ratings.merge(books, on='ISBN')
print(f"Filtered Ratings with Titles: {filtered_ratings.shape}")

#(3)create Pivot Table and Convert to Sparse Matrix
#pivot table for collaborative filtering
pt = filtered_ratings.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating').fillna(0)

print(f"Pivot Table Shape: {pt.shape}")

#convert to sparse matrix to save memory
sparse_pt = csr_matrix(pt)

#compute cosine similarity
similarity_scores = cosine_similarity(sparse_pt)
print(f"Similarity Scores Shape: {similarity_scores.shape}")

#(4)fuzzy Matching for Title Lookup
def find_closest_match(book_name):
    """Find the closest matching title in the pivot table."""
    matches = process.extractOne(book_name, pt.index)
    return matches[0] if matches else None

#(5)collaborative filtering recommendation function
def recommend_by_colab(book_name):
    """Recommend books based on collaborative filtering."""
    #handle fuzzy matching to find the closest match
    closest_match = find_closest_match(book_name)
    if closest_match is None:
        return ["Book not found in collaborative filtering dataset."]

    #fetch index of the matched book
    index = np.where(pt.index == closest_match)[0][0]

    #get similarity scores
    scores = list(enumerate(similarity_scores[index]))

    #sort by similarity and fetch top 5 recommendations
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:6]
    recommendations = [pt.index[i[0]] for i in scores]

    return recommendations

#(6)test recommendations
print(recommend_by_colab('The Hobbit'))  #replace with an actual book title



Filtered Ratings Shape: (90529, 3)
Filtered Ratings with Titles: (89929, 10)
Pivot Table Shape: (1913, 891)
Similarity Scores Shape: (1913, 1913)
['Bag of Bones', 'The Regulators', 'Pet Sematary', 'Skeleton Crew', 'The Bachman Books: Rage, the Long Walk, Roadwork, the Running Man']


In [15]:
#print(find_closest_match('The Hobbit'))

In [16]:
#print(filtered_ratings['Book-Title'].unique())

In [17]:
#print(similarity_scores[index])

this works because:

•	Filtered Data: Reduces rows and columns based on active users and popular books.

•	Sparse Matrices: Stores only non-zero values, drastically cutting down memory usage.

•	Cosine Similarity on Sparse Matrices: Efficiently computes similarity without inflating memory usage.

**genre-based filtering**

In [18]:
'''
#use TF-IDF to vectorize genres or descriptions
books['Book-Author'] = books['Book-Author'].fillna('')  #fill missing values
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books['Book-Author'])

#compute cosine similarity
genre_similarity = cosine_similarity(tfidf_matrix)

#recommend books based on author similarity
def recommend_by_genre(book_title):
    idx = books[books['Book-Title'] == book_title].index[0]
    sim_scores = list(enumerate(genre_similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]
    book_indices = [i[0] for i in sim_scores]
    return books.iloc[book_indices][['Book-Title', 'Book-Author', 'Image-URL-M']]

#example: Genre-based recommendations
print(recommend_by_genre('1984'))
'''

"\n#use TF-IDF to vectorize genres or descriptions\nbooks['Book-Author'] = books['Book-Author'].fillna('')  #fill missing values\ntfidf = TfidfVectorizer(stop_words='english')\ntfidf_matrix = tfidf.fit_transform(books['Book-Author'])\n\n#compute cosine similarity\ngenre_similarity = cosine_similarity(tfidf_matrix)\n\n#recommend books based on author similarity\ndef recommend_by_genre(book_title):\n    idx = books[books['Book-Title'] == book_title].index[0]\n    sim_scores = list(enumerate(genre_similarity[idx]))\n    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]\n    book_indices = [i[0] for i in sim_scores]\n    return books.iloc[book_indices][['Book-Title', 'Book-Author', 'Image-URL-M']]\n\n#example: Genre-based recommendations\nprint(recommend_by_genre('1984'))\n"

**optimized genre based filtering**

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process  #for fuzzy matching
import numpy as np

#(1)filling in missing values in book-author column
books['Book-Author'] = books['Book-Author'].fillna('')

#(2)limiting to book filtered earlier for ratings
filtered_books = books[books['ISBN'].isin(filtered_ratings['ISBN'].unique())]
filtered_books = filtered_books.reset_index(drop=True)
print(f"Filtered Books Shape: {filtered_books.shape}")

#(3)tf-idf vectorization (converting the text into numerical features)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(filtered_books['Book-Author'])
print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")

#(4)compute similarity in batches to save memory
batch_size = 500
similarity_batches = []

for i in range(0, tfidf_matrix.shape[0], batch_size):
    #compute cosine similarity batch-wise
    batch = tfidf_matrix[i:i + batch_size]
    sim = cosine_similarity(batch, tfidf_matrix)
    similarity_batches.append(sim)

#combine all batches into a single matrix
genre_similarity = np.vstack(similarity_batches)
print(f"Genre Similarity Shape: {genre_similarity.shape}")

#(5)fuzzy matching to Handle Title Mismatches
def find_closest_match(book_title):
    """Find the closest matching book title using fuzzy matching."""
    matches = process.extractOne(book_title, filtered_books['Book-Title'].values)
    return matches[0] #return the closest match

#(6)Genre-Based Recommendation Function
def recommend_by_genre(book_title):
    """Recommend books based on genre similarity."""
    #find the closest match for the given book title
    closest_title = find_closest_match(book_title)
    print(f"Using closest match: {closest_title}")

    #get index of the closest title
    idx = filtered_books[filtered_books['Book-Title'] == closest_title].index[0]

    #compute similarity scores
    scores = list(enumerate(genre_similarity[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:6]  # Top 5 similar books

    #extract book indices
    book_indices = [i[0] for i in scores]

    #return recommended books with their details
    return filtered_books.iloc[book_indices][['Book-Title', 'Book-Author', 'Image-URL-M']]

#(7) testing
book_title = 'The Hobbit'
print(recommend_by_genre(book_title))

Filtered Books Shape: (2101, 8)
TF-IDF Matrix Shape: (2101, 1031)
Genre Similarity Shape: (2101, 2101)
Using closest match: The Hobbit: or There and Back Again
                                            Book-Title     Book-Author  \
339                                   The Silmarillion  J.R.R. TOLKIEN   
381  The Fellowship of the Ring (The Lord of the Ri...  J.R.R. TOLKIEN   
574                The Hobbit: or There and Back Again  J.R.R. Tolkien   
659  The Hobbit : The Enchanting Prelude to The Lor...  J.R.R. TOLKIEN   
778  The Return of the King (The Lord of the Rings,...  J.R.R. TOLKIEN   

                                           Image-URL-M  
339  http://images.amazon.com/images/P/0345325818.0...  
381  http://images.amazon.com/images/P/0345339703.0...  
574  http://images.amazon.com/images/P/0618002219.0...  
659  http://images.amazon.com/images/P/0345339681.0...  
778  http://images.amazon.com/images/P/0345339738.0...  


key changes:
1.	Batch Processing for Similarity Calculations to avoid RAM overload.
2.	TF-IDF Vectorization limited to filtered books only.

**hybrid recommendations**

In [20]:
def hybrid_recommend(book_name):
    #collaborative filtering recommendations
    collab_recommendations = recommend_by_colab(book_name)

    #genre-based recommendations
    genre_recommendations = recommend_by_genre(book_name)['Book-Title'].tolist()

    #combine results
    combined = list(set(collab_recommendations + genre_recommendations))
    return combined[:5]  #return top 5 unique recommendations

#eg: Hybrid recommendations
print(hybrid_recommend('The Hobbit'))

Using closest match: The Hobbit: or There and Back Again
['Asking for Trouble: A Novel', 'The Fellowship of the Ring (The Lord of the Rings, Part 1)', 'Border Music', 'The Hobbit : The Enchanting Prelude to The Lord of the Rings', 'Blood Orchid']


**saving models**

In [21]:
import pickle
pickle.dump(pt, open('pivot_table.pkl', 'wb'))
pickle.dump(similarity_scores, open('similarity_scores.pkl', 'wb'))
pickle.dump(filtered_ratings, open('filtered_ratings.pkl', 'wb'))

In [22]:
!pip install flask
!pip install flask_cors
!pip install requests

Collecting flask_cors
  Downloading Flask_Cors-5.0.0-py2.py3-none-any.whl.metadata (5.5 kB)
Downloading Flask_Cors-5.0.0-py2.py3-none-any.whl (14 kB)
Installing collected packages: flask_cors
Successfully installed flask_cors-5.0.0
