In [1]:
#Importing all the necessary libraries
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import os
from tabulate import tabulate
#For model building
import scipy
import math
import sklearn
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity # importing consine_similarity score from metrics module of seaborn lib.
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.neighbors import NearestNeighbors


# This is to supress the warning messages
import warnings
warnings.filterwarnings('ignore') # for ignoring the warnings

In [2]:
Books = pd.read_csv('Dataset/Books.csv' , sep=";", on_bad_lines='skip', encoding='latin-1')

In [3]:
Books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
Books.shape

(271360, 8)

In [5]:
Books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [6]:
Books= Books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']]

In [7]:
Books.shape

(271360, 5)

In [8]:
Ratings = pd.read_csv('Dataset/Ratings.csv' , sep=";", on_bad_lines='skip', encoding='latin-1')

In [9]:
Ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [10]:
Ratings.shape

(1149780, 3)

In [11]:
Users = pd.read_csv('Dataset/Users.csv' , sep=";", on_bad_lines='skip', encoding='latin-1')

In [12]:
Users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [13]:
print(Books.shape)
print(Ratings.shape)
print(Users.shape)

(271360, 5)
(1149780, 3)
(278858, 3)


In [14]:
Books.rename(columns={"Book-Title":'Title',
                      'Book-Author':'Author',
                     "Year-Of-Publication":'Year',
                     "Publisher":"Publisher"},inplace=True)

In [15]:
Books.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [16]:
Users.rename(columns={"User-ID":'User_ID'}, inplace=True)

In [17]:
Users.head()

Unnamed: 0,User_ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [18]:
Ratings.rename(columns={"User-ID":'User_ID',
                      'Book-Rating':'Rating'},inplace=True)

In [19]:
Ratings.head()

Unnamed: 0,User_ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [20]:
print(Books.shape, Users.shape, Ratings.shape, sep='\n')

(271360, 5)
(278858, 3)
(1149780, 3)


In [21]:
Ratings['User_ID'].value_counts()   #checking the count of ratings by the User_ID

User_ID
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: count, Length: 105283, dtype: int64

In [22]:
#checking number of unique User IDs
Ratings['User_ID'].value_counts().shape
Ratings['User_ID'].unique().shape

(105283,)

In [23]:
##storing users who had at least rated more than 200 books
x= Ratings['User_ID'].value_counts() > 200

In [24]:
x[x].shape

(899,)

In [25]:
##storing the  list of User IDs
y= x[x].index
y

Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352, 110973,
       235105,
       ...
       260183,  73681,  44296, 155916,   9856, 274808,  28634,  59727, 268622,
       188951],
      dtype='int64', name='User_ID', length=899)

In [26]:
Ratings = Ratings[Ratings['User_ID'].isin(y)]

In [27]:
Ratings.head()

Unnamed: 0,User_ID,ISBN,Rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [28]:
##merging the ratings dataframe with books
Ratings_with_Books = Ratings.merge(Books, on='ISBN').merge(Users, on='User_ID')

In [29]:
Ratings_with_Books.head()

Unnamed: 0,User_ID,ISBN,Rating,Title,Author,Year,Publisher,Location,Age
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,"gilbert, arizona, usa",48.0
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,"gilbert, arizona, usa",48.0
2,277427,003008685X,8,Pioneers,James Fenimore Cooper,1974,Thomson Learning,"gilbert, arizona, usa",48.0
3,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau,1982,Henry Holt &amp; Co,"gilbert, arizona, usa",48.0
4,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens,2002,Avon Books,"gilbert, arizona, usa",48.0


In [30]:
Ratings_with_Books.shape

(487671, 9)

In [31]:
Number_Rating = Ratings_with_Books.groupby('Title')['Rating'].count().reset_index()

In [32]:
Number_Rating.head()

Unnamed: 0,Title,Rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [33]:
Number_Rating.rename(columns={'Rating':'Number of Ratings'}, inplace=True)

In [34]:
Number_Rating.head()

Unnamed: 0,Title,Number of Ratings
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [35]:
Number_Rating.shape

(160269, 2)

In [36]:
Final_Rating= Ratings_with_Books.merge(Number_Rating, on='Title')

In [37]:
Final_Rating.head()

Unnamed: 0,User_ID,ISBN,Rating,Title,Author,Year,Publisher,Location,Age,Number of Ratings
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,"gilbert, arizona, usa",48.0,82
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,"gilbert, arizona, usa",48.0,7
2,277427,003008685X,8,Pioneers,James Fenimore Cooper,1974,Thomson Learning,"gilbert, arizona, usa",48.0,1
3,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau,1982,Henry Holt &amp; Co,"gilbert, arizona, usa",48.0,1
4,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens,2002,Avon Books,"gilbert, arizona, usa",48.0,13


In [38]:
Final_Rating = Final_Rating[Final_Rating['Number of Ratings'] >= 50]

In [39]:
Final_Rating.head()

Unnamed: 0,User_ID,ISBN,Rating,Title,Author,Year,Publisher,Location,Age,Number of Ratings
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,"gilbert, arizona, usa",48.0,82
13,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,"gilbert, arizona, usa",48.0,133
15,277427,0060934417,0,Bel Canto: A Novel,Ann Patchett,2002,Perennial,"gilbert, arizona, usa",48.0,108
18,277427,0061009059,9,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995,HarperTorch,"gilbert, arizona, usa",48.0,108
24,277427,006440188X,0,The Secret Garden,Frances Hodgson Burnett,1998,HarperTrophy,"gilbert, arizona, usa",48.0,79


In [40]:
Final_Rating.shape

(61853, 10)

In [41]:
Final_Rating.drop_duplicates(['User_ID','Title'],inplace=True)
Final_Rating.shape

(59850, 10)

In [42]:
Final_Rating.isnull().sum()

User_ID                  0
ISBN                     0
Rating                   0
Title                    0
Author                   0
Year                     0
Publisher                0
Location                 0
Age                  13247
Number of Ratings        0
dtype: int64

In [43]:
Final_Rating.dtypes

User_ID                int64
ISBN                  object
Rating                 int64
Title                 object
Author                object
Year                  object
Publisher             object
Location              object
Age                  float64
Number of Ratings      int64
dtype: object

**Top-K**

In [45]:
agg_final_rating_df = Final_Rating.groupby('Title').agg(Average_Rating = ('Rating','mean'),Number_of_ratings=('Rating','count')).reset_index()

In [46]:
agg_final_rating_df.head()

Unnamed: 0,Title,Average_Rating,Number_of_ratings
0,1984,3.347222,72
1,1st to Die: A Novel,2.196078,153
2,2nd Chance,2.347826,115
3,4 Blondes,0.971831,71
4,84 Charing Cross Road,4.568627,51


In [47]:
def recommend_top_k(n=5):
  top_recommendations = agg_final_rating_df.sort_values(['Average_Rating','Number_of_ratings'],ascending=False).head(10)
  return top_recommendations

In [48]:
recommended_top_10 = recommend_top_k(10)

In [49]:
print("Your Top 10 Recommendations : \n")
recommended_top_10.head(10)

Your Top 10 Recommendations : 



Unnamed: 0,Title,Average_Rating,Number_of_ratings
596,The Little Prince,5.307692,52
241,Harry Potter and the Sorcerer's Stone (Book 1),4.723684,76
4,84 Charing Cross Road,4.568627,51
240,Harry Potter and the Prisoner of Azkaban (Book 3),4.451128,133
238,Harry Potter and the Goblet of Fire (Book 4),4.444444,117
239,Harry Potter and the Order of the Phoenix (Boo...,4.180952,105
103,Carrie,3.962264,53
30,A Wrinkle in Time,3.890411,73
237,Harry Potter and the Chamber of Secrets (Book 2),3.786127,173
642,The Secret Garden,3.784615,65


In [50]:
def collaborative_filtering(User_id, Final_Rating, top_n=5):
    # Create user-item matrix
    user_item_matrix = Final_Rating.pivot(index="User_ID", columns="ISBN", values="Rating").fillna(0)
    sparse_matrix = csr_matrix(user_item_matrix.values)

    # Fit the KNN model
    model_knn = NearestNeighbors(metric="cosine", algorithm="brute")
    model_knn.fit(sparse_matrix)

    # Find the nearest neighbors
    user_vector = user_item_matrix.loc[User_id].values.reshape(1, -1)
    distances, indices = model_knn.kneighbors(user_vector, n_neighbors=top_n + 1)

    # Generate recommendations
    recommended_book_ids = user_item_matrix.iloc[indices.flatten()[1:]].mean(axis=0).nlargest(top_n).index
    recommended_books = Final_Rating[Final_Rating["ISBN"].isin(recommended_book_ids)]
    
    # Format results using tabulate
    print("Collaborative Filtering Recommendations:\n")
    print(tabulate(recommended_books[["Title", "Author"]].drop_duplicates(), headers='keys', tablefmt='fancy_grid'))
    
    return recommended_books[["Title", "Author"]].drop_duplicates()

In [51]:
collab_rec = collaborative_filtering(277427, Final_Rating, top_n=10)
# print(tabulate("Collaborative Filtering Recommendations:\n", collab_rec.drop_duplicates()))

Collaborative Filtering Recommendations:

╒═══════╤══════════════════════════════════════╤════════════════════════╕
│       │ Title                                │ Author                 │
╞═══════╪══════════════════════════════════════╪════════════════════════╡
│    32 │ Girl in Hyacinth Blue                │ Susan Vreeland         │
├───────┼──────────────────────────────────────┼────────────────────────┤
│    65 │ Me Talk Pretty One Day               │ David Sedaris          │
├───────┼──────────────────────────────────────┼────────────────────────┤
│   126 │ The Da Vinci Code                    │ Dan Brown              │
├───────┼──────────────────────────────────────┼────────────────────────┤
│   143 │ Lord of the Flies                    │ William Gerald Golding │
├───────┼──────────────────────────────────────┼────────────────────────┤
│   340 │ Pay It Forward                       │ Catherine Ryan Hyde    │
├───────┼──────────────────────────────────────┼──────────────────────

In [52]:
# Content-Based Filtering
def content_based_recommendation(book_title, Final_Rating, top_n=5):
    tfidf = TfidfVectorizer(stop_words="english")
    Final_Rating["Title"] = Final_Rating["Title"].fillna("")
    content_based_final_df = Final_Rating.drop_duplicates(subset="Title")
    tfidf_matrix = tfidf.fit_transform(content_based_final_df["Title"])

    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    indices = pd.Series(content_based_final_df.index, index=content_based_final_df["Title"]).drop_duplicates()

    if book_title not in indices:
        raise ValueError(f"'{book_title}' not found in the dataset.")

    idx = indices[book_title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n + 1]
    book_indices = [i[0] for i in sim_scores]

    recommendations = content_based_final_df.iloc[book_indices][["Title", "Author"]]
    
    # Display using tabulate for a cleaner console output
    print(tabulate(recommendations, headers='keys', tablefmt='fancy_grid'))
    return recommendations

In [53]:
content_rec = content_based_recommendation("Girl in Hyacinth Blue", Final_Rating, top_n=5)
# print("Content-Based Recommendations:\n", content_rec.drop_duplicates())

╒═══════╤══════════════════════════════════════════════════════════════════════════╤═══════════════════╕
│       │ Title                                                                    │ Author            │
╞═══════╪══════════════════════════════════════════════════════════════════════════╪═══════════════════╡
│  5168 │ The Two Towers (The Lord of the Rings, Part 2)                           │ J. R. R. Tolkien  │
├───────┼──────────────────────────────────────────────────────────────────────────┼───────────────────┤
│ 19069 │ The Return of the King (The Lord of the Rings, Part 3)                   │ J. R. R. Tolkien  │
├───────┼──────────────────────────────────────────────────────────────────────────┼───────────────────┤
│  5167 │ The Fellowship of the Ring (The Lord of the Rings, Part 1)               │ J. R. R. Tolkien  │
├───────┼──────────────────────────────────────────────────────────────────────────┼───────────────────┤
│    75 │ The Hobbit : The Enchanting Prelude to The Lo