In [2]:
# import required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# import dataset
books = pd.read_csv("Books.csv")

# Data Understanding And Preprocessing

In [3]:
# top 5 rows of dataset
books.head()

Unnamed: 0.1,Unnamed: 0,ISBN,book_id,Publication Year,Author,Title,AvgRating,Image-URL,Image-URL-S
0,0,195153448,1,2008,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.34,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,1,2005018,2,1997,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,4.44,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,2,60973129,3,2005,Stephenie Meyer,"Twilight (Twilight, #1)",3.57,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,3,374157065,4,1960,Harper Lee,To Kill a Mockingbird,4.25,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,4,393045218,5,1925,F. Scott Fitzgerald,The Great Gatsby,3.89,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [4]:
# shape of dataset
books.shape

(10000, 9)

dataset have 9 columns and 10000 row samples

In [5]:
# drop unnecessary columns of dataset
books.drop("Unnamed: 0", axis=1, inplace=True)

In [6]:
# Rename columns of dataset
books.rename(columns={
    "Publication Year": "year",
    "Author": "author",
    "Title": "title",
    "AvgRating": "average_rating",
    "Image-URL": "image_url"
}, inplace=True)

In [7]:
# no. of null values in each column
books.isnull().sum()

Unnamed: 0,0
ISBN,0
book_id,0
year,0
author,0
title,0
average_rating,0
image_url,0
Image-URL-S,0


dataset has 0 null values

In [8]:
# no. of duplicate row samples
books.duplicated().sum()

np.int64(0)

dataset has 0 duplicate row samples

In [9]:
# datatype of each column
books.dtypes

Unnamed: 0,0
ISBN,object
book_id,int64
year,int64
author,object
title,object
average_rating,float64
image_url,object
Image-URL-S,object


In [10]:
# typecasting
books["book_id"] = books["book_id"].astype("int32")
books["year"] = books["year"].astype("int32")
books["average_rating"] = books["average_rating"].astype("float32")

# Popularity Based Filtering

In [11]:
# filter top 30 books
popular_books = books.sort_values(
    by="average_rating", ascending=False
).head(30)

In [12]:
# top 5 most popular books
popular_books.head()

Unnamed: 0,ISBN,book_id,year,author,title,average_rating,image_url,Image-URL-S
3627,8807015188,3628,2005,Bill Watterson,The Complete Calvin and Hobbes,4.82,https://images.gr-assets.com/books/1473064526m...,https://images.gr-assets.com/books/1473064526s...
861,451456548,862,2014,Brandon Sanderson,"Words of Radiance (The Stormlight Archive, #2)",4.77,https://images.gr-assets.com/books/1391535251m...,https://images.gr-assets.com/books/1391535251s...
3274,1586480456,3275,2003,"J.K. Rowling, Mary GrandPré","Harry Potter Boxed Set, Books 1-5 (Harry Potte...",4.77,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
7946,1565071298,7947,2002,"Anonymous, Lane T. Dennis, Wayne A. Grudem",ESV Study Bible,4.76,https://images.gr-assets.com/books/1410151002m...,https://images.gr-assets.com/books/1410151002s...
8853,425125467,8854,1993,Francine Rivers,Mark of the Lion Trilogy,4.76,https://images.gr-assets.com/books/1349032180m...,https://images.gr-assets.com/books/1349032180s...


# Content-based Ftering

In [15]:
# create combined column for vectorization
books["text"] = books["title"] + " " + books["author"]

In [16]:
# convert text into vectors
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books["text"])

# compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)

In [19]:
# function to recommend similar books
def recommend(book_title):
    idx = books[books["title"] == book_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]
    book_indices = [i[0] for i in sim_scores]
    return books.iloc[book_indices][["title", "author", "year", "average_rating"]]

In [20]:
# test recommend function
recommend("The Complete Calvin and Hobbes")

Unnamed: 0,title,author,year,average_rating
1009,The Essential Calvin and Hobbes: A Calvin and ...,Bill Watterson,1988,4.65
6589,The Authoritative Calvin and Hobbes: A Calvin ...,Bill Watterson,1990,4.73
6919,The Indispensable Calvin and Hobbes,Bill Watterson,1992,4.73
779,Calvin and Hobbes,"Bill Watterson, G.B. Trudeau",1987,4.61
6360,There's Treasure Everywhere: A Calvin and Hobb...,Bill Watterson,1996,4.74
4482,It's a Magical World: A Calvin and Hobbes Coll...,Bill Watterson,1996,4.75
1787,The Calvin and Hobbes Tenth Anniversary Book,Bill Watterson,1995,4.63
5579,The Calvin and Hobbes Lazy Sunday Book,Bill Watterson,1989,4.66
5206,The Days Are Just Packed: A Calvin and Hobbes ...,Bill Watterson,1993,4.68
7253,Homicidal Psycho Jungle Cat: A Calvin and Hobb...,Bill Watterson,1994,4.71
