# Book Search Engine

In [47]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import ipywidgets as widgets
from IPython.display import display 


In [48]:
# Load the data
prepro_data = pd.read_csv("/kaggle/input/bookcrossing-dataset/Books Data with Category Language and Summary/Preprocessed_data.csv")

prepro_data.head(3)

Unnamed: 0.1,Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,img_s,img_m,img_l,Summary,Language,Category,city,state,country
0,0,2,"stockton, california, usa",18.0,195153448,0,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,Provides an introduction to classical myths pl...,en,['Social Science'],stockton,california,usa
1,1,8,"timmins, ontario, canada",34.7439,2005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],timmins,ontario,canada
2,2,11400,"ottawa, ontario, canada",49.0,2005018,0,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],ottawa,ontario,canada


In [49]:
# Drop unnecessary columns
new_book = prepro_data.drop(columns=['Unnamed: 0', 'location', 'age', 'year_of_publication', 'publisher', 'img_s', 'img_l',
                                     'img_m','Summary', 'Language', 'city', 'state', 'country','user_id','isbn','rating'])


In [50]:
new_book.head(3)

Unnamed: 0,book_title,book_author,Category
0,Classical Mythology,Mark P. O. Morford,['Social Science']
1,Clara Callan,Richard Bruce Wright,['Actresses']
2,Clara Callan,Richard Bruce Wright,['Actresses']


In [51]:
new_book.isnull().sum()

book_title     0
book_author    1
Category       0
dtype: int64

In [52]:
new_book.duplicated().sum()

771338

In [53]:
new_book.shape

(1031175, 3)

In [54]:
# Handle missing data
new_book['book_author'] = new_book['book_author'].fillna('Unknown')

In [55]:
new_book.head(3)

Unnamed: 0,book_title,book_author,Category
0,Classical Mythology,Mark P. O. Morford,['Social Science']
1,Clara Callan,Richard Bruce Wright,['Actresses']
2,Clara Callan,Richard Bruce Wright,['Actresses']


In [56]:
check = new_book['book_title'].nunique()
check

241090

In [57]:
# let's apply regex to make thing simple
#let's clean the book title
def clean_book_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "",title)

In [58]:
new_book['clean_book_title'] = new_book['book_title'].apply(clean_book_title)

In [59]:
new_book

Unnamed: 0,book_title,book_author,Category,clean_book_title
0,Classical Mythology,Mark P. O. Morford,['Social Science'],Classical Mythology
1,Clara Callan,Richard Bruce Wright,['Actresses'],Clara Callan
2,Clara Callan,Richard Bruce Wright,['Actresses'],Clara Callan
3,Clara Callan,Richard Bruce Wright,['Actresses'],Clara Callan
4,Clara Callan,Richard Bruce Wright,['Actresses'],Clara Callan
...,...,...,...,...
1031170,As Hogan Said . . . : The 389 Best Things Anyo...,Randy Voorhees,['Humor'],As Hogan Said The 389 Best Things Anyone S...
1031171,All Elevations Unknown: An Adventure in the He...,Sam Lightner,['Nature'],All Elevations Unknown An Adventure in the Hea...
1031172,Why stop?: A guide to Texas historical roadsid...,Claude Dooley,9,Why stop A guide to Texas historical roadside ...
1031173,The Are You Being Served? Stories: 'Camping In...,Jeremy Lloyd,['Fiction'],The Are You Being Served Stories Camping In an...


In [60]:
new_unique_book = new_book.drop_duplicates(subset=['clean_book_title','book_author'])

new_unique_book = new_unique_book.drop(columns= ['book_title'])

In [61]:
new_unique_book

Unnamed: 0,book_author,Category,clean_book_title
0,Mark P. O. Morford,['Social Science'],Classical Mythology
1,Richard Bruce Wright,['Actresses'],Clara Callan
15,Carlo D'Este,['1940-1949'],Decision in Normandy
18,Gina Bari Kolata,['Medical'],Flu The Story of the Great Influenza Pandemic ...
29,E. J. W. Barber,['Design'],The Mummies of Urumchi
...,...,...,...
1031170,Randy Voorhees,['Humor'],As Hogan Said The 389 Best Things Anyone S...
1031171,Sam Lightner,['Nature'],All Elevations Unknown An Adventure in the Hea...
1031172,Claude Dooley,9,Why stop A guide to Texas historical roadside ...
1031173,Jeremy Lloyd,['Fiction'],The Are You Being Served Stories Camping In an...


In [None]:
new_unique_book = new_unique_book[['clean_book_title', 'Category', 'book_author']]

new_unique_book

Unnamed: 0,clean_book_title,Category,book_author
0,Classical Mythology,['Social Science'],Mark P. O. Morford
1,Clara Callan,['Actresses'],Richard Bruce Wright
15,Decision in Normandy,['1940-1949'],Carlo D'Este
18,Flu The Story of the Great Influenza Pandemic ...,['Medical'],Gina Bari Kolata
29,The Mummies of Urumchi,['Design'],E. J. W. Barber
...,...,...,...
1031170,As Hogan Said The 389 Best Things Anyone S...,['Humor'],Randy Voorhees
1031171,All Elevations Unknown An Adventure in the Hea...,['Nature'],Sam Lightner
1031172,Why stop A guide to Texas historical roadside ...,9,Claude Dooley
1031173,The Are You Being Served Stories Camping In an...,['Fiction'],Jeremy Lloyd


In [63]:

vectorizer = TfidfVectorizer(ngram_range= (1,2))
tfidf = vectorizer.fit_transform(new_unique_book['clean_book_title'])


In [64]:
"""now let's apply cosine similarity"""

def search(title):
    title = clean_book_title(title)
    query_vector= vectorizer.transform([title])

    similarity = cosine_similarity(query_vector,tfidf).flatten()
 
    indicies = np.argpartition(similarity, -10)[-10:]

    result= new_unique_book.iloc[indicies][::-1]
    return result

In [65]:

book_input = widgets.Text(value = 'Harry Potter', description='Book Title',disable = False)

book_list = widgets.Output()

def on_type(data):
    with book_list:
        book_list.clear_output()
        title = data["new"]
        if len(title)>4:
            display(search(title))

book_input.observe(on_type,names = 'value')
display(book_input,book_list)

Text(value='Harry Potter', description='Book Title')

Output()