
## Inspecting the datasets


In [3]:
# goodreads_books.josn file is very large.
# So reading it in a streaming fashion(line by line) without unzipping it

import gzip
import json

with gzip.open("goodreads_books.json.gz") as f:
    line = f.readline()   #fetching the first line
    
json.loads(line) #converts a json string to a python dict


{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

## Data Cleaning and Preprocessing

In [4]:
#function to parse a single line in books metadata and return relevant fields
def parse_fields(line):
    data=json.loads(line)
    return{
        "book_id":data["book_id"],
        "title":data["title_without_series"],
        "ratings":data["ratings_count"],
        "cover_image":data["image_url"],
        "link":data["url"]
    }

In [5]:
books = []
with gzip.open("goodreads_books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        
        #in case the ratings field for a book is missing or not an int
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        
        books.append(fields)

In [1]:
import pandas as pd

books = pd.DataFrame.from_dict(books)

#no. of unique books in the dataset
print(books.book_id.size)



NameError: name 'books' is not defined

In [7]:
books["ratings"] = pd.to_numeric(books["ratings"])

#to minimize the search space for our search engine

#get rid of chars other than alphanumeric and space
books["mod_title"] = books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True) 

#replace more than 1 spaces in a row with a single space
books["mod_title"] = books["mod_title"].str.replace("\s+", " ", regex=True)

#lowercase all chars
books["mod_title"] = books["mod_title"].str.lower()

#removing null titles
books = books[books["mod_title"].str.len() > 0]


In [8]:
books

Unnamed: 0,book_id,title,ratings,cover_image,link,mod_title
0,5333265,W.C. Fields: A Life on Film,3,https://images.gr-assets.com/books/1310220028m...,https://www.goodreads.com/book/show/5333265-w-...,wc fields a life on film
1,1333909,Good Harbor,10,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1333909.Go...,good harbor
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://images.gr-assets.com/books/1304100136m...,https://www.goodreads.com/book/show/7327624-th...,the unschooled wizard sun wolf and starhawk 12
3,6066819,Best Friends Forever,51184,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/6066819-be...,best friends forever
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://images.gr-assets.com/books/1413219371m...,https://www.goodreads.com/book/show/287140.Run...,runic astrology starcraft and timekeeping in t...
...,...,...,...,...,...,...
2360126,3084038,"This Sceptred Isle, Vol. 10: The Age of Victor...",12,https://images.gr-assets.com/books/1494763458m...,https://www.goodreads.com/book/show/3084038-th...,this sceptred isle vol 10 the age of victoria ...
2360127,26168430,Sherlock Holmes and the July Crisis,6,https://images.gr-assets.com/books/1440592011m...,https://www.goodreads.com/book/show/26168430-s...,sherlock holmes and the july crisis
2360128,2342551,The Children's Classic Poetry Collection,36,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/2342551.Th...,the childrens classic poetry collection
2360129,22017381,"101 Nights: Volume One (101 Nights, #1-3)",70,https://images.gr-assets.com/books/1398621236m...,https://www.goodreads.com/book/show/22017381-1...,101 nights volume one 101 nights 13


In [9]:
#saving the cleaned data for future use
books.to_json("books_data.json")

## Building the Search Engine

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

#building a tf-idf matrix of book titles
tfidf = vectorizer.fit_transform(books["mod_title"])

In [11]:
tfidf

<2346577x397156 sparse matrix of type '<class 'numpy.float64'>'
	with 11162314 stored elements in Compressed Sparse Row format>

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def search(query):
    #pre-processing the query
    processed_query=re.sub("[^a-zA-Z0-9 ]","",query.lower())
    
    #converting the query into tf-idf vector
    query_vec=vectorizer.transform([processed_query]) 
    
    #calculating query similarity to all the titles
    similarity = cosine_similarity(query_vec, tfidf).flatten() 
    
    #finding the indices of top10 most similar titles
    indices = np.argpartition(similarity, -10)[-10:]
    
    search_results = books.iloc[indices]
    
    #sorting the results in descending order of ratings
    search_results = search_results.sort_values("ratings",ascending=False)
    
    #returning the top-5 results
    # return search_results.head(5).style.format({'link': make_clickable, 'cover_image': show_image})
    
    #returning the book_id column of all the rows in result df
    return search_results.iloc[0,:]["book_id"]


#utility functions to format data frame
def make_clickable(val):
    return "<a href={}>See on GoodReads</a>".format(val)

def show_image(val):
    return "<img src={} width=55>".format(val)

In [13]:
search("the invisible man")

Unnamed: 0,book_id,title,ratings,cover_image,link,mod_title
1760839,22514478,The Invisible Man,800,,See on GoodReads,the invisible man
1057805,176938,The Invisible Man,330,,See on GoodReads,the invisible man
1015443,13393440,The Invisible Man,63,,See on GoodReads,the invisible man
1576337,8554342,The Invisible Man,55,,See on GoodReads,the invisible man
1778344,8430737,The Invisible Man,44,,See on GoodReads,the invisible man


In [41]:
my_books=pd.read_csv("goodreads_library_export.csv")
my_books

Unnamed: 0,Title,Author,Author l-f,Additional Authors,ISBN,ISBN13,My Rating,Average Rating,Publisher,Binding,...,Date Read,Date Added,Bookshelves,Bookshelves with positions,Exclusive Shelf,My Review,Spoiler,Private Notes,Read Count,Owned Copies
0,The Silent Patient,Alex Michaelides,"Michaelides, Alex",,1250301696,9781250000000.0,5,4.17,Celadon Books,Hardcover,...,,12/6/2022,,,read,,,,1,0
1,"A Killer's Mind (Zoe Bentley Mystery, #1)",Mike Omer,"Omer, Mike",,,,5,4.19,Thomas & Mercer,Kindle Edition,...,,12/6/2022,,,read,,,,1,0
2,The Secret of the Old Clock (Nancy Drew Myster...,Carolyn Keene,"Keene, Carolyn","Russell H. Tandy, Sara Paretsky",1557091552,9781557000000.0,5,3.98,Applewood Books,Hardcover,...,,12/6/2022,,,read,,,,1,0
3,The Stephen King Universe: A Guide to the Worl...,Stanley Wiater,"Wiater, Stanley","Christopher Golden, Hank Wagner",1580631606,9781581000000.0,5,4.16,Renaissance Books,Paperback,...,,12/6/2022,,,read,,,,1,0
4,"Uzumaki: Spiral into Horror, Vol. 1",Junji Ito,"Ito, Junji",,1569317143,9781569000000.0,5,4.36,Viz Media,Paperback,...,,12/6/2022,,,read,,,,1,0
5,The Amityville Horror,Jay Anson,"Anson, Jay",,0553116606,9780553000000.0,5,3.84,Bantam Books,Mass Market Paperback,...,,12/6/2022,,,read,,,,1,0
6,The Guernsey Literary and Potato Peel Pie Society,Mary Ann Shaffer,"Shaffer, Mary Ann",Annie Barrows,1984801813,9781985000000.0,5,4.18,Dial Press,Paperback,...,,12/6/2022,,,read,,,,1,0
7,Dial A for Aunties (Aunties #1),Jesse Q. Sutanto,"Sutanto, Jesse Q.",,0593336739,9780593000000.0,5,3.76,Berkley,Hardcover,...,,12/6/2022,,,read,,,,1,0
8,The Unlikely Pilgrimage of Harold Fry (Harold ...,Rachel Joyce,"Joyce, Rachel",,0812993292,9780813000000.0,5,3.92,Random House,Hardcover,...,,12/6/2022,,,read,,,,1,0
9,The House in the Cerulean Sea,T.J. Klune,"Klune, T.J.",,1250217288,9781250000000.0,4,4.44,Tor Books,Hardcover,...,,12/6/2022,,,read,,,,1,0


In [42]:
#dropping other columns
my_books=my_books[["Title","My Rating"]]
my_books

Unnamed: 0,Title,My Rating
0,The Silent Patient,5
1,"A Killer's Mind (Zoe Bentley Mystery, #1)",5
2,The Secret of the Old Clock (Nancy Drew Myster...,5
3,The Stephen King Universe: A Guide to the Worl...,5
4,"Uzumaki: Spiral into Horror, Vol. 1",5
5,The Amityville Horror,5
6,The Guernsey Literary and Potato Peel Pie Society,5
7,Dial A for Aunties (Aunties #1),5
8,The Unlikely Pilgrimage of Harold Fry (Harold ...,5
9,The House in the Cerulean Sea,4


### Using the Search Engine

In [43]:
#searching for the book_ids for the liked titles in the books dataset using search engine
my_books["book_id"] = my_books["Title"].apply(lambda x: search(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_books["book_id"] = my_books["Title"].apply(lambda x: search(x))


In [44]:
my_books

Unnamed: 0,Title,My Rating,book_id
0,The Silent Patient,5,11439409
1,"A Killer's Mind (Zoe Bentley Mystery, #1)",5,9118158
2,The Secret of the Old Clock (Nancy Drew Myster...,5,32979
3,The Stephen King Universe: A Guide to the Worl...,5,10612
4,"Uzumaki: Spiral into Horror, Vol. 1",5,25152
5,The Amityville Horror,5,293101
6,The Guernsey Literary and Potato Peel Pie Society,5,6979801
7,Dial A for Aunties (Aunties #1),5,213980
8,The Unlikely Pilgrimage of Harold Fry (Harold ...,5,13227454
9,The House in the Cerulean Sea,4,17934610


In [45]:
#initializing my user_id to -1
my_books["user_id"]=-1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_books["user_id"]=-1


In [46]:
#renaming the columns
my_books.columns=["title","my_rating","book_id","user_id"]
my_books

Unnamed: 0,title,my_rating,book_id,user_id
0,The Silent Patient,5,11439409,-1
1,"A Killer's Mind (Zoe Bentley Mystery, #1)",5,9118158,-1
2,The Secret of the Old Clock (Nancy Drew Myster...,5,32979,-1
3,The Stephen King Universe: A Guide to the Worl...,5,10612,-1
4,"Uzumaki: Spiral into Horror, Vol. 1",5,25152,-1
5,The Amityville Horror,5,293101,-1
6,The Guernsey Literary and Potato Peel Pie Society,5,6979801,-1
7,Dial A for Aunties (Aunties #1),5,213980,-1
8,The Unlikely Pilgrimage of Harold Fry (Harold ...,5,13227454,-1
9,The House in the Cerulean Sea,4,17934610,-1


In [47]:
my_books.to_csv("my_books.csv")