# Book Search

Data Source: https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home

In [1]:
import pandas as pd

In [2]:
# Counts number of lines in file

!wc -l goodreads_books.json.gz

 7588375 goodreads_books.json.gz


In [3]:
# How large the file

!ls -lh | grep goodreads_books.json.gz

-rw-r--r--@ 1 stephenkipkurui  staff   1.9G Jul 15 12:10 goodreads_books.json.gz


In [4]:
# Stream the file without opening. Purpose--> maximize machine processing

import gzip

with gzip.open('goodreads_books.json.gz', 'r') as f:
    line = f.readline()

In [5]:
line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [6]:
# Using Json to load the line of data

import json
json.loads(line)

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [7]:
# Function to create select information from json file

def parse_fields(line):
    
    data = json.loads(line)
    return {
        'book_id' : data['book_id'],
        'title': data['title_without_series'],
        'ratings': data['ratings_count'],
        'url': data['url'],
        'cover_image': data['image_url']
    }

In [8]:
books_titles = []

with gzip.open('goodreads_books.json.gz', 'r') as f:
    
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        
        try:
            ratings = int(fields['ratings'])
        except ValueError:
            continue
        if ratings > 15:
            books_titles.append(fields)

### Turn into dataframe

In [12]:
titles = pd.DataFrame.from_dict(books_titles)
titles.head()

Unnamed: 0,book_id,title,ratings,url,cover_image
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...


In [13]:
# Get info
titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1308957 entries, 0 to 1308956
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   book_id      1308957 non-null  object
 1   title        1308957 non-null  object
 2   ratings      1308957 non-null  object
 3   url          1308957 non-null  object
 4   cover_image  1308957 non-null  object
dtypes: object(5)
memory usage: 49.9+ MB


In [18]:
# Convert ratings to numerics
titles['ratings'] = pd.to_numeric(titles['ratings'])

# Use regex to remove any unnecesary characters from title and assign new column and lower case
titles['modified_title'] = titles['title'].str.replace('[^a-zA-Z0-9]', ' ', regex = True).str.lower()

# Remove any spaces greater than one to single space on modified title column
titles['modified_title'] = titles['modified_title'].str.replace('\s+', ' ', regex = True)

In [22]:
# Get titles with characters > 0
titles = titles[titles['modified_title'].str.len() > 0]

In [21]:
# Save a copy to json file
titles.to_json('books_titles.json')

In [24]:
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,modified_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 1 2
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,all s fairy in love and war avalon web of magi...
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devil s notebook
...,...,...,...,...,...,...
1308952,17805813,"Ondine (Ondine Quartet, #0.5)",327,https://www.goodreads.com/book/show/17805813-o...,https://images.gr-assets.com/books/1379766592m...,ondine ondine quartet 0 5
1308953,331839,Jacqueline Kennedy Onassis: Friend of the Arts,18,https://www.goodreads.com/book/show/331839.Jac...,https://s.gr-assets.com/assets/nophoto/book/11...,jacqueline kennedy onassis friend of the arts
1308954,2685097,The Spaniard's Blackmailed Bride,112,https://www.goodreads.com/book/show/2685097-th...,https://s.gr-assets.com/assets/nophoto/book/11...,the spaniard s blackmailed bride
1308955,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the children s classic poetry collection


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles['modified_title'])

In [53]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def make_clickable(val):
    return '<a target = "_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val):
    return '<img src = "{}" width = 50></img>'.format(val)

def search(book_query, vectorizer):
    processed = re.sub('[^a-zA-Z0-9]', '', book_query.lower())
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = titles.iloc[indices]
    results = results.sort_values('ratings', ascending = False)
    return results.head(5).style.format({'url': make_clickable, 'cover_image': show_image})

In [67]:
# Call the search function
search('leopold', vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,modified_title
753839,347610,King Leopold's Ghost,25849,Goodreads,,king leopold s ghost
388257,10474352,King Leopold's Ghost,608,Goodreads,,king leopold s ghost
688670,953016,King Leopold's Ghost,226,Goodreads,,king leopold s ghost
1083960,27457,King Leopold's Ghost,84,Goodreads,,king leopold s ghost
68789,18680452,Leopold Blue,41,Goodreads,,leopold blue


In [66]:
liked_books = ['17152735','400510', '347610']