In [2]:
#Count how many lines our zipped folder contains
!type "C:\Users\priyanka.roychoudhur\Desktop\DS\BooksRecommendation\goodreads_books.json.gz" | find /c /v ""

14471322


In [3]:
#Size of the file 1 GB = 1,000,000,000 B
!dir goodreads_books.json.gz /O:E

 Volume in drive C is OS
 Volume Serial Number is C084-E4B0

 Directory of C:\Users\priyanka.roychoudhur\desktop\DS\BooksRecommendation

12/12/2022  15:06     2,078,599,273 goodreads_books.json.gz
               1 File(s)  2,078,599,273 bytes
               0 Dir(s)  1,520,926,404,608 bytes free


In [4]:
#to avoid overloading the memory by loading the entire dataset, we read it line by line

import gzip #to unzip the zipped folder

with gzip.open("goodreads_books.json.gz") as f:
    line = f.readline()
    
line #one line from the json file

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [5]:
import json

data = json.loads(line) # convert 1st entry of the json file to dictionary
data

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [6]:
# Define a function to reduce filesize by selecting only relevant features
def get_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"], 
        "title": data["title_without_series"], 
        "ratings": data["ratings_count"], 
        "url": data["url"], 
        "publication_year": data["publication_year"],
        "cover_image": data["image_url"]
    }


In [7]:
books = []

with gzip.open("goodreads_books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = get_fields(line)
        
        books.append(fields)

In [8]:
books[0]

{'book_id': '5333265',
 'title': 'W.C. Fields: A Life on Film',
 'ratings': '3',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'publication_year': '1984',
 'cover_image': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg'}

In [9]:
#load the dictionary into a pandas datatframe
import pandas as pd

books_df = pd.DataFrame.from_dict(books)

In [10]:
books_df.shape

(2360655, 6)

In [11]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2360655 entries, 0 to 2360654
Data columns (total 6 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   book_id           object
 1   title             object
 2   ratings           object
 3   url               object
 4   publication_year  object
 5   cover_image       object
dtypes: object(6)
memory usage: 108.1+ MB


In [12]:
#We see that rating variable is an object. Rating is an ordinal variable i.e. it has a natural ordering 1<2<3<4<5.
#So we convert this variable to numerical type

books_df["ratings"] = pd.to_numeric(books_df["ratings"])
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2360655 entries, 0 to 2360654
Data columns (total 6 columns):
 #   Column            Dtype  
---  ------            -----  
 0   book_id           object 
 1   title             object 
 2   ratings           float64
 3   url               object 
 4   publication_year  object 
 5   cover_image       object 
dtypes: float64(1), object(5)
memory usage: 108.1+ MB


In [13]:
books_df.head()

Unnamed: 0,book_id,title,ratings,url,publication_year,cover_image
0,5333265,W.C. Fields: A Life on Film,3.0,https://www.goodreads.com/book/show/5333265-w-...,1984.0,https://images.gr-assets.com/books/1310220028m...
1,1333909,Good Harbor,10.0,https://www.goodreads.com/book/show/1333909.Go...,2001.0,https://s.gr-assets.com/assets/nophoto/book/11...
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140.0,https://www.goodreads.com/book/show/7327624-th...,1987.0,https://images.gr-assets.com/books/1304100136m...
3,6066819,Best Friends Forever,51184.0,https://www.goodreads.com/book/show/6066819-be...,2009.0,https://s.gr-assets.com/assets/nophoto/book/11...
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,15.0,https://www.goodreads.com/book/show/287140.Run...,,https://images.gr-assets.com/books/1413219371m...


In [14]:
#Looking at any missing values in the data
(books_df.isnull().sum()/books_df.shape[0]*100).sort_values(ascending=False)

ratings             0.022197
book_id             0.000000
title               0.000000
url                 0.000000
publication_year    0.000000
cover_image         0.000000
dtype: float64

In [15]:
#Dropping data with null values in rating as we will need the ratings data
books_df["ratings"].isnull().sum()
books_df2 = books_df.dropna() #dropping 524 rows
books_df2.isnull().sum()

book_id             0
title               0
ratings             0
url                 0
publication_year    0
cover_image         0
dtype: int64

In [16]:
#cleaning titles of books

#remove anything in the titles that is not lowercase,uppercase, digits or a space
books_df2["clean_title"] = books_df2["title"].str.replace("[^a-zA-Z0-9 ]", "", regex = True) 

books_df2[["clean_title", "title"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_df2["clean_title"] = books_df2["title"].str.replace("[^a-zA-Z0-9 ]", "", regex = True)


Unnamed: 0,clean_title,title
0,WC Fields A Life on Film,W.C. Fields: A Life on Film
1,Good Harbor,Good Harbor
2,The Unschooled Wizard Sun Wolf and Starhawk 12,"The Unschooled Wizard (Sun Wolf and Starhawk, ..."
3,Best Friends Forever,Best Friends Forever
4,Runic Astrology Starcraft and Timekeeping in t...,Runic Astrology: Starcraft and Timekeeping in ...
...,...,...
2360650,This Sceptred Isle Vol 10 The Age of Victoria ...,"This Sceptred Isle, Vol. 10: The Age of Victor..."
2360651,Sherlock Holmes and the July Crisis,Sherlock Holmes and the July Crisis
2360652,The Childrens Classic Poetry Collection,The Children's Classic Poetry Collection
2360653,101 Nights Volume One 101 Nights 13,"101 Nights: Volume One (101 Nights, #1-3)"


In [17]:
# #make all strings lowercase
books_df2["clean_title"] = books_df2["clean_title"].str.lower()

# books_df2[["clean_title", "title"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_df2["clean_title"] = books_df2["clean_title"].str.lower()


In [18]:
#Removing any extra whitespace
books_df2["clean_title"] = books_df2["clean_title"].str.replace("\s+", " ", regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_df2["clean_title"] = books_df2["clean_title"].str.replace("\s+", " ", regex=True)


In [None]:
books_df2["clean_title"].value_counts() 

In [20]:
#Keepings only rows where there is a book title
books_df2 = books_df2[books_df2["clean_title"].str.len()>0] # this drops number of rows from 2360131 to 2360124

books_df2.shape

(2346577, 7)

In [21]:
books_df2.to_json("books_titles.json")

Machine learning algorithms often use numerical data, so when dealing with textual data or any natural language processing (NLP) task, a sub-field of ML/AI dealing with text, that data first needs to be converted to a vector of numerical data by a process known as vectorization. TF-IDF vectorization involves calculating the TF-IDF score for every word in your corpus relative to that document and then putting that information into a vector (see image below using example documents “A” and “B”). Thus each document in your corpus would have its own vector, and the vector would have a TF-IDF score for every single word in the entire collection of documents.

(TF)Term frequency works by looking at the frequency of a particular term you are concerned with relative to the document.  
(IDF) Inverse document frequency looks at how common (or uncommon) a word is amongst the corpus.

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
#Convert a collection of raw documents to a matrix of TF-IDF features. 
#quantify the importance or relevance of string representations (words, phrases, lemmas, etc)  in a document amongst a collection of documents 

#initialise it 
vectoriser = TfidfVectorizer() #To compute the cosine similarity, we need the word count of the words in each document.

tfidf = vectoriser.fit_transform(books_df2["clean_title"])# create the document term matrix

#used the TfidfVectorizer() instead of CountVectorizer(), because it  downweights words that occur frequently across docuemnts.


Counting the maximum number of common words between the documents. But as the size of the document increases, the number of common words tend to increase even if the documents talk about different topics. 
Cosine similarity is a metric used to determine how similar the documents are irrespective of their size. Cosine similarity measures the cosine of the angle between two vectors projected in a multi-dimensional space.

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
import re
import numpy as np

def search(query):
    clean_query = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectoriser.transform([clean_query]) # converts the query into a vector
    similarity = cosine_similarity (query_vec, tfidf).flatten() # to get a numpy array
    indices = np.argpartition(similarity, -5)[-5:] # get the 5 most similar books and for that we want the indices
    results = books_df2.iloc[indices] # give us the book title rows from our dataset
    results = results.sort_values("ratings", ascending = False) #sort the rows with highest rated title at the top
    return results.head()

In [27]:
search("Handmaid's Tale")

Unnamed: 0,book_id,title,ratings,url,publication_year,cover_image,clean_title
128635,41567,The Handmaid's Tale,1134.0,https://www.goodreads.com/book/show/41567.The_...,2006,https://images.gr-assets.com/books/1320425957m...,the handmaids tale
304964,34729940,The Handmaid's Tale,378.0,https://www.goodreads.com/book/show/34729940-t...,2017,https://images.gr-assets.com/books/1492272668m...,the handmaids tale
1977105,6508785,The Handmaid's Tale,203.0,https://www.goodreads.com/book/show/6508785-th...,1996,https://s.gr-assets.com/assets/nophoto/book/11...,the handmaids tale
552188,998834,Handmaid's Tale,15.0,https://www.goodreads.com/book/show/998834.Han...,1988,https://s.gr-assets.com/assets/nophoto/book/11...,handmaids tale
1508664,33844877,The Handmaid's Tale,1.0,https://www.goodreads.com/book/show/33844877-t...,2009,https://images.gr-assets.com/books/1484245774m...,the handmaids tale


In [25]:
#name = input()

KeyboardInterrupt: Interrupted by user

In [None]:
#Now this gives us a way where we can search for a book title in the list of books and get their book id.