## Data
The data from Goodread website is used fro this analysis, which is scrapped by researchers at UCSD.
[link to data](https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home)

In [5]:
#number of lines in the json file
!wc -l goodreads_books.json.gz

'wc' is not recognized as an internal or external command,
operable program or batch file.


In [7]:
!ls -lh | grep goodreads_books.json.gz

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [1]:
#read the json file without unzipping
import gzip
import json
import pandas as pd


with gzip.open('book-recommendation/goodreads_books.json.gz', 'r') as f:
    l = f.readline()

In [13]:
l

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [20]:
def parse_fields(line):
    line = json.loads(line) #load streams
    return{
        'book_id':line['book_id'],
        'title':line['title_without_series'],
        'rating' : line['ratings_count'],
        'url' : line['url'],
        'book_cover': line['image_url']
    }

In [21]:
book_titles = []
with gzip.open('book-recommendation/goodreads_books.json.gz', 'r') as f:
    while True:
        l = f.readline()
        if not l:
            break
        fileld = parse_fields(l)
        #throw an error in rating filed is empty
        try:
            rating = int(fileld['rating'])
        except ValueError:
            continue
        
        #consider books with more than 5 ratings
        if rating > 5:
            book_titles.append(fileld)
        
    

In [34]:
titles = pd.DataFrame.from_dict(book_titles)

In [35]:
titles.head(4)

Unnamed: 0,book_id,title,rating,url,book_cover
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...


## Data cleaning

#### Converting ratings into number

In [36]:
titles['rating']= pd.to_numeric(titles['rating'])

#### Removing special characters from title column

In [37]:
titles['mod_tit'] = titles['title'].str.replace('[^a-zA-Z)-9 ]','',regex=True)

#### Convert title colimn to lowecase

In [38]:
titles['mod_tit'] = titles['mod_tit'].str.lower()

#### Remove consecutive spaces into one space

In [39]:
titles['mod_tit'] = titles['mod_tit'].str.replace('\s+',' ',regex=True)

#### Remove rows with no title

In [40]:
titles = titles[titles['mod_tit'].str.len()> 0]

In [41]:
titles.to_json('book-recommendation/book_titles.json')

In [42]:
titles.head(4)

Unnamed: 0,book_id,title,rating,url,book_cover,mod_tit
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,good harbor
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,"the unschooled wizard sun wolf and starhawk, 1-2)"
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,runic astrology starcraft and timekeeping in t...


## Building the search engine
To build the search engine, we need to make a tf-idf matrix of book titles and also we need to find a similarity metric to calculate the simillarity between the search query and tf-idf matrix.

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(titles['mod_tit'])

In [52]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re  #regex


#adding a link to url in goodreads
def show_url(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val)


def search(query, vectorizer):
    
    process_query = re.sub('[^A-Za-z0-9 ]','',query.lower()) #remove special chars from query and convert to lowercase
    query_vec = vectorizer.transform([process_query]) #vectorize the query
    similarity = cosine_similarity(query_vec,tfidf).flatten() #calculate cosine similarity
    indices = np.argpartition(similarity,-10)[-10:] #get indices of most relevant books 
    results = titles.iloc[indices] 
    results = results.sort_values('rating',ascending =False)
    return results.head(5).style.format({'url':show_url,'book_cover':show_image})#return 5 most relavent books

In [60]:
search('franny and zooey', vectorizer)

Unnamed: 0,book_id,title,rating,url,book_cover,mod_tit
477424,77531,Franny and Zooey,3549,Goodreads,,franny and zooey
284354,7710230,Franny and Zooey,871,Goodreads,,franny and zooey
543301,884056,Franny and Zooey,356,Goodreads,,franny and zooey
929057,2417959,Franny and Zooey,90,Goodreads,,franny and zooey
937818,9793805,Franny and Zooey,83,Goodreads,,franny and zooey


## Building the recommendation system
In order to build the recomm system we need to have a list of books that we liked. Then find users of goodreads that also liked those books(ratings over 4). the next step is to find the liked books of readers with similar taste. This way it is possible to recommend similar books to our liked books

#### Creating list of liked books

In [55]:
search('rumi',vectorizer)

Unnamed: 0,book_id,title,rating,url,book_cover,mod_tit
244457,265540,Rumi: Poems,556,Goodreads,,rumi poems
1427808,67381,The Rumi Collection,385,Goodreads,,the rumi collection
1005530,20211434,The Essential Rumi,320,Goodreads,,the essential rumi
1167949,67373,The Essential Rumi,250,Goodreads,,the essential rumi
886000,25477771,The Essential Rumi,101,Goodreads,,the essential rumi


In [2]:
liked = ['265540','7967885','77531']

#### Mapping book-id in book json file with book-id in goodread interaction file

In [3]:
book_mapping = {}
with open('book-recommendation/book_id_map.csv','r') as f:
    while True:
        l = f.readline()
        if not l:
            break
        
        book_id_csv, book_id = l.strip().split(',')  #strip() remove any new line chars
        book_mapping[book_id_csv] = book_id

In [4]:
len(book_mapping)

2360651

#### Finding user with the same taste

In [5]:
users = set()
with open('book-recommendation/goodreads_interactions.csv','r') as f:
    while True:
        l = f.readline()
        if not l:
            break
            
        user_id,csv_id,_,rating,_ = l.split(',')
        if user_id in users:
            continue
        try:
            rating = int(rating)
        except ValueError:
            continue
            
        book_id = book_mapping[csv_id]
        if book_id in liked and rating >= 4:
            users.add(user_id)

In [6]:
rec_books = []
with open('book-recommendation/goodreads_interactions.csv','r') as f:
    while True:
        l = f.readline()
        if not l:
            break
        user_id,csv_id,_,rating,_ = l.split(',')
        if user_id in users:
            book_id = book_mapping[csv_id]
            rec_books.append([user_id,book_id,rating])

#### Convert recommended array into dataframe

In [7]:

recomms = pd.DataFrame(rec_books,columns =['user_id','book_id','rating'])
recomms['book_id'] = recomms['book_id'].astype(str)

In [40]:
recomms.to_csv('book-recommendation/recommended_books.csv')

#### Find top recommended books

In [8]:
top_rec = recomms['book_id'].value_counts().head(10)
top_rec = top_rec.index.values

#### Finding book titles related to recommended book ids

In [9]:
book_titles = pd.read_json('book-recommendation/book_titles.json')
book_titles['book_id'] = book_titles['book_id'].astype(str)
book_titles.head(5)

Unnamed: 0,book_id,title,rating,url,book_cover,mod_tit
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,good harbor
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,"the unschooled wizard sun wolf and starhawk, 1-2)"
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,runic astrology starcraft and timekeeping in t...
4,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls


#### Find the titles of recommended books

In [10]:
book_titles[book_titles['book_id'].isin(top_rec)]

Unnamed: 0,book_id,title,rating,url,book_cover,mod_tit
386663,2767052,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-th...,https://images.gr-assets.com/books/1447303603m...,"the hunger games the hunger games, 1)"
477424,77531,Franny and Zooey,3549,https://www.goodreads.com/book/show/77531.Fran...,https://images.gr-assets.com/books/1229566527m...,franny and zooey
546297,5107,The Catcher in the Rye,2086945,https://www.goodreads.com/book/show/5107.The_C...,https://images.gr-assets.com/books/1398034300m...,the catcher in the rye
630937,4671,The Great Gatsby,2758812,https://www.goodreads.com/book/show/4671.The_G...,https://images.gr-assets.com/books/1490528560m...,the great gatsby
838525,5470,1984,2023937,https://www.goodreads.com/book/show/5470.1984,https://images.gr-assets.com/books/1348990566m...,1984
1048745,7613,Animal Farm,1928931,https://www.goodreads.com/book/show/7613.Anima...,https://images.gr-assets.com/books/1424037542m...,animal farm
1077226,2657,To Kill a Mockingbird,3255518,https://www.goodreads.com/book/show/2657.To_Ki...,https://images.gr-assets.com/books/1361975680m...,to kill a mockingbird
1196415,3,Harry Potter and the Sorcerer's Stone (Harry P...,4765497,https://www.goodreads.com/book/show/3.Harry_Po...,https://images.gr-assets.com/books/1474154022m...,harry potter and the sorcerers stone harry pot...
1229158,136251,Harry Potter and the Deathly Hallows (Harry Po...,1784684,https://www.goodreads.com/book/show/136251.Har...,https://images.gr-assets.com/books/1474171184m...,harry potter and the deathly hallows harry pot...
1354988,15881,Harry Potter and the Chamber of Secrets (Harry...,1821802,https://www.goodreads.com/book/show/15881.Harr...,https://images.gr-assets.com/books/1474169725m...,harry potter and the chamber of secrets harry ...


## Making the recommendatiom more personalised

In [19]:
all_rec = recomms['book_id'].value_counts()

In [20]:
all_rec

77531       305
4671        277
2657        245
5470        245
5107        231
           ... 
7179451       1
23260922      1
237704        1
20609217      1
531408        1
Name: book_id, Length: 220824, dtype: int64

In [21]:
all_rec

77531       305
4671        277
2657        245
5470        245
5107        231
           ... 
7179451       1
23260922      1
237704        1
20609217      1
531408        1
Name: book_id, Length: 220824, dtype: int64

In [22]:
all_rec =all_rec.to_frame()

In [26]:
 all_rec = all_rec.reset_index()
all_rec

Unnamed: 0,index,book_id
0,77531,305
1,4671,277
2,2657,245
3,5470,245
4,5107,231
...,...,...
220819,7179451,1
220820,23260922,1
220821,237704,1
220822,20609217,1


In [28]:
all_rec.columns = ['book_id','book_count']
all_rec

Unnamed: 0,book_id,book_count
0,77531,305
1,4671,277
2,2657,245
3,5470,245
4,5107,231
...,...,...
220819,7179451,1
220820,23260922,1
220821,237704,1
220822,20609217,1


#### Merging all recommendations with our book title dataframe

In [29]:
all_rec = all_rec.merge(book_titles, how='inner',on='book_id')

#### Define a score for each book based on counts and penalize the book if it is very popular(divide by the number of ratings)

In [30]:
all_rec['score'] = all_rec['book_count']*(all_rec['book_count']/all_rec['rating'])

In [31]:
all_rec.sort_values('score', ascending=False)

Unnamed: 0,book_id,book_count,title,rating,url,book_cover,mod_tit,score
0,77531,305,Franny and Zooey,3549,https://www.goodreads.com/book/show/77531.Fran...,https://images.gr-assets.com/books/1229566527m...,franny and zooey,26.211609
32,7967885,155,The Catcher in the Rye,1538,https://www.goodreads.com/book/show/7967885-th...,https://images.gr-assets.com/books/1327197188m...,the catcher in the rye,15.620936
238,265540,69,Rumi: Poems,556,https://www.goodreads.com/book/show/265540.Rumi,https://images.gr-assets.com/books/1320479183m...,rumi poems,8.562950
450,77530,48,"Raise High the Roof Beam, Carpenters and Seymo...",843,https://www.goodreads.com/book/show/77530.Rais...,https://s.gr-assets.com/assets/nophoto/book/11...,"raise high the roof beam, carpenters and seymo...",2.733096
15009,32191854,4,Vulgar Tongues: An Alternative History of Engl...,6,https://www.goodreads.com/book/show/32191854-v...,https://images.gr-assets.com/books/1477088826m...,vulgar tongues an alternative history of engli...,2.666667
...,...,...,...,...,...,...,...,...
210087,16140036,1,"This Man (This Man, #1)",80440,https://www.goodreads.com/book/show/16140036-t...,https://images.gr-assets.com/books/1352790466m...,"this man this man, 1)",0.000012
189690,20448515,1,"Bared to You (Crossfire, #1)",84614,https://www.goodreads.com/book/show/20448515-b...,https://images.gr-assets.com/books/1433411511m...,"bared to you crossfire, 1)",0.000012
74843,784911,1,"Rise of the Evening Star (Fablehaven, #2)",84861,https://www.goodreads.com/book/show/784911.Ris...,https://images.gr-assets.com/books/1386633982m...,"rise of the evening star fablehaven, 2)",0.000012
148726,411053,1,Better Homes and Gardens New Cook Book,97060,https://www.goodreads.com/book/show/411053.Bet...,https://s.gr-assets.com/assets/nophoto/book/11...,better homes and gardens new cook book,0.000010


In [32]:
popular_recs = all_rec[all_rec['book_count'] > 75].sort_values('score', ascending =False).head(10)

In [33]:
popular_recs

Unnamed: 0,book_id,book_count,title,rating,url,book_cover,mod_tit,score
0,77531,305,Franny and Zooey,3549,https://www.goodreads.com/book/show/77531.Fran...,https://images.gr-assets.com/books/1229566527m...,franny and zooey,26.211609
32,7967885,155,The Catcher in the Rye,1538,https://www.goodreads.com/book/show/7967885-th...,https://images.gr-assets.com/books/1327197188m...,the catcher in the rye,15.620936
160,77532,83,Nine Stories,3251,https://www.goodreads.com/book/show/77532.Nine...,https://images.gr-assets.com/books/1407030892m...,nine stories,2.11904
163,6759,82,Infinite Jest,47926,https://www.goodreads.com/book/show/6759.Infin...,https://images.gr-assets.com/books/1446876799m...,infinite jest,0.1403
171,18521,80,A Room of One's Own,61613,https://www.goodreads.com/book/show/18521.A_Ro...,https://images.gr-assets.com/books/1327883012m...,a room of ones own,0.103874
88,10975,106,The Sound and the Fury,126251,https://www.goodreads.com/book/show/10975.The_...,https://images.gr-assets.com/books/1433089995m...,the sound and the fury,0.088997
164,338798,82,Ulysses,78309,https://www.goodreads.com/book/show/338798.Uly...,https://images.gr-assets.com/books/1428891345m...,ulysses,0.085865
168,46164,81,Tender Is the Night,80994,https://www.goodreads.com/book/show/46164.Tend...,https://images.gr-assets.com/books/1438797669m...,tender is the night,0.081006
123,77013,92,As I Lay Dying,105220,https://www.goodreads.com/book/show/77013.As_I...,https://images.gr-assets.com/books/1451810782m...,as i lay dying,0.080441
124,4009,92,Nine Stories,105950,https://www.goodreads.com/book/show/4009.Nine_...,https://s.gr-assets.com/assets/nophoto/book/11...,nine stories,0.079887


In [37]:
def show_url(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val):
    return '<img src="{}" width=50></img>'.format(val)
popular_recs[~popular_recs['book_id'].isin(liked)].style.format({'url':show_url,'book_cover':show_image})

Unnamed: 0,book_id,book_count,title,rating,url,book_cover,mod_tit,score
160,77532,83,Nine Stories,3251,Goodreads,,nine stories,2.11904
163,6759,82,Infinite Jest,47926,Goodreads,,infinite jest,0.1403
171,18521,80,A Room of One's Own,61613,Goodreads,,a room of ones own,0.103874
88,10975,106,The Sound and the Fury,126251,Goodreads,,the sound and the fury,0.088997
164,338798,82,Ulysses,78309,Goodreads,,ulysses,0.085865
168,46164,81,Tender Is the Night,80994,Goodreads,,tender is the night,0.081006
123,77013,92,As I Lay Dying,105220,Goodreads,,as i lay dying,0.080441
124,4009,92,Nine Stories,105950,Goodreads,,nine stories,0.079887
