In [69]:
# Libraries
import pandas as pd
import csv

---

## Exploratory Data Analysis

**"liked_books.csv"**
- File containing our liked books

In [61]:
liked_books = pd.read_csv("liked_books.csv", index_col = 0)

## Data Cleaning
liked_books["book_id"] = liked_books["book_id"].astype(str)
liked_books["user_id"] = liked_books["user_id"].astype(str)

liked_books.head(5)

Unnamed: 0,book_id,title,rating,user_id
35132,4030991,Shoe Dog,5,-1
117471,35239798,The Courage to be Disliked,5,-1
134880,25899336,When Breath Becomes Air,5,-1
176533,6856680,Outliers,5,-1
206194,8238259,Cloud Atlas,4,-1


**"book_id_map.csv"** : 
- File which maps book id's in "goodreads_interactions.csv" to the book id's in "goodreads_books.json"

In [15]:
!head book_id_map.csv

book_id_csv,book_id
0,34684622
1,34536488
2,34017076
3,71730
4,30422361
5,33503613
6,33517540
7,34467031
8,6383669


In [16]:
## Create a Dictionary which maps the book ID's
csv_book_mapping = {}

with open("book_id_map.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        
        csv_id, book_id = map(str.strip, line.split(","))
        
        
        csv_book_mapping[csv_id] = book_id

**"goodreads_interactions.csv"**
- File containing data on other users who liked the same books as us

In [17]:
!head goodreads_interactions.csv

user_id,book_id,is_read,rating,is_reviewed
0,948,1,5,0
0,947,1,5,1
0,946,1,5,0
0,945,1,5,0
0,944,1,5,0
0,943,1,5,0
0,942,1,5,0
0,941,1,5,0
0,940,1,5,0


## Find other users who liked the same books as us

In [122]:
#._ Create a dictionary which contains user ID's and how many liked books we share in common

overlap_users = {}
liked_books_set = set(liked_books["book_id"])


with open("goodreads_interactions.csv",'r') as f:    
    while True:
        
        line = f.readline()
        if not line:
            break
        
        user_id, csv_id, _, rating, _ = line.strip().split(',')
        try:
            rating = int(rating)
        except:
            continue
        
        
        book_id = csv_book_mapping.get(csv_id)
        

        if book_id in liked_books_set and rating >= 4:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] +=1

We will filter the overlapping users for the sake of improving the quality of recommendations and for saving computation.

In [136]:
#._ filter the dictionary to contain only users who share 20% of our liked books
filtered_overlap_users = [i for i in overlap_users.values() if i >= liked_books.shape[0]/5]
len(filtered_overlap_users)

209

## Find what other books those users liked

In [141]:
#._ Construct a list of all the books overlapping users also liked

rec_lines = []

with open("goodreads_interactions.csv",'r') as f:
    reader = csv.reader(f)
    header = next(reader)
    
    for row in reader:
        user_id, csv_id, _, rating, _ = row
        
        if user_id in filtered_overlap_users:

            book_id = csv_book_mapping.get(csv_id)

            if book_id and book_id not in liked_books_set:
                rec_lines.append([user_id, book_id, rating])

In [85]:
#._ Construct a frequency dataframe showing which books appear the most

recs = pd.DataFrame(rec_lines, columns = ["user_id", "book_id", "rating"])
recs["book_id"] = recs["book_id"].astype(str)

top_recs = recs["book_id"].value_counts().head(10)
top_recs = top_recs.index.value

Unnamed: 0,user_id,book_id,rating
0,7,13536858,3
1,7,2767052,5
2,7,6148028,5
3,7,7260188,5
4,7,13227454,0
...,...,...,...
24232839,876023,16304,0
24232840,876023,485894,3
24232841,876023,7613,4
24232842,876023,17568801,0


In [91]:
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [79]:
books_titles[books_titles["book_id"].isin(top_recs)]

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
284473,2767052,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-th...,https://images.gr-assets.com/books/1447303603m...,the hunger games the hunger games 1
396828,7624,Lord of the Flies,1638289,https://www.goodreads.com/book/show/7624.Lord_...,https://images.gr-assets.com/books/1327869409m...,lord of the flies
401395,5107,The Catcher in the Rye,2086945,https://www.goodreads.com/book/show/5107.The_C...,https://images.gr-assets.com/books/1398034300m...,the catcher in the rye
463463,4671,The Great Gatsby,2758812,https://www.goodreads.com/book/show/4671.The_G...,https://images.gr-assets.com/books/1490528560m...,the great gatsby
510423,18373,Flowers for Algernon,320856,https://www.goodreads.com/book/show/18373.Flow...,https://images.gr-assets.com/books/1367141311m...,flowers for algernon
569831,5,Harry Potter and the Prisoner of Azkaban (Harr...,1876252,https://www.goodreads.com/book/show/5.Harry_Po...,https://images.gr-assets.com/books/1499277281m...,harry potter and the prisoner of azkaban harry...
615314,5470,1984,2023937,https://www.goodreads.com/book/show/5470.1984,https://images.gr-assets.com/books/1348990566m...,1984
770177,7613,Animal Farm,1928931,https://www.goodreads.com/book/show/7613.Anima...,https://images.gr-assets.com/books/1424037542m...,animal farm
790927,2657,To Kill a Mockingbird,3255518,https://www.goodreads.com/book/show/2657.To_Ki...,https://images.gr-assets.com/books/1361975680m...,to kill a mockingbird
878545,3,Harry Potter and the Sorcerer's Stone (Harry P...,4765497,https://www.goodreads.com/book/show/3.Harry_Po...,https://images.gr-assets.com/books/1474154022m...,harry potter and the sorcerers stone harry pot...


The problem with the above output of recommended books is that the list is very generic. These books are very popular and aren't necessarily catered towards our specific list of liked books.

In [123]:
all_recs = recs["book_id"].value_counts()
all_recs = all_recs.to_frame().reset_index()
all_recs.columns = ["book_id", "book_count"]

In [124]:
all_recs

Unnamed: 0,book_id,book_count
0,18373,22055
1,2657,19049
2,3,17690
3,2767052,17527
4,5470,17332
...,...,...
1079466,1835714,1
1079467,1120892,1
1079468,946827,1
1079469,1537130,1


In [125]:
all_recs = all_recs.merge(books_titles, how = "inner", on = "book_id")

In [126]:
# if a book is very popular in our set and less popular on goodreads, it has a higher score
all_recs["score"] = all_recs["book_count"] * (all_recs["book_count"] / all_recs["ratings"])

In [127]:
liked_books_titles = books_titles[books_titles["book_id"].isin(liked_books)]["mod_title"]
liked_books_titles

118243       when breath becomes air
510423          flowers for algernon
1072710    pimp the story of my life
Name: mod_title, dtype: object

In [130]:
# remove other instances of books in our liked books list
all_recs = all_recs[~all_recs["mod_title"].isin(liked_books_titles)]

In [None]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_cover(val):
    return '<img src="{}" width = 50></img>'.format(val)