In [193]:
# Libraries
import pandas as pd
import csv
from scipy.sparse import coo_matrix

---

## Exploratory Data Analysis

**"liked_books.csv"**
- File containing our liked books

In [2]:
liked_books = pd.read_csv("liked_books.csv", index_col = 0)

## Data Cleaning
liked_books["book_id"] = liked_books["book_id"].astype(str)
liked_books["user_id"] = liked_books["user_id"].astype(str)

liked_books.head(5)

Unnamed: 0,book_id,title,rating,user_id
35132,4030991,Shoe Dog,5,-1
117471,35239798,The Courage to be Disliked,5,-1
134880,25899336,When Breath Becomes Air,5,-1
176533,6856680,Outliers,5,-1
206194,8238259,Cloud Atlas,4,-1


**"book_id_map.csv"** : 
- File which maps book id's in "goodreads_interactions.csv" to the book id's in "goodreads_books.json"

In [3]:
!head book_id_map.csv

book_id_csv,book_id
0,34684622
1,34536488
2,34017076
3,71730
4,30422361
5,33503613
6,33517540
7,34467031
8,6383669


In [6]:
## Create a Dictionary which maps the book ID's
csv_book_mapping = {}

with open("book_id_map.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        
        csv_id, book_id = map(str.strip, line.split(","))
        
        
        csv_book_mapping[csv_id] = book_id

**"goodreads_interactions.csv"**
- File containing data on other users who liked the same books as us

In [7]:
!head goodreads_interactions.csv

user_id,book_id,is_read,rating,is_reviewed
0,948,1,5,0
0,947,1,5,1
0,946,1,5,0
0,945,1,5,0
0,944,1,5,0
0,943,1,5,0
0,942,1,5,0
0,941,1,5,0
0,940,1,5,0


## Find other users who liked the same books as us

In [16]:
#._ Create a dictionary which contains user ID's and how many liked books they share in common with our set of liked books

overlap_users = {}
liked_books_set = set(liked_books["book_id"])
liked_books_count = liked_books.shape[0]

with open("goodreads_interactions.csv", 'r') as f:
    for line in f:
        user_id, csv_id, _, rating, _ = line.strip().split(',')
        try:
            rating = int(rating)
        except ValueError:
            continue
            
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in liked_books_set and rating >= 4:
            overlap_users[user_id] = overlap_users.get(user_id, 0) + 1

We will filter the overlapping users for the sake of improving the quality of recommendations and for saving computation.

In [57]:
#._ filter the dictionary to contain only users who share at least 20% of our liked books

filtered_overlap_users = {key: value for key, value in overlap_users.items() if value >= liked_books_count/5}
len(filtered_overlap_users)

209

## Find what other books those users liked

In [72]:
#._ Construct a list of all the books overlapping users also liked

rec_lines = []
filtered_overlap_users_set = set(filtered_overlap_users)

with open("goodreads_interactions.csv",'r') as f:
    reader = csv.reader(f)
    header = next(reader)
    
    for row in reader:
        user_id, csv_id, _, rating, _ = row
        
        if user_id in filtered_overlap_users_set:

            book_id = csv_book_mapping.get(csv_id)

            if book_id and book_id not in liked_books_set:
                rec_lines.append([user_id, book_id, rating])

In [191]:
#._ Construct a dataframe of the users books and ratings along with our liked books

recs = pd.DataFrame(rec_lines, columns = ["user_id", "book_id", "rating"])
recs = pd.concat([liked_books[["user_id", "book_id", "rating"]], recs])

## Variable coercion
recs["user_id"] = recs["user_id"].astype(str)
recs["book_id"] = recs["book_id"].astype(str)
recs["rating"] = recs["rating"].astype(int)

## Index creation for users and books
recs["user_index"] = recs["user_id"].astype("category").cat.codes
recs["book_index"] = recs["book_id"].astype("category").cat.codes

In [201]:
len(recs["user_index"]) * len(recs["book_index"])

84679836004

The above output is the amount of individual elements we would have to store in a dense matrix. For the sake of conserving as much of our RAM as possible, we will instead construct a sparse matrix.

## Construct a Sparse Matrix

In [202]:
ratings_mat_coo = coo_matrix((recs["rating"], (recs["user_index"], recs["book_index"])))
ratings_mat_coo

<210x122890 sparse matrix of type '<class 'numpy.int64'>'
	with 290998 stored elements in COOrdinate format>

In [203]:
ratings_mat = ratings_mat_coo.tocsr()
ratings_mat

<210x122890 sparse matrix of type '<class 'numpy.int64'>'
	with 290998 stored elements in Compressed Sparse Row format>

We converted our sparse matrix to csr format because the majority of utilities ran in this script are only compatible with sparse matrices in csr format

In [125]:
all_recs = all_recs.merge(books_titles, how = "inner", on = "book_id")

In [126]:
# if a book is very popular in our set and less popular on goodreads, it has a higher score
all_recs["score"] = all_recs["book_count"] * (all_recs["book_count"] / all_recs["ratings"])

In [127]:
liked_books_titles = books_titles[books_titles["book_id"].isin(liked_books)]["mod_title"]
liked_books_titles

118243       when breath becomes air
510423          flowers for algernon
1072710    pimp the story of my life
Name: mod_title, dtype: object

In [130]:
# remove other instances of books in our liked books list
all_recs = all_recs[~all_recs["mod_title"].isin(liked_books_titles)]

In [None]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_cover(val):
    return '<img src="{}" width = 50></img>'.format(val)