In [219]:
# Libraries
import pandas as pd
import numpy as np
import csv
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

---

## Exploratory Data Analysis

**"liked_books.csv"**
- File containing our liked books

In [291]:
liked_books = pd.read_csv("liked_books.csv", index_col = 0)

## Data Cleaning
liked_books["book_id"] = liked_books["book_id"].astype(str)
liked_books["user_id"] = liked_books["user_id"].astype(str)

liked_books.head(5)

Unnamed: 0,book_id,title,mod_title,rating,user_id
35132,4030991,Shoe Dog,shoe dog phil knight,5,-1
117471,35239798,The Courage to be Disliked,the courage to be disliked ichiro kishimi fumi...,5,-1
134880,25899336,When Breath Becomes Air,when breath becomes air paul kalanithi abraham...,5,-1
176533,6856680,Outliers,outliers malcolm gladwell,5,-1
206194,8238259,Cloud Atlas,cloud atlas david mitchell,4,-1


**"book_id_map.csv"** : 
- File which maps book id's in "goodreads_interactions.csv" to the book id's in "goodreads_books.json"

In [3]:
!head book_id_map.csv

book_id_csv,book_id
0,34684622
1,34536488
2,34017076
3,71730
4,30422361
5,33503613
6,33517540
7,34467031
8,6383669


In [6]:
## Create a Dictionary which maps the book ID's
csv_book_mapping = {}

with open("book_id_map.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        
        csv_id, book_id = map(str.strip, line.split(","))
        
        
        csv_book_mapping[csv_id] = book_id

**"goodreads_interactions.csv"**
- File containing data on other users who liked the same books as us

In [7]:
!head goodreads_interactions.csv

user_id,book_id,is_read,rating,is_reviewed
0,948,1,5,0
0,947,1,5,1
0,946,1,5,0
0,945,1,5,0
0,944,1,5,0
0,943,1,5,0
0,942,1,5,0
0,941,1,5,0
0,940,1,5,0


## Find other users who liked the same books as us

In [16]:
#._ Create a dictionary which contains user ID's and how many liked books they share in common with our set of liked books

overlap_users = {}
liked_books_set = set(liked_books["book_id"])
liked_books_count = liked_books.shape[0]

with open("goodreads_interactions.csv", 'r') as f:
    for line in f:
        user_id, csv_id, _, rating, _ = line.strip().split(',')
        try:
            rating = int(rating)
        except ValueError:
            continue
            
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in liked_books_set and rating >= 4:
            overlap_users[user_id] = overlap_users.get(user_id, 0) + 1

We will filter the overlapping users for the sake of improving the quality of recommendations and for saving computation.

In [57]:
#._ filter the dictionary to contain only users who share at least 20% of our liked books

filtered_overlap_users = {key: value for key, value in overlap_users.items() if value >= liked_books_count/5}
len(filtered_overlap_users)

209

## Find what other books those users liked

In [204]:
#._ Construct a list of all the books overlapping users also liked

rec_lines = []
filtered_overlap_users_set = set(filtered_overlap_users)

with open("goodreads_interactions.csv",'r') as f:
    reader = csv.reader(f)
    header = next(reader)
    
    for row in reader:
        user_id, csv_id, _, rating, _ = row
        
        if user_id in filtered_overlap_users_set:

            book_id = csv_book_mapping.get(csv_id)

            if book_id:
                rec_lines.append([user_id, book_id, rating])

In [205]:
#._ Construct a dataframe of the users books and ratings along with our liked books

recs = pd.DataFrame(rec_lines, columns = ["user_id", "book_id", "rating"])
recs = pd.concat([liked_books[["user_id", "book_id", "rating"]], recs])

## Variable coercion
recs["user_id"] = recs["user_id"].astype(str)
recs["book_id"] = recs["book_id"].astype(str)
recs["rating"] = recs["rating"].astype(int)

## Index creation for users and books
recs["user_index"] = recs["user_id"].astype("category").cat.codes
recs["book_index"] = recs["book_id"].astype("category").cat.codes

In [206]:
len(recs["user_index"]) * len(recs["book_index"])

85067305569

The above output is the amount of individual elements we would have to store in a dense matrix. For the sake of conserving as much of our RAM as possible, we will instead construct a sparse matrix.

## Construct a Sparse Matrix

In [207]:
ratings_mat_coo = coo_matrix((recs["rating"], (recs["user_index"], recs["book_index"])))
ratings_mat_coo

<210x122890 sparse matrix of type '<class 'numpy.int64'>'
	with 291663 stored elements in COOrdinate format>

In [208]:
ratings_mat = ratings_mat_coo.tocsr()
ratings_mat

<210x122890 sparse matrix of type '<class 'numpy.int64'>'
	with 291663 stored elements in Compressed Sparse Row format>

We converted our sparse matrix to csr format because the majority of utilities ran in this script are only compatible with sparse matrices in csr format

In [226]:
#._ Find the users who are most similar to us in terms of liked books

similarity = cosine_similarity(ratings_mat[0,:], ratings_mat).flatten()
similarity[0:100]

array([1.        , 0.04520878, 0.0231263 , 0.03227147, 0.06064665,
       0.02977542, 0.07078707, 0.02122546, 0.03851848, 0.04726033,
       0.03815264, 0.0546714 , 0.02647368, 0.06641602, 0.0277758 ,
       0.02822264, 0.05754852, 0.08054442, 0.03831864, 0.03697016,
       0.02140355, 0.03214207, 0.04201714, 0.08582603, 0.05167651,
       0.03385579, 0.06197533, 0.10986547, 0.03374985, 0.01453874,
       0.02681954, 0.01964493, 0.06703279, 0.08181598, 0.04166791,
       0.05478098, 0.06566191, 0.02328752, 0.06298765, 0.01751794,
       0.04870192, 0.01623584, 0.05555714, 0.06294617, 0.0189607 ,
       0.0433072 , 0.04729731, 0.0406229 , 0.05371389, 0.03780212,
       0.0445879 , 0.03016056, 0.0309742 , 0.07357517, 0.04678338,
       0.03385329, 0.03373383, 0.03764961, 0.05682389, 0.0690278 ,
       0.03174416, 0.04696181, 0.04874628, 0.01393375, 0.05217653,
       0.01827346, 0.06083418, 0.05021814, 0.07299048, 0.04066322,
       0.03376713, 0.0535619 , 0.02988941, 0.04352443, 0.04626

Here we calculated the similarity between our ratings and the other 209 users' ratings using a cosine similarity function

In [257]:
indices = np.argpartition(similarity, -15)[-15:]

similar_users = recs[recs["user_index"].isin(indices)]
similar_users = similar_users[similar_users["user_index"] != 0]

In [264]:
book_recs = similar_users.groupby("book_id").rating.agg(["count", "mean"])
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4,3.50
1005,1,0.00
100629,1,0.00
10065,1,5.00
100915,4,4.25
...,...,...
98248,1,5.00
9923372,1,0.00
99300,1,2.00
99561,4,1.00


In [270]:
books_titles = pd.read_json("books_titles.json")

# Data Cleaning
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [283]:
book_recs = book_recs.merge(books_titles, how = "inner", on = "book_id")
book_recs

Unnamed: 0,book_id,count,mean,title,author,ratings,url,cover_image,mod_title,mod_author
0,1,4,3.50,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling Mary GrandPre,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...,jk rowling mary grandpre
1,1005,1,0.00,Think and Grow Rich,Napoleon Hill,87634,https://www.goodreads.com/book/show/1005.Think...,https://s.gr-assets.com/assets/nophoto/book/11...,think and grow rich napoleon hill,napoleon hill
2,100629,1,0.00,The Universe in a Single Atom: The Convergence...,Dalai Lama XIV,6310,https://www.goodreads.com/book/show/100629.The...,https://images.gr-assets.com/books/1320558690m...,the universe in a single atom the convergence ...,dalai lama xiv
3,10065,1,5.00,Wayside School Is Falling Down (Wayside School...,Louis Sachar Joel Schick,46885,https://www.goodreads.com/book/show/10065.Ways...,https://images.gr-assets.com/books/1405055796m...,wayside school is falling down wayside school ...,louis sachar joel schick
4,100915,4,4.25,"The Lion, the Witch, and the Wardrobe (Chronic...",C.S. Lewis,1575387,https://www.goodreads.com/book/show/100915.The...,https://images.gr-assets.com/books/1353029077m...,the lion the witch and the wardrobe chronicles...,cs lewis
...,...,...,...,...,...,...,...,...,...,...
1428,98248,1,5.00,The Non-Runner's Marathon Trainer,David A. Whitsett Forrest A. Dolgener Tanjala ...,1095,https://www.goodreads.com/book/show/98248.The_...,https://images.gr-assets.com/books/1296236131m...,the nonrunners marathon trainer david a whitse...,david a whitsett forrest a dolgener tanjala jo...
1429,9923372,1,0.00,Maine,J. Courtney Sullivan,24340,https://www.goodreads.com/book/show/9923372-maine,https://images.gr-assets.com/books/1295996313m...,maine j courtney sullivan,j courtney sullivan
1430,99300,1,2.00,The Yellow Wallpaper and Other Stories,Charlotte Perkins Gilman,65857,https://www.goodreads.com/book/show/99300.The_...,https://images.gr-assets.com/books/1327909237m...,the yellow wallpaper and other stories charlot...,charlotte perkins gilman
1431,99561,4,1.00,Looking for Alaska,John Green,804587,https://www.goodreads.com/book/show/99561.Look...,https://images.gr-assets.com/books/1394798630m...,looking for alaska john green,john green


In [288]:
#._ Ranking the book recommendations

book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

book_recs = book_recs[~book_recs["book_id"].isin(liked_books["book_id"])]



In [290]:
liked_books

Unnamed: 0,book_id,title,rating,user_id
35132,4030991,Shoe Dog,5,-1
117471,35239798,The Courage to be Disliked,5,-1
134880,25899336,When Breath Becomes Air,5,-1
176533,6856680,Outliers,5,-1
206194,8238259,Cloud Atlas,4,-1
345891,11987,The Myth of Sisyphus and Other Essays,4,-1
411052,75855,Utopia,3,-1
583667,18373,Flowers for Algernon,5,-1
893083,133518,The Things They Carried,5,-1
996572,275612,My Bloody Life: The Making of a Latin King,5,-1


In [125]:
all_recs = all_recs.merge(books_titles, how = "inner", on = "book_id")

In [126]:
# if a book is very popular in our set and less popular on goodreads, it has a higher score
all_recs["score"] = all_recs["book_count"] * (all_recs["book_count"] / all_recs["ratings"])

In [265]:
liked_books_titles = books_titles[books_titles["book_id"].isin(liked_books)]["mod_title"]
liked_books_titles

Series([], Name: mod_title, dtype: object)

Series([], Name: mod_title, dtype: object)