In [1]:
import pandas as pd

my_books = pd.read_csv("my_books.csv", index_col=0)
my_books["book_id"] = my_books["book_id"].astype(str)

In [2]:
my_books

Unnamed: 0,user_id,book_id,rating,title
0,-1,13271378,5,Brother
1,-1,11047557,5,The Lord of the Rings
2,-1,85301,5,Unaccustomed Earth
3,-1,18144590,5,The Alchemist
4,-1,77203,5,The Kite Runner
5,-1,243714,5,Interpreter of Maladies
6,-1,893172,1,Percy Jackson and the Lightning Thief (Percy J...
7,-1,18918647,5,1984
8,-1,11468377,5,"Thinking, Fast and Slow"
9,-1,4677,3,The Great Gatsby


In [3]:
csv_book_mapping = {}
with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [4]:
# Each element in a set is unique
my_books_set = set(my_books["book_id"])

In [5]:
import time

overlap_users = {}

tick = time.perf_counter()
with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in my_books_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1
tock = time.perf_counter()

elapsed_time_s = int(tock - tick)
m, s = divmod(elapsed_time_s, 60)
print("elapsed time to find overlap users = " + str(m) + " minutes and " + str(s) + " seconds")

elapsed time to find overlap users = 4 minutes and 35 seconds


In [6]:
# Reduce the overlapped set by filtering out people who read less than 20% of the books the user
my_num_books = my_books.shape[0]
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_num_books / 5])

In [7]:
num_overlap_users = len(overlap_users)
num_filtered_overlap_users = len(filtered_overlap_users)
reduction = -100 * (num_filtered_overlap_users - num_overlap_users) / num_overlap_users

In [8]:
overlap_users_info = {"number of overlap users (at least one common read book)": [num_overlap_users],
                      "number of filtered overlap users (read more than 20% more books)": [num_filtered_overlap_users],
                      "reduction (%)": [reduction]}
overlap_users_info_df = pd.DataFrame.from_dict(overlap_users_info)
overlap_users_info_df

Unnamed: 0,number of overlap users (at least one common read book),number of filtered overlap users (read more than 20% more books),reduction (%)
0,178488,1342,99.248129


In [9]:
interactions_list = []

# Determine the book list of the people who read at least one of the books that the user read and has read more than 20% more books than the user
tick = time.perf_counter()
with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])
tock = time.perf_counter()

elapsed_time_s = int(tock - tick)
m, s = divmod(elapsed_time_s, 60)
print("elapsed time to find overlap and filtered users' read book list = " + str(m) + " minutes and " + str(s) + " seconds")

elapsed time to find overlap and filtered users' read book list = 3 minutes and 3 seconds


In [10]:
num_interactions = len(interactions_list)
print("number of people to collaborate with = " + str(num_filtered_overlap_users))
print("number of ratings to use = " + str(num_interactions))

number of people to collaborate with = 1342
number of ratings to use = 2839027


In [11]:
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])
# add user's read books to interactions
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

In [12]:
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,13271378,5
1,-1,11047557,5
2,-1,85301,5
3,-1,18144590,5
4,-1,77203,5
...,...,...,...
2839022,875017,14756,4
2839023,875017,5139,4
2839024,875017,5191,4
2839025,875017,85301,3


In [13]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)

interactions["rating"] = pd.to_numeric(interactions["rating"])

interactions["user_index"] = interactions["user_id"].astype("category").cat.codes
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [14]:
# Number of unique users (will be one more than num_filtered_overlap_users because added user)
num_unique_users = len(interactions.user_id.unique())
num_book_ratings = len(interactions.book_id.unique())
matrix_size = num_unique_users * num_book_ratings
print("matrix_size = num_users * num_book_ratings = " + str(num_unique_users) + " x " + str(num_book_ratings) + " = " + str(matrix_size))

matrix_size = num_users * num_book_ratings = 1343 x 570134 = 765689962


In [15]:
from scipy.sparse import coo_matrix

# Need to make matrix sparse to reduce computational overhead (dense to sparse matrix)
# Data, row, column
ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [16]:
ratings_mat_coo

<1343x570134 sparse matrix of type '<class 'numpy.int64'>'
	with 2839038 stored elements in COOrdinate format>

In [17]:
ratings_mat = ratings_mat_coo.tocsr()

In [18]:
interactions[interactions["user_id"] == "-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,13271378,5,0,71341
1,-1,11047557,5,0,22088
2,-1,85301,5,0,535954
3,-1,18144590,5,0,173010
4,-1,77203,5,0,511400
5,-1,243714,5,0,279484
6,-1,893172,1,0,546282
7,-1,18918647,5,0,197424
8,-1,11468377,5,0,31841
9,-1,4677,3,0,427066


In [19]:
my_index = 0

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [21]:
similarity[0]

1.0

In [22]:
import numpy as np

indices = np.argpartition(similarity, -15)[-15:]

In [23]:
indices

array([ 707, 1225, 1261,  709,  956, 1194, 1125, 1310, 1139, 1221,  681,
       1145, 1176, 1300,    0], dtype=int64)

In [24]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

In [25]:
similar_users = similar_users[similar_users["user_id"] != "-1"]

In [26]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
2094031,326216,342251,0,681,391343
2094032,326216,10210,0,681,4288
2094033,326216,110824,0,681,22949
2094034,326216,227186,0,681,250520
2094035,326216,263862,0,681,315985
...,...,...,...,...,...
2839022,875017,14756,4,1310,99717
2839023,875017,5139,4,1310,436098
2839024,875017,5191,4,1310,437032
2839025,875017,85301,3,1310,535954


In [27]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [28]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
100286,1,0.000000
100322,1,0.000000
10032672,1,0.000000
10054335,1,0.000000
100915,1,0.000000
...,...,...
99561,3,2.333333
99664,1,0.000000
998133,1,2.000000
9994765,1,0.000000


In [29]:
books_titles = pd.read_json("book_info_df.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")

In [30]:
book_recs

Unnamed: 0,book_id,count,mean,title,num_ratings,url,cover_image,language_code,average_rating,publisher,num_pages,publication_year,mod_title
0,100286,1,0.000000,A Short Guide to a Happy Life,6217,https://www.goodreads.com/book/show/100286.A_S...,https://images.gr-assets.com/books/1320522593m...,,3.96,random house,50.0,2000.0,a short guide to a happy life
1,100322,1,0.000000,Assata: An Autobiography,11057,https://www.goodreads.com/book/show/100322.Assata,https://images.gr-assets.com/books/1328857268m...,eng,4.47,chicago review press,274.0,1999.0,assata an autobiography
2,10032672,1,0.000000,The Language of Flowers,142582,https://www.goodreads.com/book/show/10032672-t...,https://images.gr-assets.com/books/1333577674m...,eng,4.07,ballantine books,323.0,2011.0,the language of flowers
3,10054335,1,0.000000,Rules of Civility,78912,https://www.goodreads.com/book/show/10054335-r...,https://images.gr-assets.com/books/1311705045m...,eng,3.99,viking adult,335.0,2011.0,rules of civility
4,100915,1,0.000000,"The Lion, the Witch, and the Wardrobe (Chronic...",1575387,https://www.goodreads.com/book/show/100915.The...,https://images.gr-assets.com/books/1353029077m...,eng,4.19,harpercollins publishers,206.0,2005.0,the lion the witch and the wardrobe chronicles...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1674,99561,3,2.333333,Looking for Alaska,804587,https://www.goodreads.com/book/show/99561.Look...,https://images.gr-assets.com/books/1394798630m...,eng,4.08,speak,221.0,2006.0,looking for alaska
1675,99664,1,0.000000,The Painted Veil,24606,https://www.goodreads.com/book/show/99664.The_...,https://images.gr-assets.com/books/1320421719m...,eng,3.91,,246.0,,the painted veil
1676,998133,1,2.000000,The Gathering,14548,https://www.goodreads.com/book/show/998133.The...,https://s.gr-assets.com/assets/nophoto/book/11...,eng,3.04,grove press black cat,261.0,2007.0,the gathering
1677,9994765,1,0.000000,"My Soul to Take (African Immortals, #4)",634,https://www.goodreads.com/book/show/9994765-my...,https://s.gr-assets.com/assets/nophoto/book/11...,,4.22,,,,my soul to take african immortals 4


In [31]:
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["num_ratings"])
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]
book_recs = book_recs[book_recs["mean"] >= 4]
book_recs = book_recs[book_recs["count"]> 2]

In [32]:
top_recs = book_recs.sort_values("mean", ascending=False)

In [33]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,num_ratings,url,cover_image,language_code,average_rating,publisher,num_pages,publication_year,mod_title,adjusted_count,score
11,10235,3,4.666667,"Mountains Beyond Mountains: The Quest of Dr. Paul Farmer, A Man Who Would Cure the World",53936,Goodreads,,eng,4.21,,333.0,,mountains beyond mountains the quest of dr paul farmer a man who would cure the world,0.000167,0.000779
877,2657,5,4.4,To Kill a Mockingbird,3255518,Goodreads,,eng,4.26,harper perennial modern classics,324.0,2006.0,to kill a mockingbird,8e-06,3.4e-05
162,128029,10,4.2,A Thousand Splendid Suns,835172,Goodreads,,eng,4.34,,372.0,,a thousand splendid suns,0.00012,0.000503
434,1774836,3,4.0,The Palace of Illusions,17085,Goodreads,,eng,4.11,doubleday,360.0,2008.0,the palace of illusions,0.000527,0.002107
1226,4214,4,4.0,Life of Pi,1012471,Goodreads,,,3.88,seal books,460.0,2006.0,life of pi,1.6e-05,6.3e-05
1657,968,3,4.0,"The Da Vinci Code (Robert Langdon, #2)",1465770,Goodreads,,eng,3.79,anchor,481.0,2006.0,the da vinci code robert langdon 2,6e-06,2.5e-05


In [34]:
print(top_recs)

      book_id  count      mean  \
11      10235      3  4.666667   
877      2657      5  4.400000   
162    128029     10  4.200000   
434   1774836      3  4.000000   
1226     4214      4  4.000000   
1657      968      3  4.000000   

                                                  title  num_ratings  \
11    Mountains Beyond Mountains: The Quest of Dr. P...        53936   
877                               To Kill a Mockingbird      3255518   
162                            A Thousand Splendid Suns       835172   
434                             The Palace of Illusions        17085   
1226                                         Life of Pi      1012471   
1657             The Da Vinci Code (Robert Langdon, #2)      1465770   

                                                    url  \
11    https://www.goodreads.com/book/show/10235.Moun...   
877   https://www.goodreads.com/book/show/2657.To_Ki...   
162   https://www.goodreads.com/book/show/128029.A_T...   
434   https://www.go

In [35]:
print(type(top_recs))

<class 'pandas.core.frame.DataFrame'>


In [36]:
test_string = "13271378,5,Brother;11047557,5,The Lord of the Rings;85301,5,Unaccustomed Earth;18144590,5,The Alchemist"
print(test_string)

13271378,5,Brother;11047557,5,The Lord of the Rings;85301,5,Unaccustomed Earth;18144590,5,The Alchemist


In [37]:
test_string_list = []
p1 = 0
p2 = 0
for letter in test_string:
    p2 += 1
    if letter == ';':
        test_string_list.append(test_string[p1:p2 - 1].split(","))
        p1 = p2

In [38]:
for e in test_string_list:
    print(e)

['13271378', '5', 'Brother']
['11047557', '5', 'The Lord of the Rings']
['85301', '5', 'Unaccustomed Earth']


In [39]:
my_books = pd.DataFrame(test_string_list, columns=['book_id', 'rating', 'title'])
my_books["book_id"] = my_books["book_id"].astype(str)
my_books.insert(0, 'user_id', -1)
print(my_books)

   user_id   book_id rating                  title
0       -1  13271378      5                Brother
1       -1  11047557      5  The Lord of the Rings
2       -1     85301      5     Unaccustomed Earth
