In [1]:
import pandas as pd
import numpy as np


In [2]:
#Count how many lines our file contains
!type "C:\Users\priyanka.roychoudhur\Desktop\DS\BooksRecommendation\goodreads_interactions.csv" | find /c /v ""

228648343


In [3]:
#read a few (non-header) rows to look at how our data looks like:

test = pd.read_csv("goodreads_interactions.csv", nrows=4)

test

Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed
0,0,948,1,5,0
1,0,947,1,5,1
2,0,946,1,5,0
3,0,945,1,5,0


In [4]:
#to avoid overloading the memory by loading the entire dataset, we read it line by line

with open("goodreads_interactions.csv") as f: #default is to read the file "r"
    line = f.readline()
    
line #one line from the csv file

'user_id,book_id,is_read,rating,is_reviewed\n'

In [5]:
#I only need the user_id, book_id and rating from this dataset pertaining to users who have also read the same books as me

#Based on the search script
my_books = ["4408", "31147619", "29983711", "9401317", "9317691", "8153988", "20494944"]


In [6]:
#The books id in the goodreads list of books data and the interactions data are different so we need to map the ids across two datasets

!type "C:\Users\priyanka.roychoudhur\Desktop\DS\BooksRecommendation\book_id_map.csv" | find /c /v ""

2360651


In [7]:
!dir goodreads_interactions.csv /O:E

 Volume in drive C is OS
 Volume Serial Number is C084-E4B0

 Directory of C:\Users\priyanka.roychoudhur\desktop\DS\BooksRecommendation

12/12/2022  15:06     4,318,621,741 goodreads_interactions.csv
               1 File(s)  4,318,621,741 bytes
               0 Dir(s)  1,521,501,212,672 bytes free


In [8]:
#Look at what the mapping dataset looks like

mapping_csv = pd.read_csv("book_id_map.csv")
mapping_csv

Unnamed: 0,book_id_csv,book_id
0,0,34684622
1,1,34536488
2,2,34017076
3,3,71730
4,4,30422361
...,...,...
2360645,2360645,19517100
2360646,2360646,18597299
2360647,2360647,18584882
2360648,2360648,18518801


In [9]:
#Read in the mapping data
mapping_data = {}

with open("book_id_map.csv", "r") as f:  #default is to read the file "r"
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",") # if we don't use strip then there is white sace in the end "\n"
        mapping_data[csv_id] = book_id #assigning keys and values


In [10]:
len(mapping_data)

2360651

In [11]:
mapping_data["945"]

'45'

In [17]:
#First we would want to identify only those users who have something in common with us in terms of the books they have read. 
#We will create a list with only the above common users

common_users = [] 

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.strip().split(",") # _ indicate that part of a function result is being deliberately ignored
        
        try:
            rating = int(rating)
        except ValueError:
            continue
            
        book_id = mapping_data.get(csv_id) #use get as otherwise python throws a key error
        
        if book_id in my_books and rating >= 4:
            common_users.append(user_id)



In [18]:
#remove duplicates so we have unique users in our list

common_users_unique = list(set(common_users))
print(len(common_users))
print(len(common_users_unique))

2046
2029


In [19]:
#Now we have identified the user ids of the users who have read the same books as in our book list, 
# we can get the rest of the information 

interactions = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in common_users_unique:
            book_id = mapping_data.get(csv_id)
            interactions.append([user_id, book_id, rating])


In [22]:
#Create our dataframe which now has a more manageable size

df = pd.DataFrame(interactions, columns =["user_id", "book_id", "rating"])

df.shape

(1530257, 3)

In [25]:
df.head()

Unnamed: 0,user_id,book_id,rating
0,284,977284,3
1,284,890054,4
2,284,837153,3
3,284,1586480,4
4,284,41814,5


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1530257 entries, 0 to 1530256
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   user_id  1530257 non-null  object
 1   book_id  1530257 non-null  object
 2   rating   1530257 non-null  object
dtypes: object(3)
memory usage: 35.0+ MB


In [27]:
#No missing values 
#Convert rating data to numeric

df["rating"] = df["rating"].astype(float)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1530257 entries, 0 to 1530256
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   user_id  1530257 non-null  object 
 1   book_id  1530257 non-null  object 
 2   rating   1530257 non-null  float64
dtypes: float64(1), object(2)
memory usage: 35.0+ MB


In [40]:
#Look at what are the books id that are most read by the users
top_books = df["book_id"].value_counts().head(20)

In [45]:
top_books

2767052     1092
29983711    1089
2657        1074
3           1048
4671        1028
18143977     977
38447        911
5470         907
77203        886
5107         886
22557272     882
15881        874
6148028      863
136251       856
7260188      849
2            827
1            817
19063        810
6            805
11870085     803
Name: book_id, dtype: int64

In [58]:
#Bring up the details of these top 10 books from our goodreads dataset

top_books_index = top_books.index.values # get the top books id


books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

books_titles = books_titles.drop("cover_image", axis = 1)

books_recs = books_titles[books_titles["book_id"].isin(top_books_index)]

In [62]:
books_recs = books_recs.sort_values("ratings", ascending=False).reset_index(drop = True)

books_recs


Unnamed: 0,book_id,title,ratings,url,publication_year,clean_title
0,2767052,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-th...,2008.0,the hunger games the hunger games 1
1,3,Harry Potter and the Sorcerer's Stone (Harry P...,4765497,https://www.goodreads.com/book/show/3.Harry_Po...,1997.0,harry potter and the sorcerers stone harry pot...
2,2657,To Kill a Mockingbird,3255518,https://www.goodreads.com/book/show/2657.To_Ki...,2006.0,to kill a mockingbird
3,4671,The Great Gatsby,2758812,https://www.goodreads.com/book/show/4671.The_G...,2004.0,the great gatsby
4,11870085,The Fault in Our Stars,2429317,https://www.goodreads.com/book/show/11870085-t...,2012.0,the fault in our stars
5,5107,The Catcher in the Rye,2086945,https://www.goodreads.com/book/show/5107.The_C...,2001.0,the catcher in the rye
6,5470,1984,2023937,https://www.goodreads.com/book/show/5470.1984,1950.0,1984
7,6148028,"Catching Fire (The Hunger Games, #2)",1854746,https://www.goodreads.com/book/show/6148028-ca...,2009.0,catching fire the hunger games 2
8,77203,The Kite Runner,1848782,https://www.goodreads.com/book/show/77203.The_...,2004.0,the kite runner
9,15881,Harry Potter and the Chamber of Secrets (Harry...,1821802,https://www.goodreads.com/book/show/15881.Harr...,1999.0,harry potter and the chamber of secrets harry ...


In [72]:
def make_clickable(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)

#books_recs['url'] = books_recs['url'].apply(format_url)
books_recs.style.format({'url': make_clickable})

Unnamed: 0,book_id,title,ratings,url,publication_year,clean_title
0,2767052,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-the-hunger-games,2008.0,the hunger games the hunger games 1
1,3,"Harry Potter and the Sorcerer's Stone (Harry Potter, #1)",4765497,https://www.goodreads.com/book/show/3.Harry_Potter_and_the_Sorcerer_s_Stone,1997.0,harry potter and the sorcerers stone harry potter 1
2,2657,To Kill a Mockingbird,3255518,https://www.goodreads.com/book/show/2657.To_Kill_a_Mockingbird,2006.0,to kill a mockingbird
3,4671,The Great Gatsby,2758812,https://www.goodreads.com/book/show/4671.The_Great_Gatsby,2004.0,the great gatsby
4,11870085,The Fault in Our Stars,2429317,https://www.goodreads.com/book/show/11870085-the-fault-in-our-stars,2012.0,the fault in our stars
5,5107,The Catcher in the Rye,2086945,https://www.goodreads.com/book/show/5107.The_Catcher_in_the_Rye,2001.0,the catcher in the rye
6,5470,1984,2023937,https://www.goodreads.com/book/show/5470.1984,1950.0,1984
7,6148028,"Catching Fire (The Hunger Games, #2)",1854746,https://www.goodreads.com/book/show/6148028-catching-fire,2009.0,catching fire the hunger games 2
8,77203,The Kite Runner,1848782,https://www.goodreads.com/book/show/77203.The_Kite_Runner,2004.0,the kite runner
9,15881,"Harry Potter and the Chamber of Secrets (Harry Potter, #2)",1821802,https://www.goodreads.com/book/show/15881.Harry_Potter_and_the_Chamber_of_Secrets,1999.0,harry potter and the chamber of secrets harry potter 2
