# Data Prepration for GoodReads Dataset
## (Efficient) Loading, Cleaning, Preprocessing

### Setup

In [12]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
from IPython.display import display
from tqdm.notebook import tqdm


Dataset directory

In [2]:
DIR = './data'

#### Load Data
using pandas to read the JSON data - ~using already unzipped JSON file to improve performance~ and to later make the switch to DASK.

**only using 30k rows of books and 30k reviews**

In [3]:
in_fn_genres = 'goodreads_book_genres_initial.json'

In [4]:
in_fn_books = 'goodreads_books_mystery_thriller_crime.json.gz'
in_fn_reviews = 'goodreads_reviews_mystery_thriller_crime.json.gz'
in_fn_authors = 'goodreads_book_authors.json'

In [5]:
NBOOKS = 50000
NREVIEWS = 2*NBOOKS
NAUTHORS = 5*NBOOKS

---
read chunks of json and save to csv for easier tf.data processing

In [6]:
OUT_DIR = './data/cleaned/'

In [50]:
chunk_size = 500

book_reader = pd.read_json(os.path.join(DIR, in_fn_books), lines=True, dtype={
    "title": 'string', 
    "description": 'string', 
    "text_reviews_count": 'uint', 
    'average_rating': 'float'
}, chunksize=chunk_size)
genre_reader = pd.read_json(os.path.join(DIR, in_fn_genres), lines=True, dtype={"book_id": 'uint32'}, chunksize=chunk_size)
authors_reader = pd.read_json(os.path.join(DIR, in_fn_authors), lines=True, chunksize=chunk_size,
dtype = {
    'author': 'name'
})
review_reader = pd.read_json(os.path.join(DIR, in_fn_reviews), lines=True, chunksize=chunk_size, 
dtype = {
'book_id': 'uint32',
'rating': 'uint8',
'review_text': 'string',
})

# set the datatypes to save memory already when reading

In [21]:
# write first chunk with header
with open(os.path.join(OUT_DIR, "genres.csv"), 'a') as f:
    chunk = next(genre_reader)
    chunk.set_index("book_id", inplace=True)
    chunk = pd.concat([chunk.drop(['genres'], axis=1), chunk['genres'].apply(pd.Series)], axis=1)
    chunk.rename(columns={"comics, graphic": "comics_graphic", 
                     "history, historical fiction, biography": "history_historical_fiction_biography", 
                     "fantasy, paranormal": "fantasy_paranormal", 
                     "mystery, thriller, crime": "mystery_thriller_crime"},
            inplace=True)
    chunk.to_csv(f, header=True, line_terminator='\n')    

In [None]:
with open(os.path.join(OUT_DIR, "genres.csv"), 'a') as f:
    no_lines = 2360656
    for chunk in tqdm(genre_reader, total=no_lines/chunk_size):
        chunk.set_index("book_id", inplace=True)
        chunk = pd.concat([chunk.drop(['genres'], axis=1), chunk['genres'].apply(pd.Series)], axis=1)
        chunk.rename(columns={"comics, graphic": "comics_graphic", 
                    "history, historical fiction, biography": "history_historical_fiction_biography", 
                    "fantasy, paranormal": "fantasy_paranormal", 
                    "mystery, thriller, crime": "mystery_thriller_crime"},
        inplace=True)

        chunk.to_csv(f, header=False, line_terminator='\n')

---

In [40]:
def prepare_book_df(df):
    df.set_index("book_id", inplace=True)
    df.loc[:,'author_id'] = df['authors'].apply(lambda row: row[0]['author_id']) # just select the first author_id of the list of authors
    df.loc[:, 'author_id'] = df['author_id'].astype('int64')
    df = df.loc[:, ['title', 'text_reviews_count', 'average_rating', 'description', 'author_id']]
    df.loc[:,'title'] = df['title'].str.encode('utf-8')
    df.loc[:,'description'] = df['description'].replace(r'\n',' ', regex=True) 
    return df

In [41]:
# write first chunk with header
with open(os.path.join(OUT_DIR, "books.csv"), 'a') as f:
    chunk = next(book_reader)
    chunk = prepare_book_df(chunk)
    chunk.to_csv(f, header=True, line_terminator='\n')    

In [42]:
with open(os.path.join(OUT_DIR, "books.csv"), 'a') as f:
    no_lines = 219235
    for chunk in tqdm(book_reader, total=no_lines/chunk_size):
        chunk = prepare_book_df(chunk)
        chunk.to_csv(f, header=False, line_terminator='\n')

  0%|          | 0/2192.35 [00:00<?, ?it/s]

---

In [51]:
def prepare_review_df(df):
    df.set_index("review_id", inplace=True)
    df.drop(columns=['date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments'], inplace=True)
    df.loc[:, 'review_text'] = df['review_text'].replace(r'\n',' ', regex=True) 
    return df

In [52]:
# write first chunk with header
with open(os.path.join(OUT_DIR, "reviews.csv"), 'a') as f:
    chunk = next(review_reader)
    chunk = prepare_review_df(chunk)
    chunk.to_csv(f, header=True, line_terminator='\n')    

In [None]:
with open(os.path.join(OUT_DIR, "reviews.csv"), 'a') as f:
    no_lines = 1849236
    for chunk in tqdm(review_reader, total=no_lines/chunk_size):
        chunk = prepare_review_df(chunk)
        chunk.to_csv(f, header=False, line_terminator='\n')

- genres (= books) => 2.2 Mio
- authors => 830k
- books (mystery,crime,thriller) => 220k


---
read csv and merge dataframes, save to pickle file

In [None]:
MODEL_DIR = './models/'
DIR = './data/cleaned/'

In [None]:
csv_fn_books = 'books.csv'
csv_fn_reviews = 'reviews.csv'
csv_fn_genres = 'genres.csv'

In [None]:
chunk_size = 500

book_dtype={
    "title": 'string', 
    "description": 'string', 
    "text_reviews_count": 'uint', 
    'average_rating': 'float'
}

review_dtype = {
'book_id': 'uint32',
'rating': 'uint8',
'review_text': 'string',
}

In [None]:
df_books = pd.read_csv(os.path.join(DIR, csv_fn_books), dtype=book_dtype, low_memory=True)
df_books.title = df_books.title.str.strip("b\'\"") 
df_books.set_index('book_id', inplace=True)
df_books.head()

Unnamed: 0_level_0,title,text_reviews_count,average_rating,description,author_id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6066814,"Crowner Royal (Crowner John Mystery, #13)",15,3.93,"London, 1196. At the command of Richard the Li...",37778
33394837,The House of Memory (Pluto's Snitch #2),60,4.33,,242185
29074697,The Slaughtered Virgin of Zenopolis (Inspector...,23,3.49,"BATHS, BANKS AND ROMAN INSURRECTION Detective ...",15104629
1902202,"Dead in the Morning (Patrick Grant, #1)",8,3.3,"Gerald breezily introduced his wife, Helen, to...",190988
9671977,Aristotele e i misteri di Eleusi,3,3.54,"""I misteri di Eleusi"" e il quinto romanzo di A...",337108


In [None]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 219235 entries, 6066814 to 26168430
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   title               219235 non-null  string 
 1   text_reviews_count  219235 non-null  uint32 
 2   average_rating      219235 non-null  float64
 3   description         198488 non-null  string 
 4   author_id           219235 non-null  int64  
dtypes: float64(1), int64(1), string(2), uint32(1)
memory usage: 9.2 MB


In [None]:
df_reviews = pd.read_csv(os.path.join(DIR, csv_fn_reviews), dtype=review_dtype, low_memory=True)
df_reviews.set_index('review_id', inplace=True)
df_reviews.head()

Unnamed: 0_level_0,user_id,book_id,rating,review_text
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5e212a62bced17b4dbe41150e5bb9037,8842281e1d1347389f2ab93d60773d4d,6392944,3,I haven't read a fun mystery book in a while a...
2ede853b14dc4583f96cf5d120af636f,8842281e1d1347389f2ab93d60773d4d,28684704,3,"A fun, fast paced science fiction thriller. I ..."
8e4d61801907e591018bdc3442a9cf2b,8842281e1d1347389f2ab93d60773d4d,32283133,0,http://www.telegraph.co.uk/culture/10...
022bb6daffa49adc27f6b20b6ebeb37d,8842281e1d1347389f2ab93d60773d4d,17860739,4,An amazing and unique creation: JJ Abrams and ...
0e317947e1fd341f573192111bb2921d,8842281e1d1347389f2ab93d60773d4d,8694005,3,The Name of the Rose is a thrilling Dan Brown-...


In [None]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 735000 entries, 5e212a62bced17b4dbe41150e5bb9037 to a91f52d910b3988d02d9c72d4731de3c
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      735000 non-null  object
 1   book_id      735000 non-null  uint32
 2   rating       735000 non-null  uint8 
 3   review_text  734832 non-null  string
dtypes: object(1), string(1), uint32(1), uint8(1)
memory usage: 20.3+ MB


TODO: Maybe the Data should be indexed/grouped by BookID, not ReviewID

In [None]:
df_join = pd.merge(df_reviews, df_books, left_on="book_id", right_index=True)
df_join.head()

Unnamed: 0_level_0,user_id,book_id,rating,review_text,title,text_reviews_count,average_rating,description,author_id
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5e212a62bced17b4dbe41150e5bb9037,8842281e1d1347389f2ab93d60773d4d,6392944,3,I haven't read a fun mystery book in a while a...,"The Murder on the Links (Hercule Poirot, #2)",42,3.8,"On a French golf course, a millionaire is foun...",123715
ee9cf5e49795718b9bb086c1cde23116,24d87e891f6f60ae101c1de158cbe672,6392944,4,"A fun, fast read! I just adore reading books w...","The Murder on the Links (Hercule Poirot, #2)",42,3.8,"On a French golf course, a millionaire is foun...",123715
b5a6af59482018d0814c373723080adb,d083ade0fb4502508b1f3ce59bf1c7db,6392944,4,"Full of twists and turns, way better than the ...","The Murder on the Links (Hercule Poirot, #2)",42,3.8,"On a French golf course, a millionaire is foun...",123715
2ede853b14dc4583f96cf5d120af636f,8842281e1d1347389f2ab93d60773d4d,28684704,3,"A fun, fast paced science fiction thriller. I ...",Dark Matter,1026,4.1,"""Are you happy with your life?"" Those are the ...",442240
31db9238ec11ca671a65a50643f952b7,01ec1a320ffded6b2dd47833f2c8e4fb,28684704,5,"Mind-bending and completely unique, take a cha...",Dark Matter,1026,4.1,"""Are you happy with your life?"" Those are the ...",442240


In [None]:
df_join.info()

<class 'pandas.core.frame.DataFrame'>
Index: 735000 entries, 5e212a62bced17b4dbe41150e5bb9037 to a91f52d910b3988d02d9c72d4731de3c
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             735000 non-null  object 
 1   book_id             735000 non-null  uint32 
 2   rating              735000 non-null  uint8  
 3   review_text         734832 non-null  string 
 4   title               735000 non-null  string 
 5   text_reviews_count  735000 non-null  uint32 
 6   average_rating      735000 non-null  float64
 7   description         703758 non-null  string 
 8   author_id           735000 non-null  int64  
dtypes: float64(1), int64(1), object(1), string(3), uint32(2), uint8(1)
memory usage: 45.6+ MB


In [None]:
df_join.sample(20)

Unnamed: 0_level_0,user_id,book_id,rating,review_text,title,text_reviews_count,average_rating,description,author_id
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
84b45c5862e66c41fde9f1922fd946f7,f10d3a1771caeffebde9fd239547f067,17899948,5,I didn't know anything about the storyline and...,Rebecca,4127,4.21,Last night I dreamt I went to Manderley again ...,2001717
5ba79c8f80f076d432d1eba5ea5b1453,11f5af0f11e1b73863c7a887ef0f2d37,70897,0,This is probably one of those have-to-be-there...,The Secret History,862,4.08,Truly deserving of the accolade Modern Classic...,8719
10e8c869bb59daf5d2f3fae4ab364d9c,c8c04f0d14b8a9c482b8d979a41f963d,37304,4,Davenport meets doctor Weather Karkinnen at a ...,"Winter Prey (Lucas Davenport, #5)",424,4.25,"It is winter in the remote, dark Wisconsin woo...",4610
c0a1b5b9e12ccac1d0e3ce7844f9dfc2,4a1a45c12198b9ec99c57efa91e957ce,17802724,4,I won this book in a first reads giveaway. I w...,The Husband's Secret,18009,3.92,At the heart of The Husband's Secret is a lett...,322069
49fda46bc527b4b2a0fa0342bf574881,ac74ae567651e7502fb4408fa0da9840,23168811,4,The author is becoming at ease with his main c...,Even the Dead (Quirke #7),161,3.77,"A suspicious death, a pregnant woman suddenly ...",116405
d9b941dfcb7ff9601aa54f1083b0e3d3,65dac923d96ed9e0e3667dd9850d16be,25812109,4,4.5 stars hell yes. everything but the roman...,The Female of the Species,2026,4.25,A contemporary YA novel that examines rape cul...,5351825
305a063bd1f0ce66b7ab3839971f6573,8f3e35dcd36b81c7161baa9c28d42b1d,13147906,1,"Well, it took me forever to listen to this. I ...",Gone Girl,636,4.03,"What are you thinking, Amy? The question I've ...",2383
8bf21fcb2c3122f1fb2b14545ffea48a,95e5bfa350481de0f8e073792a170219,215238,3,Certainly not his best book. author wrote a di...,Play Dead,804,3.74,Terrible secrets lead basketball star David Ba...,24689
5c01ae2e360087907e3d0a5a8698fb2d,c99afa5c1c37afb44d1f99aea34f6204,431,3,"Ok, just finished the book and I still not sur...",The New York Trilogy,1601,3.92,"Paul Auster's signature work, The New York Tri...",296961
5f843238746d321436891c3784e292ea,b0d0e10a1941aeb47ddbd465cbc9c11a,36007655,5,A new author for me and I have marked the firs...,Running out of Time,5,4.23,Restaurant manager Dodie O'Dell's themed food ...,6187623


before:

Index: 22726 entries, 0e317947e1fd341f573192111bb2921d to 921812c9edc173c6d12e000723b9e667

after:

Index: 735000 entries, 5e212a62bced17b4dbe41150e5bb9037 to a91f52d910b3988d02d9c72d4731de3c
(= number of reviews)

after dropNA():
703594 


In [None]:
df_join.info()

<class 'pandas.core.frame.DataFrame'>
Index: 735000 entries, 5e212a62bced17b4dbe41150e5bb9037 to a91f52d910b3988d02d9c72d4731de3c
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             735000 non-null  object 
 1   book_id             735000 non-null  uint32 
 2   rating              735000 non-null  uint8  
 3   review_text         734832 non-null  string 
 4   title               735000 non-null  string 
 5   text_reviews_count  735000 non-null  uint32 
 6   average_rating      735000 non-null  float64
 7   description         703758 non-null  string 
 8   author_id           735000 non-null  int64  
dtypes: float64(1), int64(1), object(1), string(3), uint32(2), uint8(1)
memory usage: 45.6+ MB


In [None]:
df_join.dropna(inplace=True)

In [None]:
df_join.describe()

Unnamed: 0,book_id,rating,text_reviews_count,average_rating,author_id
count,703594.0,703594.0,703594.0,703594.0,703594.0
mean,13178350.0,3.672068,2735.980951,3.869114,2033713.0
std,10540830.0,1.193176,9487.515139,0.278711,3437501.0
min,164.0,0.0,0.0,0.0,14.0
25%,1744612.0,3.0,38.0,3.72,14473.0
50%,13129730.0,4.0,216.0,3.89,156327.0
75%,22323390.0,5.0,1148.0,4.05,3057577.0
max,36467170.0,5.0,78438.0,5.0,17330590.0


In [None]:
data = df_join

In [None]:
data.sample(5)

Unnamed: 0_level_0,user_id,book_id,rating,review_text,title,text_reviews_count,average_rating,description,author_id
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
544b25ef2e5bca5ed23de6e2ce334633,d991d4797cca0fff1cb9a1d6638eb47a,18770398,4,At one point it seemed I would be reading the ...,Night Film,2497,3.78,"Brilliant, haunting, breathtakingly suspensefu...",2362
14f2a43d1c9379f4fe1f0f0ee7ce9beb,34da92c91d6b2a2caddf03b8b714acb9,12875355,2,As a huge Austen fan I have been wanting to re...,Death Comes to Pemberley,6916,3.21,The world is classic Jane Austen. The mystery ...,344522
1ce7b0f009b45367e206bdb7fca2ecc4,1c28c471aa84a1ca3a8882a637d7a76d,34032606,5,This latest entry in the Laurel McKay series h...,"Dying for a Diamond (Laurel McKay Mysteries, #6)",14,4.53,"Warm breezes, tropical seas, a handsome new hu...",4115315
07f6064388bcbc76c324f89bc54f896b,c8d8cc9c1058c0caef590bf32001256c,25079805,4,After XO I was a little hesitant to jump into ...,"Solitude Creek (Kathryn Dance, #4)",14,3.85,"Jeffery Deaver, ""the master of manipulation"" (...",1612
53dc886b45ef2cffe422ca9fe619cfb7,7543b5a00e6dac232dd2a1c0ada849b9,12079574,4,Great start to a new series! I was apprehensiv...,Stay At Home Dead (Stay At Home Dad Mystery #1),107,3.88,"When Deuce Winters, a stay-at-home dad in slee...",6429000


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 703594 entries, 5e212a62bced17b4dbe41150e5bb9037 to a91f52d910b3988d02d9c72d4731de3c
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             703594 non-null  object 
 1   book_id             703594 non-null  uint32 
 2   rating              703594 non-null  uint8  
 3   review_text         703594 non-null  string 
 4   title               703594 non-null  string 
 5   text_reviews_count  703594 non-null  uint32 
 6   average_rating      703594 non-null  float64
 7   description         703594 non-null  string 
 8   author_id           703594 non-null  int64  
dtypes: float64(1), int64(1), object(1), string(3), uint32(2), uint8(1)
memory usage: 43.6+ MB


In [None]:
data.to_pickle(os.path.join(DIR, "joined_df.pkl"))