In [2]:
import pandas as pd
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer

from scipy.sparse import csr_matrix

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

In [3]:
# Loading in data
reviews = pd.read_csv('data/gr_reviews_per_book.csv')
metadata = pd.read_csv('data/metadata.csv')

In [4]:
metadata.head()

Unnamed: 0,isbn,average_rating,similar_books,description,link,num_pages,book_id,ratings_count,title,publication_year,name
0,312853122.0,4.0,[],,https://www.goodreads.com/book/show/5333265-w-...,256.0,5333265,3.0,W.C. Fields: A Life on Film,1984.0,Ronald J. Fields
1,743509986.0,3.23,['Hope Will Find You: My Search for the Wisdom...,"Anita Diamant's international bestseller ""The ...",https://www.goodreads.com/book/show/1333909.Go...,,1333909,10.0,Good Harbor,2001.0,Anita Diamant
2,,4.03,"['Through a Brazen Mirror', 'The Hound and the...",Omnibus book club edition containing the Ladie...,https://www.goodreads.com/book/show/7327624-th...,600.0,7327624,140.0,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",1987.0,Barbara Hambly
3,743294297.0,3.49,"['The Opposite of Me', 'Dune Road', 'A Summer ...",Addie Downs and Valerie Adler were eight when ...,https://www.goodreads.com/book/show/6066819-be...,368.0,6066819,51184.0,Best Friends Forever,2009.0,Jennifer Weiner
4,850308712.0,3.4,[],,https://www.goodreads.com/book/show/287140.Run...,,287140,15.0,Runic Astrology: Starcraft and Timekeeping in ...,,Nigel Pennick


In [6]:
metadata_subset = metadata.drop(columns =['isbn', 'similar_books', 'link', 'title', 'name'])

In [7]:
metadata_subset.head()

Unnamed: 0,average_rating,description,num_pages,book_id,ratings_count,publication_year
0,4.0,,256.0,5333265,3.0,1984.0
1,3.23,"Anita Diamant's international bestseller ""The ...",,1333909,10.0,2001.0
2,4.03,Omnibus book club edition containing the Ladie...,600.0,7327624,140.0,1987.0
3,3.49,Addie Downs and Valerie Adler were eight when ...,368.0,6066819,51184.0,2009.0
4,3.4,,,287140,15.0,


In [13]:
metadata_subset.isna().sum()

average_rating         524
description         412249
num_pages           764133
book_id                  0
ratings_count          524
publication_year    599625
dtype: int64

In [8]:
reviews.head()

Unnamed: 0.1,Unnamed: 0,book_id,string_tokens
0,0,1,one best book series think get better suspense...
1,1,2,first read book worst one harry potter series ...
2,2,3,remember trying time read always gave page ski...
3,3,5,one definitely good second one much happened r...
4,4,6,best harry potter book far followed closely bo...


In [9]:
reviews.drop(columns = ['Unnamed: 0'], inplace = True)

In [10]:
reviews.head()

Unnamed: 0,book_id,string_tokens
0,1,one best book series think get better suspense...
1,2,first read book worst one harry potter series ...
2,3,remember trying time read always gave page ski...
3,5,one definitely good second one much happened r...
4,6,best harry potter book far followed closely bo...


In [12]:
reviews.isna().sum()

book_id          0
string_tokens    0
dtype: int64

## NLP Processing the Descriptions

In [11]:
# # Lower casing
# metadata_subset['description']= metadata_subset['description'].apply(lambda x: x.lower())
# metadata_subset['description'][2]

AttributeError: 'float' object has no attribute 'lower'

In [None]:
# tokenizer = RegexpTokenizer(r"(?u)\w{3,}") # This pattern finds words that are at least 3 letters long
# stopwords = stopwords.words("english")
# lemmatizer = WordNetLemmatizer()

# def preprocessing(text, tokenizer, stopwords, lemmatizer):
#     # Make everything in the df["Text"] column into a lower-case string
#     #text = ["".join(item for item in lst).lower() for lst in text]

#     # Tokenize
#     tokens = tokenizer.tokenize(text)
    
#     # Remove stopwords
#     tokens = [token for token in tokens if token not in stopwords]
    
#     # Lemmatize
#     tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
#     return tokens

In [None]:
# # I'm saving off the nlp'd metadata as a separate df so I can use the non-nlp'd descriptions in the returned recommendations
# metadata_subset_nlp = metadata_subset.copy()
# metadata_subset_nlp['nlp_description'] = metadata_subset_nlp['description'].apply(lambda x: preprocessing(x, tokenizer, stopwords, lemmatizer))
# metadata_subset_nlp.head()

In [None]:
metadata_subset_nlp.head()

## Merge the two datasets

In [22]:
df = pd.merge(reviews, metadata_subset, on='book_id', how='left')

In [15]:
df.head()

Unnamed: 0,book_id,string_tokens,average_rating,description,num_pages,ratings_count,publication_year
0,1,one best book series think get better suspense...,4.54,The war against Voldemort is not going well: e...,652.0,1713866.0,2006.0
1,2,first read book worst one harry potter series ...,4.47,Harry Potter is due to start his fifth year at...,870.0,1766895.0,2004.0
2,3,remember trying time read always gave page ski...,4.45,Harry Potter's life is miserable. His parents ...,320.0,4765497.0,1997.0
3,5,one definitely good second one much happened r...,4.53,Harry Potter's third year at Hogwarts is full ...,435.0,1876252.0,2004.0
4,6,best harry potter book far followed closely bo...,4.53,Harry Potter is midway through his training as...,734.0,1792561.0,2002.0


In [16]:
df.isna().sum()

book_id                0
string_tokens          0
average_rating         1
description          305
num_pages           2002
ratings_count          1
publication_year    3084
dtype: int64

## Combining the Descriptions + Reviews NLP Text

In [None]:
# for index, row in df.iterrows():
#     df.at[index, 'all_tokens'] = row['string_tokens'] + ', ' + row['nlp_description']

# df.head()

In [None]:
# df = df.drop(columns = ['string_tokens', 'description', 'nlp_description'])

## Vectorizing Reviews

In [29]:
df['list_tokens'] = df['string_tokens'].apply(lambda x: x.split())
df.head()

Unnamed: 0,book_id,string_tokens,average_rating,description,num_pages,ratings_count,publication_year,list_tokens
0,1,one best book series think get better suspense...,4.54,The war against Voldemort is not going well: e...,652.0,1713866.0,2006.0,"[one, best, book, series, think, get, better, ..."
1,2,first read book worst one harry potter series ...,4.47,Harry Potter is due to start his fifth year at...,870.0,1766895.0,2004.0,"[first, read, book, worst, one, harry, potter,..."
2,3,remember trying time read always gave page ski...,4.45,Harry Potter's life is miserable. His parents ...,320.0,4765497.0,1997.0,"[remember, trying, time, read, always, gave, p..."
3,5,one definitely good second one much happened r...,4.53,Harry Potter's third year at Hogwarts is full ...,435.0,1876252.0,2004.0,"[one, definitely, good, second, one, much, hap..."
4,6,best harry potter book far followed closely bo...,4.53,Harry Potter is midway through his training as...,734.0,1792561.0,2002.0,"[best, harry, potter, book, far, followed, clo..."


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25475 entries, 0 to 25474
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   book_id           25475 non-null  int64  
 1   string_tokens     25475 non-null  object 
 2   average_rating    25474 non-null  float64
 3   description       25170 non-null  object 
 4   num_pages         23473 non-null  float64
 5   ratings_count     25474 non-null  float64
 6   publication_year  22391 non-null  float64
 7   list_tokens       25475 non-null  object 
dtypes: float64(4), int64(1), object(3)
memory usage: 1.6+ MB


In [33]:
tfidf = TfidfVectorizer(max_features=100)
doc_term_matrix = tfidf.fit_transform(df['string_tokens'])
df_doc_term_matrix = pd.DataFrame.sparse.from_spmatrix(doc_term_matrix, columns=tfidf.get_feature_names_out())

In [34]:
df_doc_term_matrix.head()

Unnamed: 0,actually,also,always,another,author,back,bad,best,better,bit,...,want,wanted,way,well,whole,work,world,would,writing,year
0,0.043907,0.076664,0.06414,0.032151,0.004186,0.035596,0.023336,0.039199,0.034591,0.045042,...,0.032335,0.014827,0.064098,0.072012,0.045205,0.013542,0.047176,0.071711,0.022042,0.082924
1,0.041008,0.0937,0.060527,0.018772,0.009517,0.041439,0.023422,0.035759,0.046144,0.056163,...,0.0388,0.019949,0.076559,0.07362,0.041634,0.01415,0.053947,0.073806,0.024361,0.067344
2,0.036545,0.053299,0.054523,0.016347,0.014983,0.044315,0.01968,0.031475,0.037297,0.029757,...,0.032211,0.01981,0.054758,0.065961,0.037399,0.013968,0.123113,0.076544,0.041103,0.126902
3,0.034729,0.091613,0.076617,0.026325,0.005597,0.047452,0.027907,0.071474,0.052153,0.030464,...,0.032427,0.011519,0.061239,0.059839,0.044875,0.009951,0.067974,0.067215,0.027612,0.071605
4,0.030269,0.081345,0.058291,0.025084,0.011102,0.043056,0.028742,0.044107,0.042506,0.04532,...,0.033135,0.011424,0.066903,0.057447,0.045566,0.010389,0.101918,0.080373,0.030484,0.073607


In [18]:
df = df.drop(columns = ['string_tokens'])

In [35]:
df_final = df_doc_term_matrix.merge(df, left_index=True, right_index=True)
df_final.head()

Unnamed: 0,actually,also,always,another,author,back,bad,best,better,bit,...,writing,year,book_id,string_tokens,average_rating,description,num_pages,ratings_count,publication_year,list_tokens
0,0.043907,0.076664,0.06414,0.032151,0.004186,0.035596,0.023336,0.039199,0.034591,0.045042,...,0.022042,0.082924,1,one best book series think get better suspense...,4.54,The war against Voldemort is not going well: e...,652.0,1713866.0,2006.0,"[one, best, book, series, think, get, better, ..."
1,0.041008,0.0937,0.060527,0.018772,0.009517,0.041439,0.023422,0.035759,0.046144,0.056163,...,0.024361,0.067344,2,first read book worst one harry potter series ...,4.47,Harry Potter is due to start his fifth year at...,870.0,1766895.0,2004.0,"[first, read, book, worst, one, harry, potter,..."
2,0.036545,0.053299,0.054523,0.016347,0.014983,0.044315,0.01968,0.031475,0.037297,0.029757,...,0.041103,0.126902,3,remember trying time read always gave page ski...,4.45,Harry Potter's life is miserable. His parents ...,320.0,4765497.0,1997.0,"[remember, trying, time, read, always, gave, p..."
3,0.034729,0.091613,0.076617,0.026325,0.005597,0.047452,0.027907,0.071474,0.052153,0.030464,...,0.027612,0.071605,5,one definitely good second one much happened r...,4.53,Harry Potter's third year at Hogwarts is full ...,435.0,1876252.0,2004.0,"[one, definitely, good, second, one, much, hap..."
4,0.030269,0.081345,0.058291,0.025084,0.011102,0.043056,0.028742,0.044107,0.042506,0.04532,...,0.030484,0.073607,6,best harry potter book far followed closely bo...,4.53,Harry Potter is midway through his training as...,734.0,1792561.0,2002.0,"[best, harry, potter, book, far, followed, clo..."


In [38]:
df['publication_year'].value_counts()

publication_year
2013.0    3209
2014.0    2833
2012.0    2612
2015.0    2606
2016.0    2054
          ... 
1966.0       1
1960.0       1
1965.0       1
1975.0       1
16.0         1
Name: count, Length: 70, dtype: int64

## Scaling Certain Columns

'average_rating', 'num_pages', 'ratings_count', 'publication_year'?