In [123]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import datetime
from dateutil.parser import parse
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from string import punctuation

## Data Cleaning

In [213]:
df = pd.read_csv('booksdata.csv')
# df.shape

(31246, 18)

In [214]:
df = df.drop("web-scraper-order",axis=1)
df = df.drop("web-scraper-start-url",axis=1)
df = df.drop("bookreview_user",axis=1)
df = df.drop("books-href",axis=1)

df.bookreview_2.fillna(df.bookreview_1, inplace=True)
df = df.drop("bookreview_1",axis=1)

In [215]:
def remove_punctuations(item):
    for p in punctuation:
        item = item.strip().replace(p,'')
    return item

def remove_stopwords(s):
    return [w for w in s if not w in stop_words] 

stop_words = set(stopwords.words('english')) 

In [216]:
df.rename(columns = {'booktitle':'Title','bookauthor':'Author','booksummary':'Summary','booklength':'Length','books':'Book_ID','bookgenre':'Genre','bookavgrating':'Overall_Rating','bookratingcount':'Rating_Count','bookcover-src':'Cover_URL','publishdate':'Publish_Date','bookreviewcount':'Review_Count','bookreview_rating':'User_Rating','bookreview_2':'User_Review'}, inplace = True)

df['Author_ID'] = df['Author']
lb_make = LabelEncoder()
df['Author_ID'] = lb_make.fit_transform(df['Author_ID'])

df['Length'] = df['Length'].str.replace("pages","")
df[['Length']] = df[['Length']].astype(float)

today = datetime.date.today()
df['Publish_Date'] = df['Publish_Date'].apply(lambda x: x.split('\n')[1].replace('th','').replace('rd','').replace('nd','').strip())
df['Publish_Days'] = df['Publish_Date'].apply(lambda x: int((today - parse(x).date()).days ))

df['Rating_Count'] = df['Rating_Count'].apply(lambda x: int(x.split('\n')[0].strip().replace(',','')))

df['Review_Count'] = df['Review_Count'].apply(lambda x: int(x.split('\n')[0].strip().replace(',','')))

df['Genre'] = df['Genre'].apply(lambda x: x.replace('[{"bookgenre":','')) \
                    .apply(lambda x: x.replace('{"bookgenre":','')) \
                    .apply(lambda x: x.replace('}','')) \
                    .apply(lambda x: x.replace(']','')) \
                    .apply(lambda x: x.replace('"','')) \
                    .apply(lambda x: x.replace(',',' '))

df["Summary_Length"]= df["Summary"].str.len() 
df["Review_Length"]= df["User_Review"].str.len() 

df["Summary"] = df['Summary'].apply(remove_punctuations) \
                             .apply(lambda x: x.lower())
df['Summary'].replace({'donald': 'trump','american': 'america','woman': 'women'}, inplace=True, regex=True)
df['Summary_Tokens'] = df['Summary'].apply(word_tokenize).apply(set).apply(list)
df['Summary_Tokens'] = df['Summary_Tokens'].apply(remove_stopwords)

df[['User_Review']] = df[['User_Review']].astype(str)
df["User_Review"] = df['User_Review'].apply(remove_punctuations) \
                                     .apply(lambda x: x.lower())
df['User_Review'].replace({'donald': 'trump','characters': 'character','felt':'feel'},inplace=True, regex=True)
df['Review_Tokens'] = df['User_Review'].apply(word_tokenize).apply(set).apply(list)
df['Review_Tokens'] = df['Review_Tokens'].apply(remove_stopwords)

df[['User_Rating']] = df[['User_Rating']].astype(str)
df['User_Rating'] = df['User_Rating'].apply(lambda x: x.replace('did not like it','1')) \
                    .apply(lambda x: x.replace('it was ok','2')) \
                    .apply(lambda x: x.replace('really liked it','4')) \
                    .apply(lambda x: x.replace('liked it','3')) \
                    .apply(lambda x: x.replace('it was amazing','5')) 

df = df[['Book_ID','Title','Author_ID','Author','Length','Publish_Date','Publish_Days','Cover_URL','Overall_Rating','Rating_Count','Review_Count','Genre','Summary','Summary_Length','Summary_Tokens','User_Rating','User_Review','Review_Length','Review_Tokens']]
df = df.sort_values(['Book_ID','Author_ID'])
df.head(5)

Unnamed: 0,Book_ID,Title,Author_ID,Author,Length,Publish_Date,Publish_Days,Cover_URL,Overall_Rating,Rating_Count,Review_Count,Genre,Summary,Summary_Length,Summary_Tokens,User_Rating,User_Review,Review_Length,Review_Tokens
385,1,The Guardians,42,John Grisham,375.0,October 15 2019,47,https://i.gr-assets.com/images/S/compressed.ph...,4.27,13106,1182,Fiction Mystery Thriller Thriller Legal Thriller,in the small north florida town of seabrook a ...,1424,"[founded, languished, convictions, small, else...",5,what a great legal thriller john grisham has l...,444.0,"[karma, john, american, bono, long, death, gri..."
531,1,The Guardians,42,John Grisham,375.0,October 15 2019,47,https://i.gr-assets.com/images/S/compressed.ph...,4.27,13106,1182,Fiction Mystery Thriller Thriller Legal Thriller,in the small north florida town of seabrook a ...,1424,"[founded, languished, convictions, small, else...",5,loved this book again i’m not disappointed one...,61.0,"[love, loved, one, ’, book, disappointed, bit]"
540,1,The Guardians,42,John Grisham,375.0,October 15 2019,47,https://i.gr-assets.com/images/S/compressed.ph...,4.27,13106,1182,Fiction Mystery Thriller Thriller Legal Thriller,in the small north florida town of seabrook a ...,1424,"[founded, languished, convictions, small, else...",5,a deftly written and thoughtprovoking book tha...,3093.0,"[15, review, immensely, michael, half, corners..."
546,1,The Guardians,42,John Grisham,375.0,October 15 2019,47,https://i.gr-assets.com/images/S/compressed.ph...,4.27,13106,1182,Fiction Mystery Thriller Thriller Legal Thriller,in the small north florida town of seabrook a ...,1424,"[founded, languished, convictions, small, else...",5,no stranger than the truth\n\nbased on the tru...,289.0,"[us, someone, called, story, guardians, appear..."
641,1,The Guardians,42,John Grisham,375.0,October 15 2019,47,https://i.gr-assets.com/images/S/compressed.ph...,4.27,13106,1182,Fiction Mystery Thriller Thriller Legal Thriller,in the small north florida town of seabrook a ...,1424,"[founded, languished, convictions, small, else...",3,the guardians • john grisham • started finish...,1811.0,"[grisham, goal, convictions, broken, normally,..."


In [217]:
df.to_csv('Clean_Reviews.csv')

## Task 1

## Count word frequency - Fiction vs Non-Fiction

In [330]:
## Summary - Fiction
count = []
# Count the frequency of the words

for i in range(50):
    count += df[df["Book_ID"]< 51].drop_duplicates(subset="Book_ID")["Summary_Tokens"].iloc[i]
    
count = [x.lower() for x in count]
word_freq = nltk.FreqDist(count)

# 100 most frequent words
top_words = word_freq.most_common(100)
top_words

[('new', 31),
 ('one', 25),
 ('’', 21),
 ('times', 21),
 ('york', 21),
 ('time', 18),
 ('1', 18),
 ('world', 17),
 ('bestselling', 17),
 ('women', 16),
 ('never', 15),
 ('novel', 14),
 ('life', 13),
 ('young', 13),
 ('man', 13),
 ('story', 13),
 ('home', 13),
 ('family', 13),
 ('author', 13),
 ('murder', 12),
 ('two', 11),
 ('next', 11),
 ('first', 11),
 ('left', 10),
 ('ever', 10),
 ('begins', 10),
 ('back', 10),
 ('help', 10),
 ('find', 10),
 ('finds', 10),
 ('something', 10),
 ('another', 9),
 ('years', 9),
 ('town', 9),
 ('comes', 9),
 ('crime', 9),
 ('becomes', 9),
 ('three', 9),
 ('may', 9),
 ('could', 9),
 ('night', 8),
 ('even', 8),
 ('desperate', 8),
 ('death', 8),
 ('war', 8),
 ('house', 8),
 ('place', 8),
 ('like', 8),
 ('make', 8),
 ('lives', 8),
 ('series', 8),
 ('thriller', 8),
 ('former', 8),
 ('must', 8),
 ('day', 8),
 ('father', 8),
 ('outside', 7),
 ('behind', 7),
 ('soon', 7),
 ('police', 7),
 ('far', 7),
 ('become', 7),
 ('beautiful', 7),
 ('takes', 7),
 ('mother', 

In [331]:
## Summary - Fiction
count = []
# Count the frequency of the words

for i in range(50):
    count += df[df["Book_ID"]>50].drop_duplicates(subset="Book_ID")["Summary_Tokens"].iloc[i]
    
count = [x.lower() for x in count]
word_freq = nltk.FreqDist(count)

# 100 most frequent words
top_words = word_freq.most_common(100)
top_words

[('new', 25),
 ('america', 25),
 ('book', 24),
 ('’', 24),
 ('first', 23),
 ('life', 22),
 ('one', 22),
 ('president', 21),
 ('us', 20),
 ('world', 20),
 ('time', 19),
 ('story', 19),
 ('trump', 19),
 ('history', 19),
 ('personal', 15),
 ('people', 14),
 ('like', 14),
 ('york', 13),
 ('times', 13),
 ('way', 13),
 ('political', 13),
 ('2016', 12),
 ('americas', 12),
 ('author', 12),
 ('”', 12),
 ('many', 12),
 ('“', 12),
 ('never', 12),
 ('years', 12),
 ('account', 11),
 ('comes', 11),
 ('shows', 11),
 ('election', 11),
 ('work', 11),
 ('also', 11),
 ('public', 11),
 ('even', 11),
 ('1', 11),
 ('tells', 10),
 ('women', 10),
 ('united', 10),
 ('including', 10),
 ('states', 10),
 ('politics', 10),
 ('would', 10),
 ('war', 10),
 ('country', 10),
 ('written', 10),
 ('presidency', 10),
 ('deeply', 9),
 ('administration', 9),
 ('back', 9),
 ('make', 9),
 ('past', 9),
 ('star', 9),
 ('government', 9),
 ('memoir', 9),
 ('take', 9),
 ('campaign', 9),
 ('presidential', 9),
 ('obama', 9),
 ('human

In [346]:
df_test = df[df["Book_ID"]<51]
## Review - Fiction
count = []

# Count the frequency of the words
for i in range(len(df_test)):
    count += df_test['Review_Tokens'].iloc[i]

count = [x.lower() for x in count]
word_freq = nltk.FreqDist(count)

# 100 most frequent words
top_words = word_freq.most_common(100)
top_words

[('book', 9928),
 ('one', 7764),
 ('read', 7676),
 ('story', 6569),
 ('character', 6450),
 ('like', 5537),
 ('books', 4716),
 ('’', 4547),
 ('time', 4195),
 ('really', 4045),
 ('good', 4026),
 ('series', 3903),
 ('much', 3901),
 ('would', 3663),
 ('first', 3539),
 ('love', 3536),
 ('get', 3376),
 ('reading', 3339),
 ('also', 3290),
 ('feel', 3224),
 ('well', 3223),
 ('even', 3200),
 ('way', 3139),
 ('know', 3029),
 ('great', 2991),
 ('novel', 2962),
 ('many', 2854),
 ('end', 2847),
 ('another', 2746),
 ('new', 2735),
 ('think', 2703),
 ('still', 2688),
 ('people', 2599),
 ('could', 2581),
 ('plot', 2569),
 ('always', 2557),
 ('life', 2532),
 ('see', 2454),
 ('two', 2425),
 ('back', 2422),
 ('going', 2416),
 ('little', 2396),
 ('never', 2331),
 ('stars', 2257),
 ('writing', 2235),
 ('make', 2205),
 ('author', 2171),
 ('loved', 2167),
 ('next', 2163),
 ('enjoyed', 2154),
 ('find', 2072),
 ('say', 2054),
 ('things', 1999),
 ('bit', 1991),
 ('last', 1972),
 ('found', 1955),
 ('interesting'

In [347]:
df_test = df[df["Book_ID"]>50]
## Review - Fiction
count = []

# Count the frequency of the words
for i in range(len(df_test)):
    count += df_test['Review_Tokens'].iloc[i]

count = [x.lower() for x in count]
word_freq = nltk.FreqDist(count)

# 100 most frequent words
top_words = word_freq.most_common(100)
top_words

[('book', 10459),
 ('read', 6773),
 ('one', 4878),
 ('like', 4718),
 ('would', 3888),
 ('much', 3806),
 ('’', 3786),
 ('people', 3734),
 ('also', 3384),
 ('many', 3362),
 ('time', 3354),
 ('really', 3197),
 ('reading', 3163),
 ('well', 3160),
 ('good', 3135),
 ('even', 3096),
 ('life', 3093),
 ('trump', 2884),
 ('us', 2840),
 ('think', 2840),
 ('way', 2840),
 ('great', 2732),
 ('know', 2730),
 ('get', 2614),
 ('first', 2542),
 ('history', 2482),
 ('feel', 2417),
 ('could', 2412),
 ('interesting', 2333),
 ('story', 2318),
 ('made', 2310),
 ('president', 2216),
 ('make', 2158),
 ('years', 2138),
 ('things', 2137),
 ('books', 2136),
 ('still', 2131),
 ('never', 2101),
 ('lot', 2097),
 ('political', 2036),
 ('world', 2008),
 ('new', 1946),
 ('see', 1868),
 ('work', 1851),
 ('love', 1835),
 ('better', 1823),
 ('say', 1801),
 ('times', 1752),
 ('found', 1752),
 ('dont', 1751),
 ('written', 1737),
 ('american', 1709),
 ('”', 1708),
 ('“', 1699),
 ('little', 1698),
 ('always', 1677),
 ('though

In [None]:
top_fiction_summary_words_count = [('world',5950),('life',5572),('story',5193),('america',4417),('women',4300),('bestselling',4216),('author',4140),('like',3764),('personal',3554),('history',3361),('president',3350),('people',3263),('family',3071),('trump',2896),('war',2827),('man',2781),('office',2326),('news',2097),('country',2090),('2016',1992),('love',1998),('series',1867),('mother',1853),('political',2056)]
top_nonfiction_summary_words_count = [('world',5561),('life',5219),('story',4805),('america',3993),('women',4013),('bestselling',3759),('author',3744),('like',3343),('personal',3159),('history',3031),('president',2983),('people',3022),('family',2893),('trump',2621),('war',2654),('man',2492),('office',2098),('news',1946),('country',1822),('2016',1787),('love',1890),('series',1722),('mother',1706),('political',1738)]
top_fiction_review_words_count = [('story',4618),('like',5273),('time',3916),('people',3195),('life',2960),('love',2777),('character',3760),('thought',1653),('interesting',2202),('series',2162),('world',2053),('work',1971),('author',1836),('want',1790),('best',1838),('history',1699),('man',1641),('family',1597),('trump',1571),('plot',1371),('great',2981),('long',1704),('end',2258),('enjoyed',1827),('feel',2902)]
top_nonfiction_review_words_count = [('story',4269),('like',4982),('time',3633),('people',3138),('life',2665),('love',2594),('character',3450),('thought',1605),('interesting',2069),('series',2130),('world',1763),('work',1709),('author',1741),('want',1614),('best',1678),('history',1564),('man',1500),('family',1500),('trump',1414),('plot',1250),('great',2742),('long',1581),('end',2111),('enjoyed',1696),('feel',2739)]

In [None]:
# Lift ratios for Top words in Fiction book summary
top_fiction_summary_words =[]
for word, count in top_fiction_summary_words_count:
    top_fiction_summary_words.append(word) 

word_df = pd.DataFrame(columns = top_fiction_summary_words)

def word_count(item):
    if checker in item:
        return 1
    else:
        return 0

for word in top_fiction_summary_words:
    checker = word
    word_df[word] = df[df["Book_ID"]<51]['Summary'].drop_duplicates().apply(word_count)       

lift_df = pd.DataFrame(columns = top_fiction_summary_words)

for i in range(len(top_fiction_summary_words)):
    new_list = []
    for j in range(len(top_fiction_summary_words)):
        if (i!=j):
            num = ((word_df[top_fiction_summary_words[i]] + word_df[top_fiction_summary_words[j]]) > 1).sum()
            dem = word_df[top_fiction_summary_words[j]].sum()*word_df[top_fiction_summary_words[i]].sum()
            lift = num * len(word_df) / dem
            lift_df.loc[top_fiction_summary_words[i],top_fiction_summary_words[j]] = lift

print ('Lift Matrix:\n')
lift_df

In [None]:
# Lift ratios for Top words in Fiction book reviews
top_fiction_review_words =[]
for word, count in top_fiction_review_words_count:
    top_fiction_review_words.append(word) 

word_df = pd.DataFrame(columns = top_fiction_review_words)

def word_count(item):
    if word in item:
        return 1
    else:
        return 0

for i in range(len(df["User_Review"])):
    if df.loc[i,"Book_ID"] < 51: 
        for word in top_fiction_review_words:
            word_df[word] = df['User_Review'].apply(word_count)        

lift_df = pd.DataFrame(columns = top_fiction_review_words)

for i in range(len(top_fiction_review_words)):
    new_list = []
    for j in range(len(top_fiction_review_words)):
        if (i!=j):
            num = ((word_df[top_fiction_review_words[i]] + word_df[top_fiction_review_words[j]]) > 1).sum()
            dem = word_df[top_fiction_review_words[j]].sum()*word_df[top_fiction_review_words[i]].sum()
            lift = num * len(word_df) / dem
            lift_df.loc[top_fiction_review_words[i],top_fiction_review_words[j]] = lift

print ('Lift Matrix:\n')
lift_df

In [None]:
# Lift ratios for Top words in Non-Fiction book summary
top_nonfiction_summary_words =[]
for word, count in top_nonfiction_summary_words_count:
    top_nonfiction_summary_words.append(word) 

word_df = pd.DataFrame(columns = top_nonfiction_summary_words)

def word_count(item):
    if word in item:
        return 1
    else:
        return 0

for i in range(len(df["Summary"])):
    if df.loc[i,"Book_ID"] > 50: 
        for word in top_nonfiction_summary_words:
            word_df[word] = df['Summary'].apply(word_count)        

lift_df = pd.DataFrame(columns = top_nonfiction_summary_words)

for i in range(len(top_nonfiction_summary_words)):
    new_list = []
    for j in range(len(top_nonfiction_summary_words)):
        if (i!=j):
            num = ((word_df[top_nonfiction_summary_words[i]] + word_df[top_nonfiction_summary_words[j]]) > 1).sum()
            dem = word_df[top_nonfiction_summary_words[j]].sum()*word_df[top_nonfiction_summary_words[i]].sum()
            lift = num * len(word_df) / dem
            lift_df.loc[top_nonfiction_summary_words[i],top_nonfiction_summary_words[j]] = lift

print ('Lift Matrix:\n')
lift_df

In [None]:
# Lift ratios for Top words in Non-Fiction book reviews
top_nonfiction_review_words =[]
for word, count in top_nonfiction_review_words_count:
    top_nonfiction_review_words.append(word) 

word_df = pd.DataFrame(columns = top_nonfiction_review_words)

def word_count(item):
    if word in item:
        return 1
    else:
        return 0

for i in range(len(df["User_Review"])):
    if df.loc[i,"Book_ID"] > 50: 
        for word in top_nonfiction_review_words:
            word_df[word] = df['User_Review'].apply(word_count)        

lift_df = pd.DataFrame(columns = top_nonfiction_review_words)

for i in range(len(top_nonfiction_review_words)):
    new_list = []
    for j in range(len(top_nonfiction_review_words)):
        if (i!=j):
            num = ((word_df[top_nonfiction_review_words[i]] + word_df[top_nonfiction_review_words[j]]) > 1).sum()
            dem = word_df[top_nonfiction_review_words[j]].sum()*word_df[top_nonfiction_review_words[i]].sum()
            lift = num * len(word_df) / dem
            lift_df.loc[top_nonfiction_review_words[i],top_nonfiction_review_words[j]] = lift

print ('Lift Matrix:\n')
lift_df

In [240]:
df["Book_ID"].value_counts()

98    349
55    329
85    329
36    329
52    329
     ... 
96    177
65    176
97    169
64    158
89     69
Name: Book_ID, Length: 100, dtype: int64