In [123]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import datetime
from dateutil.parser import parse
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from string import punctuation

## Data Cleaning

In [170]:
df = pd.read_csv('booksdata.csv')
df.shape

(31246, 18)

In [171]:
df = df.drop("web-scraper-order",axis=1)
df = df.drop("web-scraper-start-url",axis=1)
df = df.drop("bookreview_user",axis=1)
df = df.drop("books-href",axis=1)

df.bookreview_2.fillna(df.bookreview_1, inplace=True)
df = df.drop("bookreview_1",axis=1)

In [172]:
def remove_punctuations(item):
    for p in punctuation:
        item = item.strip().replace(p,'')
    return item

def remove_stopwords(s):
    return [w for w in s if not w in stop_words] 

stop_words = set(stopwords.words('english')) 

In [173]:
df.rename(columns = {'booktitle':'Title','bookauthor':'Author','booksummary':'Summary','booklength':'Length','books':'Book_ID','bookgenre':'Genre','bookavgrating':'Overall_Rating','bookratingcount':'Rating_Count','bookcover-src':'Cover_URL','publishdate':'Publish_Date','bookreviewcount':'Review_Count','bookreview_rating':'User_Rating','bookreview_2':'User_Review'}, inplace = True)

df['Author_ID'] = df['Author']
lb_make = LabelEncoder()
df['Author_ID'] = lb_make.fit_transform(df['Author_ID'])

df['Length'] = df['Length'].str.replace("pages","")
df[['Length']] = df[['Length']].astype(float)

today = datetime.date.today()
df['Publish_Date'] = df['Publish_Date'].apply(lambda x: x.split('\n')[1].replace('th','').replace('rd','').replace('nd','').strip())
df['Publish_Days'] = df['Publish_Date'].apply(lambda x: int((today - parse(x).date()).days ))

df['Rating_Count'] = df['Rating_Count'].apply(lambda x: int(x.split('\n')[0].strip().replace(',','')))

df['Review_Count'] = df['Review_Count'].apply(lambda x: int(x.split('\n')[0].strip().replace(',','')))

df['Genre'] = df['Genre'].apply(lambda x: x.replace('[{"bookgenre":','')) \
                    .apply(lambda x: x.replace('{"bookgenre":','')) \
                    .apply(lambda x: x.replace('}','')) \
                    .apply(lambda x: x.replace(']','')) \
                    .apply(lambda x: x.replace('"','')) \
                    .apply(lambda x: x.replace(',',' '))

df["Summary_Length"]= df["Summary"].str.len() 
df["Review_Length"]= df["User_Review"].str.len() 

df["Summary"] = df['Summary'].apply(remove_punctuations) \
                             .apply(lambda x: x.lower())
df['Summary_Tokens'] = df['Summary'].apply(word_tokenize).apply(set).apply(list)
df['Summary_Tokens'] = df['Summary_Tokens'].apply(remove_stopwords)

df[['User_Review']] = df[['User_Review']].astype(str)
df["User_Review"] = df['User_Review'].apply(remove_punctuations) \
                                     .apply(lambda x: x.lower())
df['Review_Tokens'] = df['User_Review'].apply(word_tokenize).apply(set).apply(list)
df['Review_Tokens'] = df['Review_Tokens'].apply(remove_stopwords)

df[['User_Rating']] = df[['User_Rating']].astype(str)
df['User_Rating'] = df['User_Rating'].apply(lambda x: x.replace('did not like it','1')) \
                    .apply(lambda x: x.replace('it was ok','2')) \
                    .apply(lambda x: x.replace('really liked it','4')) \
                    .apply(lambda x: x.replace('liked it','3')) \
                    .apply(lambda x: x.replace('it was amazing','5')) 

df = df[['Book_ID','Title','Author_ID','Author','Length','Publish_Date','Publish_Days','Cover_URL','Overall_Rating','Rating_Count','Review_Count','Genre','Summary','Summary_Length','Summary_Tokens','User_Rating','User_Review','Review_Length','Review_Tokens']]
df = df.sort_values(['Book_ID','Author_ID'])
df.head(5)

Unnamed: 0,Book_ID,Title,Author_ID,Author,Length,Publish_Date,Publish_Days,Cover_URL,Overall_Rating,Rating_Count,Review_Count,Genre,Summary,Summary_Length,Summary_Tokens,User_Rating,User_Review,Review_Length,Review_Tokens
385,1,The Guardians,42,John Grisham,375.0,October 15 2019,46,https://i.gr-assets.com/images/S/compressed.ph...,4.27,13106,1182,Fiction Mystery Thriller Thriller Legal Thriller,in the small north florida town of seabrook a ...,1424,"[founded, languished, convictions, small, else...",5,what a great legal thriller john grisham has l...,444.0,"[karma, john, american, bono, long, death, gri..."
531,1,The Guardians,42,John Grisham,375.0,October 15 2019,46,https://i.gr-assets.com/images/S/compressed.ph...,4.27,13106,1182,Fiction Mystery Thriller Thriller Legal Thriller,in the small north florida town of seabrook a ...,1424,"[founded, languished, convictions, small, else...",5,loved this book again i’m not disappointed one...,61.0,"[love, loved, one, ’, book, disappointed, bit]"
540,1,The Guardians,42,John Grisham,375.0,October 15 2019,46,https://i.gr-assets.com/images/S/compressed.ph...,4.27,13106,1182,Fiction Mystery Thriller Thriller Legal Thriller,in the small north florida town of seabrook a ...,1424,"[founded, languished, convictions, small, else...",5,a deftly written and thoughtprovoking book tha...,3093.0,"[15, review, immensely, michael, half, corners..."
546,1,The Guardians,42,John Grisham,375.0,October 15 2019,46,https://i.gr-assets.com/images/S/compressed.ph...,4.27,13106,1182,Fiction Mystery Thriller Thriller Legal Thriller,in the small north florida town of seabrook a ...,1424,"[founded, languished, convictions, small, else...",5,no stranger than the truth\n\nbased on the tru...,289.0,"[us, someone, called, story, guardians, appear..."
641,1,The Guardians,42,John Grisham,375.0,October 15 2019,46,https://i.gr-assets.com/images/S/compressed.ph...,4.27,13106,1182,Fiction Mystery Thriller Thriller Legal Thriller,in the small north florida town of seabrook a ...,1424,"[founded, languished, convictions, small, else...",3,the guardians • john grisham • started finish...,1811.0,"[grisham, goal, convictions, broken, normally,..."


In [176]:
df.to_csv('Clean_Reviews.csv')

## Task 1