In [1]:
#****************************************************************************************#
# Program name      : Sentiment Analysis for GoodRead Books                              #
# Project           : Online Book Review System                                          #
# Description       : IDMP_Project                                                       #
# Produced by       : APOORVA GUPTA AND PRIYA GARG                                       #
# Date              : 12/11/2021                                                         #
#****************************************************************************************#

## Importing Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd 
import numpy as np

import re
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag

from nltk.corpus import stopwords
# nltk.download('wordnet')
from nltk.corpus import wordnet

from nltk.stem import WordNetLemmatizer

from textblob import TextBlob

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.metrics import confusion_matrix, accuracy_score

import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix,classification_report

## Data 

### Getting Review Data for top 200 most popular books

In [3]:
books_df = pd.read_csv('../data/top_200_books.csv')
reviews_df = pd.read_csv("../data/review_mystery_thriller_crime.csv")
reviews_df.drop(columns=['Unnamed: 0'],inplace=True)

books = books_df['id'].unique()
book_titles = books_df[books_df['id'].isin(books)]
book_titles = book_titles[['id','title']]

book_titles['book_id'] = book_titles['id']
book_titles.drop(columns=['id'],inplace=True)

reviews_updated = pd.DataFrame(columns=reviews_df.columns)
for book in books:
    
    temp = reviews_df[reviews_df['book_id'] == book].sort_values(by=['n_votes'], ascending=False)
    
    reviews_updated = pd.concat([reviews_updated, temp], ignore_index = True)

reviews = reviews_updated.merge(book_titles,on=['book_id'],how='left')

#### Relevant Reviews: Reviews which have at least one upvote. Sentiment analysis is performed only for relevant reviews. This is done to avoid gibberish reviews which don't make any sense and hence, irrelevant to our analysis.

In [4]:
reviews = reviews[reviews['n_votes'] >= 1]
reviews.to_csv('../data/relevant_reviews.csv')

### Preparing Data

#### The rating columns contain ratings given by a user to specific book. It is a good estimator of the user's sentiment towards the book. For our analysis, we will use this variable as our target variable. 

In [5]:
reviews = pd.read_csv('../data/relevant_reviews.csv')

# bucket ratings for +ve, -ve and neutral reviews
conditions = [
    (reviews['rating'] <= 2),
    (reviews['rating'] == 3 ),
    (reviews['rating'] >= 4)
    ]

# values
values = [-1,0,1]

# getting target variable
reviews['sentiment'] = np.select(conditions, values)

In [6]:
df = reviews[['book_id','review_text','sentiment']].copy()
df['review_text'] = df['review_text'].astype(str)

## Processing Texts for Reviews

#### Functions required

In [7]:
# remove punctuation from reviews
def remove_punctuation(text):
    text = re.sub('[^A-Za-z]+', ' ', text)
    # removing all spl characters and numbers
    return text


# dictionary to tag Parts of Speech
dict_pos = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
# tokenize, tag pos and remove stopwords
def token_stop_pos_all(text):
    tags = pos_tag(word_tokenize(text))
    # creating tags for every word in review
    newlist_words=[]
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist_words.append(tuple([word, dict_pos.get(tag[0])]))
            # extracting words out of review which are not stopwords
    return newlist_words


# Lemmetisation
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatise(tagged_words):
    new_lemma=" "
    for word, pos in tagged_words:
        if not pos:
            lemma=word
            new_lemma=new_lemma+" "+lemma
        else:
            lemma=wordnet_lemmatizer.lemmatize(word, pos=pos)
            new_lemma=new_lemma+" "+lemma
    return new_lemma


In [8]:
# Remove Punctuation
df['cleaned_review'] = df['review_text'].apply(remove_punctuation)

# tokenize, tag pos, remove stopwords 
df['words_tagged_pos'] = df['cleaned_review'].apply(token_stop_pos_all)

# Lemmatisation
df['lemma_review'] = df['words_tagged_pos'].apply(lemmatise)

# df.head()

## Base Model

In [9]:
# generating random -1,0,1 values for base model 
np.random.seed(seed=42)
df['pred_baseline'] = np.random.randint(-1, 2, df.shape[0])

## Rule Based Sentiment Analysis using NLTK

In [10]:
# Using TextBlob get subjectivity and sentiment
def get_subjectivity(review):
    return TextBlob(review).sentiment.subjectivity
def get_polarity(review):
    return TextBlob(review).sentiment.polarity


# Final prediction
def prediction(score):
    if score < 0:
        return -1
    elif score == 0:
        return 0
    else:
        return 1

In [11]:
# By Text Blob, getting subjectivity and sentiment polarity
df['subjectivity_score'] = df['lemma_review'].apply(get_subjectivity) 
df['polarity_score'] = df['lemma_review'].apply(get_polarity) 

# Final Predictions
df['pred_rule_based'] = df['polarity_score'].apply(prediction)

## Naive Bayes

In [13]:
# train test splits
x = df['lemma_review']
y = df['sentiment']

# split is not random, so as to preserve order
x, x_test, y, y_test = train_test_split(x,y, test_size=0.25, random_state=42,shuffle=False)

# Vectorize text reviews to numbers
vec = CountVectorizer(stop_words='english')
x = vec.fit_transform(x).toarray()
x_test = vec.transform(x_test).toarray()

In [14]:
model = MultinomialNB()
model.fit(x, y)

MultinomialNB()

In [15]:
predicted_NB_train = model.predict(x)
predicted_NB_test = model.predict(x_test)

df['prediction_nb'] = list(predicted_NB_train)+list(predicted_NB_test)

## Logistic Regression Classification

In [16]:
# train-test split
train = df.head(int(.75*len(df)))
test = df.tail(int(.25*len(df)))

In [17]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['lemma_review'].astype(str))
test_matrix = vectorizer.transform(test['lemma_review'].astype(str))

In [18]:
X_train = train_matrix
X_test = test_matrix
y_train = train['sentiment']
y_test = test['sentiment']

In [19]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

LogisticRegression()

In [20]:
# predictions = lr.predict(X_test)

predicted_lr_train = lr.predict(X_train)
predicted_lr_test = lr.predict(X_test)

df['prediction_lr'] = list(predicted_lr_train)+list(predicted_lr_test)

### Comparing all models 

In [22]:
print('BASE LINE\n',classification_report(df['sentiment'], df['pred_baseline']))
print('RULE BASED\n',classification_report(df['sentiment'], df['pred_rule_based']))
print('NAIVE BAYES\n',classification_report(df['sentiment'], df['prediction_nb']))
print('LOGISTIC REGRESSION\n',classification_report(df['sentiment'], df['prediction_lr']))

BASE LINE
               precision    recall  f1-score   support

          -1       0.13      0.35      0.19      2658
           0       0.18      0.34      0.24      3970
           1       0.69      0.33      0.45     15216

    accuracy                           0.33     21844
   macro avg       0.33      0.34      0.29     21844
weighted avg       0.53      0.33      0.38     21844

RULE BASED
               precision    recall  f1-score   support

          -1       0.28      0.32      0.30      2658
           0       0.17      0.08      0.11      3970
           1       0.73      0.81      0.77     15216

    accuracy                           0.62     21844
   macro avg       0.39      0.40      0.39     21844
weighted avg       0.57      0.62      0.59     21844

NAIVE BAYES
               precision    recall  f1-score   support

          -1       0.81      0.37      0.51      2658
           0       0.72      0.27      0.40      3970
           1       0.78      0.97      

In [5]:
final_pred = df[['book_id','sentiment','pred_baseline','pred_rule_based','prediction_nb','prediction_lr']].copy()

final_pred.to_csv('../data/all_predictions_sentiments.csv')