## Sentiment Analysis using Naive Bayes

#### Importing required libraries

In [1]:
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
import sklearn as sk
import matplotlib as mt
import seaborn as sns

In [6]:
import os

#os.chdir('../')

%pwd

'd:\\Projects\\Learning\\Sentiment-Analysis-Naive-Bayes'

#### Reading the data set

In [8]:
# Reading the data set

alexa_reviews = pd.read_csv('artifacts/data_ingestion/extracted_data/amazon_alexa.tsv',sep='\t')

In [9]:
# Head of the data set

alexa_reviews.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [10]:
# Info of alexa_reviews data set

alexa_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3149 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


In [11]:
# Rating column distribution

alexa_reviews['rating'].value_counts()

rating
5    2286
4     455
1     161
3     152
2      96
Name: count, dtype: int64

- Most of the reviews have rating 5
- Handling the imbalanced data set might be required

#### Data preprocessing

In [12]:
# Since we are focusing only on sentiment analysis we will only retain 'Rating' and 'verified_reviews' column

alexa_reviews = alexa_reviews[['rating','verified_reviews']]

# Head of the modified data set

alexa_reviews.head()

Unnamed: 0,rating,verified_reviews
0,5,Love my Echo!
1,5,Loved it!
2,4,"Sometimes while playing a game, you can answer..."
3,5,I have had a lot of fun with this thing. My 4 ...
4,5,Music


In [13]:
# Ratings from 1 to 3 are mapped to 0 and rating 4 & 5 are mapped to 1

alexa_reviews['rating'] = alexa_reviews['rating'].apply(lambda val : 0 if val <=3 else 1)

In [14]:
alexa_reviews['rating'].value_counts()

rating
1    2741
0     409
Name: count, dtype: int64

In [15]:
from nltk.stem import WordNetLemmatizer

import nltk

# nltk.download('wordnet')

# Instantiating WordNetLemmatizer for it to be used in below function

lemmatizer = WordNetLemmatizer()

In [16]:
# Contains all the punctuations that need to be removed

punc_to_remove = string.punctuation

In [17]:
# Contains all the stop words that need to be removed

stop_words = set(stopwords.words('english'))

In [18]:
#pip install autocorrect

In [19]:
from autocorrect import Speller

# Helps with correct spellings

speller = Speller(lang='en')

In [20]:
def remove_punctuations(word):
    cleaned_word = list()
    
    for letter in list(word):
        if letter not in list(punc_to_remove):
            cleaned_word.append(letter)
    
    return "".join(cleaned_word)

In [21]:
# A generic function to pre-process a text

def preprocess_text(text):
    
    text = str(text)
    
    # Convert the text to lower case
    
    text_lower = text.lower()
    
    # Remove punctutations
    
    text_no_punc = " ".join([remove_punctuations(word) for word in str(text_lower).split()])
    
    # Remove stop words in the text
    
    text_no_stop_word = " ".join([word for word in str(text_no_punc).split() if word not in stop_words])
    
    # Performing lemmatization on the text
    
    text_lemma = " ".join([lemmatizer.lemmatize(word) for word in text_no_stop_word.split()])
    
    # Perform spelling check
    
    correct_text = " ".join([speller(word) for word in text_lemma.split()])
    
    return correct_text

In [22]:
alexa_reviews['pre_processed_review'] = alexa_reviews['verified_reviews'].apply(lambda text : preprocess_text(text))

In [24]:
alexa_reviews.head(5)

Unnamed: 0,rating,verified_reviews,pre_processed_review
0,1,Love my Echo!,love echo
1,1,Loved it!,loved
2,1,"Sometimes while playing a game, you can answer...",sometimes playing game answer question correct...
3,1,I have had a lot of fun with this thing. My 4 ...,lot fun thing 4 yr old learns dinosaur control...
4,1,Music,music


#### Performing train test split

In [25]:
from sklearn.model_selection import train_test_split

# Creating X and y data frames

y = pd.DataFrame(alexa_reviews['rating'],columns=['rating'])

X = pd.DataFrame(alexa_reviews['pre_processed_review'],columns =['pre_processed_review'])

In [26]:
X.shape

(3150, 1)

In [27]:
y.shape

(3150, 1)

In [28]:
# Perform train & test split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

print("X_train shape ", X_train.shape)
print("X_test shape ", X_test.shape)
print("y_train shape ", y_train.shape)
print("y_test shape ", y_test.shape)

X_train shape  (2520, 1)
X_test shape  (630, 1)
y_train shape  (2520, 1)
y_test shape  (630, 1)


#### Using TF-IDF vectorizer

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiating tf_idf vectorizer

tdf_idf_vec = TfidfVectorizer()

In [31]:
# Fit and transform train data

X_train_tf_idf = tdf_idf_vec.fit_transform(X_train['pre_processed_review'])

# Only transform test data

X_test_tf_idf = tdf_idf_vec.transform(X_test['pre_processed_review'])

In [34]:
X_train_tf_idf.shape

(2520, 3347)

In [35]:
X_test_tf_idf.shape

(630, 3347)

#### Using SMOTE to handle class imbalance

In [38]:
# Rating distribution in train set before class imbalance handling

y_train['rating'].value_counts()

rating
1    2195
0     325
Name: count, dtype: int64

In [42]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=7)

# Handle class imbalance

X_train_bal,y_train_bal = smote.fit_resample(X_train_tf_idf,y_train)

In [78]:
# Rating distribution in train set after class imbalance handling

y_train_bal['rating'].value_counts()

rating
1    2195
0    2195
Name: count, dtype: int64

#### Model building

In [57]:
from sklearn.naive_bayes import MultinomialNB

# Model instantiation

nb_model = MultinomialNB()

In [58]:
# Fitting the training data

nb_model.fit(X_train_bal,np.ravel(y_train_bal))

#### Model evaluation

In [59]:
# Predict the rating in test data

y_test_pred = nb_model.predict(X_test_tf_idf)

# Sample predictions

y_test_pred[:5]

array([0, 0, 1, 1, 1], dtype=int64)

In [60]:
# Rating distribution in test set

y_test['rating'].value_counts()

rating
1    546
0     84
Name: count, dtype: int64

In [61]:
# Rating distribution in predicted test set

y_test_pred = pd.DataFrame(y_test_pred)

y_test_pred.columns = ['rating']

y_test_pred['rating'].value_counts()

rating
1    519
0    111
Name: count, dtype: int64

In [62]:
from sklearn import metrics

# Preparing accuracy_score

accuracy_score = metrics.accuracy_score(y_test,y_test_pred)

print(accuracy_score)

0.8904761904761904


In [63]:
# Calculating precision_score

precision_score = metrics.precision_score(y_test,y_test_pred)

print(precision_score)

0.9595375722543352


In [64]:
# Calculating recall_score

recall_score = metrics.recall_score(y_test,y_test_pred)

print(recall_score)

0.9120879120879121


In [65]:
# Calculating f1 score

f1_score = metrics.f1_score(y_test,y_test_pred)

print(f1_score)

0.9352112676056338
