Testing the Naive-Bayes algorithm for sentiment analysis on a subset of the IMDB review data. For this round of testing, we are using a slightly larger sample size of 5000.

Start by loading the necessary libraries as well as the IMDB dataset.

In [1]:
# Import libraries:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

from wordcloud import WordCloud

import re
import nltk
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB, GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download necessary NLTK resources

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [67]:
# Download the dataset (IMDB Reviews)

df = pd.read_csv("IMDB-Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [54]:
# Check the size of the data:

df.shape

(50000, 2)

In [55]:
# Get some basic details about the df using info()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


Now that we've ascertained that there are no null values to remove, we can get some more details on the break-down of the reviews.

In [56]:
# Check the number of positive and negative reviews in the data:

df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


There is an equal number of positive and negative reviews, which means the dataset is balanced.
For this round of testing, we will use a larger subset of the data instead of the full volume. Our sample size will be 5000.

In [68]:
# Sample the data.
df = df.sample(5000)

# Reset the index.
df.reset_index(drop=True, inplace=True)

# Check the size of the sample dataset.
df.shape

# Check the value counts to ensure we have a relatively balanced sample.
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,2506
negative,2494


We will update the qualitative sentiment values to quantitative binary values: 0 for negative and 1 for positive.

In [69]:
# Replace the sentiment values with binary values.
df['sentiment'] = df['sentiment'].replace({'positive':1, 'negative':0})

df

  df['sentiment'] = df['sentiment'].replace({'positive':1, 'negative':0})


Unnamed: 0,review,sentiment
0,Many experienced and excellent actors mixed to...,1
1,To confess having fantasies about Brad Pitt is...,0
2,Can anyone give me a reason why only one Ameri...,0
3,"I have never read the book, but had always hea...",1
4,I think I read this someplace: Joe Johnston (d...,1
...,...,...
4995,"I sat down to watch ""Midnight Cowboy"" thinking...",1
4996,"If you made a genre flick in the late 80s, you...",0
4997,One of the greatest lessons I ever had in how ...,0
4998,"I saw this movie in its own time period, when ...",0


# Data Preprocessing

We will start cleaning the data to convert the text to lowercase and remove URL links, special characters, and punctuation. We will also expand contractions.

In [7]:
pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.3-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (113 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.9/113.9 kB[0m 

In [17]:
import contractions

In [70]:
# Define a function to preprocess the data.

def clean_up(text):
  # converting to lowercase, removing URL links, special characters, and punctuation marks
  text = text.lower() # convert to lowercase
  text = re.sub('https?://\S+|www\.\S+', '', text) # remove URL links
  text = re.sub(r"\b\d+\b", "", text) # remove numbers
  text = re.sub('<.*?>+', '', text) # remove special characters
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuations
  text = re.sub('\n', '', text)
  text = re.sub('[’“”…]', '', text)

  # removing contractions
  text = contractions.fix(text)

  return text

In [71]:
dt = df['review'].apply(clean_up)

For testing purposes, I am going to keep every version of the review as we continue to preprocess the data. I will convert the dataframe into a datatable.

In [72]:
dt = pd.DataFrame(dt)
dt['sentiment']=df['sentiment']

dt

Unnamed: 0,review,sentiment
0,many experienced and excellent actors mixed to...,1
1,to confess having fantasies about brad pitt is...,0
2,can anyone give me a reason why only one ameri...,0
3,i have never read the book but had always hear...,1
4,i think i read this someplace joe johnston dir...,1
...,...,...
4995,i sat down to watch midnight cowboy thinking i...,1
4996,if you made a genre flick in the late 80s you ...,0
4997,one of the greatest lessons i ever had in how ...,0
4998,i saw this movie in its own time period when h...,0


This is the stage of preprocessing where we will remove the stopwords; however, removing some stopwords (i.e. but, however, etc.) may change the meaning of a review so I will be keeping a version of the reviews with all stopwords intact.

In [73]:
# Create stopwords

stop_words = set(stopwords.words('english'))
dt['with_sw'] = dt['review']
dt['no_sw'] = dt['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [74]:
dt

Unnamed: 0,review,sentiment,with_sw,no_sw
0,many experienced and excellent actors mixed to...,1,many experienced and excellent actors mixed to...,many experienced excellent actors mixed togeth...
1,to confess having fantasies about brad pitt is...,0,to confess having fantasies about brad pitt is...,confess fantasies brad pitt pretty tough admis...
2,can anyone give me a reason why only one ameri...,0,can anyone give me a reason why only one ameri...,anyone give reason one american dies movie sup...
3,i have never read the book but had always hear...,1,i have never read the book but had always hear...,never read book always heard good things movie...
4,i think i read this someplace joe johnston dir...,1,i think i read this someplace joe johnston dir...,think read someplace joe johnston director fil...
...,...,...,...,...
4995,i sat down to watch midnight cowboy thinking i...,1,i sat down to watch midnight cowboy thinking i...,sat watch midnight cowboy thinking would anoth...
4996,if you made a genre flick in the late 80s you ...,0,if you made a genre flick in the late 80s you ...,made genre flick late 80s basically chance wou...
4997,one of the greatest lessons i ever had in how ...,0,one of the greatest lessons i ever had in how ...,one greatest lessons ever watch movie happened...
4998,i saw this movie in its own time period when h...,0,i saw this movie in its own time period when h...,saw movie time period baby wedlock ruined life...


Next, we will lemmatize both sets of review data (with and without stop words).

In [75]:
# Lemmatization:

lemmatizer = WordNetLemmatizer()

dt['with_sw_lem'] = dt['with_sw'].apply(lemmatizer.lemmatize)
dt['no_sw_lem'] = dt['no_sw'].apply(lemmatizer.lemmatize)

In [76]:
dt

Unnamed: 0,review,sentiment,with_sw,no_sw,with_sw_lem,no_sw_lem
0,many experienced and excellent actors mixed to...,1,many experienced and excellent actors mixed to...,many experienced excellent actors mixed togeth...,many experienced and excellent actors mixed to...,many experienced excellent actors mixed togeth...
1,to confess having fantasies about brad pitt is...,0,to confess having fantasies about brad pitt is...,confess fantasies brad pitt pretty tough admis...,to confess having fantasies about brad pitt is...,confess fantasies brad pitt pretty tough admis...
2,can anyone give me a reason why only one ameri...,0,can anyone give me a reason why only one ameri...,anyone give reason one american dies movie sup...,can anyone give me a reason why only one ameri...,anyone give reason one american dies movie sup...
3,i have never read the book but had always hear...,1,i have never read the book but had always hear...,never read book always heard good things movie...,i have never read the book but had always hear...,never read book always heard good things movie...
4,i think i read this someplace joe johnston dir...,1,i think i read this someplace joe johnston dir...,think read someplace joe johnston director fil...,i think i read this someplace joe johnston dir...,think read someplace joe johnston director fil...
...,...,...,...,...,...,...
4995,i sat down to watch midnight cowboy thinking i...,1,i sat down to watch midnight cowboy thinking i...,sat watch midnight cowboy thinking would anoth...,i sat down to watch midnight cowboy thinking i...,sat watch midnight cowboy thinking would anoth...
4996,if you made a genre flick in the late 80s you ...,0,if you made a genre flick in the late 80s you ...,made genre flick late 80s basically chance wou...,if you made a genre flick in the late 80s you ...,made genre flick late 80s basically chance wou...
4997,one of the greatest lessons i ever had in how ...,0,one of the greatest lessons i ever had in how ...,one greatest lessons ever watch movie happened...,one of the greatest lessons i ever had in how ...,one greatest lessons ever watch movie happened...
4998,i saw this movie in its own time period when h...,0,i saw this movie in its own time period when h...,saw movie time period baby wedlock ruined life...,i saw this movie in its own time period when h...,saw movie time period baby wedlock ruined life...


With the lemmatized data, I will create two sets of data (one with stop words and one without). From here, we will tokenize both sets of reviews and create our train/test datasets. First, we will work with the lemmatized data WITHOUT stopwords.

In [77]:
nb = dt.drop(columns=['review', 'no_sw', 'with_sw', 'with_sw_lem'])
nb.columns=['sentiment', 'review']

nb

Unnamed: 0,sentiment,review
0,1,many experienced excellent actors mixed togeth...
1,0,confess fantasies brad pitt pretty tough admis...
2,0,anyone give reason one american dies movie sup...
3,1,never read book always heard good things movie...
4,1,think read someplace joe johnston director fil...
...,...,...
4995,1,sat watch midnight cowboy thinking would anoth...
4996,0,made genre flick late 80s basically chance wou...
4997,0,one greatest lessons ever watch movie happened...
4998,0,saw movie time period baby wedlock ruined life...


Let's tokenize the data.

In [79]:
tok_reviews = nb['review'].apply(lambda x: x.split())
tok_reviews.head(5)

Unnamed: 0,review
0,"[many, experienced, excellent, actors, mixed, ..."
1,"[confess, fantasies, brad, pitt, pretty, tough..."
2,"[anyone, give, reason, one, american, dies, mo..."
3,"[never, read, book, always, heard, good, thing..."
4,"[think, read, someplace, joe, johnston, direct..."


# Feature extraction using Bag of Words Vectorization.

In [80]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(nb['review'])



In [81]:
text_counts

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 416938 stored elements and shape (5000, 52386)>

In [82]:
# Split data into train and test sets.
X = text_counts
y = nb['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=30)

# Naive Bayes Modeling

We will test through 3 different Naive Bayes models: ComplementNB, MultinomialNB, and BernoulliNB.

1. Complement NB Model:

In [83]:
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report, confusion_matrix
CNB = ComplementNB()
CNB.fit(X_train, y_train)

from sklearn import metrics
predicted = CNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('Complement NB model accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

Complement NB model accuracy is 83.90%
------------------------------------------------
Confusion Matrix:
     0    1
0  438   57
1  104  401
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       495
           1       0.88      0.79      0.83       505

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000



2. Multinomial NB Model:

In [84]:
from sklearn.naive_bayes import MultinomialNB

MNB = MultinomialNB()
MNB.fit(X_train, y_train)

predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('Multinomial NB model accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

Multinomial NB model accuracy is 83.90%
------------------------------------------------
Confusion Matrix:
     0    1
0  438   57
1  104  401
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       495
           1       0.88      0.79      0.83       505

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000



3. Bernoulli NB Model:

In [85]:
from sklearn.naive_bayes import BernoulliNB

BNB = BernoulliNB()
BNB.fit(X_train, y_train)

predicted = BNB.predict(X_test)
accuracy_score_bnb = metrics.accuracy_score(predicted,y_test)

print('Bernoulli NB model accuracy = ' + str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

Bernoulli NB model accuracy = 84.70%
------------------------------------------------
Confusion Matrix:
     0    1
0  424   71
1   82  423
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       495
           1       0.86      0.84      0.85       505

    accuracy                           0.85      1000
   macro avg       0.85      0.85      0.85      1000
weighted avg       0.85      0.85      0.85      1000



# Feature Extraction using TF-IDF

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
text_count_2 = tfidf.fit_transform(nb['review'])

In [87]:
x_train, x_test, y_train, y_test = train_test_split(text_count_2, nb['sentiment'],test_size=0.20,random_state=30)

In [88]:
# fitting the model with CNB
CNB.fit(x_train, y_train)
accuracy_score_cnb = metrics.accuracy_score(CNB.predict(x_test), y_test)
print('accuracy_score_cnb = '+str('{:4.2f}'.format(accuracy_score_cnb*100))+'%')

accuracy_score_cnb = 84.60%


In [89]:
#fitting the model with MNB
MNB.fit(x_train, y_train)
accuracy_score_mnb = metrics.accuracy_score(MNB.predict(x_test), y_test)

print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')

accuracy_score_mnb = 84.60%


In [90]:
#fitting the model with BNB
BNB.fit(x_train, y_train)
accuracy_score_bnb = metrics.accuracy_score(BNB.predict(x_test), y_test)
print('accuracy_score_bnb = '+str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')

accuracy_score_bnb = 85.40%


# Re-running this analysis with the sample dataset that INCLUDES stopwords.

In [91]:
nb_SW = dt.drop(columns=['review', 'no_sw', 'with_sw', 'no_sw_lem'])
nb_SW.columns=['sentiment', 'review']

nb_SW

Unnamed: 0,sentiment,review
0,1,many experienced and excellent actors mixed to...
1,0,to confess having fantasies about brad pitt is...
2,0,can anyone give me a reason why only one ameri...
3,1,i have never read the book but had always hear...
4,1,i think i read this someplace joe johnston dir...
...,...,...
4995,1,i sat down to watch midnight cowboy thinking i...
4996,0,if you made a genre flick in the late 80s you ...
4997,0,one of the greatest lessons i ever had in how ...
4998,0,i saw this movie in its own time period when h...


Tokenize the data (keeping stop words)

In [92]:
tok_review_SW = nb_SW['review'].apply(lambda x: x.split())
tok_review_SW.head(5)

Unnamed: 0,review
0,"[many, experienced, and, excellent, actors, mi..."
1,"[to, confess, having, fantasies, about, brad, ..."
2,"[can, anyone, give, me, a, reason, why, only, ..."
3,"[i, have, never, read, the, book, but, had, al..."
4,"[i, think, i, read, this, someplace, joe, john..."


# Bag of Words (with stop words)

In [93]:
token_SW = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token_SW.tokenize)
text_counts_SW = cv.fit_transform(nb_SW['review'])



In [94]:
text_counts_SW

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 422976 stored elements and shape (5000, 52395)>

In [95]:
# Split data into train and test sets.
X = text_counts_SW
y = nb_SW['sentiment']

X_SW_train, X_SW_test, y_SW_train, y_SW_test = train_test_split(X, y, test_size=0.20,random_state=30)

# Naive Bayes Modeling (keeping stopwords)

1. Complement NB:

In [96]:
CNB_SW = ComplementNB()
CNB_SW.fit(X_SW_train, y_SW_train)

from sklearn import metrics
predicted = CNB_SW.predict(X_SW_test)
accuracy_score = metrics.accuracy_score(predicted, y_SW_test)

print('Complement NB model accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_SW_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_SW_test, predicted))

Complement NB model accuracy is 84.00%
------------------------------------------------
Confusion Matrix:
     0    1
0  439   56
1  104  401
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       495
           1       0.88      0.79      0.83       505

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000



2. Multinomial NB:

In [97]:
MNB_SW = MultinomialNB()
MNB_SW.fit(X_SW_train, y_SW_train)

predicted = MNB_SW.predict(X_SW_test)
accuracy_score = metrics.accuracy_score(predicted, y_SW_test)

print('Multinomial NB model accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_SW_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_SW_test, predicted))

Multinomial NB model accuracy is 84.00%
------------------------------------------------
Confusion Matrix:
     0    1
0  439   56
1  104  401
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       495
           1       0.88      0.79      0.83       505

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000



3. Bernoulli NB:

In [98]:
BNB_SW = BernoulliNB()
BNB_SW.fit(X_SW_train, y_SW_train)

predicted = BNB_SW.predict(X_SW_test)
accuracy_score_bnb = metrics.accuracy_score(predicted,y_SW_test)

print('Bernoulli NB model accuracy = ' + str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_SW_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_SW_test, predicted))

Bernoulli NB model accuracy = 84.80%
------------------------------------------------
Confusion Matrix:
     0    1
0  424   71
1   81  424
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       495
           1       0.86      0.84      0.85       505

    accuracy                           0.85      1000
   macro avg       0.85      0.85      0.85      1000
weighted avg       0.85      0.85      0.85      1000



# TF-IDF (keeping stopwords)

In [99]:
tfidf_SW = TfidfVectorizer()
text_count_2 = tfidf_SW.fit_transform(nb_SW['review'])

In [100]:
x_SW_train, x_SW_test, y_SW_train, y_SW_test = train_test_split(text_count_2, nb_SW['sentiment'],test_size=0.20,random_state=30)

In [101]:
# fitting the model with CNB
CNB.fit(x_SW_train, y_SW_train)
accuracy_score_cnb = metrics.accuracy_score(CNB.predict(x_SW_test), y_SW_test)
print('accuracy_score_cnb = '+str('{:4.2f}'.format(accuracy_score_cnb*100))+'%')

accuracy_score_cnb = 82.80%


In [102]:
#fitting the model with MNB
MNB.fit(x_SW_train, y_SW_train)
accuracy_score_mnb = metrics.accuracy_score(MNB.predict(x_SW_test), y_SW_test)

print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')

accuracy_score_mnb = 82.80%


In [103]:
#fitting the model with BNB
BNB.fit(x_SW_train, y_SW_train)
accuracy_score_bnb = metrics.accuracy_score(BNB.predict(x_SW_test), y_SW_test)
print('accuracy_score_bnb = '+str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')

accuracy_score_bnb = 83.90%


# Conclusions after testing with datasets where stop words had been removed versus dataset that kept stop words

The accuracy and F1 scores for the dataset that retained stopwords were slightly smaller than the dataset where the stopwords had been removed. The percentage difference was nominal.