## Importing Packages

In [3]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import keras
from keras.datasets import imdb

In [2]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB

## Loading Datasets

In [44]:
# The data, split between train and test sets
(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz", num_words=None, skip_top=0, maxlen=None, seed=113, 
                                                      start_char=1, oov_char=2, index_from=3)

In [3]:
#Getting Word index
Words = imdb.get_word_index(path="imdb_word_index.json")

In [4]:
len(Words.keys())

88584

In [4]:
import pandas as pd
import numpy as np

imdb_data = pd.read_csv('IMDB Dataset.csv')

imdb_data['Label'] = np.where(imdb_data['sentiment']=='positive',1,0)
imdb_data

Unnamed: 0,review,sentiment,Label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0
49997,I am a Catholic taught in parochial elementary...,negative,0
49998,I'm going to have to disagree with the previou...,negative,0


In [47]:
np.mean(imdb_data['Label'] * 100)

50.0

In [5]:
from sklearn.model_selection import train_test_split


X_train, X_test, Y_train, Y_test = train_test_split(imdb_data['review'], 
                                                    imdb_data['Label'], 
                                                    random_state=0)

## Multinomial Naive Bayes

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score, accuracy_score

vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)
clfrNB = MultinomialNB(alpha = 0.1)
clfrNB.fit(X_train_vectorized, Y_train)
predicted_labels = clfrNB.predict(X_test_vectorized)
score = accuracy_score(Y_test, predicted_labels)

In [50]:
score

0.84664

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit & Transform X_train using Tfidf Vectorizer & get the feature names. {My Note}
vect = TfidfVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
feature_names = np.array(vect.get_feature_names_out())
    
# Sort index to get the top feature names. {My Note}
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()
Sindex = feature_names[sorted_tfidf_index[:20]]
Lindex = feature_names[sorted_tfidf_index[:-21:-1]]
    
# Create a DataFrame(Sparse) with all the scores and get the max of each row to get the sorted Series. {My Note}
df = pd.DataFrame(X_train_vectorized.toarray(), columns = vect.get_feature_names_out())
Series = sorted(df.max())
    
# Sort the smallest tf-idfs based on Score and then Alphabetically. {My Note}
Sdf = pd.DataFrame(Series[:20],index = Sindex)
Sdf = Sdf.reset_index()
Sdf = Sdf.sort_values([0, "index"], ascending = (True, True))
Sdf = Sdf.set_index('index')
Sseries = pd.Series(Sdf[0], index = Sdf.index)
# Sort the largest tf-idfs based on Score and then Alphabetically. {My Note}
Ldf = pd.DataFrame(Series[:-21:-1],index = Lindex)
Ldf = Ldf.reset_index()
Ldf = Ldf.sort_values([0, "index"], ascending = (False, True))
Ldf = Ldf.set_index('index')
Lseries = pd.Series(Ldf[0], index = Ldf.index)

In [50]:
Lseries

index
trivialboring                                                               0.990542
pokemon                                                                     0.926929
ghoulies                                                                    0.913663
robot                                                                       0.890573
blahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblah    0.883381
nr                                                                          0.879292
esperanto                                                                   0.864549
uzumakis                                                                    0.859902
ernest                                                                      0.859825
bad                                                                         0.851258
wei                                                                         0.843160
darkman                                                    

In [51]:
Sseries

index
bishoff          0.015378
brawled          0.015378
chokeslammed     0.015378
clotheslining    0.015378
crossface        0.015378
dudleys          0.015378
ganged           0.015378
gloated          0.015378
goaded           0.015378
hurracanrana     0.015378
nwo              0.015378
powerbomb        0.015378
riksihi          0.015378
rollup           0.015378
somersaulted     0.015378
somersaulting    0.015378
sprinted         0.015378
superkicked      0.015378
suplexing        0.015378
turnbuckles      0.015378
Name: 0, dtype: float64

In [52]:
X_test_vectorized = vect.transform(X_test)
clfrNB = MultinomialNB(alpha = 0.1)
clfrNB.fit(X_train_vectorized, Y_train)
predicted_labels = clfrNB.predict(X_test_vectorized)
score = accuracy_score(Y_test, predicted_labels)
score

0.86416

## Bernoulli Naive Bayes

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score, accuracy_score

vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)
clfrNB2 = BernoulliNB(alpha=0.1)
clfrNB2.fit(X_train_vectorized, Y_train)
predicted_labels = clfrNB2.predict(X_test_vectorized)
score = accuracy_score(Y_test, predicted_labels)

In [8]:
score

0.85272

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit & Transform X_train using Tfidf Vectorizer & get the feature names. {My Note}
vect = TfidfVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
feature_names = np.array(vect.get_feature_names_out())
    
# Sort index to get the top feature names. {My Note}
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()
Sindex = feature_names[sorted_tfidf_index[:20]]
Lindex = feature_names[sorted_tfidf_index[:-21:-1]]
    
# Create a DataFrame(Sparse) with all the scores and get the max of each row to get the sorted Series. {My Note}
df = pd.DataFrame(X_train_vectorized.toarray(), columns = vect.get_feature_names_out())
Series = sorted(df.max())
    
# Sort the smallest tf-idfs based on Score and then Alphabetically. {My Note}
Sdf = pd.DataFrame(Series[:20],index = Sindex)
Sdf = Sdf.reset_index()
Sdf = Sdf.sort_values([0, "index"], ascending = (True, True))
Sdf = Sdf.set_index('index')
Sseries = pd.Series(Sdf[0], index = Sdf.index)
# Sort the largest tf-idfs based on Score and then Alphabetically. {My Note}
Ldf = pd.DataFrame(Series[:-21:-1],index = Lindex)
Ldf = Ldf.reset_index()
Ldf = Ldf.sort_values([0, "index"], ascending = (False, True))
Ldf = Ldf.set_index('index')
Lseries = pd.Series(Ldf[0], index = Ldf.index)

In [24]:
Sseries

index
bishoff          0.015378
brawled          0.015378
chokeslammed     0.015378
clotheslining    0.015378
crossface        0.015378
dudleys          0.015378
ganged           0.015378
gloated          0.015378
goaded           0.015378
hurracanrana     0.015378
nwo              0.015378
powerbomb        0.015378
riksihi          0.015378
rollup           0.015378
somersaulted     0.015378
somersaulting    0.015378
sprinted         0.015378
superkicked      0.015378
suplexing        0.015378
turnbuckles      0.015378
Name: 0, dtype: float64

In [25]:
Lseries

index
trivialboring                                                               0.990542
pokemon                                                                     0.926929
ghoulies                                                                    0.913663
robot                                                                       0.890573
blahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblah    0.883381
nr                                                                          0.879292
esperanto                                                                   0.864549
uzumakis                                                                    0.859902
ernest                                                                      0.859825
bad                                                                         0.851258
wei                                                                         0.843160
darkman                                                    

In [28]:
X_test_vectorized = vect.transform(X_test)
clfrNB2 = BernoulliNB(alpha = 0.1)
clfrNB2.fit(X_train_vectorized, Y_train)
predicted_labels = clfrNB2.predict(X_test_vectorized)
score = accuracy_score(Y_test, predicted_labels)
score

0.85272

## Complement Naive Bayes

In [9]:
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)
clfrNB3 = ComplementNB()
clfrNB3.fit(X_train_vectorized, Y_train)
predicted_labels = clfrNB3.predict(X_test_vectorized)
score = accuracy_score(Y_test, predicted_labels)
score

0.8464

In [65]:
vect = TfidfVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)
clfrNB3 = ComplementNB()
clfrNB3.fit(X_train_vectorized, Y_train)
predicted_labels = clfrNB3.predict(X_test_vectorized)
score = accuracy_score(Y_test, predicted_labels)
score

0.86224