## Importing Packages

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import keras
from keras.datasets import imdb

In [25]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB

## Loading Datasets

In [10]:
import pandas as pd
import numpy as np

rt_data = pd.read_csv('RT_train.tsv', sep='\t')

rt_data

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [11]:
np.mean(rt_data['Sentiment'] * 100)

206.3578110982955

In [12]:
from sklearn.model_selection import train_test_split


X_train, X_test, Y_train, Y_test = train_test_split(rt_data['Phrase'], 
                                                    rt_data['Sentiment'], 
                                                    random_state=0)

## Multinomial Naive Bayes

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score, accuracy_score

vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)
clfrNB = MultinomialNB(alpha = 0.1)
clfrNB.fit(X_train_vectorized, Y_train)
predicted_labels = clfrNB.predict(X_test_vectorized)
score = accuracy_score(Y_test, predicted_labels)

In [14]:
score

0.6003588363449955

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit & Transform X_train using Tfidf Vectorizer & get the feature names. {My Note}
vect = TfidfVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
feature_names = np.array(vect.get_feature_names_out())
    
# Sort index to get the top feature names. {My Note}
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()
Sindex = feature_names[sorted_tfidf_index[:20]]
Lindex = feature_names[sorted_tfidf_index[:-21:-1]]
    
# Create a DataFrame(Sparse) with all the scores and get the max of each row to get the sorted Series. {My Note}
df = pd.DataFrame(X_train_vectorized.toarray(), columns = vect.get_feature_names_out())
Series = sorted(df.max())
    
# Sort the smallest tf-idfs based on Score and then Alphabetically. {My Note}
Sdf = pd.DataFrame(Series[:20],index = Sindex)
Sdf = Sdf.reset_index()
Sdf = Sdf.sort_values([0, "index"], ascending = (True, True))
Sdf = Sdf.set_index('index')
Sseries = pd.Series(Sdf[0], index = Sdf.index)
# Sort the largest tf-idfs based on Score and then Alphabetically. {My Note}
Ldf = pd.DataFrame(Series[:-21:-1],index = Lindex)
Ldf = Ldf.reset_index()
Ldf = Ldf.sort_values([0, "index"], ascending = (False, True))
Ldf = Ldf.set_index('index')
Lseries = pd.Series(Ldf[0], index = Ldf.index)

In [17]:
Lseries

index
cheated         1.0
cheatfully      1.0
cheats          1.0
journalist      1.0
journalistic    1.0
journalists     1.0
journey         1.0
jovial          1.0
joy             1.0
joyful          1.0
joyless         1.0
joylessly       1.0
joyous          1.0
jr              1.0
juan            1.0
judaism         1.0
judd            1.0
judge           1.0
judgment        1.0
zzzzzzzzz       1.0
Name: 0, dtype: float64

In [18]:
Sseries

index
managing            0.228233
cineasts            0.261518
ackerman            0.271927
grounds             0.288356
uproariously        0.300387
ashamed             0.304002
preposterousness    0.306089
documented          0.310805
helpful             0.310888
shamefully          0.312022
downplays           0.315960
clotted             0.318975
suspenser           0.320186
woozy               0.320806
fools               0.321918
provocateur         0.322897
devise              0.324900
trimmed             0.325637
fuddled             0.326940
errs                0.328282
Name: 0, dtype: float64

In [19]:
X_test_vectorized = vect.transform(X_test)
clfrNB = MultinomialNB(alpha = 0.1)
clfrNB.fit(X_train_vectorized, Y_train)
predicted_labels = clfrNB.predict(X_test_vectorized)
score = accuracy_score(Y_test, predicted_labels)
score

0.5998205818275022

## Bernoulli Naive Bayes

In [20]:
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)
clfrNB2 = BernoulliNB(alpha=0.1)
clfrNB2.fit(X_train_vectorized, Y_train)
predicted_labels = clfrNB2.predict(X_test_vectorized)
score = accuracy_score(Y_test, predicted_labels)

In [21]:
score

0.5989491221325132

In [22]:
# Fit & Transform X_train using Tfidf Vectorizer & get the feature names. {My Note}
vect = TfidfVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
feature_names = np.array(vect.get_feature_names_out())
    
# Sort index to get the top feature names. {My Note}
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()
Sindex = feature_names[sorted_tfidf_index[:20]]
Lindex = feature_names[sorted_tfidf_index[:-21:-1]]
    
# Create a DataFrame(Sparse) with all the scores and get the max of each row to get the sorted Series. {My Note}
df = pd.DataFrame(X_train_vectorized.toarray(), columns = vect.get_feature_names_out())
Series = sorted(df.max())
    
# Sort the smallest tf-idfs based on Score and then Alphabetically. {My Note}
Sdf = pd.DataFrame(Series[:20],index = Sindex)
Sdf = Sdf.reset_index()
Sdf = Sdf.sort_values([0, "index"], ascending = (True, True))
Sdf = Sdf.set_index('index')
Sseries = pd.Series(Sdf[0], index = Sdf.index)
# Sort the largest tf-idfs based on Score and then Alphabetically. {My Note}
Ldf = pd.DataFrame(Series[:-21:-1],index = Lindex)
Ldf = Ldf.reset_index()
Ldf = Ldf.sort_values([0, "index"], ascending = (False, True))
Ldf = Ldf.set_index('index')
Lseries = pd.Series(Ldf[0], index = Ldf.index)

In [23]:
X_test_vectorized = vect.transform(X_test)
clfrNB2 = BernoulliNB(alpha = 0.1)
clfrNB2.fit(X_train_vectorized, Y_train)
predicted_labels = clfrNB2.predict(X_test_vectorized)
score = accuracy_score(Y_test, predicted_labels)
score

0.5989491221325132

## Complement Naive Bayes

In [26]:
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)
clfrNB3 = ComplementNB()
clfrNB3.fit(X_train_vectorized, Y_train)
predicted_labels = clfrNB3.predict(X_test_vectorized)
score = accuracy_score(Y_test, predicted_labels)
score

0.48717160066641035

In [27]:
vect = TfidfVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)
clfrNB3 = ComplementNB()
clfrNB3.fit(X_train_vectorized, Y_train)
predicted_labels = clfrNB3.predict(X_test_vectorized)
score = accuracy_score(Y_test, predicted_labels)
score

0.51759579648853