In [1]:
import pandas as pd
import numpy as np

## Loading The Data

In [2]:
df = pd.read_csv("movie_reviewdataset_50K.csv")
df = df[0:1000]

## Cleaning The Data (NLTK)

In [3]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [4]:
# Init Objects
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [5]:
def CleanReview(review):
    review = review.lower()
    
    #Tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    clean_review = ' '.join(stemmed_tokens)
    return clean_review

In [31]:
df['review'] = df['review'].apply(CleanReview)

# Vectorization

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
cv = CountVectorizer()

In [36]:
reviews = cv.fit_transform(df['review']).toarray()
print(reviews.shape)

(1000, 12442)


## Dividing Dataset Into Training Data & Testing Data

In [7]:
from sklearn.model_selection import train_test_split

In [42]:
X_train, X_test, y_train, y_test = train_test_split(reviews,df['sentiment'], test_size=0.2)

# 1) Multinomial Naive Bayes

In [44]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB

In [47]:
mnb = MultinomialNB()

In [51]:
mnb.fit(X_train,y_train)

In [72]:
mnb.predict(X_test)

array(['negative', 'negative', 'positive', 'positive', 'positive',
       'negative', 'negative', 'negative', 'negative', 'positive',
       'negative', 'positive', 'negative', 'negative', 'positive',
       'positive', 'positive', 'negative', 'positive', 'negative',
       'positive', 'positive', 'negative', 'negative', 'positive',
       'positive', 'negative', 'positive', 'positive', 'positive',
       'negative', 'negative', 'negative', 'positive', 'positive',
       'negative', 'negative', 'positive', 'negative', 'positive',
       'negative', 'positive', 'negative', 'positive', 'negative',
       'positive', 'negative', 'negative', 'positive', 'positive',
       'positive', 'negative', 'positive', 'positive', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'positive', 'negative', 'positive', 'negative', 'negative',
       'negative', 'positive', 'negative', 'positive', 'negative',
       'negative', 'negative', 'positive', 'positive', 'positi

In [76]:
mnb.score(X_test,y_test)

0.835

# 2) Multivariate Bernoulli Event Model Naive Bayes

In [59]:
bnb = BernoulliNB(binarize=0.0)

In [62]:
bnb.fit(X_train,y_train)

In [75]:
bnb.predict_proba(X_test)

array([[9.99995956e-01, 4.04360515e-06],
       [9.99801974e-01, 1.98025836e-04],
       [2.00515155e-03, 9.97994848e-01],
       [9.68121467e-01, 3.18785331e-02],
       [2.21303135e-03, 9.97786969e-01],
       [9.99982056e-01, 1.79444638e-05],
       [9.85879107e-01, 1.41208930e-02],
       [9.91237722e-01, 8.76227751e-03],
       [6.79239927e-01, 3.20760073e-01],
       [2.64204685e-02, 9.73579531e-01],
       [9.99999998e-01, 1.58202954e-09],
       [2.94184219e-04, 9.99705816e-01],
       [9.99999979e-01, 2.08659626e-08],
       [9.98142723e-01, 1.85727714e-03],
       [4.80755686e-01, 5.19244314e-01],
       [3.88642616e-11, 1.00000000e+00],
       [1.92951840e-02, 9.80704816e-01],
       [9.97419373e-01, 2.58062715e-03],
       [2.50499475e-01, 7.49500525e-01],
       [9.94873578e-01, 5.12642244e-03],
       [1.08441738e-02, 9.89155826e-01],
       [4.36269787e-03, 9.95637302e-01],
       [9.99999695e-01, 3.04528245e-07],
       [9.99999942e-01, 5.81991598e-08],
       [1.969976

In [68]:
bnb.predict(X_test)

array(['negative', 'negative', 'positive', 'negative', 'positive',
       'negative', 'negative', 'negative', 'negative', 'positive',
       'negative', 'positive', 'negative', 'negative', 'positive',
       'positive', 'positive', 'negative', 'positive', 'negative',
       'positive', 'positive', 'negative', 'negative', 'positive',
       'positive', 'negative', 'positive', 'positive', 'positive',
       'negative', 'negative', 'negative', 'positive', 'positive',
       'positive', 'negative', 'positive', 'negative', 'negative',
       'positive', 'positive', 'negative', 'positive', 'negative',
       'positive', 'negative', 'negative', 'negative', 'positive',
       'positive', 'negative', 'positive', 'positive', 'negative',
       'negative', 'negative', 'negative', 'positive', 'negative',
       'positive', 'negative', 'positive', 'negative', 'negative',
       'negative', 'positive', 'negative', 'positive', 'negative',
       'negative', 'negative', 'negative', 'positive', 'positi

In [71]:
bnb.score(X_test,y_test)

0.79

# Generating Confusion Matrix 

In [79]:
"""
ACCURACY = (TP/TN)/Toal Examples
PRECISION = TP/(TP+FP)
RECALL = TP/(TP+FN)
F-MEASURE = 2*TP/(2*TP+FP+FN)
"""

'\nACCURACY = (TP/TN)/Toal Examples\nPRECISION = TP/(TP+FP)\nRECALL = TP/(TP+FN)\nF-MEASURE = 2*TP/(2*TP+FP+FN)\n'

In [None]:
from sklearn