### Multinomial Event Model

In [1]:
x = ["This was awesome, an awesome movie",
     "Great movie! I liked it a lot",
     "Happy Ending! awesome acting by the hero",
     "loved it! truly great",
     "bad not upto the mark",
     "could have been better",
     "That movie was surely not a bad one!",
     "Surely a disappointing movie"]

y = [1,1,1,1,0,0,1,0] # 1 - Positive, 0 - Negative Class

In [2]:
x_test = ["I was happy & happy and I loved the acting in the movie",
          "The movie I saw was not good",
          "The movie I saw was not bad"]

#### 1. Cleaning

In [3]:
# import clean_text as ct

In [4]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [5]:
tk = RegexpTokenizer(r'\w+')
sw = stopwords.words('english')
ss = SnowballStemmer('english')

In [6]:
def clean_text(review):
    review = review.lower()
    review.replace("<br />", " ")
    
    words = tk.tokenize(review)
    new_words = [word for word in words if (word not in sw or word=='not')]
    stemmed_words = [ss.stem(word) for word in new_words]
    
    clean_rev = ' '.join(stemmed_words)
    return clean_rev

In [7]:
x_clean = [clean_text(i) for i in x]         # List comprehension
xt_clean = [clean_text(i) for i in x_test]

print(x_clean)
print(xt_clean)

['awesom awesom movi', 'great movi like lot', 'happi end awesom act hero', 'love truli great', 'bad not upto mark', 'could better', 'movi sure not bad one', 'sure disappoint movi']
['happi happi love act movi', 'movi saw not good', 'movi saw not bad']


#### 2. Vectorization
- As ScikitLearn's Naive Bayes accepts a sparse matrix in fit() function, so we transform our text reviews to vectors using Bag of Words model

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
cv = CountVectorizer(ngram_range=(1,2))

In [10]:
x_vec = cv.fit_transform(x_clean).toarray()
print(x_vec)
print(x_vec.shape)

[[0 0 2 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0
  0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
  0 1 1 0 0]
 [0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0
  0 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0]
 [0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 1 0
  1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1
  0 0 0 0 0]]
(8, 41)


In [11]:
xt_vec = cv.transform(xt_clean).toarray()
print(xt_vec)
print(xt_vec.shape)

[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0
  0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0
  0 0 0 0 0]]
(3, 41)


In [12]:
print(cv.get_feature_names())

['act', 'act hero', 'awesom', 'awesom act', 'awesom awesom', 'awesom movi', 'bad', 'bad not', 'bad one', 'better', 'could', 'could better', 'disappoint', 'disappoint movi', 'end', 'end awesom', 'great', 'great movi', 'happi', 'happi end', 'hero', 'like', 'like lot', 'lot', 'love', 'love truli', 'mark', 'movi', 'movi like', 'movi sure', 'not', 'not bad', 'not upto', 'one', 'sure', 'sure disappoint', 'sure not', 'truli', 'truli great', 'upto', 'upto mark']


#### 3. Multinomial Naive Bayes

In [13]:
from sklearn.naive_bayes import MultinomialNB

In [14]:
mnb = MultinomialNB()
print(type(mnb))
mnb

<class 'sklearn.naive_bayes.MultinomialNB'>


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
mnb.fit(x_vec, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
# Predicting on training data
mnb.predict(x_vec)

array([1, 1, 1, 1, 0, 0, 1, 0])

In [17]:
# Predicting on testing data
print(x_test)
mnb.predict(xt_vec)  # => Gives wrong prediction for last example, where two negative are present

['I was happy & happy and I loved the acting in the movie', 'The movie I saw was not good', 'The movie I saw was not bad']


array([1, 1, 1])

In [18]:
mnb.predict_proba(xt_vec)

array([[0.07946391, 0.92053609],
       [0.35589878, 0.64410122],
       [0.33724552, 0.66275448]])

In [19]:
print(mnb.score(x_vec, y))
print(mnb.score(xt_vec, [1,0,1]))

1.0
0.6666666666666666


#### 4. Multivariate Bernoulli Naive Bayes

In [20]:
from sklearn.naive_bayes import BernoulliNB

In [21]:
bnb = BernoulliNB(binarize=0.0)

In [22]:
print(bnb)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


In [23]:
bnb.fit(x_vec, y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [24]:
bnb.predict(xt_vec)

array([1, 1, 1])

In [25]:
print(bnb.score(x_vec, y))
print(bnb.score(xt_vec, [1,0,1]))

1.0
0.6666666666666666


In [26]:
bnb.predict_proba(xt_vec)

array([[0.03451745, 0.96548255],
       [0.19618255, 0.80381745],
       [0.2026999 , 0.7973001 ]])