### Multinomial Event Model


In [2]:
x = ["Awesome, this was an awesome movie", 
     "Great movie! I liked it a lot",
     "Happy Ending! awesome acting by the hero",
     "loved it! truly great",
     "bad not upto the mark",
     "could have been better",
     "Surely a Disappointing movie"]

y = [1,1,1,1,0,0,0] # 1 - Positive, 0 - Negative Class

In [3]:
x_test = ["I was happy & happy and I loved the acting in the movie",
          "The movie I saw was bad"]

### 1. Cleaning


In [20]:
## copy it from 02_file.
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import sys

# Init Objects
tokenizer = RegexpTokenizer(r'\w+')
eng_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

def getCleanReview(review):
    
    review = review.lower()
#     review = review.replace("<br /><br />"," ")   # this is not required here.
    tokens = tokenizer.tokenize(review)
    useful_tokens = [i for i in tokens if (i not in eng_stopwords or i=='not')]
    stemmed_tokens = [ps.stem(i) for i in useful_tokens]    
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review

In [22]:
x_clean = [getCleanReview(i) for i in x] #List Comprehension
xt_clean = [getCleanReview(i) for i in x_test]

print(x_clean)
print(xt_clean)

['awesom awesom movi', 'great movi like lot', 'happi end awesom act hero', 'love truli great', 'bad not upto mark', 'could better', 'sure disappoint movi']
['happi happi love act movi', 'movi saw bad']


### 2.Vectorization

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
cv = CountVectorizer(ngram_range=(1,2))

x_vec = cv.fit_transform(x_clean).toarray()  # toarray() converts sparse matrix to array.
print(x_vec)
print(x_vec.shape)

[[0 0 2 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0]
 [0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0]]
(7, 36)


In [34]:
d = {'a': 1 , 'b': 2}

In [39]:
print(sorted(cv.vocabulary_))

['act', 'act hero', 'awesom', 'awesom act', 'awesom awesom', 'awesom movi', 'bad', 'bad not', 'better', 'could', 'could better', 'disappoint', 'disappoint movi', 'end', 'end awesom', 'great', 'great movi', 'happi', 'happi end', 'hero', 'like', 'like lot', 'lot', 'love', 'love truli', 'mark', 'movi', 'movi like', 'not', 'not upto', 'sure', 'sure disappoint', 'truli', 'truli great', 'upto', 'upto mark']


#### Vectorization on the test set: 


In [41]:
xt_vec = cv.transform(xt_clean).toarray()
print(xt_vec)
print(xt_vec.shape)

[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]]
(2, 36)


### 3. Multinomial Naive Bayes

In [42]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB, GaussianNB

In [43]:
mnb = MultinomialNB()
print(mnb)

MultinomialNB()


In [44]:
# Do Training 
mnb.fit(x_vec,y)

MultinomialNB()

In [45]:
#Predictions
mnb.predict(xt_vec)

array([1, 0])

In [46]:
mnb.predict_proba(xt_vec)

array([[0.07661933, 0.92338067],
       [0.59643134, 0.40356866]])

In [47]:
mnb.score(x_vec,y)

1.0

### 4. Multivariate Bernoulli Event Model Naive Bayes
- Let us check using Bernoulli NB also:

In [50]:
## create instance of BernoulliNB
bnb = BernoulliNB(binarize=0.0)

In [52]:
## Training 
bnb.fit(x_vec,y)

BernoulliNB()

In [53]:
bnb.predict_proba(xt_vec)

array([[0.0581288, 0.9418712],
       [0.6220379, 0.3779621]])

In [54]:
bnb.predict(xt_vec)

array([1, 0])

In [55]:
bnb.score(x_vec,y)

1.0