# MOVIE REVIEW PREDICTION USING NAIVE BAYES ALGORITHM (using scikit-learn)

In [50]:
x = ["This was an awesome movie",
     "Great movie! I liked it a lot",
     "Happy Ending! awesome acting by the hero",
     "loved it! truly great",
     "bad not upto the mark",
     "could have better",
     "this movie was a disappointment"]

In [67]:
y = [1,1,1,1,0,0,0] #1 for positive review and 0 for negative reviews

### nltk pipeline: tokenization, stopwords removal, stemming, vectorization

In [52]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [53]:
#init objects
tokenizer = RegexpTokenizer(r'\w+') #accepts all the alphanumeric words
eng_stopwords = set(stopwords.words('english')) #remove the stopwords like is, am, has, the etc
ps= PorterStemmer() #stemming converts all forms of verb into the base form

In [54]:
def getStemmedReviews(review):
    review = review.lower() 
    
    #tokenize followed by stemming
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in eng_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review

In [55]:
x_clean = [getStemmedReviews(i) for i in x] #list comprehension


In [69]:
print(x_clean)

['awesom movi', 'great movi like lot', 'happi end awesom act hero', 'love truli great', 'bad upto mark', 'could better', 'movi disappoint']


In [57]:
from sklearn.feature_extraction.text import CountVectorizer

In [58]:
#we need to perform vectorization i.e. convert the data into numerical data to further analyze
cv = CountVectorizer(ngram_range = (1,2)) #bi-grams: converts 2 consecutive words into a single characterstic
x_vec = cv.fit_transform(x_clean).toarray()
print(x_vec)

[[0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 1 0 1 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0]
 [0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]]


In [70]:
print(cv.get_feature_names()) #

['act', 'act hero', 'awesom', 'awesom act', 'awesom movi', 'bad', 'bad upto', 'better', 'could', 'could better', 'disappoint', 'end', 'end awesom', 'great', 'great movi', 'happi', 'happi end', 'hero', 'like', 'like lot', 'lot', 'love', 'love truli', 'mark', 'movi', 'movi disappoint', 'movi like', 'truli', 'truli great', 'upto', 'upto mark']


### MULTINOMIAL NAIVE BAYES USING SCIKIT

In [71]:
from sklearn.naive_bayes import MultinomialNB

In [78]:
mb = MultinomialNB()
mb.fit(x_vec,y) #TRAINING

MultinomialNB()

In [73]:
test_x = ["I was happy happy and I loved the acting in the movie",
          "The movie I saw was bad"]
test_clean = [getStemmedReviews(i) for i in test_x]
print(test_clean)

['happi happi love act movi', 'movi saw bad']


In [74]:
xt_vec = cv.transform(test_clean).toarray()
print(xt_vec.shape)

(2, 31)


In [75]:

cv.get_feature_names()

['act',
 'act hero',
 'awesom',
 'awesom act',
 'awesom movi',
 'bad',
 'bad upto',
 'better',
 'could',
 'could better',
 'disappoint',
 'end',
 'end awesom',
 'great',
 'great movi',
 'happi',
 'happi end',
 'hero',
 'like',
 'like lot',
 'lot',
 'love',
 'love truli',
 'mark',
 'movi',
 'movi disappoint',
 'movi like',
 'truli',
 'truli great',
 'upto',
 'upto mark']

In [76]:
mb.predict(xt_vec)

array([1, 0])

In [77]:
mb.predict_proba(xt_vec)

array([[0.10741537, 0.89258463],
       [0.63165588, 0.36834412]])