### Textual data cleaning

In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [2]:
# Initializing objects
tokenizer=RegexpTokenizer(r'\w+') # To extract all the words
ps=PorterStemmer()
en_stopwords=set(stopwords.words('english'))
print(en_stopwords)

{'than', 'are', 'his', 'ain', 're', 'below', 'myself', 'herself', 'about', 'there', 'so', 'had', 'hasn', 'by', 'her', 'too', 'doesn', 'further', 'few', 'against', 'from', 'hers', 'just', "wasn't", "don't", 'itself', 'can', 'how', 'why', 'mustn', 'o', "couldn't", 'or', 'other', 'does', 'couldn', 'into', 'same', 'over', 'is', "shan't", 'doing', "should've", 'd', 'that', 'were', 'hadn', 'both', 'as', 'no', 'wasn', "that'll", 'with', 'mightn', "mustn't", 'ours', "didn't", 'those', 'ourselves', "aren't", 'on', 'themselves', 'the', 'where', 'now', 'out', 'nor', 'have', 'again', 'until', 'an', 'needn', 'himself', 'yourselves', 'our', 'they', 'in', 'should', 'you', "needn't", "isn't", 'isn', 'will', 'them', 'but', 'once', 'not', 'what', 'has', "she's", 'be', 'before', 'during', 'above', 'theirs', 'me', 'she', 'your', 'between', 'more', "you'll", 'if', 'been', 'which', 'after', "haven't", 'i', 'own', 'down', 'don', "hadn't", 'through', 'and', 'under', 'when', 'shan', 'we', 'only', 'some', 'll',

In [3]:
# Removing negative words from stopwords
negative_words=["not", "won't", "wasn't", "weren't", "nor", "neither", "mustn't", "mightn't", "shouldn't", "haven't", "hasn't", "hadn't", "don't", "didn't", "shan't", "couldn't"] 
en_stopwords=[word for word in en_stopwords if word not in negative_words]
print(en_stopwords)

['than', 'are', 'his', 'ain', 're', 'below', 'myself', 'herself', 'about', 'there', 'so', 'had', 'hasn', 'by', 'her', 'too', 'doesn', 'further', 'few', 'against', 'from', 'hers', 'just', 'itself', 'can', 'how', 'why', 'mustn', 'o', 'or', 'other', 'does', 'couldn', 'into', 'same', 'over', 'is', 'doing', "should've", 'd', 'that', 'were', 'hadn', 'both', 'as', 'no', 'wasn', "that'll", 'with', 'mightn', 'ours', 'those', 'ourselves', "aren't", 'on', 'themselves', 'the', 'where', 'now', 'out', 'have', 'again', 'until', 'an', 'needn', 'himself', 'yourselves', 'our', 'they', 'in', 'should', 'you', "needn't", "isn't", 'isn', 'will', 'them', 'but', 'once', 'what', 'has', "she's", 'be', 'before', 'during', 'above', 'theirs', 'me', 'she', 'your', 'between', 'more', "you'll", 'if', 'been', 'which', 'after', 'i', 'own', 'down', 'don', 'through', 'and', 'under', 'when', 'shan', 'we', 'only', 'some', 'll', 'to', 'each', 'did', 'up', 'most', 'wouldn', 'shouldn', 'him', 'because', "wouldn't", 'aren', "d

In [4]:
def getStemmedReview(review):
    review=review.lower()
    review=review.replace('<br/>', ' ')
    # Tokenize
    tokens=tokenizer.tokenize(review)
    filtered_tokens=[token for token in tokens if token not in en_stopwords]
    stemmed_tokens=[ps.stem(token) for token in filtered_tokens]
    stemmed_review=' '.join(stemmed_tokens)
    return stemmed_review

In [5]:
def getStemmedDocument(outputFile, inputFile):
    out=open(outputFile, 'w', encoding='utf-8')
    with open(inputFile, encoding='utf-8') as f:
        reviews=f.readlines()
    for review in reviews:
        stemmed_review=getStemmedReview(review)
        print(stemmed_review, file=out)
    out.close()

### Using multinomial NB

In [6]:
X=['This was an awesome movie.',
  'Great movie! I liked it a lot.',
   'Bad, not upto the mark.',
   'Happy ending. Awesome acting by the hero.',
   'Surely a disappointing movie',
   'Loved it! Truely great.',
   'Could have been better.'
  ]
Y=[1, 1, 0, 1, 0, 1, 0] # 1-Positive 0-Negative

In [7]:
Xt=['I was happy and I loved the acting in the movie',
   'The movie that I saw was not bad.']

In [8]:
# Cleaning
X_clean=[getStemmedReview(i) for i in X]
Xt_clean=[getStemmedReview(i) for i in Xt]

In [9]:
X_clean

['awesom movi',
 'great movi like lot',
 'bad not upto mark',
 'happi end awesom act hero',
 'sure disappoint movi',
 'love trueli great',
 'could better']

In [10]:
Xt_clean

['happi love act movi', 'movi saw not bad']

In [11]:
# Vectorization
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [12]:
X_vec=cv.fit_transform(X_clean).toarray()
print(X_vec)
print(cv.get_feature_names()) # Vocab

[[0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1]
 [1 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0]
 [0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
['act', 'awesom', 'bad', 'better', 'could', 'disappoint', 'end', 'great', 'happi', 'hero', 'like', 'lot', 'love', 'mark', 'movi', 'not', 'sure', 'trueli', 'upto']


In [13]:
Xt_vec=cv.transform(Xt_clean).toarray()
print(Xt_vec)

[[1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0]]


In [14]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()

In [15]:
mnb.fit(X_vec, Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
mnb.predict(Xt_vec)

array([1, 0])

In [17]:
# Posterior probalilities
mnb.predict_proba(Xt_vec)

array([[0.10761126, 0.89238874],
       [0.766035  , 0.233965  ]])

### Using multivariate berouli NB

In [18]:
from sklearn.naive_bayes import BernoulliNB
bnb=BernoulliNB(binarize=0.0)

In [19]:
bnb.fit(X_vec, Y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [20]:
mnb.predict(Xt_vec)

array([1, 0])

In [21]:
# Posterior probalilities
mnb.predict_proba(Xt_vec)

array([[0.10761126, 0.89238874],
       [0.766035  , 0.233965  ]])