# Create a NLP Pipeline to 'Clean' Reviews Data
    => Load Input File and Read Reviews
    => Tokenize
    => Remove Stopwords
    => perform Stemming
    => Write cleaned data to output file

In [13]:
sample_text = " I loved this movie since I was 7 and I saw it on the opening day. It was so touching and beautiful. I strongly recommend seeing for all.It's a movie to watch with your family by far.<br /><br />My Papa rating: PG-13 for thematic elements, prolonged scenes of disastor, nudity/sexuality and some language. "

# NLTK

In [27]:
# Create NLTK pipeline
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import sys

In [21]:
# Init objects
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [31]:
# save this funtioin as python script "clean_text"
def getCleanedReview(review):
    
    review = review.lower()
    review = review.replace("<br /><br />"," ")
    
    # Tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review



In [None]:
# Write one function that accepts input file and returns clean output file of monie reviews
def getStemmedDocument(inputFile,outputFile):
    
    with open(inputFile,encoding="utf8") as f:
        reviews = f.readlines()
        
    for review in reviews:
        cleaned_review = getCleaned
        Review(review)
        print((cleaned_review),file=out)
     
    out.close()
    
# Read command ln arguments

inputFile = sys.argv[1]
outpuFile = sys.argv[2]
getStemmedDocument(inputFile,outputFile)
        
        

# Multinomias Event Model

In [70]:
x = [
    "This was an awesome movie",
    "Great movie! I liked it a lot",
    "Happy Ending! awesome acting by the hero",
    "loved it! truly great",
    "bad not upto the mark",
    "could have better",
    "Surely a Disaapointing movie"
]

y = [1,1,1,1,0,0,0] # 1-Positive,0-Negative reviews

In [81]:
x_test = ["I was happy & happy and I loved the acting in the movie",
         "The movie I saw was not good"]

### 1.Cleaning

In [39]:
# Import the clean funtion that you saved earlie as script
#import clean_text as ct

In [82]:
#x_clean = [ct.getCleanedReview(i) for i in x]#list comprehension
#st_clean = [ct.getCleanedReview(i) for i in x_test]
x_clean = [getCleanedReview(i) for i in x]#list comprehension
xt_clean = [getCleanedReview(i) for i in x_test]

In [73]:
print(x_clean)

['awesom movi', 'great movi like lot', 'happi end awesom act hero', 'love truli great', 'bad upto mark', 'could better', 'sure disaapoint movi']


### 2.Vectorisation

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

In [79]:
cv = CountVectorizer(ngram_range=(1,2))

x_vec = cv.fit_transform(x_clean).toarray()
print(x_vec)
print(x_vec.shape)

[[0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0]
 [0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0]]
(7, 33)


In [83]:
print(cv.get_feature_names())

['act', 'act hero', 'awesom', 'awesom act', 'awesom movi', 'bad', 'bad upto', 'better', 'could', 'could better', 'disaapoint', 'disaapoint movi', 'end', 'end awesom', 'great', 'great movi', 'happi', 'happi end', 'hero', 'like', 'like lot', 'lot', 'love', 'love truli', 'mark', 'movi', 'movi like', 'sure', 'sure disaapoint', 'truli', 'truli great', 'upto', 'upto mark']


In [84]:
## Vectoriztion on the test set

#xt_vec = cv.fit_transform(xt_clean).toarray()
xt_vec = cv.transform(xt_clean).toarray()
print(xt_vec)
cv.get_feature_names()
print(xt_vec.shape)

[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]]
(2, 33)


## 3. Multinomial Naive Bayes

In [65]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB


In [67]:
mnb = MultinomialNB()
print(mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


In [85]:
# Training 
mnb.fit(x_vec,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [86]:
# Predicts
mnb.predict(xt_vec)

array([1, 1])

In [88]:
mnb.predict_proba(xt_vec)

array([[0.0836554 , 0.9163446 ],
       [0.38255034, 0.61744966]])

In [96]:
mnb.score(x_vec,y)

1.0

### 4.Multivariate Bernoulli Event Model Naive Bayes

In [91]:
bnb = BernoulliNB(binarize =0.0)
print(bnb)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


In [92]:
bnb.fit(x_vec,y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [93]:
bnb.predict_proba(xt_vec)

array([[0.09025538, 0.90974462],
       [0.44248591, 0.55751409]])

In [94]:
bnb.predict(xt_vec)

array([1, 1])

In [95]:
bnb.score(x_vec,y)

1.0