In [49]:
# -*- coding: utf-8 -*-
"""
How to Prepare Text Data for Machine Learning with scikit-learn
"""

import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Reviews.csv')
dataset = dataset.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
dataset2 = ('Monster_page1_results.csv')

In [40]:
#bs4 for Removing HTML tags from the text
from bs4 import BeautifulSoup
#re helps in Removing Alphanumeric Text and Special Characters
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sparshk10/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sparshk10/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [41]:
#removing punctuations in the text
    
def removeApostrophe(review):
    phrase = re.sub(r"won't", "will not", review)
    phrase = re.sub(r"can\'t", "can not", review)
    phrase = re.sub(r"n\'t", " not", review)
    phrase = re.sub(r"\'re", " are", review)
    phrase = re.sub(r"\'s", " is", review)
    phrase = re.sub(r"\'d", " would", review)
    phrase = re.sub(r"\'ll", " will", review)
    phrase = re.sub(r"\'t", " not", review)
    phrase = re.sub(r"\'ve", " have", review)
    phrase = re.sub(r"\'m", " am", review)
    return phrase

In [42]:
#removing HTML tags
 
def removeHTMLTags(review):
    soup = BeautifulSoup(review, 'lxml')
    return soup.get_text()

In [43]:
#removing special characters
 
def removeSpecialChars(review):
    return re.sub('[^a-zA-Z]', ' ', review)


In [44]:
#removing Alpha numeric words

def removeAlphaNumericWords(review):
    return re.sub("\S*\d\S*", "", review).strip()

In [3]:
# Tokenization (parsing text into a list of words)
# Stopwords Removal (is, am, are)
# Lowercasing (so that uppercase and locerse versions of one word are equal)
# Lemmatization (removes the inflectional endings of the word by using the vocabulary and morphological analysis of words)

def doTextCleaning(review):
    review = removeHTMLTags(review)
    review = removeApostrophe(review)
    review = removeAlphaNumericWords(review)
    review = removeSpecialChars(review) 
 
    review = review.lower()  # Lower casing
    review = review.split()  # Tokenization
    
    #Removing Stopwords and Lemmatization
    lmtzr = WordNetLemmatizer()
    review = [lmtzr.lemmatize(word, 'v') for word in review if not word in set(stopwords.words('english'))]
    
    review = " ".join(review)    
    return review

In [2]:
# Creates document corpus (collection of all reviews)

corpus = []   
for index, row in tqdm(dataset.iterrows()):
    review = doTextCleaning(row['Text'])
    corpus.append(review)

NameError: name 'tqdm' is not defined

In [47]:
# Converts the reviews into Numeric Vectors

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
 
# Creating the transform with Tri-gram
cv = CountVectorizer(ngram_range=(1,3), max_features = 2)
 
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,6].values

In [50]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
 
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
 
# Creating Naive Bayes classifier
classifier = GaussianNB()
 
# Fitting the training set into the Naive Bayes classifier
classifier.fit(X_train, y_train)

GaussianNB()

In [51]:
#Predict sentiment for new Review
def predictNewReview():
    newReview = input("Type the Review: ")
    
    if newReview =='':
        print('Invalid Review')  
    else:
        newReview = doTextCleaning(newReview)
        reviewVector = cv.transform([newReview]).toarray()  
        prediction =  classifier.predict(reviewVector)
        if prediction[0] == 1:
            print( "Positive Review" )
        else:        
            print( "Negative Review")

In [52]:
predictNewReview()


Type the Review: fd
Negative Review


In [53]:
predictNewReview()


Type the Review: the cookies are bad
Negative Review


In [54]:
predictNewReview()


Type the Review: the cookies are good
Negative Review


In [55]:
predictNewReview()


Type the Review: I love the shrimp poppers!
Negative Review


In [56]:
predictNewReview()


Type the Review: Lemons are bad
Negative Review
