##### Importing Necessary Packages

In [210]:
import nltk
import nltk.corpus
import string
import re
import numpy as np
import pandas as pd
import sklearn
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('wordnet')
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfTransformer
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\varma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\varma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\varma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Loading Data into Pandas Data Frame

In [211]:
#Replace the paths of training_file and testing_file with your local paths
training_file="C:/Users/varma/OneDrive/Documents/1643662645_8986752_1567602457_1187546_train_file.dat"
testing_file = "C:/Users/varma/OneDrive/Documents/1643662645_9617953_1567602457_126649_test.dat"

In [212]:
#load data from training file to pandas dataframe 
data = pd.read_table(training_file,names=["Sentiment", "Review"])
#fill null values in dataframe with text and inplace condition true
data.fillna("No review", inplace = True)
data.shape

(18506, 2)

# Preprocessing/Cleaning Data

In [213]:
#Convert text into lower case
data.Review=data.Review.str.lower()
#remove digits in text
data.Review=data.Review.str.replace('\d+', '')
#remove punctuation in the text data
data.Review=data.Review.str.replace('[{}]'.format(string.punctuation), '')
#strip leading and trailing whitespaces
data.Review=data.Review.str.strip()
print(data)

       Sentiment                                             Review
0              1  this book is such a life saver  it has been so...
1              1  i bought this a few times for my older son and...
2              1  this is great for basics but i wish the space ...
3              1  this book is perfect  im a first time new mom ...
4              1  during your postpartum stay at the hospital th...
...          ...                                                ...
18501         -1  i really liked this monitor at first but the s...
18502         -1  apparently you get what you pay for  ive used ...
18503         -1  the old saying holds true with this product  y...
18504         -1  we did a great deal of research before purchas...
18505         -1  i ordered these after having great success wit...

[18506 rows x 2 columns]


In [214]:
#Remove the stopwords from review data
stop_word = stopwords.words('english')
data.Review = data.Review.apply(lambda review: ' '.join([text for text in review.split() if text not in (stop_word)]))

In [215]:
#Performing stemming on review data
stemmer = SnowballStemmer("english")
data.Review = data.Review.apply(lambda review: ' '.join([stemmer.stem(text) for text in review.split()]))

In [216]:
#perform lemmatization on review data
lemmatizer = WordNetLemmatizer()
data.Review = data.Review.apply(lambda review: ' '.join([lemmatizer.lemmatize(text, pos ='v') for text in review.split()]))

# Traing the Model

In [217]:
#split data into training and test sets with test size as 0.25 of total data
X = data['Review']
y = data['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size = 0.25)

###  Bag of Words and TF-IDF Representation

In [218]:
#bag_of_words is the vector containg the words of each review.where each position of the vector represnts a word and the value of that position represents the number of times that word is used in the review
bag_of_words = CountVectorizer(analyzer = "word",ngram_range = (1,2),max_features = 10000) 
bow_word_features= bag_of_words.fit_transform(X_train)
bow_word_features=bow_word_features.toarray()
# The tf_df transorm reduces the value of a given word in proportion to the number of documents that it appears in.
tf_idf =  TfidfTransformer()
tfidf_word_features = tf_idf.fit_transform(bow_word_features).toarray()
print(tfidf_word_features)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.05420796 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [219]:
#Checking the feature words
feature_words = bag_of_words.get_feature_names()
print(feature_words)

['aa', 'abil', 'abl', 'abl figur', 'abl find', 'abl fit', 'abl get', 'abl hear', 'abl hold', 'abl keep', 'abl move', 'abl open', 'abl pull', 'abl pump', 'abl push', 'abl put', 'abl remov', 'abl see', 'abl sit', 'abl sleep', 'abl take', 'abl use', 'absolut', 'absolut ador', 'absolut love', 'absolut recommend', 'absorb', 'abus', 'ac', 'accept', 'access', 'accessori', 'accid', 'accident', 'accommod', 'accomod', 'accomplish', 'accord', 'account', 'accumul', 'accur', 'ach', 'acid', 'acid reflux', 'across', 'across floor', 'act', 'action', 'activ', 'activ mat', 'actual', 'actual buy', 'actual fit', 'actual get', 'actual hold', 'actual like', 'actual make', 'actual stay', 'actual take', 'actual use', 'actual work', 'ad', 'ad bonus', 'adapt', 'adaptor', 'add', 'addit', 'address', 'aden', 'aden anai', 'adequ', 'adher', 'adhes', 'adjust', 'adjust babi', 'adjust fit', 'adjust height', 'adjust size', 'adjust strap', 'admit', 'ador', 'adult', 'adult toilet', 'advanc', 'advantag', 'advent', 'adverti

In [220]:
#Dataframe to see how tfidf values for the words.Here we see the count,word feature and tfidf values for the first review in data frame.
bow_dictionary = pd.DataFrame()
bow_dictionary['word'] = feature_words
bow_dictionary['count'] = bow_word_features[0]
bow_dictionary['tfidf_features'] = tfidf_word_features[0]
bow_dictionary.sort_values(by=['count'], ascending=False, inplace=True)
print(bow_dictionary.head(10))

              word  count  tfidf_features
4989          make      4        0.282670
1439         china      2        0.373606
3272           get      2        0.112274
8427       teether      1        0.161725
7079        return      1        0.105721
3702  green sprout      1        0.224666
5285   money worth      1        0.202989
3699         green      1        0.149032
6521       product      1        0.079064
6520        produc      1        0.187345


# Logistic Regression Classifier

In [221]:
#using the tfidf_word_features  to train the model.Passing the sentiment values in 'y_train' and tfidf_word_features with the help of fit function trains the logistic regression model
model = LogisticRegression(C = 1.1,random_state=0)
bag_of_words_model=model.fit(bow_word_features, y_train)
tf_idf_model = model.fit(tfidf_word_features, y_train)

In [222]:
#Applying the bag_of_words vectorizer to create word feature vectors for reviews in test data set
test_data_features = bag_of_words.transform(X_test)
test_data_features = test_data_features.toarray()
#Using tf_idf transform the word feature vectors are transformed in the same was as the review data in training data set
test_data_tfidf_features = tf_idf.fit_transform(test_data_features)
test_data_tfidf_features = test_data_tfidf_features.toarray()
#predict the sentiment using predict function
predicted_sentiment = tf_idf_model.predict(test_data_tfidf_features)
#matches the correctly identified sentiment values by comparing it with y_test data
correctly_predicted = predicted_sentiment == y_test
#using numpy mean to get accuracy of correctly predicted sentiment values
accuracy = np.mean(correctly_predicted) * 100
print (accuracy)

87.65939053382321


# Testing the Model

In [223]:
#load test data from the test file
test_data = pd.read_csv(testing_file,sep='\n\t\r',names=["Review"],skip_blank_lines=False) 
test_data.fillna("No review", inplace = True)

In [224]:
#perform the same preprocessing steps in the same way it was done with data in training file
test_data.Review=test_data.Review.str.lower()
test_data.Review=test_data.Review.str.replace('\d+', '')
test_data.Review=test_data.Review.str.replace('[{}]'.format(string.punctuation), '')
test_data.Review=test_data.Review.str.strip()
test_data.Review = test_data.Review.apply(lambda review: ' '.join([text for text in review.split() if text not in (stop_word)]))
test_data.Review = test_data.Review.apply(lambda review: ' '.join([stemmer.stem(text) for text in review.split()]))
test_data.Review = test_data.Review.apply(lambda review: ' '.join([lemmatizer.lemmatize(text, pos ='v') for text in review.split()]))
print(test_data)

                                                  Review
0      perfect new parent abl keep track babi fee sle...
1      help know exact babi day go mother law watch g...
2      want altern print daili log sheet nanni fill w...
3      month old son spend half day mother half neigh...
4      babi tracker brand book absolut best tracker a...
...                                                  ...
18501  wtf piec dont fit togeth instruct look noth li...
18502  ive go coupl video babi monitor compar one oth...
18503  monitor cheap doesnt work well half night stay...
18504  monitor work even attempt contact custom suppo...
18505  short stori disappoint qualiti think engin pro...

[18506 rows x 1 columns]


In [225]:
#apply bag_of_words vectorizer to transform the test data into word vectors in the same way as the training data set
test_file_features= bag_of_words.transform(test_data.Review)
test_file_features=test_file_features.toarray()
#apply the tf_idf transform so that word vectors from bag_of_words vectorizer are transformed in the same way as the training data set
test_file_tfidf_features = tf_idf.transform(test_file_features).toarray()
#Perform sentiment prediction of review
sentiment_prediction = tf_idf_model.predict(test_file_tfidf_features)
#output the predicted values
sp_list=list(sentiment_prediction)
for i in range(0,len(sp_list)):
    print(sp_list[i])

1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
1
-1
1
-1
1
1
1
1
-1
1
1
1
1
1
1
1
1
-1
1
-1
-1
1
1
1
1
1
1
1
-1
-1
-1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
-1
1
1
1
1
1
1
-1
1
1
1
1
-1
1
1
-1
-1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
-1
1
1
-1
1
1
-1
1
1
-1
1
1
1
1
-1
-1
1
-1
1
1
1
1
1
1
1
1
-1
1
-1
1
1
1
1
1
1
1
-1
1
-1
1
1
-1
-1
-1
-1
-1
-1
1
1
1
-1
1
1
1
1
-1
1
1
1
1
1
1
1
-1
1
-1
-1
-1
-1
-1
1
1
1
1
1
1
1
1
-1
1
1
1
-1
-1
1
1
-1
-1
1
1
-1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
1
-1
-1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
1
1
-1
1
-1
1
1
-1
1
1
1
1
1
-1
1
-1
1
1
1
1
1
-1
-1
1
1
1
1
1
1
1
1
1
-1
1
1
-1
1
1
1
1
1
-1
1
1
1
1
1
1
1
-1
1
1
-1
1
-1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
-1
1
1
1
-1
1
1
-1
1
1
1
1
1
1
-1
1
1
1
1


1
1
1
1
1
-1
-1
1
1
1
1
1
1
1
1
1
1
1
1
-1
-1
1
-1
1
1
1
1
1
-1
-1
-1
-1
1
1
-1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
-1
1
1
1
-1
1
1
1
1
1
-1
1
1
1
-1
1
1
1
1
1
-1
-1
1
1
-1
1
-1
-1
-1
1
-1
-1
-1
-1
1
-1
1
1
-1
-1
-1
1
1
-1
1
-1
-1
-1
1
1
1
-1
1
-1
-1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
-1
-1
1
1
1
1
1
1
1
1
-1
1
1
1
-1
1
1
1
1
1
-1
1
1
1
1
1
1
-1
1
1
-1
1
1
1
1
1
1
1
-1
-1
1
1
1
1
-1
1
-1
1
1
1
-1
1
1
1
1
1
1
1
1
1
-1
1
1
1
-1
1
-1
1
-1
1
-1
1
-1

1
1
1
1
1
1
-1
1
-1
-1
1
1
1
1
1
1
1
1
-1
1
1
1
-1
1
1
-1
1
1
-1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
-1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
-1
-1
1
1
1
1
1
1
1
-1
1
1
1
1
-1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
-1
1
1
1
-1
1
1
1
1
-1
-1
1
1
1
1
1
1
1
1
1
1
-1
1
-1
1
-1
1
1
1
1
1
1
-1
-1
1
1
-1
1
1
1
1
1
1
1
1
1
-1
-1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
1
1
1
1
1
1
1
1
1
1
1
-1
1
1
-1
1
1
1
-1
-1
1
-1
1
1
1
-1
1
1
1
1
1
1
1
-1
1
1
1
-1
-1
1
-1
1
-1
-1
1
1
1
-1
1
1
-1
1
-1
1
1
1
-1
1
1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
1
-1
-1
-1
-1
1
-1
-1
-1
-1
1
-1
-1
-1
-1
1
-1
-1
-1
1
-1
-1
1
1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
1
-1
1
-1
1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
1
-1
-1
-1
-1
-1


-1
-1
1
-1
-1
-1
-1
1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
1
-1
-1
-1
-1
1
1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
1
-1
-1
1
1
-1
-1
-1
-1
-1
-1
1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
1
-1
-1
1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
1
-1
-1
-1
-