In [53]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import pandas as pd
import re
import csv


In [54]:
#reading both file using two functions read_table and read_csv
train_data = pd.read_table("train_file.dat",header=None,names=["sentiment","review"])
test_data = pd.read_csv("test1.csv",sep='\n', header=None,names=["review_new"])
print(train_data)
print(test_data)

       sentiment                                             review
0              1  This book is such a life saver.  It has been s...
1              1  I bought this a few times for my older son and...
2              1  This is great for basics, but I wish the space...
3              1  This book is perfect!  I'm a first time new mo...
4              1  During your postpartum stay at the hospital th...
...          ...                                                ...
18501         -1  I really liked this monitor at first, but the ...
18502         -1  Apparently you get what you pay for.  I've use...
18503         -1  The old saying holds true with this product --...
18504         -1  We did a great deal of research before purchas...
18505         -1  I ordered these after having great success wit...

[18506 rows x 2 columns]
                                              review_new
0      Perfect for new parents. We were able to keep ...
1      Helps me know exactly how my babies d

In [55]:
#using regex on both trainig and test data to remove numeric as well as any special character after converting it to lowercase
train_data.review=train_data.review.str.lower()
test_data.review_new=test_data.review_new.str.lower()
train_data['review'] = [re.sub("[^a-z ]","", str(x)) for x in train_data['review']]
test_data['review_new'] = [re.sub("[^a-z ]","", str(x)) for x in test_data['review_new']]
print(train_data.review[0])
print()
print(test_data.review_new[0])

this book is such a life saver  it has been so helpful to be able to go back to track trends answer pediatrician questions or communicate with each other when you are up at different times of the night with a newborn  i think it is one of those things that everyone should be required to have before they leave the hospital  we went through all the pages of the newborn version then moved to the infant version and will finish up the second infant book third total right as our baby turns   see other things that are must haves for baby at 

perfect for new parents we were able to keep track of babys feeding sleep and diaper change schedule for the first two and a half months of her life made life easier when the doctor would ask questions about habits because we had it all right there


In [56]:
#performing stemming and removing stopwords from training data
stemmer=SnowballStemmer("english")
stopWords = set(stopwords.words('english'))
train_data['review'] = train_data['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split() if word not in (stopWords)]))
train_data['review'] = [word_tokenize(str(x)) for x in train_data['review']]
print(train_data['review'][0])


['book', 'life', 'saver', 'help', 'abl', 'go', 'back', 'track', 'trend', 'answer', 'pediatrician', 'question', 'communic', 'differ', 'time', 'night', 'newborn', 'think', 'one', 'thing', 'everyon', 'requir', 'leav', 'hospit', 'went', 'page', 'newborn', 'version', 'move', 'infant', 'version', 'finish', 'second', 'infant', 'book', 'third', 'total', 'right', 'babi', 'turn', 'see', 'thing', 'must', 'have', 'babi']


In [57]:
#performing stemming and removing stopwords from test data
test_data['review_new'] = test_data['review_new'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split() if word not in (stopWords)]))
test_data['review_new'] = [word_tokenize(str(x)) for x in test_data['review_new']]
print(test_data['review_new'][0])

['perfect', 'new', 'parent', 'abl', 'keep', 'track', 'babi', 'feed', 'sleep', 'diaper', 'chang', 'schedul', 'first', 'two', 'half', 'month', 'life', 'made', 'life', 'easier', 'doctor', 'would', 'ask', 'question', 'habit', 'right']


In [58]:
#Using TF-idf vectorizer on training data
vectorizer = TfidfVectorizer()
train_data['review']=[" ".join(review) for review in train_data['review'].values]
x_train=vectorizer.fit_transform(train_data['review'])
print(x_train.shape)
X=x_train.toarray()
Y=train_data['sentiment']


(18506, 27864)


In [59]:
#Using transform method on test data on the basis of given vocabulary from training data
test_data['review_new']=[" ".join(review) for review in test_data['review_new'].values]
x_test=vectorizer.transform(test_data['review_new'])
x_test=x_test.toarray()
print(x_test.shape)

(18506, 27864)


In [60]:
#training model on train data
log_reg = LogisticRegression(max_iter=20000)
log_reg.fit(X,Y)


LogisticRegression(max_iter=20000)

In [61]:
#predicting labels on test data
prediction = log_reg.predict(x_test)

In [62]:
#printing labels in text file name output
with open("output.txt", "w") as out_file:
    for element in prediction:
        print(element,file=out_file)
