In [153]:
#Assigning texts from files to lists
import numpy as np

file = open("Train data.dat", encoding='utf-8')
testfile = open("Test data.dat", encoding='utf-8')

#Training data
def labtxt_split(f): # to store labels(sentimental value) and reviews seperately from train.dat
    
    labels = []
    texts = []
    
    for x in f:
        labels.append(x[:2]) #takes first 0-2 characters, that is +1,-1 from train.dat
        texts.append(x[3:].strip()) #takes all remaining texts of each record from train.dat
    return np.array(labels), texts

train_labels, train_texts = labtxt_split(file) 
print(train_labels) #to show list of sentimental values alone
print(len(train_labels)) #18506 records

print(train_texts[0]) #printing first review after seperating sentimental values

#Testing data
test_texts = [] # to store reviews from test.dat
for x in testfile:
    test_texts.append(x)

print(len(test_texts)) #to check for 18506 records same as train.dat

['+1' '+1' '+1' ... '-1' '-1' '-1']
18506
This book is such a life saver.  It has been so helpful to be able to go back to track trends, answer pediatrician questions, or communicate with each other when you are up at different times of the night with a newborn.  I think it is one of those things that everyone should be required to have before they leave the hospital.  We went through all the pages of the newborn version, then moved to the infant version, and will finish up the second infant book (third total) right as our baby turns 1.  See other things that are must haves for baby at [...]
18506


In [154]:
#data pre-processing (cleaning)
import re
NON_ALPHANUM = re.compile(r'[\W]') #extracting non alphanumerics
NON_ASCII = re.compile(r'[^a-z0-1\s]') #extracting set of all non alphabets and puntuation marks
def make_standard(reviews):
    standardizing = []
    for text in reviews:
        lower = text.lower() #converting text to lowercase
        no_punct = NON_ALPHANUM.sub(r' ', lower) #removing non alphanumerics 
        clear_txt = NON_ASCII.sub(r'', no_punct) 
        standardizing.append(clear_txt)
    return standardizing


train_texts = make_standard(train_texts)
test_texts = make_standard(test_texts)

print(train_texts[0]) #to check if all texts are lowercase and contains no special characters


this book is such a life saver   it has been so helpful to be able to go back to track trends  answer pediatrician questions  or communicate with each other when you are up at different times of the night with a newborn   i think it is one of those things that everyone should be required to have before they leave the hospital   we went through all the pages of the newborn version  then moved to the infant version  and will finish up the second infant book  third total  right as our baby turns 1   see other things that are must haves for baby at      


In [155]:
#count vectorizer to convert, arrange and transform words to fixed vectors 
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True, ngram_range=(1,2)) #initializing count vectorizer with unigrams and bigrams
#binary=True to eliminate no. of occurences of a word and store as either 0 or 1

x = cv.fit_transform(train_texts) #to calc mean & SD and transform train data
x_test = cv.transform(test_texts) #to transform test data

x_test

<18506x414238 sparse matrix of type '<class 'numpy.int64'>'
	with 2585228 stored elements in Compressed Sparse Row format>

In [156]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

#splitting the training set into 2 subsets - 75% for training and 25% for testing on the same
x_train, x_val, y_train, y_val = train_test_split(x, train_labels, train_size = 0.75) 
    
for c in [0.01, 0.05, 0.25, 0.5, 1]:

    lr = LogisticRegression(C=c, solver='lbfgs', max_iter=200, class_weight="balanced") #to stop max iterations reached error
        
    lr.fit(x_train, y_train)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_val, lr.predict(x_val))))


Accuracy for C=0.01: 0.8778906418845904
Accuracy for C=0.05: 0.8949643397449751
Accuracy for C=0.25: 0.8977739355954182
Accuracy for C=0.5: 0.8966933218067863
Accuracy for C=1: 0.8958288307758807


In [157]:
#predicting sentiment for test data using the defined classifier
predictions = lr.predict(x_test)

predictions.tolist()

print(len(predictions)) #to check if predictions of all records exists

18506


In [158]:
#writing the predicted values in a new file
file = open("Test_predict1.dat","w", encoding='utf-8')

for i in predictions:
    file.write(str(i)+'\n')
    
#printing the written prediction values from file
with open("Test_predict1.dat") as test:
    for s in test:
        print(s)
        
test.close()

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

-1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

-1

+1

+1

+1

+1

+1

+1

-1

+1

-1

+1

+1

+1

+1

-1

+1

+1

+1

+1

+1

+1

-1

+1

-1

+1

-1

-1

+1

+1

+1

+1

+1

+1

+1

-1

-1

+1

+1

+1

+1

+1

-1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

-1

-1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

-1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

+1

-1

+1

+1

+1

+1

-1

+1

+1

+1

-1

-1

+1

+1

+1

-1

+1

+1

+1

+1

+1

-1

+1

+1

-1

+1

+1

-1

+1

+1

+1

+1

-1

-1

+1

+1

+1

+1

+1

+1

+1

+1

+1

-1

+1

+1

+1

+1

-1

+1

+1

-1

+1

+1

-1

+1

+1

-1

-1

+1

-1

-1

+1

-1

+1

-1

+1

-1

+1

+1

+1

+1

+1

-1

-1

+1

+1

+1

-1

+1

-1

+1

+1

+1

-1

+1

-1

+1

+1

-1

-1

-1

+1

-1

-1

+1

+1

+1

+1

+1

+1

+1

-1

-1

+1

+1

+1

+1

+1

+1

+1

-1

+1

-1

-1

