In [1]:


%matplotlib inline

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

# using the SQLite Table to read data.
con = sqlite3.connect('E:\database.sqlite') 



#filtering only positive and negative reviews i.e. 
# not taking into consideration those reviews with Score=3
filtered_data = pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3
""", con) 
# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

#changing reviews with score less than 3 to be positive and vice-versa
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Score'] = positiveNegative



In [2]:
filtered_data.shape #looking at the number of attributes and size of the data
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND UserId="AR5J8UI46CURR"
ORDER BY ProductID
""", con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [4]:
#Sorting data according to ProductId in ascending order
sorted_data=filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [5]:
#Deduplication of entries
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

(364173, 10)

In [6]:
#Checking to see how much % of data still remains
(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100

69.25890143662969

In [7]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND Id=44737 OR Id=64422
ORDER BY ProductID
""", con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [8]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [9]:
#Before starting the next phase of preprocessing lets see the number of entries left
print(final.shape)

#How many positive and negative reviews are present in our dataset?
final['Score'].value_counts()

(364171, 10)


positive    307061
negative     57110
Name: Score, dtype: int64

In [10]:
# find sentences containing HTML tags
import re
i=0;
for sent in final['Text'].values:
    if (len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i += 1;

6
I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [11]:
import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download('stopwords')
stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned
print(stop)
print('************************************')
print(sno.stem('tasty'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nvjc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
{'from', "shouldn't", 'more', 'same', 'some', 'other', 'herself', 'so', 'any', 'myself', 'own', 'too', 'just', 'when', 'mightn', 'doesn', "you're", 'is', 'most', 'such', 'very', 'yourselves', 'off', "didn't", 've', "couldn't", 'than', 'above', "hadn't", 're', 'doing', "mustn't", "you'll", 'been', 'd', 'itself', 'each', 'down', 'he', 'no', 'and', 'hasn', 'i', "needn't", 'was', 'yourself', 'were', 's', 'be', 'wouldn', 'o', "doesn't", 'during', 'did', 'him', 'ma', 'what', 'under', 'with', "isn't", 'there', "that'll", 'about', 'after', 'does', 'both', 'm', 'up', 'where', 'this', "mightn't", 'ain', 'all', 'shouldn', 'at', 'your', 'had', 'further', "aren't", 'through', 'again', "wasn't", 'me', "weren't", 'now', 'will', "don't", 'she', 'or', 'himself', 'haven', 'having', 'in', 'am', 'for', 'until', 'here', 'whom', 'can', 'yours', 'their

In [12]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase
# this code takes a while to run as it needs to run on 500k sentences.
i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in final['Text'].values:
    filtered_sentence=[]
    #print(sent);
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (final['Score'].values)[i] == 'positive': 
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(final['Score'].values)[i] == 'negative':
                        all_negative_words.append(s) #list of all words used to describe negative reviews reviews
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1

In [13]:
final['CleanedText']=final_string #adding a column of CleanedText which displays the data after pre-processing of the review

In [15]:
final.head(3) #below the processed review can be seen in the CleanedText Column 

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,b'witti littl book make son laugh loud recit c...
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",b'grew read sendak book watch realli rosi movi...
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,b'fun way children learn month year learn poem...


In [17]:
# store final table into an SQlLite table for future.
conn = sqlite3.connect('final.sqlite')
c=conn.cursor()
conn.text_factory = str
final.to_sql('Reviews', conn, schema=None, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None)

In [18]:
c = conn.cursor()

In [19]:
final = pd.read_sql_query("""SELECT * FROM Reviews;""", conn)
final.head()

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,b'witti littl book make son laugh loud recit c...
1,138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",b'grew read sendak book watch realli rosi movi...
2,138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,b'fun way children learn month year learn poem...
3,138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,positive,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,b'great littl book read nice rhythm well good ...
4,138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,b'book poetri month year goe month cute littl ...


In [21]:
## Encoding the labels

# Label encoding the "Score"
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
score_labels = label.fit_transform(final['Score'])
# Adding new column to the dataset that are label - encoded.
final['score_labels'] = score_labels

In [22]:
final.to_csv('complete_raw_df.csv')

In [23]:
final.head()

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText,score_labels
0,138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,b'witti littl book make son laugh loud recit c...,1
1,138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",b'grew read sendak book watch realli rosi movi...,1
2,138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,b'fun way children learn month year learn poem...,1
3,138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,positive,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,b'great littl book read nice rhythm well good ...,1
4,138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,b'book poetri month year goe month cute littl ...,1


In [24]:
neg = final[final['score_labels'] == 0].reset_index().drop('index', axis = 1)
pos = final[final['score_labels'] == 1].reset_index().drop('index', axis = 1)

In [25]:
print(neg.shape, pos.shape)

(57110, 13) (307061, 13)


In [26]:
np.random.seed(100)
indices = np.random.choice(np.arange(57110),replace=False, size= 5000)
neg = neg.loc[indices]
print("Negative reviews shape:", neg.shape)
indices = np.random.choice(np.arange(307061),replace=False, size = 5000)
pos = pos.loc[indices]
print("Positive reviews shape", pos.shape)
fil_df = pd.concat([neg, pos])

Negative reviews shape: (5000, 13)
Positive reviews shape (5000, 13)


In [27]:
fil_df.sort_values(by='Time', inplace=True)


In [28]:
final_df = fil_df.drop('level_0', axis=1)

In [29]:
final_df.to_csv("final_df.csv", index=False)

In [30]:
final_df = pd.read_csv('final_df.csv', encoding='cp1252')

In [31]:
final_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText,score_labels
0,361319,B00005IX96,AGUF1WPEG4GSM,"""lchang44""",5,8,negative,1055376000,"Almost expired, Sept '03",I returned this due to the very short/brief ex...,b'return due short brief expir date sept would...,0
1,193108,B0000DJDJR,A3F6UNXVI9LSMA,"Samuel H. Wheeler ""bigdaddysam""",7,7,positive,1066348800,Best Ice Cream Ever!!!,As a kid this ice cream was a favorite treat. ...,b'kid ice cream favorit treat ive told highest...,1
2,30629,B00008RCMI,A19E94CF5O1LY7,Andrew Arnold,0,0,positive,1067040000,"I've chewed this gum many times, but used?","Nothing against the product, but it does bothe...",b'noth product bother link top page buy use ch...,1
3,434425,B0000CA4TK,A5VIGE8EO86RI,"captmorgan1670 ""captmorgan1670""",4,4,positive,1068422400,&iexcl;Oye - Tremendo caf&eacute; chico !,I live in exile in Denver now - I couldn't sur...,b'live exil denver couldnt surviv without bit ...,1
4,333669,B0000UBTYG,A6BS5D5YPF2HW,MT,3,3,positive,1072137600,A Great Chai!!!,I think that this is a great Chai! Far from sw...,b'think great chai far sweet found unsweet exa...,1


In [58]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [59]:
def model_nb(train_x, train_y, test_x, test_y, sim_cv=False):
    if(sim_cv == False):
        print("Naive bayes with out cross - validation(with default <alpha> value)")
        clf = MultinomialNB()
        clf.fit(train_x, train_y)
        pred = clf.predict(test_x)
        print("-"*100)
        print("The confusion matrix is:", confusion_matrix(test_y, pred))
        print("-"*100)
        print("The accuracy of the model is:", accuracy_score(test_y, pred)*100)
        print("-"*100)
        print("The classification report is", print(classification_report(test_y, pred)))
    else:
        print("Naive bayes with cross - validation(with choice of <alpha> value)")
        myList = list(range(0,50))
        alpha = list(filter(lambda x: x % 2 != 0, myList))

        cv_scores = []
        for k in alpha:
            nb = MultinomialNB(alpha= k)
            scores = cross_val_score(nb, train_x, train_y, cv=10, scoring='accuracy')
            cv_scores.append(scores.mean())

    # changing to misclassification error
        MSE = [1 - x for x in cv_scores]

    # determining best k
        optimal_alpha = alpha[MSE.index(min(MSE))]
        print('\nThe optimal valus of <alpha> is %d.' % optimal_alpha)

    # plot misclassification error vs k 
        plt.plot(alpha, MSE)

        for xy in zip(alpha, np.round(MSE,3)):
            plt.annotate('(%s, %s)' % xy, xy=xy, textcoords='data')

        plt.xlabel('values of alpha')
        plt.ylabel('Misclassification Error')
        plt.title("Naive Bayes")
        plt.show()

        print("the misclassification error for each <alpha> value is :{0} \n\n\n".format(np.round(MSE,3)))
        print("Fitting the model with <alpha> value {0}".format(optimal_alpha))
    
    ## Prediction of test data with optimal K value.
        nb_model = MultinomialNB(alpha= optimal_alpha)

# fitting the model
        nb_model.fit(train_x, train_y)

# predict the response
        pred = nb_model.predict(test_x)

# evaluate accuracy
        acc = accuracy_score(test_y, pred) * 100
        print('\nThe accuracy of the naive bayes classifier for the <alpha> value = %d is %f%%' % (optimal_alpha, acc))
        print("-"*100)
        print("The classification report is", print(classification_report(test_y, pred)))
    return None

In [60]:
## Index at which the data is to be sliced.
index = round((10000 / 100) * 70)
train_y = final_df['score_labels'].values[:index]
test_y = final_df['score_labels'].values[index:]

In [62]:
def sample_data(vectors, dfra, amount):
    points = 10000 / amount
    index = np.arange(0, 10000, points, dtype=int)
    fil_df = dfra.loc[index]
    fil_vec = vectors[index]
    return fil_df, fil_vec

In [63]:
## Computing binary BOW.
count_vect = CountVectorizer() 
final_counts = count_vect.fit_transform(final_df['Text'].values)
final_counts = final_counts.toarray()

In [64]:
np.save('final_bow.npy',final_counts)

In [65]:
bow = np.load('final_bow.npy')

In [67]:
## Train - test split (Time - bases slicing)

print("The index at which the data to be splitted:", index)
train_x = bow[:index]

print("The train_x shape is: {0} train_y shape is {1}".format(train_x.shape, train_y.shape))
test_x = bow[index:]

print("The test_x shape is: {0} test_y shape is {1}".format(test_x.shape, test_y.shape))

The index at which the data to be splitted: 7000
The train_x shape is: (7000, 21333) train_y shape is (7000,)
The test_x shape is: (3000, 21333) test_y shape is (3000,)


In [68]:
## Fitting the model with out cross - validataion
model_nb(train_x, train_y, test_x, test_y, sim_cv=False)

Naive bayes with out cross - validation(with default <alpha> value)
----------------------------------------------------------------------------------------------------
The confusion matrix is: [[1389  232]
 [ 190 1189]]
----------------------------------------------------------------------------------------------------
The accuracy of the model is: 85.93333333333332
----------------------------------------------------------------------------------------------------
             precision    recall  f1-score   support

          0       0.88      0.86      0.87      1621
          1       0.84      0.86      0.85      1379

avg / total       0.86      0.86      0.86      3000

The classification report is None
