In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import string
import sqlite3
import nltk

import re # Tutorial about Python regular expression: http://pymotw.com/2/re/

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from tqdm import tqdm
import os


In [2]:
con = sqlite3.connect('database.sqlite')

In [4]:
filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3""", con) 
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [5]:
# Give reviews with Score>3 a positive rating(1), and reviews with a score<3 a negative rating(0).
def partition(x):
    if x < 3:
        return 0
    return 1

#changing reviews with score less than 3 to be positive and vice-versa
filtered_data['Score'] = filtered_data['Score'].apply(partition)
filtered_data.shape

(525814, 10)

In [6]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,0,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,1,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [7]:
display = pd.read_sql_query("""Select UserId, ProductId,ProfileName, Time, Score, Text, Count(*)
from Reviews
group by UserId
Having count(*) > 1
""", con)

In [8]:
print(display.shape)
display.head()

(80668, 7)


Unnamed: 0,UserId,ProductId,ProfileName,Time,Score,Text,Count(*)
0,#oc-R115TNMSPFT9I7,B007Y59HVM,Breyton,1331510400,2,Overall its just OK when considering the price...,2
1,#oc-R11D9D7SHXIJB9,B005HG9ET0,"Louis E. Emory ""hoppy""",1342396800,5,"My wife has recurring extreme muscle spasms, u...",3
2,#oc-R11DNU2NBKQ23Z,B007Y59HVM,Kim Cieszykowski,1348531200,1,This coffee is horrible and unfortunately not ...,2
3,#oc-R11O5J5ZVQE25C,B005HG9ET0,Penguin Chick,1346889600,5,This will be the bottle that you grab from the...,3
4,#oc-R12KPBODL2B5ZD,B007OSBE1U,Christopher P. Presta,1348617600,1,I didnt like this coffee. Instead of telling y...,2


In [9]:
display['Count(*)'].sum()

393063

#  [2] Exploratory Data Analysis
## [2.1] Data Cleaning: Deduplication

It is observed (as shown in the table below) that the reviews data had many duplicate entries. Hence it was necessary to remove duplicates in order to get unbiased results for the analysis of the data.  Following is an example:

In [10]:
display = pd.read_sql_query("""
Select * from Reviews
where Score != 3 and UserId = "AR5J8UI46CURR"
order by ProductID
""", con)

display.head()


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [11]:
#Sorting data according to ProductId in ascending order
sorted_data = filtered_data.sort_values('ProductId', axis=0, ascending=True)

In [12]:
# Deduplication of entries
final = sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"})
final.shape

(364173, 10)

In [13]:
# checking to see how much % of data still remain
(final['Id'].size*1)/(filtered_data.size*1)*100

6.9258901436629685

In [14]:
display = pd.read_sql_query("""
Select * from Reviews
where Score != 3 and Id = 44737 or Id = 64422
order by ProductId
""", con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [15]:
display = pd.read_sql_query("""
Select * from Reviews
where Score != 3 and HelpfulnessNumerator > HelpfulnessDenominator
order by ProductId
""", con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [16]:
final = final[final.HelpfulnessNumerator <= final.HelpfulnessDenominator]

#Before starting the next phase of preprocessing lets see the number of entries left

final.shape

(364171, 10)

In [17]:
#How many positive and negative reviews are present in our dataset?

final['Score'].value_counts()

1    307061
0     57110
Name: Score, dtype: int64

# [3] Preprocessing
## [3.1]. Preprocessing Review Text
Now that we have finished deduplication our data requires some preprocessing before we go on further with analysis and making the prediction model.

Hence in the Preprocessing phase we do the following in the order below:-

1. Begin by removing the html tags
2. Remove any punctuations or limited set of special characters like , or . or # etc.
3. Check if the word is made up of english letters and is not alpha-numeric
4. Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
5. Convert the word to lowercase
6. Remove Stopwords
7. Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)
After which we collect the words used to describe positive and negative reviews## BAg of words

In [18]:
# https://stackoverflow.com/a/47091490/4084039

import re

def decontracted(phrase):
    
    phrase = re.sub(r"won't", 'will not', phrase)
    phrase = re.sub(r"can't", 'can not', phrase)
    phrase = re.sub(r"n't", ' not', phrase)
    phrase = re.sub(r"'re", ' are', phrase)
    phrase = re.sub(r"'d'", ' would', phrase)
    phrase = re.sub(r"'ll", ' will', phrase)
    phrase = re.sub(r"'t", ' not', phrase)
    phrase = re.sub(r"'m", ' am', phrase)
    phrase = re.sub(r"'ve", ' have', phrase)
#     phrase = re.sub(r"'t", 'can not', phrase)
    
    return phrase
        

In [22]:
# https://stackoverflow.com/questions/16206380/python-beautifulsoup-how-to-remove-all-tags-from-an-element
from bs4 import BeautifulSoup

In [20]:
nltk.download('stopwords')
stopwords = stopwords.words('english')
stopwords



[nltk_data] Downloading package stopwords to C:\Users\Saurabh
[nltk_data]     Satyarth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [23]:
# from tqdm import tqdm
# preprocessed_reviews = []
# # tqdm is for printing the status bar
# for sentence in tqdm(final['Text'].values):
#     sentence = re.sub(r'http\S+', '', sentence)
#     sentence = BeautifulSoup(sentence, 'html').get_text()
#     sentence = decontracted(sentence)
#     sentence = re.sub('\S*\d\S*', '', sentence)
#     sentence = re.sub('[^A-Za-z]', ' ', sentence)
#     sentence = " ".join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
#     preprocessed_reviews.append(sentence.strip())

100%|████████████████████████████████████████████████████████████████████████| 364171/364171 [02:43<00:00, 2227.91it/s]


In [24]:
for review in preprocessed_reviews[:4]:
    print(review)
    print("=" * 70)
    

witty little book makes son laugh loud recite car driving along always sing refrain learned whales india drooping roses love new words book introduces silliness classic book willing bet son still able recite memory college
grew reading sendak books watching really rosie movie incorporates love son loves however miss hard cover version paperbacks seem kind flimsy takes two hands keep pages open
fun way children learn months year learn poems throughout school year like handmotions invent poem
great little book read aloud nice rhythm well good repetition little ones like lines chicken soup rice child gets go months year go wonderful places like bombay nile eating well know get eat kids maurice sendak version ice skating treat roses heads long time even know came surprise came little witty book


# [3.2] Preprocessing Review Summary


In [25]:
## Similartly you can do preprocessing for review summary also.
# preprocessed_summary = []
# for sentence in tqdm(final['Summary'].values):
#     sentence = re.sub(r'http\S+', '', sentence)
#     sentence = BeautifulSoup(sentence, 'html').get_text()
#     sentence = decontracted(sentence)
#     sentence = re.sub('\S*\d\S*', '', sentence)
#     sentence = re.sub('[^A-Za-z]', ' ', sentence)
#     sentence = " ".join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
#     preprocessed_summary.append(sentence.strip())

100%|████████████████████████████████████████████████████████████████████████| 364171/364171 [01:06<00:00, 5446.07it/s]


In [26]:
for summary in preprocessed_summary[:4]:
    print(summary)
    print("=" * 30)
    

every book educational
love book miss hard cover version
chicken soup rice months
good swingy rhythm reading aloud


In [27]:
final['cleanedtext'] = preprocessed_reviews

In [28]:
final['cleanedsummary'] = preprocessed_summary
final.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,cleanedtext,cleanedsummary
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,1,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,witty little book makes son laugh loud recite ...,every book educational
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,1,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",grew reading sendak books watching really rosi...,love book miss hard cover version
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,1,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,fun way children learn months year learn poems...,chicken soup rice months


In [31]:
conn = sqlite3.connect('final.sqlite')
c = conn.cursor()
conn.text_factory = str
final.to_sql('Reviews', conn, schema=None, if_exists='replace')

# [4] Featurization
 ## [4.1] BAG OF WORDS

In [35]:
count_vect = CountVectorizer()
final_count = count_vect.fit_transform(preprocessed_reviews)

print(final_count.get_shape())
print(count_vect.get_feature_names()[:10])

(4986, 13037)
['aa', 'aahhhs', 'aback', 'abandon', 'abates', 'abbott', 'abby', 'abdominal', 'abiding', 'ability']


### Bi-grams and n-Grams

In [36]:
#bi-grams, tri-gram and n-gram

# removing stopwords like 'not' should be avoided before building n-gram

count_vect = CountVectorizer(ngram_range=(1,2))
final_bigram = count_vect.fit_transform(preprocessed_reviews)

print(final_bigram.get_shape())
print(count_vect.get_feature_names()[:10])

(4986, 139094)
['aa', 'aa sumatra', 'aahhhs', 'aahhhs get', 'aback', 'aback brand', 'abandon', 'abates', 'abates steeping', 'abbott']


### TF-IDF

In [46]:
# term frequency : 
# inverse document frequency

tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf = tf_idf_vect.fit_transform(preprocessed_reviews)

print(final_tf_idf.get_shape())
features =  tf_idf_vect.get_feature_names()
print(features[:10])

(4986, 139094)
['aa', 'aa sumatra', 'aahhhs', 'aahhhs get', 'aback', 'aback brand', 'abandon', 'abates', 'abates steeping', 'abbott']


In [47]:
# convert a row in sparsematrix to numpy array
sparse = final_tf_idf[2,:].toarray()[0]
print(sparse)
print(len(sparse))

[0. 0. 0. ... 0. 0. 0.]
139094


In [48]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

top_tfidf = top_tfidf_feats(sparse, features, 25)

In [49]:
top_tfidf

Unnamed: 0,feature,tfidf
0,printed,0.223244
1,windows,0.19886
2,windows everywhere,0.130098
3,shop program,0.130098
4,product windows,0.130098
5,car windows,0.130098
6,designed signs,0.130098
7,windows printed,0.130098
8,printed reverse,0.130098
9,printed beautifully,0.130098


### Word2Vec

In [50]:
# Train your own Word2Vec model using your own text corpus
list_of_sentence = []
for sentence in preprocessed_reviews:
    list_of_sentence.append(sentence.split())
len(list_of_sentence)

4986

In [52]:
import gensim
w2v_model = gensim.models.Word2Vec(list_of_sentence, min_count=5, size=50, 
                                   workers=4)

In [60]:
words = list(w2v_model.wv.vocab)
print(len(words))
print(words[:50])

3836
['product', 'available', 'course', 'total', 'pretty', 'stinky', 'right', 'nearby', 'used', 'ca', 'beat', 'great', 'received', 'shipment', 'could', 'hardly', 'wait', 'try', 'love', 'call', 'instead', 'removed', 'easily', 'daughter', 'designed', 'printed', 'use', 'car', 'windows', 'beautifully', 'shop', 'program', 'going', 'lot', 'fun', 'everywhere', 'like', 'tv', 'computer', 'really', 'good', 'idea', 'final', 'outstanding', 'window', 'everybody', 'asks', 'bought', 'made', 'two']


In [54]:
w2v_model.wv.most_similar('tasty')

[('snack', 0.9984371066093445),
 ('crisp', 0.9983736276626587),
 ('crispy', 0.9983733892440796),
 ('fried', 0.998332679271698),
 ('artificial', 0.9983131289482117),
 ('light', 0.9982814788818359),
 ('greasy', 0.9982805848121643),
 ('texture', 0.9982209801673889),
 ('crunch', 0.9982200264930725),
 ('love', 0.9982093572616577)]

In [55]:
w2v_model.wv.most_similar('like')

[('sweet', 0.9967730045318604),
 ('taste', 0.9951509833335876),
 ('tastes', 0.9922884702682495),
 ('flavor', 0.9918922185897827),
 ('strong', 0.9913098216056824),
 ('milk', 0.9854931831359863),
 ('bitter', 0.9852263331413269),
 ('add', 0.9842127561569214),
 ('chocolate', 0.9839362502098083),
 ('green', 0.9835146069526672)]

In [56]:
count_vect_feat = count_vect.get_feature_names()
count_vect_feat.index('like')

68140

In [58]:
count_vect_feat[68140]

'like'

### Avg W2V, TFIDF-W2V

In [63]:
sentence_vec = []
for sentence in preprocessed_reviews:
    count_word = 0
    sent_vector = np.zeros(50)
    for word in sentence:
        try:
            vec = w2v_model.wv[word]
            sent_vector += vec
            count_word += 1
        except:
            pass
    sent_vector /= count_word
    sentence_vec.append(sent_vector)
print(len(sentence_vec))
print(len(sentence_vec[0]))

4986
50
