In [5]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import string
import sqlite3
import nltk

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

import re # Tutorial about Python regular expression: http://pymotw.com/2/re/

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm
import os


In [6]:
con = sqlite3.connect('database.sqlite')

In [7]:
filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 LIMIT 5000""", con) 
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [8]:
# Give reviews with Score>3 a positive rating(1), and reviews with a score<3 a negative rating(0).
def partition(x):
    if x < 3:
        return 0
    return 1

#changing reviews with score less than 3 to be positive and vice-versa
filtered_data['Score'] = filtered_data['Score'].apply(partition)
filtered_data.shape

(5000, 10)

In [9]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,0,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,1,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [10]:
display = pd.read_sql_query("""Select UserId, ProductId,ProfileName, Time, Score, Text, Count(*)
from Reviews
group by UserId
Having count(*) > 1
""", con)

In [11]:
print(display.shape)
display.head()

(80668, 7)


Unnamed: 0,UserId,ProductId,ProfileName,Time,Score,Text,Count(*)
0,#oc-R115TNMSPFT9I7,B007Y59HVM,Breyton,1331510400,2,Overall its just OK when considering the price...,2
1,#oc-R11D9D7SHXIJB9,B005HG9ET0,"Louis E. Emory ""hoppy""",1342396800,5,"My wife has recurring extreme muscle spasms, u...",3
2,#oc-R11DNU2NBKQ23Z,B007Y59HVM,Kim Cieszykowski,1348531200,1,This coffee is horrible and unfortunately not ...,2
3,#oc-R11O5J5ZVQE25C,B005HG9ET0,Penguin Chick,1346889600,5,This will be the bottle that you grab from the...,3
4,#oc-R12KPBODL2B5ZD,B007OSBE1U,Christopher P. Presta,1348617600,1,I didnt like this coffee. Instead of telling y...,2


In [12]:
display['Count(*)'].sum()

393063

#  [2] Exploratory Data Analysis
## [2.1] Data Cleaning: Deduplication

It is observed (as shown in the table below) that the reviews data had many duplicate entries. Hence it was necessary to remove duplicates in order to get unbiased results for the analysis of the data.  Following is an example:

In [13]:
display = pd.read_sql_query("""
Select * from Reviews
where Score != 3 and UserId = "AR5J8UI46CURR"
order by ProductID
""", con)

display.head()


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [14]:
#Sorting data according to ProductId in ascending order
sorted_data = filtered_data.sort_values('ProductId', axis=0, ascending=True)

In [15]:
# Deduplication of entries
final = sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"})
final.shape

(4986, 10)

In [16]:
# checking to see how much % of data still remain
(final['Id'].size*1)/(filtered_data.size*1)*100

9.972

In [17]:
display = pd.read_sql_query("""
Select * from Reviews
where Score != 3 and Id = 44737 or Id = 64422
order by ProductId
""", con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [18]:
display = pd.read_sql_query("""
Select * from Reviews
where Score != 3 and HelpfulnessNumerator > HelpfulnessDenominator
order by ProductId
""", con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [19]:
final = final[final.HelpfulnessNumerator <= final.HelpfulnessDenominator]

#Before starting the next phase of preprocessing lets see the number of entries left

final.shape

(4986, 10)

In [20]:
#How many positive and negative reviews are present in our dataset?

final['Score'].value_counts()

1    4178
0     808
Name: Score, dtype: int64

# [3] Preprocessing
## [3.1]. Preprocessing Review Text
Now that we have finished deduplication our data requires some preprocessing before we go on further with analysis and making the prediction model.

Hence in the Preprocessing phase we do the following in the order below:-

1. Begin by removing the html tags
2. Remove any punctuations or limited set of special characters like , or . or # etc.
3. Check if the word is made up of english letters and is not alpha-numeric
4. Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
5. Convert the word to lowercase
6. Remove Stopwords
7. Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)
After which we collect the words used to describe positive and negative reviews## BAg of words

In [21]:
count_vect = CountVectorizer()
final_count = count_vect.fit_transform(final['Text'].values)

In [22]:
print(final_count.shape)
print(final_count.get_shape())

(4986, 13510)
(4986, 13510)


In [23]:
# print(count_vect.get_feature_names())


In [24]:
sent_0 = final['Text'].values[0]
print(sent_0)
print("="* 50)

sent_1000 = final['Text'].values[1000]
print(sent_1000)
print("="*50)

sent_1500 = final['Text'].values[1500]
print(sent_1500)
print("="*50)

sent_4900 = final['Text'].values[4900]
print(sent_4900)
print("="*50)

Why is this $[...] when the same product is available for $[...] here?<br />http://www.amazon.com/VICTOR-FLY-MAGNET-BAIT-REFILL/dp/B00004RBDY<br /><br />The Victor M380 and M502 traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby.
I recently tried this flavor/brand and was surprised at how delicious these chips are.  The best thing was that there were a lot of "brown" chips in the bsg (my favorite), so I bought some more through amazon and shared with family and friends.  I am a little disappointed that there are not, so far, very many brown chips in these bags, but the flavor is still very good.  I like them better than the yogurt and green onion flavor because they do not seem to be as salty, and the onion flavor is better.  If you haven't eaten Kettle chips before, I recommend that you try a bag before buying bulk.  They are thicker and crunchier than Lays but just as fresh out of the bag.
Wow.  So far, two two-star reviews.  One obviously had no 

In [25]:
# remove urls from text python: https://stackoverflow.com/a/40823105/4084039
sent_0 = re.sub(r'http\S', "", sent_0)
sent_1000  = re.sub(r'http\S', "", sent_1000)
sent_1500 = re.sub(r'http\S', "", sent_1500)
sent_4900 = re.sub(r'http\S', "", sent_4900)

print(sent_0)


Why is this $[...] when the same product is available for $[...] here?<br />//www.amazon.com/VICTOR-FLY-MAGNET-BAIT-REFILL/dp/B00004RBDY<br /><br />The Victor M380 and M502 traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby.


In [27]:
# https://stackoverflow.com/questions/16206380/python-beautifulsoup-how-to-remove-all-tags-from-an-element
from bs4 import BeautifulSoup

soup = BeautifulSoup(sent_0, 'html')
text = soup.get_text()
print(text)
print("=" * 50)

soup = BeautifulSoup(sent_1000, 'html')
text = soup.get_text()
print(text)
print("=" * 50)

soup = BeautifulSoup(sent_1500, 'html')
text = soup.get_text()
print(text)
print("=" * 50)

soup = BeautifulSoup(sent_4900, 'html')
text = soup.get_text()
print(text)
print("=" * 50)

Why is this $[...] when the same product is available for $[...] here?//www.amazon.com/VICTOR-FLY-MAGNET-BAIT-REFILL/dp/B00004RBDYThe Victor M380 and M502 traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby.
I recently tried this flavor/brand and was surprised at how delicious these chips are.  The best thing was that there were a lot of "brown" chips in the bsg (my favorite), so I bought some more through amazon and shared with family and friends.  I am a little disappointed that there are not, so far, very many brown chips in these bags, but the flavor is still very good.  I like them better than the yogurt and green onion flavor because they do not seem to be as salty, and the onion flavor is better.  If you haven't eaten Kettle chips before, I recommend that you try a bag before buying bulk.  They are thicker and crunchier than Lays but just as fresh out of the bag.
Wow.  So far, two two-star reviews.  One obviously had no idea what they were ord

In [30]:
# https://stackoverflow.com/a/47091490/4084039

import re

def decontracted(phrase):
    
    phrase = re.sub(r"won't", 'will not', phrase)
    phrase = re.sub(r"can't", 'can not', phrase)
    phrase = re.sub(r"n't", ' not', phrase)
    phrase = re.sub(r"'re", ' are', phrase)
    phrase = re.sub(r"'d'", ' would', phrase)
    phrase = re.sub(r"'ll", ' will', phrase)
    phrase = re.sub(r"'t", ' not', phrase)
    phrase = re.sub(r"'m", ' am', phrase)
    phrase = re.sub(r"'ve", ' have', phrase)
#     phrase = re.sub(r"'t", 'can not', phrase)
    
    return phrase
        

In [31]:
sent_1500 = decontracted(sent_1500)
print(sent_1500)


Wow.  So far, two two-star reviews.  One obviously had no idea what they were ordering; the other wants crispy cookies.  Hey, Iam sorry; but these reviews do nobody any good beyond reminding us to look  before ordering.<br /><br />These are chocolate-oatmeal cookies.  If you donot like that combination, donot order this type of cookie.  I find the combo quite nice, really.  The oatmeal sort of "calms" the rich chocolate flavor and gives the cookie sort of a coconut-type consistency.  Now let's also remember that tastes differ; so, I have given my opinion.<br /><br />Then, these are soft, chewy cookies -- as advertised.  They are not "crispy" cookies, or the blurb would say "crispy," rather than "chewy."  I happen to like raw cookie dough; however, I donot see where these taste like raw cookie dough.  Both are soft, however, so is this the confusion?  And, yes, they stick together.  Soft cookies tend to do that.  They arenot individually wrapped, which would add to the cost.  Oh yeah, c

In [32]:
#remove words with numbers python: https://stackoverflow.com/a/18082370/4084039
sent_0 = re.sub('\S*\d\S*', '', sent_0).strip()
print(sent_0)

Why is this $[...] when the same product is available for $[...] here?<br  /><br />The Victor  and  traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby.


In [33]:
#remove spacial character: https://stackoverflow.com/a/5843547/4084039
sent_0 = re.sub('[^A-Za-z0-9]+', " ", sent_0)
print(sent_0)

Why is this when the same product is available for here br br The Victor and traps are unreal of course total fly genocide Pretty stinky but only right nearby 


In [36]:
nltk.download('stopwords')
stopwords = stopwords.words('english')
stopwords



[nltk_data] Downloading package stopwords to C:\Users\Saurabh
[nltk_data]     Satyarth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [40]:
# from tqdm import tqdm
preprocessed_reviews = []
# tqdm is for printing the status bar
for sentence in tqdm(final['Text'].values):
    sentence = re.sub(r'http\S+', '', sentence)
    sentence = BeautifulSoup(sentence, 'html').get_text()
    sentence = decontracted(sentence)
    sentence = re.sub('\S*\d\S*', '', sentence)
    sentence = re.sub('[^A-Za-z]', ' ', sentence)
    sentence = " ".join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
    preprocessed_reviews.append(sentence.strip())

100%|████████████████████████████████████████████████████████████████████████████| 4986/4986 [00:02<00:00, 1835.22it/s]


In [44]:
for review in preprocessed_reviews[:4]:
    print(review)
    print("=" * 70)
    

product available victor traps unreal course total fly genocide pretty stinky right nearby
used victor fly bait seasons ca beat great product
received shipment could hardly wait try product love slickers call instead stickers removed easily daughter designed signs printed reverse use car windows printed beautifully print shop program going lot fun product windows everywhere surfaces like tv screens computer monitors
really good idea final product outstanding use decals car window everybody asks bought decals made two thumbs


# [3.2] Preprocessing Review Summary


In [43]:
## Similartly you can do preprocessing for review summary also.
preprocessed_summary = []
for sentence in tqdm(final['Summary'].values):
    sentence = re.sub(r'http\S+', '', sentence)
    sentence = BeautifulSoup(sentence, 'html').get_text()
    sentence = decontracted(sentence)
    sentence = re.sub('\S*\d\S*', '', sentence)
    sentence = re.sub('[^A-Za-z]', ' ', sentence)
    sentence = " ".join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
    preprocessed_summary.append(sentence.strip())

100%|████████████████████████████████████████████████████████████████████████████| 4986/4986 [00:01<00:00, 2918.19it/s]


In [45]:
for summary in preprocessed_summary[:4]:
    print(summary)
    print("=" * 30)
    

thirty bucks
flies begone
wow make slickers
great product


# [4] Featurization
 ## [4.1] BAG OF WORDS

In [46]:
count_vect = CountVectorizer()
final_count = count_vect.fit_transform(preprocessed_reviews)

final_count.get_shape()

(4986, 13037)

### Bi-grams and n-Grams