In [52]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import sqlite3
import string
from sklearn import metrics

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import roc_curve,auc,confusion_matrix

from nltk.corpus import stopwords
import re # regular expression
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm
import os

In [53]:
#pip install --user gensim --upgrade
#np.__version__

In [54]:
#pip install --user gensim

## 1. READING THE DATA FROM SQLITE

In [55]:
#first we need to create the connection with data data base ....

con = sqlite3.connect('database.sqlite')


entire_data = pd.read_sql_query("""SELECT * FROM Reviews WHERE Score != 3 """,con)
filtered_data = pd.read_sql_query("""SELECT * FROM Reviews WHERE Score != 3 LIMIT 5000""",con)
print(filtered_data.shape)
filtered_data.head(10)

(5000, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...


In [56]:
# First we need to create a connection with the database

con = sqlite3.connect('database.sqlite')

filtered_data = pd.read_sql_query("""SELECT * FROM Reviews WHERE SCORE != 3 LIMIT 5000""",con)
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


# DATA CLEANING

## DATA CLEANING:De-duplication

#### Now we are trying to find the duplicate rows in our filtered data frame

In [57]:
duplicate_df = filtered_data[filtered_data.duplicated(['UserId','ProfileName','Time','Summary','Text'])]
duplicate_df.shape

(11, 10)

**CONCLUSION:** We can clearly observe that there are 11 duplicate rows in the data frame.
From the above observation we can conclude that there are a lot of duplicate records in the data set as we can see that the time of commenting is same by a single user in this case for multiple products. This means that the user meant to comment on a singular product but the comment got tagged to all the different variations of the same product .Say waffle of different falvours but user wanted to comment for chocolate flavour but since amazon has different product id's for each variations too so we have multiple records and the reviews get duplicated. 

* **Moreover we need to remove this duplicates as part of De-duplication of the data.**

* To check that we can type :   https://www.amazon.com/dp/B003ANFMY8 where dp stands for detailed page and the product id is called ASIN Amazon Standard Identification Number.


**Before Removing duplicates we need to do the following**

* Sort the data with respect to the product id and then only keep the first record and the remaining are supposed to be dropped.

In [58]:
#Sorting the data wrt ProductId
sorted_df = filtered_data.sort_values(by='ProductId',inplace=False,ascending=True,na_position = 'last',axis=0)
sorted_df.shape

(5000, 10)

In [59]:
cleaned_df = sorted_df.drop_duplicates(subset=['UserId','ProfileName','Time','Summary','Text'],keep='first')

In [60]:
print('The percentage of data preserved after dropping duplicates is: {}'.format((cleaned_df['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100))

The percentage of data preserved after dropping duplicates is: 99.78


**Now we need to ensure that the rows must contain Helpfullness Numerator <= Helpfullness Denominator**

In [61]:
cleaned_df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

**Now we are trying to find rows where 'HelpfulnessNumerator' is greater than 'HelpfulnessDenominator' . These are corrupted records as this scenario is not possible.**

* HelpfulnessNumerator = Number of people who found the review posted by a person helpful.
* HelpfulnessDenominator = HelpfulnessNumerator + No of ppl who found found the review not helpful"

In [62]:
cleaned_df = cleaned_df.loc[cleaned_df['HelpfulnessNumerator']<=cleaned_df['HelpfulnessDenominator']]

In [63]:
cleaned_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
2546,2774,B00002NCJC,A196AJHU9EASJN,Alex Chaffee,0,0,4,1282953600,thirty bucks?,Why is this $[...] when the same product is av...
2547,2775,B00002NCJC,A13RRPGE79XFFH,reader48,0,0,5,1281052800,Flies Begone,We have used the Victor fly bait for 3 seasons...
1145,1244,B00002Z754,A3B8RCEI0FXFI6,B G Chase,10,10,5,962236800,WOW Make your own 'slickers' !,I just received my shipment and could hardly w...
1146,1245,B00002Z754,A29Z5PI9BW2PU3,Robbie,7,7,5,961718400,Great Product,This was a really good idea and the final prod...
2942,3204,B000084DVR,A1UGDJP1ZJWVPF,"T. Moore ""thoughtful reader""",1,1,5,1177977600,Good stuff!,I'm glad my 45lb cocker/standard poodle puppy ...


In [64]:
def pos_neg(x):
    if x<3:
        return 0
    return 1


initial_score = filtered_data['Score']
final_score = initial_score.map(pos_neg)
filtered_data['Score'] = final_score

filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,0,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,1,1350777600,Great taffy,Great taffy at a great price. There was a wid...


Now , we have a dataframe which is cleaned of all the corruptions. 
**Objective :** We need to create a column Score with only Positive or negative values which will be our Target Column which we are trying to predict. We can use the Score column which is alreday present and map it to positive if the rating is greater then 3 and neagtive if rating is less than 3.
    

In [65]:
def pos_neg(num):
    if num>3:
        return 'positive'
    return 'negative'

actual_score = cleaned_df['Score']
converted_score = actual_score.map(pos_neg)
cleaned_df['Score'] = converted_score
cleaned_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
2546,2774,B00002NCJC,A196AJHU9EASJN,Alex Chaffee,0,0,positive,1282953600,thirty bucks?,Why is this $[...] when the same product is av...
2547,2775,B00002NCJC,A13RRPGE79XFFH,reader48,0,0,positive,1281052800,Flies Begone,We have used the Victor fly bait for 3 seasons...
1145,1244,B00002Z754,A3B8RCEI0FXFI6,B G Chase,10,10,positive,962236800,WOW Make your own 'slickers' !,I just received my shipment and could hardly w...
1146,1245,B00002Z754,A29Z5PI9BW2PU3,Robbie,7,7,positive,961718400,Great Product,This was a really good idea and the final prod...
2942,3204,B000084DVR,A1UGDJP1ZJWVPF,"T. Moore ""thoughtful reader""",1,1,positive,1177977600,Good stuff!,I'm glad my 45lb cocker/standard poodle puppy ...


In [66]:
cleaned_df['Score'].value_counts()

positive    4180
negative     809
Name: Score, dtype: int64

In [67]:
final = cleaned_df.copy()
final.shape

(4989, 10)

In [68]:
import nltk
from nltk.corpus import stopwords
#print(stopwords.words('english'))

In [69]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [70]:
#nltk.download('stopwords')

# [3].  Text Preprocessing.

Now that we have finished deduplication our data requires some preprocessing before we go on further with analysis and making the prediction model.

Hence in the Preprocessing phase we do the following in the order below:-

1. Begin by removing the html tags
2. Remove any punctuations or limited set of special characters like , or . or # etc.
3. Check if the word is made up of english letters and is not alpha-numeric
4. Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
5. Convert the word to lowercase
6. Remove Stopwords
7. Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)<br>

After which we collect the words used to describe positive and negative reviews

In [71]:
final.shape

(4989, 10)

In [72]:
# printing some random reviews
sent_0 = final['Text'].values[0]
print(sent_0)
print("="*50)

sent_1000 = final['Text'].values[1000]
print(sent_1000)
print("="*50)

sent_1500 = final['Text'].values[1500]
print(sent_1500)
print("="*50)

sent_4900 = final['Text'].values[4900]
print(sent_4900)
print("="*50)

Why is this $[...] when the same product is available for $[...] here?<br />http://www.amazon.com/VICTOR-FLY-MAGNET-BAIT-REFILL/dp/B00004RBDY<br /><br />The Victor M380 and M502 traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby.
These are delicious.  The 2 oz bags are a bit big for school lunches, definitely more than a serving.
When I eat junk food, I'd like to enjoy it, the guilty pleasure it is.<br /><br />These cookies are neither healthy or tasty.  Stick to Chewy Chocolate Chip cookies.
I've tried quite a few coffees and many end up tasting the same. However, this and the Italian Blend by Timothy's are two that are very good black or with cream/sugar. If you're looking for a simple coffee without all the flavors, this is the one for you.


In [73]:
# remove urls from text python: https://stackoverflow.com/a/40823105/4084039
sent_0 = re.sub(r"http\S+", "", sent_0)
sent_1000 = re.sub(r"http\S+", "", sent_1000)
sent_150 = re.sub(r"http\S+", "", sent_1500)
sent_4900 = re.sub(r"http\S+", "", sent_4900)

print(sent_0)

Why is this $[...] when the same product is available for $[...] here?<br /> /><br />The Victor M380 and M502 traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby.


In [74]:
# https://stackoverflow.com/questions/16206380/python-beautifulsoup-how-to-remove-all-tags-from-an-element
from bs4 import BeautifulSoup

soup = BeautifulSoup(sent_0, 'lxml')
text = soup.get_text()
print(text)
print("="*50)

soup = BeautifulSoup(sent_1000, 'lxml')
text = soup.get_text()
print(text)
print("="*50)

soup = BeautifulSoup(sent_1500, 'lxml')
text = soup.get_text()
print(text)
print("="*50)

soup = BeautifulSoup(sent_4900, 'lxml')
text = soup.get_text()
print(text)

Why is this $[...] when the same product is available for $[...] here? />The Victor M380 and M502 traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby.
These are delicious.  The 2 oz bags are a bit big for school lunches, definitely more than a serving.
When I eat junk food, I'd like to enjoy it, the guilty pleasure it is.These cookies are neither healthy or tasty.  Stick to Chewy Chocolate Chip cookies.
I've tried quite a few coffees and many end up tasting the same. However, this and the Italian Blend by Timothy's are two that are very good black or with cream/sugar. If you're looking for a simple coffee without all the flavors, this is the one for you.


In [75]:
# https://stackoverflow.com/a/47091490/4084039
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [76]:
sent_1500 = decontracted(sent_1500)
print(sent_1500)
print("="*50)

When I eat junk food, I would like to enjoy it, the guilty pleasure it is.<br /><br />These cookies are neither healthy or tasty.  Stick to Chewy Chocolate Chip cookies.


In [77]:
#remove words with numbers python: https://stackoverflow.com/a/18082370/4084039
sent_0 = re.sub("\S*\d\S*", "", sent_0).strip()
print(sent_0)

Why is this $[...] when the same product is available for $[...] here?<br /> /><br />The Victor  and  traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby.


In [78]:
#remove spacial character: https://stackoverflow.com/a/5843547/4084039
sent_1500 = re.sub('[^A-Za-z0-9]+', ' ', sent_1500)
print(sent_1500)

When I eat junk food I would like to enjoy it the guilty pleasure it is br br These cookies are neither healthy or tasty Stick to Chewy Chocolate Chip cookies 


In [79]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [80]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
# <br /><br /> ==> after the above steps, we are getting "br br"
# we are including them into stop words list
# instead of <br /> if we have <br/> these tags would have revmoved in the 1st step

stopwords = set(['br','i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"])




In [81]:
sno = nltk.stem.SnowballStemmer('english')

In [82]:
from bs4 import BeautifulSoup
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


# Combining all the above stundents 
from tqdm import tqdm
preprocessed_reviews = []
# tqdm is for printing the status bar
for sentence in tqdm(final['Text'].values):
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = BeautifulSoup(sentence, 'lxml').get_text()
    sentence = decontracted(sentence)
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    # https://gist.github.com/sebleier/554280
    sentence = ' '.join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
    sentence = ' '.join((sno.stem(w)) for w in sentence.split()) 
    preprocessed_reviews.append(sentence.strip())

100%|█████████████████████████████████████████████████████████████████████████████| 4989/4989 [00:12<00:00, 403.32it/s]


In [83]:
preprocessed_reviews[100]

'not know scienc diet refer junk food one higher qualiti food market lot new organ natur cat food not healthi pet crack regardless veterinarian want put year old cat prescript food high count crystal urin comparison scienc diet matur adult activ longev closest prescript food cat younger one older food seem enjoy graze day indoor cat tri best not overfe younger cat not issu urin sinc went food happi'

In [84]:
sno.stem('science')

'scienc'

In [85]:
preprocessed_reviews[100]

'not know scienc diet refer junk food one higher qualiti food market lot new organ natur cat food not healthi pet crack regardless veterinarian want put year old cat prescript food high count crystal urin comparison scienc diet matur adult activ longev closest prescript food cat younger one older food seem enjoy graze day indoor cat tri best not overfe younger cat not issu urin sinc went food happi'

In [86]:
final['Text'].values[100]

"I don't know why Science Diet is being referred to as junk food, as it is one of the higher quality foods on the market. A lot of the new organic and natural cat foods are not as healthy for pets as they are cracked up to be. But regardless, my veterinarian wanted me to put my 2 year old cat on a prescription food because of the high count of crystals in her urine. After some comparisons, Science Diet Mature Adult active longevity is the closest to the prescription food. Both my cats, the younger one and the older (10) are on this food and they seem to enjoy it. They both graze all day and are indoor cats so i try my best not to overfeed them. My younger cat has not had issues with her urine since she went on this food and we are all happy :)"

## Pre-processing for Summary column

In [87]:
final.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [88]:
final.Summary.values[100]

'Keeps my cats happy and healthy'

In [89]:
# Combining all the above stundents 
from tqdm import tqdm
preprocessed_summary = []
# tqdm is for printing the status bar
for sentence in tqdm(final['Summary'].values):
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = BeautifulSoup(sentence, 'lxml').get_text()
    sentence = decontracted(sentence)
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    sentence = ' '.join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
    sentence = ' '.join((sno.stem(w)) for w in sentence.split())  
    preprocessed_summary.append(sentence.strip())

100%|████████████████████████████████████████████████████████████████████████████| 4989/4989 [00:03<00:00, 1269.78it/s]


In [90]:
preprocessed_summary[100]

'keep cat happi healthi'

## Now we will insert all the words on the basis of classes

In [91]:
final['cleaned_text'] = preprocessed_reviews

In [92]:
positive_df = final["cleaned_text"][final['Score']=="positive"]

In [93]:
positive_df.shape

(4180,)

In [94]:
negative_df = final["cleaned_text"][final['Score']=="negative"]

In [95]:
negative_df.shape

(809,)

In [96]:
positive_words = []
for sen in positive_df:
    for word in sen.split():
        positive_words.append(word)
        
print(positive_words)

['product', 'avail', 'victor', 'trap', 'unreal', 'cours', 'total', 'fli', 'genocid', 'pretti', 'stinki', 'right', 'nearbi', 'use', 'victor', 'fli', 'bait', 'season', 'ca', 'not', 'beat', 'great', 'product', 'receiv', 'shipment', 'could', 'hard', 'wait', 'tri', 'product', 'love', 'slicker', 'call', 'instead', 'sticker', 'remov', 'easili', 'daughter', 'design', 'sign', 'print', 'revers', 'use', 'car', 'window', 'print', 'beauti', 'print', 'shop', 'program', 'go', 'lot', 'fun', 'product', 'window', 'everywher', 'surfac', 'like', 'tv', 'screen', 'comput', 'monitor', 'realli', 'good', 'idea', 'final', 'product', 'outstand', 'use', 'decal', 'car', 'window', 'everybodi', 'ask', 'bought', 'decal', 'made', 'two', 'thumb', 'glad', 'cocker', 'standard', 'poodl', 'puppi', 'love', 'stuff', 'trust', 'brand', 'superior', 'nutrit', 'compar', 'label', 'previous', 'feed', 'pedigre', 'most', 'corn', 'littl', 'dude', 'healthi', 'happi', 'high', 'energi', 'glossi', 'coat', 'also', 'superior', 'nutrit', 'pr




In [97]:
negative_words = []
for sen in negative_df:
    for word in sen.split():
        negative_words.append(word)
        
print(negative_words)

['descript', 'product', 'discept', 'product', 'repres', 'powder', 'not', 'powder', 'granul', 'noth', 'shred', 'coconut', 'not', 'even', 'dissolv', 'high', 'speed', 'commerci', 'blender', 'unless', 'use', 'product', 'manufactur', 'dark', 'chocol', 'coat', 'coconut', 'patti', 'useless', 'intent', 'use', 'addit', 'healthi', 'shake', 'end', 'ruin', 'shake', 'resort', 'chew', 'undissolv', 'tasteless', 'coconut', 'piec', 'rather', 'drink', 'shake', 'addit', 'way', 'product', 'packag', 'no', 'protect', 'cardboard', 'prevent', 'slash', 'top', 'packag', 'box', 'open', 'could', 'rate', 'product', 'zero', 'star', 'would', 'redeem', 'qualiti', 'rather', 'inexpens', 'gave', 'one', 'packag', 'away', 'free', 'patient', 'love', 'coconut', 'gave', 'back', 'two', 'day', 'later', 'complain', 'terribl', 'bought', 'brand', 'onlin', 'indian', 'groceri', 'store', 'usual', 'excel', 'product', 'abl', 'turn', 'cream', 'butter', 'use', 'super', 'blender', 'ad', 'water', 'bare', 'flavor', 'usual', 'buy', 'chao', 




In [98]:
freq_positive_words = nltk.FreqDist(positive_words)
freq_negative_words = nltk.FreqDist(negative_words)
print("Most Common Positive words :{}".format(freq_positive_words.most_common(20)))
print("Most Common Negative words :{}".format(freq_negative_words.most_common(20)))

Most Common Positive words :[('not', 3593), ('like', 1814), ('tast', 1631), ('good', 1569), ('flavor', 1558), ('love', 1461), ('great', 1427), ('use', 1262), ('one', 1198), ('product', 1197), ('tri', 1157), ('coffe', 1015), ('food', 1011), ('chip', 992), ('make', 971), ('would', 913), ('get', 829), ('tea', 790), ('bag', 736), ('buy', 726)]
Most Common Negative words :[('not', 1283), ('like', 441), ('tast', 434), ('product', 412), ('would', 324), ('one', 280), ('tri', 278), ('flavor', 274), ('food', 238), ('use', 230), ('good', 208), ('no', 205), ('tea', 188), ('buy', 185), ('order', 184), ('chip', 181), ('get', 178), ('bag', 177), ('even', 167), ('make', 160)]


## 4. FEATURIZATION

### 4.1 BAG OF WORDS

In [99]:
#BoW
count_vect = CountVectorizer() #in scikit-learn
count_vect.fit(preprocessed_reviews)
print("some feature names ", count_vect.get_feature_names()[:10])
print('='*50)

some feature names  ['aa', 'aahhh', 'aback', 'abandon', 'abat', 'abbi', 'abbott', 'abdomin', 'abid', 'abil']


In [100]:
final_counts = count_vect.transform(preprocessed_reviews)
print("the type of count vectorizer ",type(final_counts))
print("the shape of out text BOW vectorizer ",final_counts.get_shape())
print("the number of unique words ", final_counts.get_shape()[1])

the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (4989, 8998)
the number of unique words  8998


## [4.2] Bi-Grams and n-Grams.

In [101]:
#bi-gram, tri-gram and n-gram

#removing stop words like "not" should be avoided before building n-grams
# count_vect = CountVectorizer(ngram_range=(1,2))
# please do read the CountVectorizer documentation http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# you can choose these numebrs min_df=10, max_features=5000, of your choice
count_vect = CountVectorizer(ngram_range=(1,2), min_df=10, max_features=5000)
final_bigram_counts = count_vect.fit_transform(preprocessed_reviews)
print("the type of count vectorizer ",type(final_bigram_counts))
print("the shape of out text BOW vectorizer ",final_bigram_counts.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_bigram_counts.get_shape()[1])

the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (4989, 3021)
the number of unique words including both unigrams and bigrams  3021


## [4.3] TF-IDF

In [102]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10)
tf_idf_vect.fit(preprocessed_reviews)
print("some sample features(unique words in the corpus)",tf_idf_vect.get_feature_names()[0:10])
print('='*50)

final_tf_idf = tf_idf_vect.transform(preprocessed_reviews)
print("the type of count vectorizer ",type(final_tf_idf))
print("the shape of out text TFIDF vectorizer ",final_tf_idf.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_tf_idf.get_shape()[1])

some sample features(unique words in the corpus) ['abil', 'abl', 'abl find', 'abl get', 'absolut', 'absolut best', 'absolut delici', 'absolut love', 'absolut no', 'absorb']
the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text TFIDF vectorizer  (4989, 3021)
the number of unique words including both unigrams and bigrams  3021


## [4.4] Word2Vec

In [103]:
# Train your own Word2Vec model using your own text corpus
i=0
list_of_sentance=[]
for sentance in preprocessed_reviews:
    list_of_sentance.append(sentance.split())

In [105]:
is_your_ram_gt_16g=False
want_to_use_google_w2v = False
want_to_train_w2v = True

if want_to_train_w2v:
    # min_count = 5 considers only words that occured atleast 5 times
    w2v_model=Word2Vec(list_of_sentance,min_count=5,vector_size=50, workers=4)
    print(w2v_model.wv.most_similar('great'))
    print('='*50)
    print(w2v_model.wv.most_similar('worst'))
    
elif want_to_use_google_w2v and is_your_ram_gt_16g:
    if os.path.isfile('GoogleNews-vectors-negative300.bin'):
        w2v_model=KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
        print(w2v_model.wv.most_similar('great'))
        print(w2v_model.wv.most_similar('worst'))
    else:
        print("you don't have gogole's word2vec file, keep want_to_train_w2v = True, to train your own w2v ")

[('excel', 0.9541078209877014), ('think', 0.9457134008407593), ('especi', 0.9440923929214478), ('overal', 0.9427210092544556), ('altern', 0.9413254261016846), ('sacrific', 0.93986976146698), ('snack', 0.9396160840988159), ('defin', 0.939296305179596), ('wonder', 0.9356966614723206), ('hamburg', 0.9355344772338867)]
[('terribl', 0.9973497986793518), ('english', 0.9969518780708313), ('normal', 0.9966275095939636), ('wow', 0.9965056777000427), ('south', 0.9964181184768677), ('stand', 0.9964109063148499), ('aw', 0.9961926937103271), ('lech', 0.9961819648742676), ('oh', 0.9961560964584351), ('similar', 0.9961084127426147)]


In [107]:
w2v_words = list(w2v_model.wv.key_to_index)
print("number of words that occured minimum 5 times ",len(w2v_words))
print("sample words ", w2v_words[0:50])

number of words that occured minimum 5 times  2998
sample words  ['not', 'like', 'tast', 'flavor', 'good', 'product', 'love', 'great', 'use', 'one', 'tri', 'food', 'would', 'chip', 'make', 'coffe', 'get', 'tea', 'bag', 'buy', 'no', 'eat', 'time', 'realli', 'order', 'best', 'much', 'mix', 'price', 'find', 'amazon', 'dog', 'also', 'littl', 'brand', 'cup', 'chocol', 'even', 'well', 'better', 'store', 'box', 'water', 'go', 'hot', 'free', 'recommend', 'drink', 'year', 'look']


## [4.4.1] Converting text into vectors using wAvg W2V, TFIDF-W2V

#### [4.4.1.1] Avg W2v

# average Word2Vec
# compute average word2vec for each review.
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(list_of_sentance): # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

#### [4.4.1.2] TFIDF weighted W2v

In [110]:
# S = ["abc def pqr", "def def def abc", "pqr pqr def"]
model = TfidfVectorizer()
model.fit(preprocessed_reviews)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))

In [111]:
# TF-IDF weighted Word2Vec
tfidf_feat = model.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in tqdm(list_of_sentance): # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words and word in tfidf_feat:
            vec = w2v_model.wv[word]
#             tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
            # to reduce the computation we are 
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

100%|█████████████████████████████████████████████████████████████████████████████| 4989/4989 [00:28<00:00, 173.90it/s]


In [112]:
final.shape[1]

11