In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## **Importing Libraries**

In [2]:
# Importing all the Libraries
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd  
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

import re
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm
import os

## **Loading Data**

In [3]:
path = '/content/drive/MyDrive/Project/Reviews.csv'
df = pd.read_csv(path)

In [4]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [5]:
df.shape

(568454, 10)

## **EDA**

In [6]:
# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
def partition(x):
    if x < 3:
        return 0
    return 1

actual_score = df['Score']
positiveNegative = actual_score.map(partition) 
df['Score'] = positiveNegative
print("Number of data points in our data", df.shape)
df.head()

Number of data points in our data (568454, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,0,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,1,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [7]:
# Sorting data according to ProductId in ascending order
sorted_data = df.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
sorted_data.head(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
515425,515426,141278509X,AB1A5EGHHVA9M,CHelmic,1,1,1,1332547200,The best drink mix,This product by Archer Farms is the best drink...
24749,24750,2734888454,A13ISQV0U9GZIC,Sandikaye,1,1,0,1192060800,made in china,My dogs loves this chicken but its a product f...
24750,24751,2734888454,A1C298ITT645B6,Hugh G. Pritchard,0,0,1,1195948800,Dog Lover Delites,Our dogs just love them. I saw them in a pet ...
308076,308077,2841233731,A3QD68O22M2XHQ,LABRNTH,0,0,1,1345852800,Great recipe book for my babycook,This book is easy to read and the ingredients ...
150528,150529,6641040,A25ACLV5KPB4W,"Matt Hetling ""Matt""",0,1,1,1108425600,"Nice cadence, catchy rhymes",In June<br />I saw a charming group<br />of ro...


In [8]:
# Deduplication of entries
final = sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

(393933, 10)

In [9]:
# Checking to see how much % of data still remains
(final['Id'].size*1.0)/(df['Id'].size*1.0)*100

69.29901100176971

In [10]:
final = final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [11]:
# Before starting the next phase of preprocessing lets see the number of entries left
print(final.shape)

# How many positive and negative reviews are present in our dataset?
final['Score'].value_counts()

(393931, 10)


1    336824
0     57107
Name: Score, dtype: int64

## **Text Preprocessing**

In [12]:
import nltk
nltk.download('stopwords')

# set of stop words
stop = set(stopwords.words('english'))

# set of snow ball stemmers in english
sno=nltk.stem.SnowballStemmer('english') 

# function to clean html tags in a sentence
def cleanhtml(sentence): 
    cleannr=re.compile('<.*?>')
    cleantext=re.sub(cleannr,'',sentence)
    return cleantext

# function to clean punctuation in the sentence
def cleanpunc(sentence) : 
    cleaned=re.sub(r'[? | ! | \' |" | #]',r'',sentence)
    cleaned=re.sub(r'[. | , | ) | ( | \ | / ]' ,r' ',sentence)
    return cleaned

print(stop)
print('**' * 50)
print(sno.stem('taste'))

{'no', "mightn't", 'll', 'more', 'mightn', 'whom', 'until', 'she', "needn't", 'into', 'once', 've', "aren't", 'him', 'again', 'which', 'some', "don't", 'ain', 'now', 'yourself', 'shan', 'your', 'had', "that'll", 'i', 'from', 'those', 'because', 'our', 'off', 'over', "wouldn't", "wasn't", 'isn', 'y', 'been', 'in', 'same', 'yours', "shouldn't", 'with', 're', "she's", 'during', 'above', 'why', 'very', "didn't", 'hasn', 'his', 'd', 'and', 'under', 'this', "you're", 'each', 'doing', 'they', 'the', 'my', "isn't", 'you', 'themselves', 'where', "hasn't", "couldn't", "you've", 'are', 'their', 'is', 'couldn', 'itself', 'hadn', 'wouldn', "it's", "you'd", 'was', 'we', 'then', 'doesn', 'up', 'does', 'than', 'such', 'didn', 'as', 'theirs', 'do', 'wasn', 'needn', 'not', 'them', 'm', 'but', 'having', 'most', "haven't", 'ma', "mustn't", 'its', 't', 'after', 'aren', 'should', 'between', 'were', 'have', 'while', 'all', 'down', 'both', 'too', 'through', 'won', 'weren', 'or', 'will', 'be', 'mustn', 'there'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [14]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [15]:
from tqdm import tqdm # tqdm is for printing the status bar

i=0
str1=' '
preprocessed_reviews=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here
s=''

for sentance in tqdm(final['Text'].values):
    filtered_sentence=[]
    sentance=cleanhtml(sentance)# removing the html tags
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    for w in sentance.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words) > 2)):
                if (cleaned_words.lower() not in stopwords):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (final['Score'].values[i]) == 1 :
                      all_positive_words.append(s) # list all the positive words
                    if (final['Score'].values[i]) == 0 :
                       all_negative_words.append(s) # list all the negative words
                else:
                    continue
            else:
                continue
    str1=b" ".join(filtered_sentence) # final string of the filtered sentence
    
    preprocessed_reviews.append(str1.strip())
    i+=1

100%|██████████| 393931/393931 [13:48<00:00, 475.71it/s]


In [16]:
final['preprocessed_reviews'] = preprocessed_reviews

In [17]:
final['preprocessed_reviews'] = final['preprocessed_reviews'].str.decode("utf-8")

In [18]:
final['preprocessed_reviews'].iloc[0]

'product archer farm best drink mix ever mix flavor packet water bottl contain natur sweetner stevia real fruit flavor food color color fruit veget color pure natur tast great eight packet box contain calori per packet thank archer farm'

In [19]:
final.to_csv('/content/drive/MyDrive/Project/preprocessed_reviews.csv',index=False)

In [20]:
final['Summary']=final['Summary'].apply(str)

## **Preprocess Summary**

In [21]:
from tqdm import tqdm # tqdm is for printing the status bar

i=0
str1=' '
preprocessed_summary=[]
all_positive_summary=[] # store words from +ve reviews here
all_negative_summary=[] # store words from -ve reviews here
s=''

for sentance in tqdm(final['Summary'].values):
    filtered_summary=[]
    sentance = cleanhtml(sentance)# removing the html tags
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    for w in sentance.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words) > 2)):
                if (cleaned_words.lower() not in stopwords):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_summary.append(s)
                    if (final['Score'].values[i]) == 1 :
                      all_positive_summary.append(s) # list all the positive words
                    if (final['Score'].values[i]) == 0 :
                      all_negative_summary.append(s) # list all the negative words
                else:
                    continue
            else:
                continue
    str1=b" ".join(filtered_summary) # final string of the filtered sentence
    
    preprocessed_summary.append(str1.strip())
    i+=1

100%|██████████| 393931/393931 [02:48<00:00, 2338.26it/s]


In [22]:
final['preprocessed_summary'] = preprocessed_summary 

In [23]:
final['preprocessed_summary']=final['preprocessed_summary'].str.decode("utf-8")

In [24]:
final['preprocessed_summary'].iloc[0]

'best drink mix'

In [25]:
final.to_csv('/content/drive/MyDrive/Project/preprocessed_data.csv',index=False)

In [26]:
all_positive_summary[0]

b'best'

In [27]:
all_positive_words[0:10]

[b'product',
 b'archer',
 b'farm',
 b'best',
 b'drink',
 b'mix',
 b'ever',
 b'mix',
 b'flavor',
 b'packet']

In [28]:
all_negative_words[0:10]

[b'dog',
 b'love',
 b'chicken',
 b'product',
 b'china',
 b'wont',
 b'buy',
 b'anymor',
 b'hard',
 b'find']

In [29]:
all_negative_summary[0:10]

[b'made',
 b'china',
 b'stori',
 b'great',
 b'softcov',
 b'book',
 b'disappoint',
 b'awesom',
 b'book',
 b'poor']

In [30]:
all_positive_summary[0:10]

[b'best',
 b'drink',
 b'mix',
 b'dog',
 b'lover',
 b'delit',
 b'great',
 b'recip',
 b'book',
 b'babycook']

## **Featurization**

In [31]:
# Bag of Words
count_vect = CountVectorizer() 
count_vect.fit(preprocessed_reviews)
print("some feature names ", count_vect.get_feature_names()[500:510])
print('='*50)

final_counts = count_vect.transform(preprocessed_reviews)
print("the type of count vectorizer ",type(final_counts))
print("the shape of out text BOW vectorizer ",final_counts.get_shape())
print("the number of unique words ", final_counts.get_shape()[1])

some feature names  ['acetaia', 'acetaminophen', 'acetatewaterzinc', 'aceto', 'aceton', 'acetonitril', 'acetum', 'acetyl', 'acetylcholin', 'ach']
the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (393931, 91969)
the number of unique words  91969


In [32]:
data = pd.read_csv('/content/drive/MyDrive/Project/preprocessed_data.csv')

In [33]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,preprocessed_reviews,preprocessed_summary
0,515426,141278509X,AB1A5EGHHVA9M,CHelmic,1,1,1,1332547200,The best drink mix,This product by Archer Farms is the best drink...,product archer farm best drink mix ever mix fl...,best drink mix
1,24750,2734888454,A13ISQV0U9GZIC,Sandikaye,1,1,0,1192060800,made in china,My dogs loves this chicken but its a product f...,dog love chicken product china wont buy anymor...,made china
2,24751,2734888454,A1C298ITT645B6,Hugh G. Pritchard,0,0,1,1195948800,Dog Lover Delites,Our dogs just love them. I saw them in a pet ...,dog love saw pet store tag attach regard made ...,dog lover delit
3,308077,2841233731,A3QD68O22M2XHQ,LABRNTH,0,0,1,1345852800,Great recipe book for my babycook,This book is easy to read and the ingredients ...,book easi read ingredi avail store unlik recip...,great recip book babycook
4,150529,6641040,A25ACLV5KPB4W,"Matt Hetling ""Matt""",0,1,1,1108425600,"Nice cadence, catchy rhymes",In June<br />I saw a charming group<br />of ro...,junei saw charm groupof rose beginto droopi pe...,nice cadenc catchi rhyme


In [34]:
data['preprocessed_reviews'].isnull().sum()

3

In [35]:
data['preprocessed_reviews']=data['preprocessed_reviews'].fillna(method='bfill')
data['preprocessed_reviews'].isnull().sum()

0

In [36]:
# bi-gram, tri-gram and n-gram
# Used only 5000 points beause we have memory error
count_vect = CountVectorizer(ngram_range=(1,2), min_df=10, max_features=5000)
final_bigram_counts = count_vect.fit_transform(data['preprocessed_reviews'])[:5000]
print("the type of count vectorizer ",type(final_bigram_counts))
print("the shape of out text BOW vectorizer ",final_bigram_counts.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_bigram_counts.get_shape()[1])

the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (5000, 5000)
the number of unique words including both unigrams and bigrams  5000


In [37]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10)
tf_idf_vect.fit(data['preprocessed_reviews'])
print("some sample features(unique words in the corpus)",tf_idf_vect.get_feature_names()[0:10])
print('='*50)

final_tf_idf = tf_idf_vect.transform(data['preprocessed_reviews'])
print("the type of count vectorizer ",type(final_tf_idf))
print("the shape of out text TFIDF vectorizer ",final_tf_idf.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_tf_idf.get_shape()[1])

some sample features(unique words in the corpus) ['aaa', 'aaaaa', 'aaah', 'aafco', 'aah', 'ab', 'aback', 'abandon', 'abbey', 'abbi']
the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text TFIDF vectorizer  (393931, 222706)
the number of unique words including both unigrams and bigrams  222706


In [38]:
data = pd.read_csv('/content/drive/MyDrive/Project/preprocessed_data.csv')
data['preprocessed_reviews']=data['preprocessed_reviews'].fillna(method='bfill')
data['preprocessed_reviews'].isnull().sum()

0

In [39]:
i=0
list_of_sentance=[]
for sentance in data['preprocessed_reviews']:
    list_of_sentance.append(sentance.split())

In [40]:
# Using Google News Word2Vectors
is_your_ram_gt_16g=False
want_to_use_google_w2v = False
want_to_train_w2v = True

if want_to_train_w2v:
    w2v_model=Word2Vec(list_of_sentance,min_count=5,size=50, workers=4)
    print(w2v_model.wv.most_similar('great'))
    print('='*50)
    print(w2v_model.wv.most_similar('worst'))
    
elif want_to_use_google_w2v and is_your_ram_gt_16g:
    if os.path.isfile('GoogleNews-vectors-negative300.bin'):
        w2v_model=KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
        print(w2v_model.wv.most_similar('great'))
        print(w2v_model.wv.most_similar('worst'))
    else:
        print("you don't have gogole's word2vec file, keep want_to_train_w2v = True, to train your own w2v ")



[('fantast', 0.8723997473716736), ('terrif', 0.8663351535797119), ('awesom', 0.8448942303657532), ('good', 0.8439229726791382), ('excel', 0.8396443724632263), ('wonder', 0.7959756851196289), ('perfect', 0.7726341485977173), ('nice', 0.764541745185852), ('fabul', 0.724461019039154), ('amaz', 0.690619945526123)]
[('nastiest', 0.8893260955810547), ('greatest', 0.7726911902427673), ('best', 0.7426216006278992), ('disgust', 0.742313027381897), ('horrid', 0.7276912331581116), ('horribl', 0.7272164821624756), ('tastiest', 0.7081483006477356), ('aw', 0.7035646438598633), ('sweetest', 0.6952811479568481), ('vile', 0.6933640837669373)]


In [41]:
w2v_words = list(w2v_model.wv.vocab)
print("number of words that occured minimum 5 times ",len(w2v_words))
print("sample words ", w2v_words[0:50])

number of words that occured minimum 5 times  23481
sample words  ['product', 'archer', 'farm', 'best', 'drink', 'mix', 'ever', 'flavor', 'packet', 'water', 'bottl', 'contain', 'natur', 'sweetner', 'stevia', 'real', 'fruit', 'food', 'color', 'veget', 'pure', 'tast', 'great', 'eight', 'box', 'calori', 'per', 'thank', 'dog', 'love', 'chicken', 'china', 'wont', 'buy', 'anymor', 'hard', 'find', 'made', 'usa', 'one', 'isnt', 'bad', 'good', 'take', 'chanc', 'till', 'know', 'go', 'import', 'saw']


In [42]:
# Average Word2Vec
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(list_of_sentance): # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

100%|██████████| 393931/393931 [19:42<00:00, 333.01it/s]

393931
50





In [43]:
model = TfidfVectorizer()
model.fit(data['preprocessed_reviews'])
dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))

In [None]:
# TF-IDF weighted Word2Vec
tfidf_feat = model.get_feature_names() # tfidf words/col-names

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in tqdm(list_of_sentance): # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words and word in tfidf_feat:
            vec = w2v_model.wv[word] 
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

  7%|▋         | 26723/393931 [46:35<7:03:46, 14.44it/s]