In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# The Amazon Fine Food Reviews dataset consists of reviews of fine foods from Amazon.

* Number of reviews: 568,454
* Number of users: 256,059
* Number of products: 74,258
* Timespan: Oct 1999 - Oct 2012
* Number of Attributes/Columns in data: 10

**
Attribute Information:**

* Id
* ProductId - unique identifier for the product
* UserId - unqiue identifier for the user
* ProfileName
* HelpfulnessNumerator - number of users who found the review helpful
* HelpfulnessDenominator - number of users who indicated whether they found the review helpful or not
* Score - rating between 1 and 5
* Time - timestamp for the review
* Summary - brief summary of the review
* Text - text of the review

**Objective:**

Given a review, determine whether the review is positive (Rating of 4 or 5) or negative (rating of 1 or 2).


[Q] How to determine if a review is positive or negative?

[Ans] We could use the Score/Rating. A rating of 4 or 5 could be cosnidered a positive review. A review of 1 or 2 could be considered negative. A review of 3 is nuetral and ignored. This is an approximate and proxy way of determining the polarity (positivity/negativity) of a review.

**Loading the data**

The dataset is available in two forms

.csv file
SQLite Database
In order to load the data, We have used the SQLITE dataset as it easier to query the data and visualise the data efficiently.

Here as we only want to get the global sentiment of the recommendations (positive or negative), we will purposefully ignore all Scores equal to 3. If the score id above 3, then the recommendation wil be set to "positive". Otherwise, it will be set to "negative".

In [None]:
%matplotlib inline

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

In [None]:
# using the SQLite Table to read data.
con = sqlite3.connect('/kaggle/input/amazon-fine-food-reviews/database.sqlite')
#con = sqlite3.connect('database.sqlite') 

#filtering only positive and negative reviews i.e.
# not taking into consideration those reviews with Score=3

df = pd.read_sql_query("SELECT * FROM Reviews WHERE Score !=3", con)

# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
def partition(x):
    if x<3:
        return 'positive'
    return 'negative'


#changing reviews with score less than 3 to be positive and vice-versa
actualScore = df['Score']
positivenegative = actualScore.map(partition)
df['Score']  = positivenegative

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
final = df.drop_duplicates(subset={'UserId','ProfileName', 'Time', 'Text'},
                   keep = 'first', inplace=False)
final.shape

In [None]:
(final['Id'].size*1.0)/(df['Id'].size*1.0)*100

**Exploratory Data Analysis**
**Data Cleaning: Deduplication**
It is observed (as shown in the table below) that the reviews data had many duplicate entries. Hence it was necessary to remove duplicates in order to get unbiased results for the analysis of the data. Following is an example:

In [None]:
display = pd.read_sql_query("""Select * from
            Reviews Where Score !=3 AND Id=44737 OR Id=64422
            Order by ProductId""", con)
display

As can be seen above the same user has multiple reviews of the with the same values for HelpfulnessNumerator, HelpfulnessDenominator, Score, Time, Summary and Text and on doing analysis it was found that.

In [None]:
final = final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [None]:
final.shape

In [None]:
final['Score'].value_counts()

# Text Preprocessing: Stemming, stop-word removal and Lemmatization.
Now that we have finished deduplication our data requires some preprocessing before we go on further with analysis and making the prediction model.

Hence in the Preprocessing phase we do the following in the order below:-

* Begin by removing the html tags
* Remove any punctuations or limited set of special characters like , or . or # etc.
* Check if the word is made up of english letters and is not alpha-numeric
* Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
* Convert the word to lowercase
* Remove Stopwords
* Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)
After which we collect the words used to describe positive and negative reviews

In [None]:
# find sentences containing HTML tags
import re
i =0 ;
for sent in final['Text'].values:
    if (len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i+=1;

In [None]:
import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('english'))
sno = nltk.stem.SnowballStemmer('english')

def cleanhtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    return cleantext

def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return cleaned

print(stop)
print("******************")
print(sno.stem('tasty'))

In [None]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase
# this code takes a while to run as it needs to run on 500k sentences.
i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in final['Text'].values:
    filtered_sentence=[]
    #print(sent);
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (final['Score'].values)[i] == 'positive': 
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(final['Score'].values)[i] == 'negative':
                        all_negative_words.append(s) #list of all words used to describe negative reviews reviews
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1

In [None]:
final['CleanedText'] = final_string
#adding a column of CleanedText which displays the data after pre-processing of the review 

In [None]:
final.head(3)
#below the processed review can be seen in the CleanedText Column 

**Bag of Words (BoW)**

In [None]:
data_pos = final[final["Score"]=="positive"].sample(n=2000)
data_neg = final[final["Score"]=="negative"].sample(n=2000)
final = pd.concat([data_pos, data_neg])

In [None]:
score = final["Score"]

In [None]:
score.shape

In [None]:
#bow
count_vect = CountVectorizer()
final_counts = count_vect.fit_transform(final['CleanedText'].values)

In [None]:
(final_counts.get_shape()[1])

In [None]:
type(final_counts)

In [None]:
final_counts.get_shape()

**Bi-Grams and n-Grams.**
* Motivation

Now that we have our list of words describing positive and negative reviews lets analyse them.

We begin analysis by getting the frequency distribution of the words as shown below

In [None]:
freq_dist_positive = nltk.FreqDist(all_positive_words)
freq_dist_negative = nltk.FreqDist(all_negative_words)
print("Most Common Positive Words : ",freq_dist_positive.most_common(20))
print("Most Common Negative Words : ",freq_dist_negative.most_common(20))

**Observation**:- From the above it can be seen that the most common positive and the negative words overlap for eg. 'like' could be used as 'not like' etc.
So, it is a good idea to consider pairs of consequent words (bi-grams) or q sequnce of n consecutive words (n-grams)

In [None]:
#bi-gram, tri-gram and n-gram
#removing stop words like "not" should be avoided before building n-grams

count_vect = CountVectorizer(ngram_range = (1,2))
final_bigram_counts = count_vect.fit_transform(final["CleanedText"].values)

In [None]:
final_bigram_counts.get_shape()

In [None]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf = tf_idf_vect.fit_transform(final['CleanedText'].values)

In [None]:
final_tf_idf.shape

In [None]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler(with_mean = False)
std_data = std.fit_transform(final_tf_idf)

std_data = std_data.todense()



# tsne
from sklearn.manifold import TSNE
model = TSNE(n_components = 2, perplexity = 50)
tsne_data = model.fit_transform(std_data)

tsne_data = np.vstack((tsne_data.T, score_4000)).T
tsne_df = pd.DataFrame(data = tsne_data, columns = ("dim1", "dim2", "score"))
sns.FacetGrid(tsne_df, hue = "score", size = 6).map(plt.scatter, "dim1", "dim2").add_legend()
plt.title("TSNE for TF-IDF")
plt.show()



In [None]:
features = tf_idf_vect.get_feature_names()
len(features)

In [None]:
# covnert a row in saprsematrix to a numpy array
print(final_tf_idf[3,: ].toarray()[0])

In [None]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]  #Returns the indices that would sort an array.
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

top_tfidf = top_tfidf_feats(final_tf_idf[1, :].toarray()[0], 
features, 25)

In [None]:
top_tfidf

Observations:- As this representation also looks like bow and massively overlapped +ve and -ve review.

In [None]:
# Using Google News Word2Vectors
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

## in this project we are using a pretrained model by google
# its 3.3G file, once you load this into your memory 
# it occupies ~9Gb, so please do this step only if you have >12G of ram
# we will provide a pickle file which contains a dict , 
# and it contains all our courpus words as keys and  model[word] as values
# To use this code-snippet, download "GoogleNews-vectors-negative300.bin" 
# from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
# it's 1.9GB in size.

#model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
import gensim
list_of_sent = []
for sent in final['Text'].values:
    filtered_sentence = []
    sent = cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if(cleaned_words.isalpha()):
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue
    list_of_sent.append(filtered_sentence)


In [None]:
print(final['Text'].values[0])
print("*****************************************************************")
print(list_of_sent[0])

In [None]:
w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,size=50, workers=4)  

In [None]:
w2v = w2v_model[w2v_model.wv.vocab]

In [None]:
w2v.shape

In [None]:
words = list(w2v_model.wv.vocab)
print(len(words))

In [None]:
w2v_model.wv.most_similar('tasty')

In [None]:
w2v_model.wv.most_similar('like')

**Avg W2V, TFIDF-W2V¶**

In [None]:
# average Word2Vec
# compute average word2vec for each review.

In [None]:
from sklearn.preprocessing import StandardScaler

std_data = StandardScaler(with_mean = False).fit_transform(final_bigram_counts)
std_data.shape


In [None]:
type(std_data)

In [None]:
from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0, perplexity=30, n_iter=5000)

# configuring the parameteres
# the number of components = 2
# default perplexity = 30
# default learning rate = 200
# default Maximum number of iterations for the optimization = 1000

tsne_data = model.fit_transform(std_data)

#create a new data frame which help us in ploting the result data
tsne_data = np.vstack((tsne_data.T, score)).T
tsne_df = pd.DataFrame(data = tsne_data, columns=("dim1", "dim2", "score"))

sns.FacetGrid(tsne_df, hue="score", size=6).map(plt.scatter, "dim1", "dim2", "score")
plt.title("TSNE for bag of words")
plt.show()

In [None]:
convert sparse to dense as tsne takes dense vector
std_data = std_data.todense()

In [None]:
#type(std_data)

* from sklearn.manifold import TSNE
* 
* model = TSNE(n_components=2, random_state=0, perplexity=30, n_iter=1000)
* 
* # configuring the parameteres
* # the number of components = 2
* # default perplexity = 30
* # default learning rate = 200
* # default Maximum number of iterations for the optimization = 1000
* 
* tsne_data = model.fit_transform(std_data)
* 
* #create a new data frame which help us in ploting the result data
* tsne_data = np.vstack((tsne_data.T, score)).T
* tsne_df = pd.DataFrame(data = tsne_data, columns=("dim1", "dim2", "score"))
* 
* sns.FacetGrid(tsne_df, hue="score", size=6).map(plt.scatter, "dim1", "dim2", "score")
* plt.title("TSNE for bag of words")
* plt.show()

**Observation**:- Here, we are unable to simply draw a hyperplane and separate +ve and -ve reviews because it overlap each other. But we will have some alternative way to separates review.

In [None]:
score.head()

In [None]:
# average Word2Vec
# compute average word2vec for each review.
sent_vectors = [];  # the avg-w2v for each sentence/review is stored in this list
for sent in list_of_sent:# for each review/sentence
    sent_vec = np.zeros(50)
    cnt_words = 0; # num of words with a valid vector in the sentence/review
    for word in sent:
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words +=1
        except:
            pass
    sent_vec /=cnt_words
    sent_vectors.append(sent_vec)
    
print(len(sent_vectors))
print(len(sent_vectors[0]))

In [None]:
#tsne
from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0, perplexity = 20, n_iter = 5000)

tsne_data = model.fit_transform(sent_vectors)

tsne_data = np.vstack((tsne_data.T, score_4000)).T
tsne_df = pd.DataFrame(data=tsne_data, columns=("dim1", "dim2", "score"))

# Ploting the result of tsne
sns.FacetGrid(tsne_df, hue="score", size=6).map(plt.scatter, 'dim1', 'dim2').add_legend()
plt.title("TSNE for Average Word2vec")
plt.show()

In [None]:
np.seterr(divide='ignore', invalid='ignore')

In [None]:
# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list

row=0;
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tf_idf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
        except:
            pass
    sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

In [None]:
#to know length of tfidf vector
len(tfidf_sent_vectors)

In [None]:
np.isnan(tfidf_sent_vectors)

In [None]:
# To replace nan with 0 and inf with large finite number

tfidf_sent_vectors = np.nan_to_num(tfidf_sent_vectors)

* from sklearn.manifold import TSNE
* model = TSNE(n_components=2, random_state=0, perplexity = 50, n_iter = 5000)
* 
* tsne_data = model.fit_transform(tfidf_sent_vectors)
* 
* tsne_data = np.vstack((tsne_data.T, score_4000)).T
* tsne_df = pd.DataFrame(data=tsne_data, columns=("dim1", "dim2", "score"))
* 
* # Ploting the result of tsne
* sns.FacetGrid(tsne_df, hue="score", size=6).map(plt.scatter, 'dim1', 'dim2').add_legend()
* plt.title("TSNE for TF-IDF Word2vec")
* plt.show()

Observation- This plot also looks like the bow, tfidf and avg word2vec.Both +ve and -ve reviwes are not well seperated they overlapped each other.

Conclusions:-

AS none of TSNE representation gives a well separated both +ve and -ve reviews.
We can not simply draw a plane to separate -ve and +ve reviews. Although, By looking at only visual representation of data we can not take decision whether to draw a plane or not.
We will have some alternative method by that we will look at into this problem like how we can separate -ve and +ve reviews.