In [32]:
%matplotlib inline

import sqlite3
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

# Using the SQLite Table to read data
con = sqlite3.connect('./amazon-fine-food-reviews/database.sqlite')

# filtering only positive and negative reviews i.e.
# not taking into consideration those reviews with Score=3
filtered_data = pd.read_sql_query("""
SELECT * FROM Reviews
WHERE Score != 3
""", con)

# give reviews with Score>3 a positive rating, and reviews with score<3 as a negative raging

def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

# Changing reviews with score less than 3 to be positive and vice-versa
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition)
filtered_data['Score'] = positiveNegative

[nltk_data] Downloading package stopwords to /home/ritesh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
filtered_data.shape
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


# Data Cleaning : Deduplication

In [34]:
display = pd.read_sql_query("""
SELECT * FROM Reviews
WHERE Score != 3 AND USerId="AR5J8UI46CURR"
ORDER BY ProductID
""", con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [35]:
# Sorting data according to ProductId in ascending order
sorted_data = filtered_data.sort_values('ProductId', axis=0, ascending=True)

In [36]:
final = sorted_data.drop_duplicates(subset={"UserId", "ProfileName", "Time", "Text"}, keep='first', inplace=False)
final.shape

(364173, 10)

In [37]:
# Checking how much data is remaining
(final['Id'].size * 1.0)/(filtered_data['Id'].size*1.0) * 100

69.25890143662969

Observation:- It was also seen that in two rows given below the value of HelpfulnessNumerator is greater than HelpfulnessDenominator which is not practically possible hence these two rows too are removed from calculations

In [38]:
display = pd.read_sql_query("""
SELECT * FROM Reviews
WHERE Score != 3 AND Id=44737 Or Id=64422
ORDER BY ProductID
""", con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [39]:
final = final[final.HelpfulnessNumerator <= final.HelpfulnessDenominator]

In [40]:
# Before starting the next phase of preprocessing lets see the number of 
print(final.shape)

# How many positive and negative reviews are present in our dataset?
final['Score'].value_counts()

(364171, 10)


positive    307061
negative     57110
Name: Score, dtype: int64

In [41]:

# BoW
count_vect = CountVectorizer()  ## in Scikit-learn
final_counts = count_vect.fit_transform(final['Text'].values)

In [42]:
type(final_counts)

scipy.sparse.csr.csr_matrix

In [43]:
final_counts.get_shape()

(364171, 115281)

## Text Preprocessing

In [44]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('english'))    # Set of stopwords
sno = nltk.stem.SnowballStemmer('english')

def cleanhtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]', r'', sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]', r'', cleaned)
    return cleaned

print(stop)
print("*******************************************")
print(sno.stem('tasty'))

{'against', 'into', 'here', 'needn', 'down', 'before', 'they', 'no', 'above', 'we', 'wasn', 'about', 'same', 'whom', 'does', 'after', 'your', "weren't", 'over', 'too', 'my', 'she', 'herself', 'both', 'will', "doesn't", 'weren', 'should', 'did', 'again', 'hers', 'its', 'of', 'he', "you've", 'couldn', 'be', 'or', 'once', 'was', 'up', "she's", "that'll", 'for', 'during', 'are', "it's", 'under', 'any', 'hasn', 'isn', 'you', 'themselves', 'don', 'in', 'am', 'how', 'their', 'her', 'didn', 'doesn', 'on', 've', "needn't", 'i', 'by', 'but', "isn't", 'as', "hadn't", 'all', 'some', 'when', 'this', 'what', 'were', 'yourselves', 'through', 'mustn', "shan't", "wouldn't", 'to', 'few', 'himself', 'ma', 'shan', 'hadn', 't', 'wouldn', 'until', 'out', 'now', "should've", 'myself', 'yourself', 'll', 'him', 'between', 'had', 'if', "hasn't", 'from', 'o', 'won', 'such', 'haven', "wasn't", 'that', 'so', 'with', "you're", "didn't", 'me', 'm', 'our', 'because', 'y', 'which', 'being', 'the', "couldn't", 'them', 

In [46]:
# Step by step code for Sentiment analysis

i = 0
str1 = ' '
final_string = []
all_positive_words = []
all_negative_words = []
s = ''

for sent in final['Text'].values:
    filtered_sentence = []
    #print(sent)
    sent = cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if ((cleaned_words.isalpha()) and (len(cleaned_words) > 2)):
                if (cleaned_words.lower() not in stop):
                    s = (sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (final["Score"].values)[i] == 'positive':
                        all_positive_words.append(s)   # list of all words used
                    if (final["Score"].values)[i] == 'negative':
                        all_negative_words.append(s)  # list of all words used
                else:
                    continue
            else:
                continue
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence)   # final string of cleaned words
    # print("********************************************")
    
    final_string.append(str1)
    i += 1
    
        

In [47]:
final["CleneadText"] = final_string   # Adding a column of CleanedText which 

In [48]:
final.head(3)

# store final.table into an SQLite table for future
conn = sqlite3.connect('final.sqlite')
c = conn.cursor()
conn.text_factory = str
final.to_sql("Reviews", conn, flavor=None, schema=None, if_exists="replace")

TypeError: to_sql() got an unexpected keyword argument 'flavor'

## Bi-Grams and n-Grams

In [49]:
freq_dist_positive = nltk.FreqDist(all_positive_words)
freq_dist_negative = nltk.FreqDist(all_negative_words)
print("Most common Positive words: ", freq_dist_positive.most_common(20))
print("Most common Negative words: ", freq_dist_negative.most_common(20))

Most common Positive words:  [(b'like', 139072), (b'tast', 128077), (b'good', 112017), (b'flavor', 108653), (b'love', 107018), (b'use', 103603), (b'great', 103095), (b'one', 96537), (b'product', 90273), (b'tri', 86411), (b'tea', 83290), (b'coffe', 77985), (b'make', 74914), (b'get', 72029), (b'food', 64244), (b'would', 55457), (b'time', 54848), (b'buy', 54031), (b'realli', 52611), (b'eat', 51790)]
Most common Negative words:  [(b'tast', 34300), (b'like', 32225), (b'product', 28003), (b'one', 20521), (b'flavor', 19368), (b'would', 17947), (b'tri', 17691), (b'use', 15263), (b'good', 14908), (b'coffe', 14579), (b'get', 13770), (b'buy', 13711), (b'order', 12832), (b'food', 12643), (b'dont', 11808), (b'tea', 11574), (b'even', 11042), (b'box', 10734), (b'make', 9823), (b'time', 9720)]


In [50]:
## Bi-gram, tri-gram and n-gram

# removing stop words like "Not" should be avoided before building n-grams
count_vect = CountVectorizer(ngram_range=(1, 2))  # in Scikit-learn
final_bigram_counts = count_vect.fit_transform(final['Text'].values)

In [53]:
print(final_bigram_counts.get_shape())

(364171, 2910192)


## TF-IDF

In [54]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1, 2))
final_tf_idf = tf_idf_vect.fit_transform(final["Text"].values)

In [55]:
final_tf_idf.get_shape()

(364171, 2910192)

In [56]:
features = tf_idf_vect.get_feature_names()
len(features)

2910192

In [57]:
features[100000:100010]

['ales until',
 'ales ve',
 'ales would',
 'ales you',
 'alessandra',
 'alessandra ambrosia',
 'alessi',
 'alessi added',
 'alessi also',
 'alessi and']

In [58]:
# convert a row in sparsematrix to a numpy array
print(final_tf_idf[3, :].toarray()[0])

[0. 0. 0. ... 0. 0. 0.]


In [60]:
def top_tfidf_feats(row, features, top_n=25):
    '''Get top n tfidf values in row and return them with their corresponding'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['features', 'tfidf']
    return df
top_tfidf = top_tfidf_feats(final_tf_idf[1, :].toarray()[0], features, 25)

In [61]:
top_tfidf

Unnamed: 0,features,tfidf
0,sendak books,0.173437
1,rosie movie,0.173437
2,paperbacks seem,0.173437
3,cover version,0.173437
4,these sendak,0.173437
5,the paperbacks,0.173437
6,pages open,0.173437
7,really rosie,0.168074
8,incorporates them,0.168074
9,paperbacks,0.168074


# Word2Vec

In [63]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

model = KeyedVectors.load_word