# Sentiment-Analysis-Amazon-fine-foods

    Loading the data

In [30]:
import pandas as pd
df = pd.read_csv('data/Reviews.csv')
df.shape

(568454, 10)

In [31]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

## Dropping duplicates

In [32]:
imp_cols = set(df.columns)-{'Id','ProductId'}
df = df.drop_duplicates(subset=imp_cols)
print ('Dimension after eliminating duplicates',df.shape)

Dimension after eliminating duplicates (396309, 10)


***
    1. Neglecting 3 star reviews 
    2. Sorting by time-stamp
    3. Extracting Reviews and Summary and concatenating them
    4. Defining <3 score as negative and >3 as positive

In [33]:
df = df[df.Score != 3]
df = df.sort_values(by='Time')
temp1 = df.Text.tolist(); temp2 = df.Summary.tolist()
X = [str(temp1[i])+' '+str(temp2[i]) for i in range(len(temp1))]
Y = []
for i in df.Score.tolist():
    if(i>3):
        Y.append(1)
    else:
        Y.append(0)
del df, temp1, temp2
len(X), len(Y)

(366402, 366402)

## Cleaning the data
* Removing HTML tags
* Make in lower case
* Tokenizing and removing stopwords with punctuation marks
* Also removing non alpha numeric data

In [34]:
import re
X = [re.sub('<[^>]*>', '',i.lower()) for i in X] #Removes HTML tags
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
stop_word = stopwords.words('english')+\
['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']
'''including punctuation marks and other chatecters in the stopwords'''
for j in range(len(X)):
    X[j] = [i for i in wordpunct_tokenize(X[j]) if (i not in stop_word) and (i.isalnum())]
print (X[5])

['one', 'movie', 'movie', 'collection', 'filled', 'comedy', 'action', 'whatever', 'else', 'want', 'call', 'great']


* Stemming to normalize the words

In [6]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
for j in range(len(X)):
    X[j] = [ps.stem(i) for i in X[j]]

***

    Storing the cleaned data to avoid running above operations in the future

In [7]:
import pickle
with open('clean-data-XY.pkl','wb') as fp:
    tupXY = (X,Y)
    pickle.dump(tupXY,fp)
fp.close()
del tupXY, X, Y

    Loading the data

In [8]:
import pickle
with open('clean-data-XY.pkl','rb') as fp:
    X,Y = pickle.load(fp)
fp.close()
len(X), len(Y)

(366402, 366402)

In [9]:
X = [' '.join(i) for i in X]
X[0]

'witti littl book make son laugh loud recit car drive along alway sing refrain learn whale india droop rose love new word book introduc silli classic book will bet son still abl recit memori colleg everi book educ'

# Baseline models:
* Throught the models, precision recall would be my metric
* Feature used: Bag of words

In [10]:
bs_metric = []

### Count based Bag-of-words

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
model = CountVectorizer(min_df=3,binary=False) #keeping min_df=3 will eliminate unnecessary strings
l = int(0.7*len(X))
model.fit(X[:l])
BOW_tr = model.transform(X[:l])
BOW_ts = model.transform(X[l:])
BOW_tr.shape, BOW_ts.shape

((256481, 27224), (109921, 27224))

### Occurence based Bag-of-words

In [12]:
model = CountVectorizer(min_df=3,binary=True)
model.fit(X[:l])
BBOW_tr = model.transform(X[:l])
BBOW_ts = model.transform(X[l:])
BBOW_tr.shape, BBOW_ts.shape

((256481, 27224), (109921, 27224))

## 1. Multinomial Naive Baye's
    Doing Grid-search on hyperparameter alpha

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(MultinomialNB(),{'alpha':[0.5,0.25,0.125,1,2,3,4]})
clf.fit(BOW_tr,Y[:l])
hX = clf.predict(BOW_ts)

In [14]:
from sklearn.metrics import precision_recall_fscore_support, roc_curve
m = precision_recall_fscore_support(Y[l:],hX)
bs_metric.append(('Multinomial NB with Count-BOW',m[0],m[1]))
m

(array([ 0.7340398 ,  0.94389242]),
 array([ 0.73643813,  0.94323636]),
 array([ 0.73523701,  0.94356427]),
 array([19282, 90639]))

We can see high precision and recall for positive class, but low for negative class

## 2. Bernoullis Naive Baye's
    Doing grid search on hyperparameter alpha

In [15]:
from sklearn.naive_bayes import BernoulliNB
clf = GridSearchCV(BernoulliNB(),{'alpha':[0.5,0.25,0.125,1,2,3,4]})
clf.fit(BBOW_tr,Y[:l])
hX = clf.predict(BBOW_ts)

In [16]:
m = precision_recall_fscore_support(Y[l:],hX)
bs_metric.append(('Bernoullis NB with Binary-BOW',m[0],m[1]))
m

(array([ 0.69271864,  0.9411627 ]),
 array([ 0.72627321,  0.93146438]),
 array([ 0.70909919,  0.93628843]),
 array([19282, 90639]))

The metrics have decreased than MultinomialNB which was expected

## 3. Logistic Regression

    L1 regularizer with gridsearch on hyperparameter c

In [17]:
from sklearn.linear_model import LogisticRegression
clf = GridSearchCV(LogisticRegression(n_jobs=-1,penalty='l1'),{'C':[0.25,0.5,0.75,1,2,3,4]})
clf.fit(BBOW_tr,Y[:l])
hX = clf.predict(BBOW_ts)

In [18]:
m = precision_recall_fscore_support(Y[l:],hX)
bs_metric.append(('Logistic Regression-L1 regularizer',m[0],m[1]))
m

(array([ 0.8501605 ,  0.94384969]),
 array([ 0.72798465,  0.97270491]),
 array([ 0.78434331,  0.95806008]),
 array([19282, 90639]))

The metrics have drastically improved from Naive Baye's

    L2 regularizer with gridsearch on hyperparameter C

In [19]:
clf = GridSearchCV(LogisticRegression(n_jobs=-1,penalty='l2'),{'C':[0.25,0.5,0.75,1,2,3,4]})
clf.fit(BBOW_tr,Y[:l])
hX = clf.predict(BBOW_ts)

In [20]:
m = precision_recall_fscore_support(Y[l:],hX)
bs_metric.append(('Logistic Regression-L2 regularizer',m[0],m[1]))
m

(array([ 0.85201603,  0.94381133]),
 array([ 0.72767348,  0.97311312]),
 array([ 0.78495105,  0.95823827]),
 array([19282, 90639]))

The metrics are almost equal to LR with l1 regularizer

### Summary

In [21]:
print('Model --- Precision --- Recall')
for i in bs_metric:
    model, pre, rec = i
    print('{} --- {} --- {}'.format(model,pre,rec))

Model --- Precision --- Recall
Multinomial NB with Count-BOW --- [ 0.7340398   0.94389242] --- [ 0.73643813  0.94323636]
Bernoullis NB with Binary-BOW --- [ 0.69271864  0.9411627 ] --- [ 0.72627321  0.93146438]
Logistic Regression-L1 regularizer --- [ 0.8501605   0.94384969] --- [ 0.72798465  0.97270491]
Logistic Regression-L2 regularizer --- [ 0.85201603  0.94381133] --- [ 0.72767348  0.97311312]


***
# Feature engineering
* tf-idf
* W2V
* tf-idf based W2V

In [22]:
fe_metric = []

### Tf-idf:

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(min_df=3,strip_accents='unicode')
model.fit(X[:l])
#tmp = model.fit_transform(X)
tfidf_tr = model.transform(X[:l])
tfidf_ts = model.transform(X[l:])
#tfidf_tr, tfidf_ts = tmp[:l], tmp[l:]
tfidf_tr.shape, tfidf_ts.shape

((256481, 27222), (109921, 27222))

## 1. Multinomial Naive Baye's
    Doing Grid-search on hyperparameter alpha

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(MultinomialNB(),{'alpha':[0.5,0.25,0.125,1,2,3,4]})
clf.fit(tfidf_tr,Y[:l])
hX = clf.predict(tfidf_ts)

In [25]:
from sklearn.metrics import precision_recall_fscore_support, roc_curve
m = precision_recall_fscore_support(Y[l:],hX)
fe_metric.append(('MultinomialNB with tfidf',m[0],m[1]))
m

(array([ 0.91731827,  0.87431927]),
 array([ 0.32854476,  0.99370028]),
 array([ 0.48380938,  0.93019509]),
 array([19282, 90639]))

After using tf-idf the precision for negative class has increased and the positive class has decreased. The recall for negative class has drastically decreased but for positive class has increased very much.
        
    Checking for overfitting

In [26]:
hX = clf.predict(tfidf_tr)
m = precision_recall_fscore_support(Y[:l],hX)
m

(array([ 0.92301067,  0.89851194]),
 array([ 0.36251269,  0.994671  ]),
 array([ 0.52057098,  0.9441494 ]),
 array([ 38429, 218052]))

## 2. Bernoullis Naive Baye's
    Doing grid search on hyperparameter alpha

In [27]:
from sklearn.naive_bayes import BernoulliNB
clf = GridSearchCV(BernoulliNB(),{'alpha':[0.5,0.25,0.125,1,2,3,4]})
clf.fit(tfidf_tr,Y[:l])
hX = clf.predict(tfidf_ts)

In [28]:
m = precision_recall_fscore_support(Y[l:],hX)
fe_metric.append(('BernoullisNB with tfidf',m[0],m[1]))
m

(array([ 0.69263106,  0.94117122]),
 array([ 0.72632507,  0.93143128]),
 array([ 0.70907802,  0.93627592]),
 array([19282, 90639]))

## 3. Logistic Regression

    L1 regularizer with gridsearch on hyperparameter c

In [29]:
from sklearn.linear_model import LogisticRegression
clf = GridSearchCV(LogisticRegression(n_jobs=-1,penalty='l1'),{'C':[0.25,0.5,0.75,1,2,3,4]})
clf.fit(tfidf_tr,Y[:l])
hX = clf.predict(tfidf_ts)

In [30]:
m = precision_recall_fscore_support(Y[l:],hX)
fe_metric.append(('LR(L1) with tfidf',m[0],m[1]))
m

(array([ 0.8576021 ,  0.94763914]),
 array([ 0.74712167,  0.97360959]),
 array([ 0.79855876,  0.96044884]),
 array([19282, 90639]))

    L2 regularizer with hyper parameter tuning

In [31]:
from sklearn.linear_model import LogisticRegression
clf = GridSearchCV(LogisticRegression(n_jobs=-1,penalty='l2'),{'C':[0.25,0.5,0.75,1,2,3,4]})
clf.fit(tfidf_tr,Y[:l])
hX = clf.predict(tfidf_ts)

In [32]:
m = precision_recall_fscore_support(Y[l:],hX)
fe_metric.append(('LR(L2) with tfidf',m[0],m[1]))
m

(array([ 0.86021505,  0.9468019 ]),
 array([ 0.74266155,  0.97432672]),
 array([ 0.79712767,  0.96036713]),
 array([19282, 90639]))

### Summary

In [33]:
print('Model --- Precision --- Recall')
for i in fe_metric:
    model, pre, rec = i
    print('{} --- {} --- {}'.format(model,pre,rec))

Model --- Precision --- Recall
MultinomialNB with tfidf --- [ 0.91731827  0.87431927] --- [ 0.32854476  0.99370028]
BernoullisNB with tfidf --- [ 0.69263106  0.94117122] --- [ 0.72632507  0.93143128]
LR(L1) with tfidf --- [ 0.8576021   0.94763914] --- [ 0.74712167  0.97360959]
LR(L2) with tfidf --- [ 0.86021505  0.9468019 ] --- [ 0.74266155  0.97432672]


***
### Word 2 Vec
    Using gensim library
    Keeping vector size=100

In [34]:
fe_metric = []

In [35]:
import nltk
Tokenized_X = [i.split() for i in X]
# don't use word_punct tokenize. It caused problems.

In [36]:
from gensim.models.word2vec import Word2Vec
# we would train this W2V and save in a binary file
model = Word2Vec(Tokenized_X,size=100,workers=4,min_count=1)
with open('Normal-W2V-100.pkl','wb') as fp:
    pickle.dump(model,fp)
fp.close()

In [37]:
with open('Normal-W2V-100.pkl','rb') as fp:
    model = pickle.load(fp)
model.most_similar('good')

[('great', 0.7985106110572815),
 ('decent', 0.7620522975921631),
 ('nice', 0.6822870373725891),
 ('excel', 0.6805440187454224),
 ('bad', 0.6473750472068787),
 ('tasti', 0.6393276453018188),
 ('fantast', 0.629787027835846),
 ('awesom', 0.6213588118553162),
 ('like', 0.61072838306427),
 ('terrif', 0.6032482385635376)]

In [11]:
# Creating the sentence vector

In [38]:
import scipy as np
sen_vector = np.empty((len(X),100)); index = 0
for i in Tokenized_X:
    vector = np.zeros(100)
    for word in i:
        vector += model[word]
    vector = vector/len(i)
    sen_vector[index] = vector
    index += 1

## Logistic Regression

    L1 regularizer with gridsearch on hyperparameter c

In [39]:
from sklearn.linear_model import LogisticRegression
clf = GridSearchCV(LogisticRegression(n_jobs=-1,penalty='l1'),{'C':[0.25,0.5,0.75,1,2,3,4]})
clf.fit(sen_vector[:l],Y[:l])
hX = clf.predict(sen_vector[l:])

In [40]:
m = precision_recall_fscore_support(Y[l:],hX)
fe_metric.append(('LR(L1) with W2V',m[0],m[1]))
m

(array([ 0.83357558,  0.92968297]),
 array([ 0.65434084,  0.97220843]),
 array([ 0.73316288,  0.95047027]),
 array([19282, 90639]))

    L2 regularizer with hyper parameter tuning

In [41]:
from sklearn.linear_model import LogisticRegression
clf = GridSearchCV(LogisticRegression(n_jobs=-1,penalty='l2'),{'C':[0.25,0.5,0.75,1,2,3,4]})
clf.fit(sen_vector[:l],Y[:l])
hX = clf.predict(sen_vector[l:])

In [42]:
m = precision_recall_fscore_support(Y[l:],hX)
fe_metric.append(('LR(L2) with W2V',m[0],m[1]))
m

(array([ 0.83316822,  0.92965816]),
 array([ 0.65423711,  0.9721312 ]),
 array([ 0.73294019,  0.9504204 ]),
 array([19282, 90639]))

### Summary

In [43]:
print('Model --- Precision --- Recall')
for i in fe_metric:
    model, pre, rec = i
    print('{} --- {} --- {}'.format(model,pre,rec))

Model --- Precision --- Recall
LR(L1) with W2V --- [ 0.83357558  0.92968297] --- [ 0.65434084  0.97220843]
LR(L2) with W2V --- [ 0.83316822  0.92965816] --- [ 0.65423711  0.9721312 ]


### W2V without stemming
Before running the part below, rerun the code above before stemming | Don't run the PorterStemmer part as our objective is to construct W2V on non stemmed data

In [9]:
from gensim.models.word2vec import Word2Vec
'''Training W2V with un-stemmed data'''
model = Word2Vec(X,size=100,workers=4,min_count=1)

Creating the sentence vector

In [12]:
import scipy as np
sen_vector = np.empty((len(X),100)); index = 0
for i in X:
    vector = np.zeros(100)
    for word in i:
        vector += model[word]
    vector = vector/len(i)
    sen_vector[index] = vector
    index += 1

In [20]:
# Using l1 regularizer

In [16]:
l = int(0.7*len(X))
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(LogisticRegression(n_jobs=-1,penalty='l1'),{'C':[0.25,0.5,0.75,1,2,3,4]})
clf.fit(sen_vector[:l],Y[:l])
hX = clf.predict(sen_vector[l:])

In [19]:
from sklearn.metrics import precision_recall_fscore_support
m = precision_recall_fscore_support(Y[l:],hX)
#fe_metric.append(('LR(L2) with W2V',m[0],m[1]))
m

(array([ 0.8328127 ,  0.93141558]),
 array([ 0.66367597,  0.97165679]),
 array([ 0.73868622,  0.95111073]),
 array([19282, 90639]))

In [21]:
l = int(0.7*len(X))
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(LogisticRegression(n_jobs=-1,penalty='l2'),{'C':[0.25,0.5,0.75,1,2,3,4]})
clf.fit(sen_vector[:l],Y[:l])
hX = clf.predict(sen_vector[l:])

In [22]:
from sklearn.metrics import precision_recall_fscore_support
m = precision_recall_fscore_support(Y[l:],hX)
#fe_metric.append(('LR(L2) with W2V',m[0],m[1]))
m

(array([ 0.83209828,  0.93144483]),
 array([ 0.66388341,  0.97150233]),
 array([ 0.73853343,  0.95105197]),
 array([19282, 90639]))

### W2V with weights from tfidf