# Sentiment Analysis on the summary 

Here in this notebook I will try to perform sentiment analysis by only using the summary part of the review from the original dataset

In [1]:
# importing required libraries

import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

In [2]:
# Loading the de-duplicated reviews

data = pd.read_pickle('deduped_reviews')

In [3]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


This is how the original dataframe looks
<br><br>
In the cell below there are some utility functions that'll help me throughout. Please go through their function documentation to know more about them.

In [4]:
# Data cleaning and utility functions

def rearrange_score():
    '''This function will make every score greater than 3 as positive
        and something less than 3 as negative'''
    score = data.Score.tolist()
    for i in range(len(score)):
        if(score[i]>3):
            score[i]=1
        else:
            score[i]=0
    data['Score'] = score
    
def remove_htmltags(df,cn):
    '''This function will remove the html tags'''
    col = df[cn].tolist()
    from bs4 import BeautifulSoup
    for i in range(len(col)):
        soup = BeautifulSoup(col[i], "lxml")        
        col[i] = soup.get_text()
    df[cn] = col
    return df

def remove_punctuation(df,cn):
    '''This function will remove almost every puntuation marks except \' '''
    col = df[cn].tolist()
    import re
    for i in range(len(col)):
        col[i] = re.sub('[^A-Za-z0-9\s\']+', '', col[i])
    df[cn] = col
    return df

def drop_cols(df,cols):
    '''This function will drop the unnecessary columns'''
    df = df.drop(labels=cols,axis=1)
    return df

def make_lower(df,cn):
    '''This function makes the contents of the column(cn) of dataframe(df) into lowercase'''
    col = df[cn].tolist()
    for i in range(len(col)):
        col[i] = col[i].lower()
    df[cn]=col
    return df

In [5]:
# I've noticed an anamoly in a certain summary. Let's remove it first from the dataframe

summary = data['Summary'].tolist()
index_to_remove = list()

import math
for i in range(len(summary)):
    try:
        if(math.isnan(summary[i])):
            index_to_remove.append(i)
    except:
        _ = None
        
data = data.drop(data.index[index_to_remove])

In [6]:
# Dropping all the rows that have rating as 3 star since I've already decided anything >3 is positive and anything <3 as negative

data = data[data.Score != 3]

In [7]:
data.shape

(366401, 10)

In [8]:
# Calling the functions

# deciding which columns are unnecessary
cols_to_drop = set(data.columns) - {'Summary','Score'}

# Removing unnecessary columns
data = drop_cols(data,list(cols_to_drop))

# Rearraging score
# less than 3: Class 0
# greater than 3: Class 1
rearrange_score()

# Removing html tags from the summary  column of the dataframe
data = remove_htmltags(data,'Summary')

# Removing selective punctuation marks from the summary column of the dataframe
data = remove_punctuation(data,'Summary')

# turning the uppercase portions of the summary into lowercase
data = make_lower(data,'Summary')

In [9]:
# how my data looks after doing all the actions in the above cells

data.head()

Unnamed: 0,Score,Summary
0,1,good quality dog food
1,0,not as advertised
2,1,delight says it all
3,0,cough medicine
4,1,great taffy


In [10]:
# Saving the dataframe

data.to_pickle('only_summary') 

In [11]:
# reloading the saved dataframe

data = pd.read_pickle('only_summary') 

In [12]:
# Randomizing the dataset

data = data.sample(frac=1)

In the cell below I have written some utility functions and algorithms that will help me build and evaluate models in the later part of this notebook

In [13]:
# Utility functions and algorithm

from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

def metric(observed,predicted):
    '''Prints different metric btaking in observed and predicted value'''
    pre_rec = precision_recall_fscore_support(observed,predicted)
    roc_auc = roc_auc_score(observed,predicted)
    print ('---Precision:---\n{}\n---Recall:---\n{}\n---fscore:---\n{}\n---AUC:---\n{}'.format(pre_rec[0],pre_rec[1],pre_rec[2],roc_auc))
    
    
def lr_classifier(X_train,X_test,y_train,param):
    '''Logistic regression with hyperparameter tuning'''
    lr = LogisticRegression(class_weight= 'balanced',n_jobs=-1,penalty='l1')
    clf = GridSearchCV(lr,param)
    clf.fit(X_train,y_train)

    lr_parameters = lr.get_params()
    lr_parameters['C'] = clf.best_params_['C']

    lr.set_params(**lr_parameters)
    print ('\n---Parameters for LR---\n{}'.format(lr.get_params))

    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    
    return (y_pred) 

def nb_classifier(X_train,X_test,y_train,param):
    '''Naive Bayes with hyper parameter tuning'''
    nb = MultinomialNB(class_prior=[1,1])
    clf = GridSearchCV(nb,param)
    clf.fit(X_train,y_train)
    
    nb_parameters = nb.get_params()
    nb_parameters['alpha'] = clf.best_params_['alpha']

    nb.set_params(**nb_parameters)
    print ('\n---Parameters for NB---\n{}'.format(nb.get_params))

    nb.fit(X_train,y_train)
    y_pred = nb.predict(X_test)
    
    return (y_pred)



## Tfidf on summary

In [14]:
# Creating tfidf features using tfidf-vectorizer of scikit-learn 

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=0)
tfidf_features = tfidf_vectorizer.fit_transform(data['Summary'])
tfidf_features.get_shape()

(366401, 40152)

In [15]:
l = int(0.8*data.shape[0])
print ('Size of training set -- {}\nSize of test set -- {}'.format(l,data.shape[0]-l))

Size of training set -- 293120
Size of test set -- 73281


In [16]:
# Performing Naive Bayes on tfidf approach

alpha = [0.125,0.25,0.5,1,2,4,8]
parameter = {'alpha':alpha}

y_pred = nb_classifier(tfidf_features[:l],tfidf_features[l:],data.Score[:l],parameter)

print ('\n===METRICS===')
metric(data.Score[l:],y_pred)


---Parameters for NB---
<bound method BaseEstimator.get_params of MultinomialNB(alpha=4, class_prior=[1, 1], fit_prior=True)>

===METRICS===
---Precision:---
[ 0.60781893  0.93332352]
---Recall:---
[ 0.64435913  0.92292138]
---fscore:---
[ 0.62555588  0.92809331]
---AUC:---
0.7836402578155206


In [17]:
# Performing Logistic regression on tfidf approach

parameter = {'C':[0.125,0.25,0.5,1,2,4,8]}

y_pred = lr_classifier(tfidf_features[:l],tfidf_features[l:],data.Score[:l],parameter)

print ('\n===METRICS===')
metric(data.Score[l:],y_pred)


---Parameters for LR---
<bound method BaseEstimator.get_params of LogisticRegression(C=4, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)>

===METRICS===
---Precision:---
[ 0.62445528  0.96787898]
---Recall:---
[ 0.8377105   0.90659981]
---fscore:---
[ 0.71553138  0.93623774]
---AUC:---
0.8721551511771695


## W2V on summary

In [18]:
# Reloading the data before creating the w2v features 

data = pd.read_pickle('only_summary')

In [19]:
# Randomizing the dataframe 

data = data.sample(frac=1)

In [20]:
# Import necessary libraries and loading google's w2v model

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [21]:
'has' in model.vocab.keys()

True

In the cell below we have 2 utility funtions. 1st one is used for standardizing the data and the 2nd one is to get the mean weighted 300 dim vector.

In [22]:
# Utility functions for W2v model

from sklearn.preprocessing import StandardScaler

def center_scale(X):
    '''This function standardizes the features'''
    X = StandardScaler().fit_transform(X)
    return X

def get_avg_vector(df):
    '''This fucntion created mean weighted w2v vectors on the summary part of the dataframe'''
    summary = df['Summary'].tolist()
    vectorlist = list()
    
    for i in range(len(summary)):
        
        sen_vec = np.zeros(shape=(300,))
        N = 0
        
        for word in summary[i].split():
            if (word in model.vocab.keys()):
                sen_vec = sen_vec + model[word]
                N = N + 1
        
        if(N != 0):
            vectorlist.append(sen_vec)
        else:
            vectorlist.append(np.zeros(shape=(300,)))

    return (vectorlist)

In [23]:
# Getting the 300 dim mean weighted vector

avg_w2v = get_avg_vector(data)

Again we have 2 utility functions specifically for w2v models

In [24]:
def check_nan(X):
    '''This functions checks and returns the position of NaN values if present any'''
    import math
    positions = list()
    for i in range(len(X)):
        if(math.isnan(X[i][0])):
            positions.append(i)
    
    return positions

def modify_nan(X,positions):
    '''this function modifies those nan values'''
    for i in positions:
        X[i] = np.zeros(shape=(300,))
    
    return X

In [25]:
# I had found out that there are some nan values that are being generated while getting the 300 dim vectors

# this gets the location of the nan
pos = check_nan(avg_w2v)

# this modifies those nan in those locations
avg_w2v = modify_nan(avg_w2v,pos)

In [26]:
# Standardizing the values

avg_w2v = center_scale(avg_w2v)

In [27]:
# Performing Logistic regression on tfidf approach

parameter = {'C':[0.125,0.25,0.5,1,2,4,8]}

y_pred = lr_classifier(avg_w2v[:l],avg_w2v[l:],data.Score[:l],parameter)

print ('\n===METRICS===')
metric(data.Score[l:],y_pred)


---Parameters for LR---
<bound method BaseEstimator.get_params of LogisticRegression(C=2, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)>

===METRICS===
---Precision:---
[ 0.48675546  0.96648708]
---Recall:---
[ 0.84677003  0.83191451]
---fscore:---
[ 0.61816581  0.89416583]
---AUC:---
0.8393422699775088
