In [6]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

In [2]:
# Loading the de-duplicated reviews
data = pd.read_pickle('deduped_reviews')

In [3]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
# Data cleaning and utility functions

def rearrange_score():
    '''This function will make every score greater than 3 as positive
        and something less than 3 as negative'''
    score = data.Score.tolist()
    for i in range(len(score)):
        if(score[i]>3):
            score[i]=1
        else:
            score[i]=0
    data['Score'] = score
    
def remove_htmltags(df,cn):
    '''This function will remove the html tags'''
    col = df[cn].tolist()
    from bs4 import BeautifulSoup
    for i in range(len(col)):
        soup = BeautifulSoup(col[i], "lxml")        
        col[i] = soup.get_text()
    df[cn] = col
    return df

def remove_punctuation(df,cn):
    '''This function will remove almost every puntuation marks except \' '''
    col = df[cn].tolist()
    import re
    for i in range(len(col)):
        col[i] = re.sub('[^A-Za-z0-9\s\']+', '', col[i])
    df[cn] = col
    return df

def drop_cols(df,cols):
    '''This function will drop the unnecessary columns'''
    df = df.drop(labels=cols,axis=1)
    return df

def make_lower(df,cn):
    col = df[cn].tolist()
    for i in range(len(col)):
        col[i] = col[i].lower()
    df[cn]=col
    return df

In [7]:
# I've noticed an anamoly in a certain summary. Let's remove it first from the dataframe

summary = data['Summary'].tolist()
index_to_remove = list()

import math
for i in range(len(summary)):
    try:
        if(math.isnan(summary[i])):
            index_to_remove.append(i)
    except:
        _ = None
        
data = data.drop(data.index[index_to_remove])

In [8]:
data = data[data.Score != 3]

In [9]:
data.shape

(366401, 10)

In [10]:
# Calling the functions

cols_to_drop = set(data.columns) - {'Summary','Score'}

data = drop_cols(data,list(cols_to_drop))

rearrange_score()

data = remove_htmltags(data,'Summary')

data = remove_punctuation(data,'Summary')

data = make_lower(data,'Summary')

In [11]:
data.head()

Unnamed: 0,Score,Summary
0,1,good quality dog food
1,0,not as advertised
2,1,delight says it all
3,0,cough medicine
4,1,great taffy


In [12]:
data.to_pickle('only_summary') # Saving the dataframe

In [13]:
data = pd.read_pickle('only_summary')

In [14]:
# Randomizing the dataset

data = data.sample(frac=1)

In [25]:
# Utility functions and algorithm

from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

def metric(observed,predicted):
    '''Prints different metric btaking in observed and predicted value'''
    pre_rec = precision_recall_fscore_support(observed,predicted)
    roc_auc = roc_auc_score(observed,predicted)
    print ('---Precision:---\n{}\n---Recall:---\n{}\n---fscore:---\n{}\n---AUC:---\n{}'.format(pre_rec[0],pre_rec[1],pre_rec[2],roc_auc))
    
    
def lr_classifier(X_train,X_test,y_train,param):
    '''Logistic regression with hyperparameter tuning'''
    lr = LogisticRegression(class_weight= 'balanced',n_jobs=-1,penalty='l1')
    clf = GridSearchCV(lr,param)
    clf.fit(X_train,y_train)

    lr_parameters = lr.get_params()
    lr_parameters['C'] = clf.best_params_['C']

    lr.set_params(**lr_parameters)
    print ('\n---Parameters for LR---\n{}'.format(lr.get_params))

    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    
    return (y_pred) 

def nb_classifier(X_train,X_test,y_train,param):
    '''Naive Bayes with hyper parameter tuning'''
    nb = MultinomialNB(class_prior=[1,1])
    clf = GridSearchCV(nb,param)
    clf.fit(X_train,y_train)
    
    nb_parameters = nb.get_params()
    nb_parameters['alpha'] = clf.best_params_['alpha']

    nb.set_params(**nb_parameters)
    print ('\n---Parameters for NB---\n{}'.format(nb.get_params))

    nb.fit(X_train,y_train)
    y_pred = nb.predict(X_test)
    
    return (y_pred)



## Tfidf on summary

In [16]:
# Creating tfidf features
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=0)
tfidf_features = tfidf_vectorizer.fit_transform(data['Summary'])
tfidf_features.get_shape()

(366401, 40152)

In [23]:
l = int(0.8*data.shape[0])
print ('Size of training set -- {}\nSize of test set -- {}'.format(l,data.shape[0]-l))

Size of training set -- 293120
Size of test set -- 73281


In [18]:
# Performing Naive Bayes on tfidf approach

alpha = [0.125,0.25,0.5,1,2,4,8]
parameter = {'alpha':alpha}

y_pred = nb_classifier(tfidf_features[:l],tfidf_features[l:],data.Score[:l],parameter)

print ('\n===METRICS===')
metric(data.Score[l:],y_pred)


---Parameters for NB---
<bound method BaseEstimator.get_params of MultinomialNB(alpha=4, class_prior=[1, 1], fit_prior=True)>

===METRICS===
---Precision:---
[ 0.60732334  0.93425911]
---Recall:---
[ 0.64561867  0.92345703]
---fscore:---
[ 0.62588577  0.92882666]
---AUC:---
0.7845378497734821


In [19]:
# Performing Logistic regression on tfidf approach

parameter = {'C':[0.125,0.25,0.5,1,2,4,8]}

y_pred = lr_classifier(tfidf_features[:l],tfidf_features[l:],data.Score[:l],parameter)

print ('\n===METRICS===')
metric(data.Score[l:],y_pred)


---Parameters for LR---
<bound method BaseEstimator.get_params of LogisticRegression(C=4, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)>

===METRICS===
---Precision:---
[ 0.62178937  0.96666437]
---Recall:---
[ 0.82932629  0.9075025 ]
---fscore:---
[ 0.71071698  0.93614966]
---AUC:---
0.8684143954831502


## W2V on summary

In [10]:
data = pd.read_pickle('only_summary')

In [11]:
data = data.sample(frac=1)

In [1]:
# Import necessary libraries and loading google's w2v model

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)



In [4]:
'has' in model.vocab.keys()

True

In [14]:
# Utility functions for W2v model
from sklearn.preprocessing import StandardScaler

def center_scale(X):
    '''This function standardizes the features'''
    X = StandardScaler().fit_transform(X)
    return X

def get_avg_vector(df):
    
    summary = df['Summary'].tolist()
    vectorlist = list()
    
    for i in range(len(summary)):
        
        sen_vec = np.zeros(shape=(300,))
        N = 0
        
        for word in summary[i].split():
            if (word in model.vocab.keys()):
                sen_vec = sen_vec + model[word]
                N = N + 1
        
        if(N != 0):
            vectorlist.append(sen_vec)
        else:
            vectorlist.append(np.zeros(shape=(300,)))

    return (vectorlist)

In [8]:
np.zeros(shape=(5,)) / 25

array([ 0.,  0.,  0.,  0.,  0.])

In [15]:
# Getting the 300 dim mean weighted vector

avg_w2v = get_avg_vector(data)

In [21]:
def check_nan(X):
    '''This functions checks and returns the position of NaN values if present any'''
    import math
    positions = list()
    for i in range(len(X)):
        if(math.isnan(X[i][0])):
            positions.append(i)
    
    return positions

def modify_nan(X,positions):
    for i in positions:
        X[i] = np.zeros(shape=(300,))
    
    return X

In [20]:
pos = check_nan(avg_w2v)

avg_w2v = modify_nan(avg_w2v,pos)

In [22]:
# Standardizing the values

avg_w2v = center_scale(avg_w2v)

In [26]:
# Performing Logistic regression on tfidf approach

parameter = {'C':[0.125,0.25,0.5,1,2,4,8]}

y_pred = lr_classifier(avg_w2v[:l],avg_w2v[l:],data.Score[:l],parameter)

print ('\n===METRICS===')
metric(data.Score[l:],y_pred)


---Parameters for LR---
<bound method BaseEstimator.get_params of LogisticRegression(C=0.125, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)>

===METRICS===
---Precision:---
[ 0.48226881  0.96822094]
---Recall:---
[ 0.85424258  0.82883977]
---fscore:---
[ 0.61649239  0.89312511]
---AUC:---
0.841541176023826
