In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

In [2]:
# Loading the deduped dataframe
data = pd.read_pickle('deduped_reviews')
#data = data.sort_values(by='Time')
print (data.shape)

(396309, 10)


In [3]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
# Utility functions for pre-processing

def print_df(df,nn):
    '''Prints random nn samples'''
    print (df.sample(n=nn))
    
# Appending summary and text
def append_summary_text():
    '''This function will append the summary and the text'''
    summary = data.Summary.tolist()
    text = data.Text.tolist()
    for i in range(len(summary)):
        if(summary[i][-1]!='.' or summary[i][-1]!='!'):
            summary[i] = summary[i] + '.'
        text[i] = summary[i]+' '+text[i]
    data['Text'] = text

def rearrange_score():
    '''This function will make every score greater than 3 as positive
        and something less than 3 as negative'''
    score = data.Score.tolist()
    for i in range(len(score)):
        if(score[i]>3):
            score[i]=1
        else:
            score[i]=0
    data['Score'] = score
    
def drop_cols(df,cols):
    df = df.drop(labels=cols,axis=1)
    return df

def remove_htmltags(df,cn):
    col = df[cn].tolist()
    from bs4 import BeautifulSoup
    for i in range(len(col)):
        soup = BeautifulSoup(col[i], "lxml")        
        col[i] = soup.get_text()
    df[cn] = col
    return df

def remove_punctuation(df,cn):
    col = df[cn].tolist()
    import re
    for i in range(len(col)):
        col[i] = re.sub('[^A-Za-z0-9\s\']+', '', col[i])
    df[cn] = col
    return df

from nltk.corpus import stopwords
stop_word = stopwords.words('english')
#print (type(stop_word))

def notin(word):
    return (not(word in set(stop_word)))

def remove_stopwords(df,cn):
    col = df[cn].tolist()
    for i in range(len(col)):
        col[i] = ' '.join(j for j in col[i].split() if notin(j))
    df[cn] = col
    return df

def stemming(df,cn):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    col = df[cn].tolist()
    for i in range(len(col)):
        col[i] = ' '.join(stemmer.stem(j) for j in col[i].split())
    df[cn] = col
    return df

def make_lower(df,cn):
    col = df[cn].tolist()
    for i in range(len(col)):
        col[i] = col[i].lower()
    df[cn]=col
    return df

In [5]:
# How the data looks
print_df(data,5)

            Id   ProductId          UserId                ProfileName  \
375938  375939  B0000DBN1L  A3D6TFYRMIV3ZL              Themis-Athena   
29857    29858  B0045CTYNI  A3B6D3UQIMVOUK  ViVeriVeniversumVivusVici   
290881  290882  B005HG9ESG  A2L0WJMOT484GM                   reviewer   
428009  428010  B003KRHDMI  A1NZ4QSS7JJ1UY            Packard 1 "Ren"   
284048  284049  B0051COPH6  A2VH0UT5EQFB6P                 Loveguitar   

        HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
375938                    11                      12      5  1163116800   
29857                      5                       5      5  1318636800   
290881                     0                       0      4  1345680000   
428009                     1                       1      5  1293753600   
284048                     0                       0      4  1342224000   

                                  Summary  \
375938  India's Original Spiced Milk Tea.   
29857          Addic

In [6]:
# Discarding the three star reviews
data = data.loc[data.Score!=3]

In [7]:
print (data.shape)
# Previously there were 396309 reviews. Now 366402. (396309-366402) = ~30k 3 star reviews

(366402, 10)


In [8]:
print (data.isna().sum())
# Checking where the null values are

Id                         0
ProductId                  0
UserId                     0
ProfileName               11
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                    1
Text                       0
dtype: int64


In [9]:
data = data.fillna('Unavailable')
# We are filling the null values with unavailable as filling with string type is fine

In [10]:
print (data.isna().sum())
# No null values anymore

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64


In [11]:
append_summary_text() # Calling this function would append the summary infront of the text
data = make_lower(data,'Text')

In [12]:
# Let's check how the dataframe looks now
print_df(data,5)

            Id   ProductId          UserId            ProfileName  \
70095    70096  B007I7Z3Z0   AKGQ6RM68SQY1  Catherine Diane "CDI"   
381117  381118  B001EQ4HVC   APPPI44BNFTMF         Ellen A. Paige   
20100    20101  B004U49R24   AVU1ILDDYW301               G. Hearn   
138719  138720  B0000GHNTK  A1X1CEGHTHMBL1                  jjceo   
384195  384196  B000EVWQZW  A2AJ8FFJED971Z        atticusthebaker   

        HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
70095                      1                       2      1  1343606400   
381117                     8                       8      5  1196553600   
20100                      1                       1      2  1336521600   
138719                     0                       0      5  1340323200   
384195                     2                       4      1  1288310400   

                                 Summary  \
70095                          SULFITES!   
381117  S'Malts, as in "some more malts"   
20

In [13]:
rearrange_score() # Calling this function would rearrange the score 1 for >3 and 0 for <3

In [14]:
# Let's check how the dataframe looks
print_df(data,3)

            Id   ProductId          UserId                       ProfileName  \
431965  431966  B001P22GHC  A2JOLGWW0J7K6G                             angel   
75687    75688  B004MO6NI8   AOYUWL023B8TZ  Donna "a very truthful reviewer"   
121057  121058  B001JH93BU  A35CSDXCHC44SY              Frequent Contributor   

        HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
431965                     2                       2      1  1275609600   
75687                      0                       0      1  1330387200   
121057                    52                      55      1  1264204800   

                          Summary  \
431965  Pre-portioned yummy-ness!   
75687                 Great Taste   
121057                Fun once...   

                                                     Text  
431965  pre-portioned yummy-ness!. i initially began t...  
75687   great taste. i enjoyed the taste of no fear, b...  
121057  fun once.... a lot of the reviews/testimo

In [15]:
# We are going the drop the columns that we don't need
cols_to_drop = set(data.columns) - {'Score','Text','HelpfulnessNumerator','HelpfulnessDenominator'}

data = drop_cols(data,list(cols_to_drop))

In [16]:
print_df(data,3)

        HelpfulnessNumerator  HelpfulnessDenominator  Score  \
443525                     1                       3      1   
48714                      2                       2      0   
325477                     0                       0      1   

                                                     Text  
443525  these candy pebbles rock!. (sing to "tiny bubb...  
48714   awful taste, not even sure it's real honey, be...  
325477  it is good.. i don't write a lot of reviews......  


In [17]:
data = remove_htmltags(data,'Text') # This will remove the html tags that we have in the 'Text' field

In [18]:
print (data.Text.iloc[290620]);print('-'*125); print(data.Text.iloc[293]);print('-'*125); print(data.Text.iloc[156971])
# We have picked three random reviews and they seem free from tags

delicious!!!. i am a runner and a single mom so this soup is perfect for my taste and nutritional needs.  i eat them for lunch, a healthy snack or sometimes breakfast.  i have tried many cup-o-soups, but this one is my favorite.  it isn't overly salty and has nutritional yeast in it, which a flavor i love.  i subscribe to this product.  i appreciate having it delivered to me.  i can't always find it in stores, and even if they carry it isn't always on the shelf.  i highly recommend this soup.
-----------------------------------------------------------------------------------------------------------------------------
absolutely delicious!. these individually wrapped pieces are delicious (although i wish there were a few more). mildly sweet with no sugar-free aftertaste, even a traditional "marzipan-aholic" should enjoy these.
-----------------------------------------------------------------------------------------------------------------------------
yum?. i quite liked it, the flavor is

In [19]:
data = remove_punctuation(data,'Text') # This will remove the puntuation except apostrophe

In [20]:
print (data.Text.iloc[290620]);print('-'*125); print(data.Text.iloc[293]);print('-'*125); print(data.Text.iloc[156971])
# We have picked three random reviews and they seem free from punctuations

delicious i am a runner and a single mom so this soup is perfect for my taste and nutritional needs  i eat them for lunch a healthy snack or sometimes breakfast  i have tried many cuposoups but this one is my favorite  it isn't overly salty and has nutritional yeast in it which a flavor i love  i subscribe to this product  i appreciate having it delivered to me  i can't always find it in stores and even if they carry it isn't always on the shelf  i highly recommend this soup
-----------------------------------------------------------------------------------------------------------------------------
absolutely delicious these individually wrapped pieces are delicious although i wish there were a few more mildly sweet with no sugarfree aftertaste even a traditional marzipanaholic should enjoy these
-----------------------------------------------------------------------------------------------------------------------------
yum i quite liked it the flavor is a bit harsh at first but then i

In [21]:
data = remove_stopwords(data,'Text') # This will remove the stopwords

In [22]:
print (data.Text.iloc[290620]);print('-'*125); print(data.Text.iloc[293]);print('-'*125); print(data.Text.iloc[156971])
# We have picked three random reviews and they seem free from punctuations

delicious runner single mom soup perfect taste nutritional needs eat lunch healthy snack sometimes breakfast tried many cuposoups one favorite overly salty nutritional yeast flavor love subscribe product appreciate delivered can't always find stores even carry always shelf highly recommend soup
-----------------------------------------------------------------------------------------------------------------------------
absolutely delicious individually wrapped pieces delicious although wish mildly sweet sugarfree aftertaste even traditional marzipanaholic enjoy
-----------------------------------------------------------------------------------------------------------------------------
yum quite liked flavor bit harsh first becomes mellow certainly caffeinated good expensive buy though liked


In [23]:
# Saving the processed dataframe
data.to_pickle('processed_reviews')

In [24]:
data = pd.read_pickle('processed_reviews')

In [25]:
# Stemming
data = stemming(data,'Text')

In [26]:
# Saving the stemmed data too for later usage
data.to_pickle('processed_stemmed_reviews')

In [27]:
data = pd.read_pickle('processed_stemmed_reviews')
print (data.Text.iloc[290620]);print('-'*125); print(data.Text.iloc[293]);print('-'*125); print(data.Text.iloc[156971])
# We have picked three random reviews and they seem free from punctuations

delici runner singl mom soup perfect tast nutrit need eat lunch healthi snack sometim breakfast tri mani cuposoup one favorit overli salti nutrit yeast flavor love subscrib product appreci deliv can't alway find store even carri alway shelf highli recommend soup
-----------------------------------------------------------------------------------------------------------------------------
absolut delici individu wrap piec delici although wish mildli sweet sugarfre aftertast even tradit marzipanahol enjoy
-----------------------------------------------------------------------------------------------------------------------------
yum quit like flavor bit harsh first becom mellow certainli caffein good expens buy though like


In [28]:
# Utility functions and algorithm

from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

def metric(observed,predicted):
    '''Prints different metric btaking in observed and predicted value'''
    pre_rec = precision_recall_fscore_support(observed,predicted)
    roc_auc = roc_auc_score(observed,predicted)
    print ('---Precision:---\n{}\n---Recall:---\n{}\n---fscore:---\n{}\n---AUC:---\n{}'.format(pre_rec[0],pre_rec[1],pre_rec[2],roc_auc))
    
    
def lr_classifier(X_train,X_test,y_train,param):
    '''Logistic regression with hyperparameter tuning'''
    lr = LogisticRegression(class_weight= 'balanced',n_jobs=-1,penalty='l1')
    clf = GridSearchCV(lr,param)
    clf.fit(X_train,y_train)

    lr_parameters = lr.get_params()
    lr_parameters['C'] = clf.best_params_['C']

    lr.set_params(**lr_parameters)
    print ('\n---Parameters for LR---\n{}'.format(lr.get_params))

    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    
    return (y_pred) 

def nb_classifier(X_train,X_test,y_train,param):
    '''Naive Bayes with hyper parameter tuning'''
    nb = MultinomialNB(class_prior=[1,1])
    clf = GridSearchCV(nb,param)
    clf.fit(X_train,y_train)
    
    nb_parameters = nb.get_params()
    nb_parameters['alpha'] = clf.best_params_['alpha']

    nb.set_params(**nb_parameters)
    print ('\n---Parameters for NB---\n{}'.format(nb.get_params))

    nb.fit(X_train,y_train)
    y_pred = nb.predict(X_test)
    
    return (y_pred) 

def rf_classifier(X_train,X_test,y_train,param):
    '''Random forest with hyperparameter tuning as number of trees'''
    rf = RandomForestClassifier(n_jobs=-1,class_weight='balanced',verbose=0)
    clf = GridSearchCV(rf,param)
    clf.fit(X_train,y_train)

    rf_parameters = rf.get_params()
    rf_parameters['n_estimators'] = clf.best_params_['n_estimators']

    rf.set_params(**rf_parameters)
    print ('\n---Parameters for RF---\n{}'.format(rf.get_params))

    rf.fit(X_train,y_train)
    y_pred = rf.predict(X_test)

    return (y_pred)

def xgb_classifier(X_train,X_test,y_train,param):
    
    xg = XGBClassifier(silent=True,nthread=4) # Change this n_thread according to the number of cores
    clf = GridSearchCV(xg,param)
    clf.fit(X_train,y_train)

    xg_parameters = xg.get_params()
    xg_parameters['n_estimators'] = clf.best_params_['n_estimators']
    xg_parameters['max_depth'] = clf.best_params_['max_depth']

    xg.set_params(**xg_parameters)
    print ('\n---Parameters for xgboost---\n{}'.format(xg.get_params))

    xg.fit(X_train,y_train)
    y_pred = xg.predict(X_test)

    return (y_pred)



### BOW approach on text

In [29]:
l = int(0.8*data.shape[0])
print ('Size of training set -- {}\nSize of test set -- {}'.format(l,data.shape[0]-l))

Size of training set -- 293121
Size of test set -- 73281


In [30]:
from sklearn.feature_extraction.text import CountVectorizer
text_vectorizer = CountVectorizer()
text_features = text_vectorizer.fit_transform(data.Text)
text_features.get_shape()

(366402, 278683)

In [31]:
# Performing Logistic regression on BOW approach

C = np.arange(1,11,1)
parameter = {'C':list(C)}

y_pred = lr_classifier(text_features[:l],text_features[l:],data.Score[:l],parameter)

print ('\n===METRICS===')
metric(data.Score[l:],y_pred)


---Parameters for LR---
<bound method BaseEstimator.get_params of LogisticRegression(C=2, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)>

===METRICS===
---Precision:---
[ 0.69488575  0.97097284]
---Recall:---
[ 0.84724925  0.93213392]
---fscore:---
[ 0.76354071  0.95115707]
---AUC:---
0.8896915865782747


In [32]:
# Performing Naive Bayes on BOW approach

alpha = [0.125,0.25,0.5,1,2,4,8]
parameter = {'alpha':alpha}

y_pred = nb_classifier(text_features[:l],text_features[l:],data.Score[:l],parameter)

print ('\n===METRICS===')

metric(data.Score[l:],y_pred)


---Parameters for NB---
<bound method BaseEstimator.get_params of MultinomialNB(alpha=1, class_prior=[1, 1], fit_prior=True)>

===METRICS===
---Precision:---
[ 0.68177071  0.95364434]
---Recall:---
[ 0.75057492  0.93608713]
---fscore:---
[ 0.71452027  0.94478418]
---AUC:---
0.8433310239409232


[Note]:
We cannot apply Random forest, xgboost or any of it's variations since the dimensionality is huge when we are applying BOW

### Tf-idf approach on text

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_text_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_text_features = tfidf_text_vectorizer.fit_transform(data['Text'])
tfidf_text_features.get_shape()

(366402, 278683)

In [34]:
# Performing Logistic regression on Tfidf approach

C = np.arange(1,11,1)
parameter = {'C':list(C)}

y_pred = lr_classifier(tfidf_text_features[:l],tfidf_text_features[l:],data.Score[:l],parameter)

print ('\n===METRICS===')
metric(data.Score[l:],y_pred)


---Parameters for LR---
<bound method BaseEstimator.get_params of LogisticRegression(C=6, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)>

===METRICS===
---Precision:---
[ 0.68664831  0.97325688]
---Recall:---
[ 0.86016275  0.92839048]
---fscore:---
[ 0.76367349  0.9502944 ]
---AUC:---
0.8942766127385838


In [35]:
# Performing Naive Bayes on tfidf approach

alpha = [0.125,0.25,0.5,1,2,4,8]
parameter = {'alpha':alpha}

y_pred = nb_classifier(tfidf_text_features[:l],tfidf_text_features[l:],data.Score[:l],parameter)

print ('\n===METRICS===')
metric(data.Score[l:],y_pred)


---Parameters for NB---
<bound method BaseEstimator.get_params of MultinomialNB(alpha=0.25, class_prior=[1, 1], fit_prior=True)>

===METRICS===
---Precision:---
[ 0.62240332  0.94213774]
---Recall:---
[ 0.68901468  0.92374344]
---fscore:---
[ 0.65401729  0.93284993]
---AUC:---
0.80637906370348


[Note]: Due to the dimensionality of the data, we cannot apply any form of decision tree based methods. The same reason was for BOW approach

### W2V Approach

In [36]:
# Loading unstemmed data 
data = pd.read_pickle('processed_reviews')

In [37]:
# Import necessary libraries and loading google's w2v model

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [38]:
type(model)

gensim.models.keyedvectors.KeyedVectors

In [40]:
# Utility functions for W2v model
from sklearn.preprocessing import StandardScaler

def center_scale(X):
    '''This function standardizes the features'''
    X = StandardScaler().fit_transform(X)
    return X

def get_avg_sen_vector(sentence,weight=1):
    '''This function produces the mean weighted sentence vector'''
    _l = 0
    sen_vec = np.zeros(shape=(300,))
    for i in sentence.split():
        try:
            sen_vec += model[i]*weight
            _l += 1
        except:
            a=None
    sen_vec = sen_vec/_l
    return (sen_vec)
    
def get_vector(df,way='avg'):
    '''This function gets the vector of the whole dataset be it mean weighted or idf weighted'''
    w2v_vectors = []
    text = df.Text.tolist()
    for i in range(len(text)):
        if(way=='avg'):
            w2v_vectors.append(get_avg_sen_vector(text[i]))
    return w2v_vectors

In [41]:
# Getting the 300 dim mean weighted vector
avg_w2v = get_vector(data,'avg')

In [42]:
print ('Dimensions: {} x {}'.format(len(avg_w2v),len(avg_w2v[0])))

Dimensions: 366402 x 300


In [43]:
import math
for i in range(len(avg_w2v)):
    for j in range(len(avg_w2v[i])):
        if(math.isnan(avg_w2v[i][j] or math.isinf(avg_w2v[i][j]))):
            print (i,j)

172301 0
172301 1
172301 2
172301 3
172301 4
172301 5
172301 6
172301 7
172301 8
172301 9
172301 10
172301 11
172301 12
172301 13
172301 14
172301 15
172301 16
172301 17
172301 18
172301 19
172301 20
172301 21
172301 22
172301 23
172301 24
172301 25
172301 26
172301 27
172301 28
172301 29
172301 30
172301 31
172301 32
172301 33
172301 34
172301 35
172301 36
172301 37
172301 38
172301 39
172301 40
172301 41
172301 42
172301 43
172301 44
172301 45
172301 46
172301 47
172301 48
172301 49
172301 50
172301 51
172301 52
172301 53
172301 54
172301 55
172301 56
172301 57
172301 58
172301 59
172301 60
172301 61
172301 62
172301 63
172301 64
172301 65
172301 66
172301 67
172301 68
172301 69
172301 70
172301 71
172301 72
172301 73
172301 74
172301 75
172301 76
172301 77
172301 78
172301 79
172301 80
172301 81
172301 82
172301 83
172301 84
172301 85
172301 86
172301 87
172301 88
172301 89
172301 90
172301 91
172301 92
172301 93
172301 94
172301 95
172301 96
172301 97
172301 98
172301 99
172301 100

In [44]:
print (avg_w2v[172301])

[ nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  

So we notice that there's something wrong with the location 172301. Everything is NaN here. However, one entry cannot harm our model that much. Hence let's change the nan there and move on.

In [45]:
avg_w2v[172301] = list(np.zeros(shape=(300,)))

In [46]:
# Centering and scaling the data, so that the optimisation algorithm converges faster
avg_w2v = center_scale(avg_w2v)

In [47]:
# Performing Logistic regression on mean weighted w2v approach

C = [0.125,0.25,0.5,1,2,4,8]
parameter = {'C':C}

y_pred = lr_classifier(avg_w2v[:l],avg_w2v[l:],data.Score[:l],parameter)

print ('\n===METRICS===')
metric(data.Score[l:],y_pred)


---Parameters for LR---
<bound method BaseEstimator.get_params of LogisticRegression(C=8, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)>

===METRICS===
---Precision:---
[ 0.52353286  0.97278063]
---Recall:---
[ 0.86874226  0.85576442]
---fscore:---
[ 0.65334087  0.91052835]
---AUC:---
0.8622533409420308


In [None]:
# Random forest on mean weighted w2v 

parameter = {'n_estimators':[50,100,150,200,250]}

y_pred = rf_classifier(avg_w2v[:l],avg_w2v[l:],data.Score[:l],parameter)

print ('\n===METRICS===')
metric(data.Score[l:],y_pred)

In [None]:
# Xgboost on mean weighted w2v

parameter = {'n_estimators':[50,100,150,200,250], 'max_depth':[1,2,3,4,5]}

y_pred = xgb_classifier(avg_w2v[:l],avg_w2v[l:],data.Score[:l],parameter)

print ('\n===METRICS===')
metric(data.Score[l:],y_pred)

In [None]:
# Now we would try to do tfidf weighted w2v

In [48]:
# Utility function for tfidf weighted w2v

def get_tfidfweighted_w2v(df):
    '''This function will take in the dataframe and return the tfidf weighted word 2 vec'''
    
    from sklearn.feature_extraction.text import TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(min_df=0)
    tfidf_features = tfidf_vectorizer.fit_transform(df['Text'])
    tfidf_vec = []
    
    check = 0; words_not_in = [] # Debugging purposes
    
    text = df['Text'].tolist()
    
    for loc in range(len(text)):
        sentence = text[loc]
        sen_vec = np.zeros(shape=(300,)); N = 0
        for word in sentence.split():
            try:
                if(word in tfidf_vectorizer.vocabulary_.keys()):
                    x = loc
                    y = tfidf_vectorizer.vocabulary_[word]
                    tfidf_weight = tfidf_features[x,y]
                    #print (tfidf_weight)
                    sen_vec += tfidf_weight * model[word]
                    N += tfidf_weight
            except:
                #print ('Entered')
                check = check + 1
                words_not_in.append(word)
        sen_vec = sen_vec / N
        tfidf_vec.append(sen_vec)
        
    return tfidf_vec,words_not_in

In [49]:
tfidf_w2v,word_not_in =  get_tfidfweighted_w2v(data)

In [50]:
for i in range(len(tfidf_w2v)):
    for j in range(len(tfidf_w2v[i])):
        if(math.isnan(tfidf_w2v[i][j] or math.isinf(tfidf_w2v[i][j]))):
            print (i,j)

172301 0
172301 1
172301 2
172301 3
172301 4
172301 5
172301 6
172301 7
172301 8
172301 9
172301 10
172301 11
172301 12
172301 13
172301 14
172301 15
172301 16
172301 17
172301 18
172301 19
172301 20
172301 21
172301 22
172301 23
172301 24
172301 25
172301 26
172301 27
172301 28
172301 29
172301 30
172301 31
172301 32
172301 33
172301 34
172301 35
172301 36
172301 37
172301 38
172301 39
172301 40
172301 41
172301 42
172301 43
172301 44
172301 45
172301 46
172301 47
172301 48
172301 49
172301 50
172301 51
172301 52
172301 53
172301 54
172301 55
172301 56
172301 57
172301 58
172301 59
172301 60
172301 61
172301 62
172301 63
172301 64
172301 65
172301 66
172301 67
172301 68
172301 69
172301 70
172301 71
172301 72
172301 73
172301 74
172301 75
172301 76
172301 77
172301 78
172301 79
172301 80
172301 81
172301 82
172301 83
172301 84
172301 85
172301 86
172301 87
172301 88
172301 89
172301 90
172301 91
172301 92
172301 93
172301 94
172301 95
172301 96
172301 97
172301 98
172301 99
172301 100

Let's convert the null values again

In [51]:
tfidf_w2v[172301] = list(np.zeros(shape=(300,)))
tfidf_w2v[257702] = list(np.zeros(shape=(300,)))

In [52]:
# Centering and scaling the data, so that the optimisation algorithm converges faster
tfidf_w2v = center_scale(tfidf_w2v)

In [53]:
# Performing Logistic regression on mean weighted w2v approach

C = [0.125,0.25,0.5,1,2,4,8]
parameter = {'C':C}

y_pred = lr_classifier(tfidf_w2v[:l],tfidf_w2v[l:],data.Score[:l],parameter)

print ('\n===METRICS===')
metric(data.Score[l:],y_pred)


---Parameters for LR---
<bound method BaseEstimator.get_params of LogisticRegression(C=2, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)>

===METRICS===
---Precision:---
[ 0.45024493  0.9631903 ]
---Recall:---
[ 0.82920573  0.81529649]
---fscore:---
[ 0.58360309  0.88309419]
---AUC:---
0.8222511109951934
