In [None]:
!pip install seaborn
!pip install gensim
import warnings
warnings.filterwarnings('ignore')

### Objective: 
To classify a review(amazon fine food reviews) into negative or positive class using Naive Bayes algorithm.

### Result: 
After taking various methods to convert a review to vector , i found that tf-idf version of review applied 
with Multinomial naive bayes performs well(accuracy=89.75 approx) in comparison with others.
    

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split,cross_val_score,TimeSeriesSplit,cross_val_predict
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import accuracy_score,make_scorer

from sklearn.metrics import confusion_matrix,make_scorer
from sklearn.model_selection import cross_validate
from collections import OrderedDict
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

#reading cleaned amazon review csv
df=pd.read_csv('cleaned_amazon_reviews.csv')
df=df.dropna()

df_sample=df #here we can modify our sample size

In [3]:
df_sample=df_sample.sort_values(by=['Time'],axis=0,ascending=True)  # sorting of dataframe for Time Based splitting

# using Binary BOW and Bernoulli Naive Bayes

In [4]:
# converting reviews to vector
list_of_sentence=[] #list of sentence to be used for countvectorizer
for sentence in df_sample['CleanText'].values:
    li=sentence
    list_of_sentence.append(li)

from sklearn.feature_extraction.text import CountVectorizer
bow_model=CountVectorizer(ngram_range=(1, 2),binary=True) #using  Binary BOW and 1 & 2 grams 
bow_review_matrix=bow_model.fit_transform(list_of_sentence) #training our model and converting text to vector

In [5]:
X=bow_review_matrix #creating out feature matrix and label vector
y=df_sample['class'].apply(lambda string: 1 if string=='positive' else 0)

In [6]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,shuffle=False) # 70-30 split without shuffling(used for TBS)

In [7]:
def tnr(y_true,y_pred): 
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return tn/(tn+fp)

def fpr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return fp/(tn+fp)

def fnr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return fn/(fn+tp)

def tpr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return tp/(fn+tp)

tnr=make_scorer(tnr)
fpr=make_scorer(fpr)
fnr=make_scorer(fnr)
tpr=make_scorer(tpr)

scoring={'accuracy':'accuracy',
    'tnr':tnr,
    'fnr':fnr,
    'fpr':fpr,
    'tpr':tpr,
    'precision':'precision',
    'recall':'recall',
    'f1':'f1',
}


In [8]:
from  sklearn.metrics import accuracy_score#training stage
tscv=TimeSeriesSplit(n_splits=10) # timeseries split for using timeseries based cross-validation
weight_vector_y_train=compute_sample_weight(class_weight='balanced',y=y_train) #oversampling to balance the dataset
#creating list's to store scores for each value of alpha for each score
accuracy=[]
tpr=[]
tnr=[]
fpr=[]
fnr=[]
f1=[]
precision=[]
recall=[]
alpha_list=[0.7,0.8,0.9,1,1.2,1.4] # alpha used for cross validation
for alpha in alpha_list:
    nb_clf_model=BernoulliNB(alpha=alpha)
    cv_dict=cross_validate(nb_clf_model,X_train,y_train,scoring=scoring,cv=tscv.split(X_train),fit_params={'sample_weight':weight_vector_y_train})
    accuracy.append(np.mean(cv_dict['test_accuracy']))
    tnr.append(np.mean(cv_dict['test_tnr']))
    fnr.append(np.mean(cv_dict['test_fnr']))
    fpr.append(np.mean(cv_dict['test_fpr']))
    tpr.append(np.mean(cv_dict['test_tpr']))
    f1.append(np.mean(cv_dict['test_f1']))
    precision.append(np.mean(cv_dict['test_precision']))
    recall.append(np.mean(cv_dict['test_recall'].mean()))


In [9]:
score_df=pd.DataFrame(OrderedDict({'Accuracy':accuracy,'TNR':tnr,'TPR':tpr,'FNR':fnr,'FPR':fpr,'Precesion':precision,'Recall':recall,'F1':f1}),index=alpha_list)   
score_df.index.name='alpha'
score_df

Unnamed: 0_level_0,Accuracy,TNR,TPR,FNR,FPR,Precesion,Recall,F1
alpha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.7,0.899909,0.442153,0.977935,0.022065,0.557847,0.9113,0.977935,0.943053
0.8,0.898064,0.428957,0.978101,0.021899,0.571043,0.909413,0.978101,0.942083
0.9,0.896381,0.416034,0.978462,0.021538,0.583966,0.907536,0.978462,0.94121
1.0,0.894851,0.404465,0.978788,0.021212,0.595535,0.905831,0.978788,0.940417
1.2,0.891863,0.381213,0.97961,0.02039,0.618787,0.902355,0.97961,0.938879
1.4,0.889392,0.361819,0.980396,0.019604,0.638181,0.899409,0.980396,0.937614


In [10]:
def confusion_score(y_true,y_predicted,sample_weight=None):
    '''
    calculation of all 4 ratios of confusion matrix
    and returing dict consisting all rations
    '''
    tn, fp, fn, tp = confusion_matrix(y_true,y_predicted).ravel()
    tnr=tn/(tn+fp)
    fpr=fp/(tn+fp)
    fnr=fn/(fn+tp)
    tpr=tp/(fn+tp)
    return {'tnr':tnr,'tpr':tpr,'fpr':fpr,'fnr':fnr}
  

In [11]:
#test stage/evaluation stage
#retrianing our model with optimal hyperparameter:
optimal_alpha=float(input('please enter optimal alpha  '))
nb_clf_model=BernoulliNB(alpha=optimal_alpha)
weight_vector_y_train=compute_sample_weight(class_weight='balanced',y=y_train) #oversampling to balance the dataset
nb_clf_model.fit(X_train,y_train,sample_weight=weight_vector_y_train)

#evaluating our model
weight_vector_y_test=compute_sample_weight(class_weight='balanced',y=y_test) #oversampling to balance the dataset
y_predicted=nb_clf_model.predict(X_test)
accuracy=accuracy_score(y_test,y_predicted,normalize=True,sample_weight=weight_vector_y_test)
precesion=precision_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
recall=recall_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
f1=f1_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
confusion_result=confusion_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
print('Accuracy= ',accuracy*100,' %')
print('TNR= ',confusion_result['tnr']*100,' %')
print('TPR= ',confusion_result['tpr']*100,' %')
print('FNR= ',confusion_result['fnr']*100,' %')
print('FPR= ',confusion_result['fpr']*100,' %')
print('Precesion= ',precesion*100,' %')
print('Recall= ',recall*100,' %')
print('F1= ',f1*100,' %')

please enter optimal alpha  0.7
Accuracy=  85.2849958808  %
TNR=  74.7099377838  %
TPR=  95.8600539778  %
FNR=  4.13994602223  %
FPR=  25.2900622162  %
Precesion=  79.1250202552  %
Recall=  95.8600539778  %
F1=  86.6922935674  %


### feature importance

In [21]:
feature_prob=pd.DataFrame({'negative':nb_clf_model.feature_log_prob_[0,:],'positive':nb_clf_model.feature_log_prob_[1,:],'feature':bow_model.get_feature_names()})


In [22]:
#top 100 feature for positive class
feature_prob.sort_values(by=['positive'],axis=0,ascending=False)['feature'][:100]

1850292        not
1563628       like
2736080       tast
1620760       love
1185785       good
1217211      great
2787022        the
2826533       this
1042761     flavor
1910210        one
2974007        use
2907733        tri
2147763    product
1651550       make
1149117        get
383137         buy
2849770       time
251097        best
3135268      would
1019034       find
2231983     realli
2128916      price
88702       amazon
1786405       much
854640         eat
74927         also
807909        dont
1588739      littl
1933196      order
2637615      store
            ...   
274010         bit
1972621     packag
2674125      sugar
562145        come
2817112      thing
1615976        lot
1499477       know
761033      differ
1664342       mani
417932        cant
2937892        two
2380079        say
632769       could
675431         cup
925109       everi
2625958      still
1598803      local
499957      chocol
83555        alway
3110998    without
3045051      water
2901342     

In [23]:
#top 100 feature for negative class
feature_prob.sort_values(by=['negative'],axis=0,ascending=False)['feature'][:100]

1850292           not
2736080          tast
1563628          like
2787022           the
2147763       product
1910210           one
3135268         would
2907733           tri
2826533          this
1185785          good
1042761        flavor
383137            buy
1149117           get
2974007           use
807909           dont
1933196         order
917016           even
1651550          make
1786405          much
2849770          time
2231983        realli
1620760          love
1608367          look
88702          amazon
315343         bought
322642            box
854640            eat
776139     disappoint
1069061          food
2179667       purchas
              ...    
826659          drink
3045051         water
1089200         found
1348038         howev
2513803         smell
2247587     recommend
2752048           tea
376064            but
1397810       ingredi
417932           cant
173671           away
3150313          year
499957         chocol
2482747          sinc
2800629   

### How confusion matrix can help us to understand about dumb models 

In [51]:
# confusion matrix  considering unbalanced data
nb_clf_model=BernoulliNB(alpha=optimal_alpha)
nb_clf_model.fit(X_train,y_train)
y_predicted=nb_clf_model.predict(X_test)
confusion_result=confusion_score(y_test,y_predicted)
print('TNR= ',confusion_result['tnr']*100,' %')
print('TPR= ',confusion_result['tpr']*100,' %')
print('FNR= ',confusion_result['fnr']*100,' %')
print('FPR= ',confusion_result['fpr']*100,' %')
# so we can see how bad TNR and FPR if we are modeling Naive Bayes algo with unbalanced data

TNR=  23.7654839975  %
TPR=  94.4437566544  %
FNR=  5.55624334563  %
FPR=  76.2345160025  %


#### so here  we can see how bad TNR and FPR if we are modeling Naive Bayes algo with unbalanced data, so we can say that model is positive class has undue advantage over negative class.

# using count based BOW and multinomial naive bayes

In [24]:
# converting reviews to vector
list_of_sentence=[] #list of sentence to be used for countvectorizer
for sentence in df_sample['CleanText'].values:
    li=sentence
    list_of_sentence.append(li)

from sklearn.feature_extraction.text import CountVectorizer
bow_model=CountVectorizer(ngram_range=(1, 2),binary=False) #using count-BOW and 1 & 2 grams 
bow_review_matrix=bow_model.fit_transform(list_of_sentence) #training our model and converting text to vector

In [25]:
X=bow_review_matrix #creating out feature matrix and label vector
y=df_sample['class'].apply(lambda string: 1 if string=='positive' else 0)

In [26]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,shuffle=False) # 70-30 split without shuffling(used for TBS)

In [27]:
def tnr(y_true,y_pred): 
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return tn/(tn+fp)

def fpr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return fp/(tn+fp)

def fnr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return fn/(fn+tp)

def tpr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return tp/(fn+tp)

tnr=make_scorer(tnr)
fpr=make_scorer(fpr)
fnr=make_scorer(fnr)
tpr=make_scorer(tpr)

scoring={'accuracy':'accuracy',
    'tnr':tnr,
    'fnr':fnr,
    'fpr':fpr,
    'tpr':tpr,
    'precision':'precision',
    'recall':'recall',
    'f1':'f1',
}


In [28]:
from  sklearn.metrics import accuracy_score#training stage
tscv=TimeSeriesSplit(n_splits=10) # timeseries split for using timeseries based cross-validation
weight_vector_y_train=compute_sample_weight(class_weight='balanced',y=y_train) #oversampling to balance the dataset
#creating list's to store scores for each value of alpha for each score
accuracy=[]
tpr=[]
tnr=[]
fpr=[]
fnr=[]
f1=[]
precision=[]
recall=[]
alpha_list=[0.7,0.8,0.9,1,1.2,1.4] # alpha used for cross validation
for alpha in alpha_list:
    nb_clf_model=MultinomialNB(alpha=alpha)
    cv_dict=cross_validate(nb_clf_model,X_train,y_train,scoring=scoring,cv=tscv.split(X_train),fit_params={'sample_weight':weight_vector_y_train})
    accuracy.append(np.mean(cv_dict['test_accuracy']))
    tnr.append(np.mean(cv_dict['test_tnr']))
    fnr.append(np.mean(cv_dict['test_fnr']))
    fpr.append(np.mean(cv_dict['test_fpr']))
    tpr.append(np.mean(cv_dict['test_tpr']))
    f1.append(np.mean(cv_dict['test_f1']))
    precision.append(np.mean(cv_dict['test_precision']))
    recall.append(np.mean(cv_dict['test_recall'].mean()))


In [29]:
score_df=pd.DataFrame(OrderedDict({'Accuracy':accuracy,'TNR':tnr,'TPR':tpr,'FNR':fnr,'FPR':fpr,'Precesion':precision,'Recall':recall,'F1':f1}),index=alpha_list)   
score_df.index.name='alpha'
score_df

Unnamed: 0_level_0,Accuracy,TNR,TPR,FNR,FPR,Precesion,Recall,F1
alpha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.7,0.916976,0.677417,0.956944,0.043056,0.322583,0.945633,0.956944,0.950998
0.8,0.916909,0.681498,0.95596,0.04404,0.318502,0.946472,0.95596,0.950902
0.9,0.916789,0.684731,0.955072,0.044928,0.315269,0.947174,0.955072,0.950786
1.0,0.916541,0.68724,0.954166,0.045834,0.31276,0.947753,0.954166,0.950594
1.2,0.915762,0.690262,0.952456,0.047544,0.309738,0.948485,0.952456,0.950051
1.4,0.91493,0.693008,0.950779,0.049221,0.306992,0.949121,0.950779,0.949474


In [30]:
def confusion_score(y_true,y_predicted,sample_weight=None):
    '''
    calculation of all 4 ratios of confusion matrix
    and returing dict consisting all rations
    '''
    tn, fp, fn, tp = confusion_matrix(y_true,y_predicted).ravel()
    tnr=tn/(tn+fp)
    fpr=fp/(tn+fp)
    fnr=fn/(fn+tp)
    tpr=tp/(fn+tp)
    return {'tnr':tnr,'tpr':tpr,'fpr':fpr,'fnr':fnr}
  

In [31]:
#test stage/evaluation stage
#retrianing our model with optimal hyperparameter:
optimal_alpha=float(input('please enter optimal alpha  '))
nb_clf_model=MultinomialNB(alpha=optimal_alpha)
weight_vector_y_train=compute_sample_weight(class_weight='balanced',y=y_train) #oversampling to balance the dataset
nb_clf_model.fit(X_train,y_train,sample_weight=weight_vector_y_train)

#evaluating our model
weight_vector_y_test=compute_sample_weight(class_weight='balanced',y=y_test) #oversampling to balance the dataset
y_predicted=nb_clf_model.predict(X_test)
accuracy=accuracy_score(y_test,y_predicted,normalize=True,sample_weight=weight_vector_y_test)
precesion=precision_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
recall=recall_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
f1=f1_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
confusion_result=confusion_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
print('Accuracy= ',accuracy*100,' %')
print('TNR= ',confusion_result['tnr']*100,' %')
print('TPR= ',confusion_result['tpr']*100,' %')
print('FNR= ',confusion_result['fnr']*100,' %')
print('FPR= ',confusion_result['fpr']*100,' %')
print('Precesion= ',precesion*100,' %')
print('Recall= ',recall*100,' %')
print('F1= ',f1*100,' %')

please enter optimal alpha  0.7
Accuracy=  87.3790182404  %
TNR=  80.8082506586  %
TPR=  93.9497858222  %
FNR=  6.05021417783  %
FPR=  19.1917493414  %
Precesion=  83.0373970853  %
Recall=  93.9497858222  %
F1=  88.1571822686  %


### feature importance

In [38]:
feature_prob=pd.DataFrame({'negative':nb_clf_model.feature_log_prob_[0,:],'positive':nb_clf_model.feature_log_prob_[1,:],'feature':bow_model.get_feature_names()})

In [39]:
#top 100 feature for positive class
feature_prob.sort_values(by=['positive'],axis=0,ascending=False)['feature'][:100]

1850292        not
1563628       like
2736080       tast
1185785       good
1620760       love
2787022        the
1042761     flavor
2974007        use
1217211      great
1910210        one
2147763    product
2907733        tri
2752048        tea
2826533       this
543608       coffe
1651550       make
1149117        get
1069061       food
3135268      would
383137         buy
2849770       time
854640         eat
2231983     realli
1019034       find
251097        best
2128916      price
88702       amazon
1786405       much
1933196      order
1588739      littl
            ...   
2821173      think
992023     favorit
1815777       need
274010         bit
315343      bought
2482747       sinc
2027065    perfect
1475198       keep
441136         cat
2817112      thing
2937892        two
761033      differ
562145        come
1499477       know
1615976        lot
2522081      snack
1664342       mani
2380079        say
632769       could
1339075        hot
1099117       free
2625958     

In [40]:
#top 100 feature for negative class
feature_prob.sort_values(by=['negative'],axis=0,ascending=False)['feature'][:100]

1850292          not
2736080         tast
1563628         like
2147763      product
2787022          the
1910210          one
1042761       flavor
3135268        would
2907733          tri
543608         coffe
2974007          use
1185785         good
2826533         this
1149117          get
383137           buy
1933196        order
2752048          tea
807909          dont
1069061         food
322642           box
917016          even
1651550         make
88702         amazon
1786405         much
2849770         time
190642           bag
2231983       realli
854640           eat
1608367         look
1620760         love
             ...    
2544849       someth
1438069         item
2817112        thing
1748609          mix
1924247         open
1763894        money
675431           cup
761033        differ
2625958        still
181608          back
2454181         ship
701518           day
3150313         year
2704921        sweet
1089200        found
2657844        stuff
603397       

# using tfidf-BOW and multinomial naive bayes

In [41]:
# converting reviews to vector
list_of_sentence=[] #list of sentence to be used for countvectorizer
for sentence in df_sample['CleanText'].values:
    li=sentence
    list_of_sentence.append(li)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_model=TfidfVectorizer(ngram_range=(1, 2)) #using count-BOW and 1 & 2 grams 
tfidf_bow_review_matrix=tfidf_model.fit_transform(list_of_sentence) #training our model and converting text to vector

In [42]:
X=tfidf_bow_review_matrix#creating out feature matrix and label vector
y=df_sample['class'].apply(lambda string: 1 if string=='positive' else 0)

In [43]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,shuffle=False) # 70-30 split without shuffling(used for TBS)

In [44]:
def tnr(y_true,y_pred): 
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return tn/(tn+fp)

def fpr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return fp/(tn+fp)

def fnr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return fn/(fn+tp)

def tpr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return tp/(fn+tp)

tnr=make_scorer(tnr)
fpr=make_scorer(fpr)
fnr=make_scorer(fnr)
tpr=make_scorer(tpr)

scoring={'accuracy':'accuracy',
    'tnr':tnr,
    'fnr':fnr,
    'fpr':fpr,
    'tpr':tpr,
    'precision':'precision',
    'recall':'recall',
    'f1':'f1',
}


In [45]:
from  sklearn.metrics import accuracy_score#training stage
tscv=TimeSeriesSplit(n_splits=10) # timeseries split for using timeseries based cross-validation
weight_vector_y_train=compute_sample_weight(class_weight='balanced',y=y_train) #oversampling to balance the dataset
#creating list's to store scores for each value of alpha for each score
accuracy=[]
tpr=[]
tnr=[]
fpr=[]
fnr=[]
f1=[]
precision=[]
recall=[]
alpha_list=[0.7,0.8,0.9,1,1.2,1.4] # alpha used for cross validation
for alpha in alpha_list:
    nb_clf_model=MultinomialNB(alpha=alpha)
    cv_dict=cross_validate(nb_clf_model,X_train,y_train,scoring=scoring,cv=tscv.split(X_train),fit_params={'sample_weight':weight_vector_y_train})
    accuracy.append(np.mean(cv_dict['test_accuracy']))
    tnr.append(np.mean(cv_dict['test_tnr']))
    fnr.append(np.mean(cv_dict['test_fnr']))
    fpr.append(np.mean(cv_dict['test_fpr']))
    tpr.append(np.mean(cv_dict['test_tpr']))
    f1.append(np.mean(cv_dict['test_f1']))
    precision.append(np.mean(cv_dict['test_precision']))
    recall.append(np.mean(cv_dict['test_recall'].mean()))


In [46]:
score_df=pd.DataFrame(OrderedDict({'Accuracy':accuracy,'TNR':tnr,'TPR':tpr,'FNR':fnr,'FPR':fpr,'Precesion':precision,'Recall':recall,'F1':f1}),index=alpha_list)   
score_df.index.name='alpha'
score_df

Unnamed: 0_level_0,Accuracy,TNR,TPR,FNR,FPR,Precesion,Recall,F1
alpha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.7,0.914084,0.63274,0.959644,0.040356,0.36726,0.941118,0.959644,0.949515
0.8,0.913572,0.630166,0.959383,0.040617,0.369834,0.940868,0.959383,0.949214
0.9,0.912965,0.627448,0.959058,0.040942,0.372552,0.94055,0.959058,0.948857
1.0,0.912573,0.624542,0.959033,0.040967,0.375458,0.940213,0.959033,0.948641
1.2,0.911994,0.619544,0.959109,0.040891,0.380456,0.93963,0.959109,0.948326
1.4,0.911129,0.614205,0.958941,0.041059,0.385795,0.93892,0.958941,0.947833


In [47]:
def confusion_score(y_true,y_predicted,sample_weight=None):
    '''
    calculation of all 4 ratios of confusion matrix
    and returing dict consisting all rations
    '''
    tn, fp, fn, tp = confusion_matrix(y_true,y_predicted).ravel()
    tnr=tn/(tn+fp)
    fpr=fp/(tn+fp)
    fnr=fn/(fn+tp)
    tpr=tp/(fn+tp)
    return {'tnr':tnr,'tpr':tpr,'fpr':fpr,'fnr':fnr}
  

In [48]:
#test stage/evaluation stage
#retrianing our model with optimal hyperparameter:
optimal_alpha=float(input('please enter optimal alpha  '))
nb_clf_model=MultinomialNB(alpha=optimal_alpha)
weight_vector_y_train=compute_sample_weight(class_weight='balanced',y=y_train) #oversampling to balance the dataset
nb_clf_model.fit(X_train,y_train,sample_weight=weight_vector_y_train)

#evaluating our model
weight_vector_y_test=compute_sample_weight(class_weight='balanced',y=y_test) #oversampling to balance the dataset
y_predicted=nb_clf_model.predict(X_test)
accuracy=accuracy_score(y_test,y_predicted,normalize=True,sample_weight=weight_vector_y_test)
precesion=precision_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
recall=recall_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
f1=f1_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
confusion_result=confusion_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
print('Accuracy= ',accuracy*100,' %')
print('TNR= ',confusion_result['tnr']*100,' %')
print('TPR= ',confusion_result['tpr']*100,' %')
print('FNR= ',confusion_result['fnr']*100,' %')
print('FPR= ',confusion_result['fpr']*100,' %')
print('Precesion= ',precesion*100,' %')
print('Recall= ',recall*100,' %')
print('F1= ',f1*100,' %')

please enter optimal alpha  0.7
Accuracy=  89.7325604264  %
TNR=  89.2046409955  %
TPR=  90.2604798574  %
FNR=  9.73952014262  %
FPR=  10.7953590045  %
Precesion=  89.317431703  %
Recall=  90.2604798574  %
F1=  89.7864795853  %


### feature importance


In [49]:
feature_prob=pd.DataFrame({'negative':nb_clf_model.feature_log_prob_[0,:],'positive':nb_clf_model.feature_log_prob_[1,:],'feature':tfidf_model.get_feature_names()})

In [50]:
#top 100 feature for positive class
feature_prob.sort_values(by=['positive'],axis=0,ascending=False)['feature'][:100]

1620760       love
1217211      great
2736080       tast
2752048        tea
1563628       like
1185785       good
1850292        not
1042761     flavor
543608       coffe
2974007        use
2826533       this
2147763    product
2787022        the
1910210        one
2907733        tri
1651550       make
1149117        get
251097        best
2128916      price
1019034       find
383137         buy
1069061       food
1933196      order
88702       amazon
2849770       time
2231983     realli
854640         eat
2637615      store
799305         dog
3135268      would
            ...   
3031509       want
1030844      first
2454181       ship
3045051      water
1815777       need
1099117       free
205151         bar
3116145     wonder
1598803      local
618307       cooki
849287        easi
1475198       keep
3051460        way
417932        cant
1339075        hot
1105181      fresh
1641490       made
922660        ever
83555        alway
936199       excel
274010         bit
1615976     

In [51]:
#top 100 feature for negative class
feature_prob.sort_values(by=['negative'],axis=0,ascending=False)['feature'][:100]

1850292           not
2736080          tast
1563628          like
2147763       product
2787022           the
3135268         would
543608          coffe
1910210           one
1042761        flavor
2907733           tri
383137            buy
1933196         order
322642            box
2826533          this
2752048           tea
1149117           get
1185785          good
807909           dont
776139     disappoint
2974007           use
917016           even
190642            bag
1069061          food
187587            bad
88702          amazon
2179667       purchas
315343         bought
1972621        packag
1786405          much
1608367          look
              ...    
675431            cup
2637615         store
2454181          ship
1969625          pack
1445720           ive
74927            also
181608           back
944856         expect
2817112         thing
2657844         stuff
1588739         littl
1903232           old
1748609           mix
570250        compani
761033    

# using avg-w2v and multinomial naive bayes

In [52]:
import gensim
sentence_list=[] # list for storing list of words
for sentence in df_sample['CleanText'].values:
    li=sentence.split()
    sentence_list.append(li)
w2v_model=gensim.models.Word2Vec(sentence_list,min_count=5,size=50) # training our word2vec model and note :list of sentences is list of list of words

In [53]:
def get_sentence_vec(list_of_sentence):
    vec_list=[]
    for sentence in list_of_sentence:
        vec=np.zeros(50)
        count=0
        for word in sentence.split():
            try:
                vec=vec+w2v_model.wv[word]
                count+=1
            except:
                continue
        vec=vec/count
        vec_list.append(vec)
    return vec_list

list_of_sentence=[] # list of each review
for sentence in df_sample['CleanText'].values:
    list_of_sentence.append(sentence)
    
w2v_data=get_sentence_vec(list_of_sentence) # calculating sentence vector using Word2Vec


In [54]:
dataframe=pd.DataFrame(w2v_data)  # converting w2v_data to dataframe for nan 
dataframe=dataframe.fillna(0)   #filling nan value with 0
w2v_data=MinMaxScaler(feature_range=(0, 1)).fit_transform(dataframe) # normalization on data

In [55]:
X=w2v_data #creating out feature matrix and label vector
y=df_sample['class'].apply(lambda string: 1 if string=='positive' else 0)

In [56]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,shuffle=False) # 70-30 split without shuffling(used for TBS)

In [57]:
def tnr(y_true,y_pred): 
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return tn/(tn+fp)

def fpr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return fp/(tn+fp)

def fnr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return fn/(fn+tp)

def tpr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return tp/(fn+tp)

tnr=make_scorer(tnr)
fpr=make_scorer(fpr)
fnr=make_scorer(fnr)
tpr=make_scorer(tpr)

scoring={'accuracy':'accuracy',
    'tnr':tnr,
    'fnr':fnr,
    'fpr':fpr,
    'tpr':tpr,
    'precision':'precision',
    'recall':'recall',
    'f1':'f1',
}


In [58]:
from  sklearn.metrics import accuracy_score#training stage
tscv=TimeSeriesSplit(n_splits=10) # timeseries split for using timeseries based cross-validation
weight_vector_y_train=compute_sample_weight(class_weight='balanced',y=y_train) #oversampling to balance the dataset
#creating list's to store scores for each value of alpha for each score
accuracy=[]
tpr=[]
tnr=[]
fpr=[]
fnr=[]
f1=[]
precision=[]
recall=[]
alpha_list=[0.7,0.8,0.9,1,1.2,1.4] # alpha used for cross validation
for alpha in alpha_list:
    nb_clf_model=MultinomialNB(alpha=alpha)
    cv_dict=cross_validate(nb_clf_model,X_train,y_train,scoring=scoring,cv=tscv.split(X_train),fit_params={'sample_weight':weight_vector_y_train})
    accuracy.append(np.mean(cv_dict['test_accuracy']))
    tnr.append(np.mean(cv_dict['test_tnr']))
    fnr.append(np.mean(cv_dict['test_fnr']))
    fpr.append(np.mean(cv_dict['test_fpr']))
    tpr.append(np.mean(cv_dict['test_tpr']))
    f1.append(np.mean(cv_dict['test_f1']))
    precision.append(np.mean(cv_dict['test_precision']))
    recall.append(np.mean(cv_dict['test_recall'].mean()))


In [59]:
score_df=pd.DataFrame(OrderedDict({'Accuracy':accuracy,'TNR':tnr,'TPR':tpr,'FNR':fnr,'FPR':fpr,'Precesion':precision,'Recall':recall,'F1':f1}),index=alpha_list)   
score_df.index.name='alpha'
score_df

Unnamed: 0_level_0,Accuracy,TNR,TPR,FNR,FPR,Precesion,Recall,F1
alpha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.7,0.854202,0.127347,0.985602,0.014398,0.872653,0.862293,0.985602,0.919125
0.8,0.854202,0.127347,0.985602,0.014398,0.872653,0.862293,0.985602,0.919125
0.9,0.854202,0.127347,0.985602,0.014398,0.872653,0.862293,0.985602,0.919125
1.0,0.854202,0.127347,0.985602,0.014398,0.872653,0.862293,0.985602,0.919125
1.2,0.854202,0.127347,0.985602,0.014398,0.872653,0.862293,0.985602,0.919125
1.4,0.854197,0.127319,0.985602,0.014398,0.872681,0.862289,0.985602,0.919122


In [60]:
def confusion_score(y_true,y_predicted,sample_weight=None):
    '''
    calculation of all 4 ratios of confusion matrix
    and returing dict consisting all rations
    '''
    tn, fp, fn, tp = confusion_matrix(y_true,y_predicted).ravel()
    tnr=tn/(tn+fp)
    fpr=fp/(tn+fp)
    fnr=fn/(fn+tp)
    tpr=tp/(fn+tp)
    return {'tnr':tnr,'tpr':tpr,'fpr':fpr,'fnr':fnr}
  

In [61]:
#test stage/evaluation stage
#retrianing our model with optimal hyperparameter:
optimal_alpha=float(input('please enter optimal alpha  '))
nb_clf_model=MultinomialNB(alpha=optimal_alpha)
weight_vector_y_train=compute_sample_weight(class_weight='balanced',y=y_train) #oversampling to balance the dataset
nb_clf_model.fit(X_train,y_train,sample_weight=weight_vector_y_train)

#evaluating our model
weight_vector_y_test=compute_sample_weight(class_weight='balanced',y=y_test) #oversampling to balance the dataset
y_predicted=nb_clf_model.predict(X_test)
accuracy=accuracy_score(y_test,y_predicted,normalize=True,sample_weight=weight_vector_y_test)
precesion=precision_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
recall=recall_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
f1=f1_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
confusion_result=confusion_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
print('Accuracy= ',accuracy*100,' %')
print('TNR= ',confusion_result['tnr']*100,' %')
print('TPR= ',confusion_result['tpr']*100,' %')
print('FNR= ',confusion_result['fnr']*100,' %')
print('FPR= ',confusion_result['fpr']*100,' %')
print('Precesion= ',precesion*100,' %')
print('Recall= ',recall*100,' %')
print('F1= ',f1*100,' %')

please enter optimal alpha  0.7
Accuracy=  79.3156133555  %
TNR=  83.4090017376  %
TPR=  75.2222249734  %
FNR=  24.7777750266  %
FPR=  16.5909982624  %
Precesion=  81.9296200725  %
Recall=  75.2222249734  %
F1=  78.4327834176  %


# using tf-idf weighted-w2v and multinomial naive bayes


In [6]:
import gensim

list_of_sentence=[] #using cleaned text
for sentence in df_sample['CleanText'].values:
    list_of_sentence.append(sentence)

tfidf_model=TfidfVectorizer(ngram_range=(1,2))
tfidf_review_matrix=tfidf_model.fit_transform(list_of_sentence) #calculating tf-idf vectors


sentence_list=[] # list for storing list of words 
for sentence in df_sample['CleanText'].values:
    li=sentence.split()
    sentence_list.append(li)
w2v_model=gensim.models.Word2Vec(sentence_list,min_count=5,size=50) #training w2v model and note data should be list of list of words

In [7]:
def get_sentence_vec_tfidf(list_of_sentence):

    feature_name=tfidf_model.get_feature_names()
    di={ key:value for value,key in enumerate(tfidf_model.get_feature_names())}  #using dictionary to store feature name and index
    #it made our algo to have less time complexity 
    vec_list=[]
    row=0
    for sentence in list_of_sentence:
        sent_vec=np.zeros(50)
        weighted_sum=0
        for word in sentence.split():
            
            try:
                tfidf_value=tfidf_review_matrix[row,di[word]]
                vec=w2v_model.wv[word]
                sent_vec=sent_vec+(vec*tfidf_value)
                weighted_sum+=tfidf_value
            except:
                continue
        sent_vec=sent_vec/weighted_sum
        vec_list.append(sent_vec)
        row+=1
    return vec_list
tfidfw2v_data=get_sentence_vec_tfidf(list_of_sentence) # calculating sentence vector using Word2Vec


In [13]:
dataframe=pd.DataFrame(tfidfw2v_data)  # converting w2v_data to dataframe for nan 
dataframe=dataframe.fillna(0)   #filling nan value with 0
tfidfw2v_data=MinMaxScaler(feature_range=(0, 1)).fit_transform(dataframe) # normalization on data

In [14]:
X=tfidfw2v_data #creating out feature matrix and label vector
y=df_sample['class'].apply(lambda string: 1 if string=='positive' else 0)

In [15]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,shuffle=False) # 70-30 split without shuffling(used for TBS)

In [16]:
def tnr(y_true,y_pred): 
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return tn/(tn+fp)

def fpr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return fp/(tn+fp)

def fnr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return fn/(fn+tp)

def tpr(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    return tp/(fn+tp)

tnr=make_scorer(tnr)
fpr=make_scorer(fpr)
fnr=make_scorer(fnr)
tpr=make_scorer(tpr)

scoring={'accuracy':'accuracy',
    'tnr':tnr,
    'fnr':fnr,
    'fpr':fpr,
    'tpr':tpr,
    'precision':'precision',
    'recall':'recall',
    'f1':'f1',
}


In [17]:
from  sklearn.metrics import accuracy_score#training stage
tscv=TimeSeriesSplit(n_splits=10) # timeseries split for using timeseries based cross-validation
weight_vector_y_train=compute_sample_weight(class_weight='balanced',y=y_train) #oversampling to balance the dataset
#creating list's to store scores for each value of alpha for each score
accuracy=[]
tpr=[]
tnr=[]
fpr=[]
fnr=[]
f1=[]
precision=[]
recall=[]
alpha_list=[0.7,0.8,0.9,1,1.2,1.4] # alpha used for cross validation
for alpha in alpha_list:
    nb_clf_model=MultinomialNB(alpha=alpha)
    cv_dict=cross_validate(nb_clf_model,X_train,y_train,scoring=scoring,cv=tscv.split(X_train),fit_params={'sample_weight':weight_vector_y_train})
    accuracy.append(np.mean(cv_dict['test_accuracy']))
    tnr.append(np.mean(cv_dict['test_tnr']))
    fnr.append(np.mean(cv_dict['test_fnr']))
    fpr.append(np.mean(cv_dict['test_fpr']))
    tpr.append(np.mean(cv_dict['test_tpr']))
    f1.append(np.mean(cv_dict['test_f1']))
    precision.append(np.mean(cv_dict['test_precision']))
    recall.append(np.mean(cv_dict['test_recall'].mean()))


In [18]:
score_df=pd.DataFrame(OrderedDict({'Accuracy':accuracy,'TNR':tnr,'TPR':tpr,'FNR':fnr,'FPR':fpr,'Precesion':precision,'Recall':recall,'F1':f1}),index=alpha_list)   
score_df.index.name='alpha'
score_df

Unnamed: 0_level_0,Accuracy,TNR,TPR,FNR,FPR,Precesion,Recall,F1
alpha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.7,0.750464,0.783054,0.744559,0.255441,0.216946,0.948853,0.744559,0.82913
0.8,0.750464,0.783054,0.744559,0.255441,0.216946,0.948853,0.744559,0.82913
0.9,0.750464,0.783054,0.744559,0.255441,0.216946,0.948853,0.744559,0.82913
1.0,0.750464,0.783054,0.744559,0.255441,0.216946,0.948853,0.744559,0.82913
1.2,0.750464,0.783054,0.744559,0.255441,0.216946,0.948853,0.744559,0.82913
1.4,0.750464,0.783054,0.744559,0.255441,0.216946,0.948853,0.744559,0.82913


In [19]:
def confusion_score(y_true,y_predicted,sample_weight=None):
    '''
    calculation of all 4 ratios of confusion matrix
    and returing dict consisting all rations
    '''
    tn, fp, fn, tp = confusion_matrix(y_true,y_predicted).ravel()
    tnr=tn/(tn+fp)
    fpr=fp/(tn+fp)
    fnr=fn/(fn+tp)
    tpr=tp/(fn+tp)
    return {'tnr':tnr,'tpr':tpr,'fpr':fpr,'fnr':fnr}
  

In [20]:
#test stage/evaluation stage
#retrianing our model with optimal hyperparameter:
optimal_alpha=float(input('please enter optimal alpha  '))
nb_clf_model=MultinomialNB(alpha=optimal_alpha)
weight_vector_y_train=compute_sample_weight(class_weight='balanced',y=y_train) #oversampling to balance the dataset
nb_clf_model.fit(X_train,y_train,sample_weight=weight_vector_y_train)

#evaluating our model
weight_vector_y_test=compute_sample_weight(class_weight='balanced',y=y_test) #oversampling to balance the dataset
y_predicted=nb_clf_model.predict(X_test)
accuracy=accuracy_score(y_test,y_predicted,normalize=True,sample_weight=weight_vector_y_test)
precesion=precision_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
recall=recall_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
f1=f1_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
confusion_result=confusion_score(y_test,y_predicted,sample_weight=weight_vector_y_test)
print('Accuracy= ',accuracy*100,' %')
print('TNR= ',confusion_result['tnr']*100,' %')
print('TPR= ',confusion_result['tpr']*100,' %')
print('FNR= ',confusion_result['fnr']*100,' %')
print('FPR= ',confusion_result['fpr']*100,' %')
print('Precesion= ',precesion*100,' %')
print('Recall= ',recall*100,' %')
print('F1= ',f1*100,' %')

please enter optimal alpha  0.7
Accuracy=  77.4580440742  %
TNR=  77.4614021767  %
TPR=  77.4546859717  %
FNR=  22.5453140283  %
FPR=  22.5385978233  %
Precesion=  77.4598883365  %
Recall=  77.4546859716  %
F1=  77.4572870667  %
