### Assignment 3: Multinomial Naïve Bayes for Fake Review Detection

Importing the data file

In [96]:
import numpy as np
import pandas as p
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

df=p.read_csv("deception_data_converted_final.tsv", delimiter='\t')
y1=df['sentiment'].values
y2=df['lie']
X=df['review'].values

Cross validating using 5 runs for different vectorizers and picking the vectroizer with the highest average score

In [97]:
##MNB TFIDF
mNB_tfidf_pipe = Pipeline([('nb_tf',TfidfVectorizer(encoding='latin-1',use_idf=True,binary=False, stop_words='english')),('nb',MultinomialNB())])
scores = cross_val_score(mNB_tfidf_pipe,X,y1,cv=5)
print(sum(scores)/len(scores))

##MNB TF
mNB_tf_pipe = Pipeline([('nb_tf',TfidfVectorizer(encoding='latin-1',use_idf=False,binary=False, stop_words='english')),('nb',MultinomialNB())])
scores = cross_val_score(mNB_tf_pipe,X,y1,cv=5)
print(sum(scores)/len(scores))

##MNB with Bool
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=True, stop_words='english')),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe, X, y1, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.8578947368421053
0.8368421052631578
0.8467836257309941


Splitting the data into train and test for sentiment prediction

In [98]:
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y1, test_size=0.4, random_state=0)


Training the data using the tfidf vectorizer

In [99]:
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1',use_idf=True,binary=False, stop_words='english')
X_train_vec1 = unigram_tfidf_vectorizer.fit_transform(X_train1)
X_test_vec1 = unigram_tfidf_vectorizer.transform(X_test1)
print(X_test_vec1.shape)

(37, 931)


In [100]:
# import the MNB module
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
# feature_log_prob_ stores the conditional probs for all categories
# if the labels are strings, the index is in alphabetic order
# e.g. 'f' comes before 't' in alphabet, so 'f' is in [0] dimension and 't' in [1]

nb_clf.fit(X_train_vec1,y_train1)
print(nb_clf.classes_)
print(nb_clf.feature_log_prob_.shape)

['n' 'p']
(2, 931)


Top 10 features for negative and positive reviews:

In [101]:
log_ratios = []
features = unigram_tfidf_vectorizer.get_feature_names()
neg_cond_prob = nb_clf.feature_log_prob_[0]
pos_cond_prob = nb_clf.feature_log_prob_[1]

for i in range(0, len(features)):
  log_ratio = pos_cond_prob[i] - neg_cond_prob[i]
  log_ratios.append(log_ratio)

ranks1 = sorted(zip(log_ratios, features))
print(ranks1[:10])
print(ranks1[-10:])

[(-0.6936468428417388, 'terrible'), (-0.6820915714871552, 'asked'), (-0.6451006621473727, 'bad'), (-0.621652775316961, 'minutes'), (-0.6128970832082654, 'took'), (-0.5836211500244861, 'come'), (-0.5622665938458997, 'said'), (-0.5573960510736526, 'indian'), (-0.5412291970875955, 'did'), (-0.5311061215048722, 'came')]
[(0.5277270244748582, 'japanese'), (0.5781783791278325, 'nice'), (0.5898845287400594, 'great'), (0.6142792786278921, 'atmosphere'), (0.629705703597617, 'friendly'), (0.632404171580343, 'noodle'), (0.6443672842543862, 'need'), (0.7344205547266469, 'fresh'), (0.8389046726461666, 'amazing'), (0.8957289492509446, 'best')]


In [102]:
nb_clf.score(X_test_vec1,y_test1)

0.8918918918918919

The confusion matrix:

In [103]:
# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred1 = nb_clf.fit(X_train_vec1, y_train1).predict(X_test_vec1)
cm=confusion_matrix(y_test1, y_pred1, labels=['n', 'p'])
print(cm)

[[17  1]
 [ 3 16]]


Checking accuracy, precision, recall, f1 score 

In [104]:
# print classification report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test1, y_pred1, average=None))
print(recall_score(y_test1, y_pred1, average=None))

from sklearn.metrics import classification_report
target_names = ['n', 'p']
print(classification_report(y_test1, y_pred1, target_names=target_names))

[0.85       0.94117647]
[0.94444444 0.84210526]
              precision    recall  f1-score   support

           n       0.85      0.94      0.89        18
           p       0.94      0.84      0.89        19

    accuracy                           0.89        37
   macro avg       0.90      0.89      0.89        37
weighted avg       0.90      0.89      0.89        37



Analyzing errors

In [105]:
err_cnt = 0
for i in range(0, len(y_test1)):
    if(y_test1[i]=='p' and y_pred1[i]=='n'):
        print(X_test1[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

'This place was one of the best restaurant I have been. The price is little expensive, but the food and the service is best around the area. I went here with my family, and we ordered 4 dishes. They were all well cooked, and their taste were nicely balanced. Waiters came when we needed them without having to call for them. I would definitely recommend it to everyone visiting this area. '
'Ruby Tuesday is my favorite America Style Restaurant. The salad is awesome. And I like the baby pork ribs so much . So does the coconut shrimp.'
?
errors: 3


In [106]:
err_cnt = 0
for i in range(0, len(y_test1)):
    if(y_test1[i]=='n' and y_pred1[i]=='p'):
        print(X_test1[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

'Carlo\'s Plate Shack was the worst dining experience of my life. Although my Southern Comfort Plate sounded to die for, the staff was extremely unhelpful at every turn. We started off with drinks, I had a sick Loganberry milkshake, and my friends had fresh brewed, but bland, iced tea (the ice likely melted and diluted). Eventually our server returned a half hour later to take our orders. I had the aforementioned Southern Comfort Plate, while my friends ordered the Buffalo Chicken Plate and the Hawaiian Plate Lunch. The Southern Comfort Plate came out first, a good 15 minutes before the others, and was extremely greasy. The other 2 ended up being nearly room temperature when they came out. Our server failed to return again to check on us until she brought our check rather abruptly. We want to give this place a chance, but it\'s rather difficult to subject ourselves to such brutal service and pay money.'
errors: 1


Cross validating using 5 runs for different vectorizers and picking the vectroizer with the highest average score

In [107]:
##MNB TFIDF
mNB_tfidf_pipe = Pipeline([('nb_tf',TfidfVectorizer(encoding='latin-1',use_idf=True,binary=False, stop_words='english')),('nb',MultinomialNB())])
scores = cross_val_score(mNB_tfidf_pipe,X,y2,cv=5)
print(sum(scores)/len(scores))

##MNB TF
mNB_tf_pipe = Pipeline([('nb_tf',TfidfVectorizer(encoding='latin-1',use_idf=False,binary=False, stop_words='english')),('nb',MultinomialNB())])
scores = cross_val_score(mNB_tf_pipe,X,y2,cv=5)
print(sum(scores)/len(scores))

##MNB with Bool
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=False, stop_words='english')),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe, X, y2, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.5871345029239766
0.5871345029239766
0.5865497076023393


Splitting the data into train and test for sentiment prediction

In [108]:
from sklearn.model_selection import train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.4, random_state=0)

Training the data using the tfidf vectorizer

In [109]:
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1',use_idf=True,binary=False, stop_words='english')
X_train_vec2 = unigram_tfidf_vectorizer.fit_transform(X_train2)
X_test_vec2 = unigram_tfidf_vectorizer.transform(X_test2)
print(X_test_vec2.shape)

(37, 931)


In [111]:
# import the MNB module
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
# feature_log_prob_ stores the conditional probs for all categories
# if the labels are strings, the index is in alphabetic order
# e.g. 'f' comes before 't' in alphabet, so 'f' is in [0] dimension and 't' in [1]

nb_clf.fit(X_train_vec2,y_train2)
print(nb_clf.classes_)
print(nb_clf.feature_log_prob_.shape)

['f' 't']
(2, 931)


Top 10 features for false and true reviews:

In [112]:
log_ratios = []
features = unigram_tfidf_vectorizer.get_feature_names()
false_cond_prob = nb_clf.feature_log_prob_[0]
true_cond_prob = nb_clf.feature_log_prob_[1]

for i in range(0, len(features)):
  log_ratio = true_cond_prob[i] - false_cond_prob[i]
  log_ratios.append(log_ratio)

ranks2 = sorted(zip(log_ratios, features))
print(ranks2[:10])
print(ranks2[-10:])

[(-0.7321182906687804, 'want'), (-0.527468650204276, 'steak'), (-0.5144518712077053, 'plate'), (-0.5077382986850907, 'bring'), (-0.49123791430867936, 'free'), (-0.45086727053602704, 'price'), (-0.44610464947613604, 'casino'), (-0.43406695261205375, 'definitely'), (-0.41630393646993635, 'delicious'), (-0.4134992464866407, 'coming')]
[(0.40733562828624414, 'people'), (0.4239142405571048, 'environment'), (0.440190693196409, 'tables'), (0.45342542076306547, 'say'), (0.4577048038840559, 'thing'), (0.4686544382951672, 'finish'), (0.49654642367645785, 'flies'), (0.5110784275619329, 'worst'), (0.5373743623980118, 'glass'), (0.5850602438996582, 'did')]


In [113]:
nb_clf.score(X_test_vec2,y_test2)

0.5675675675675675

The confusion matrix:

In [79]:
# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred2 = nb_clf.fit(X_train_vec2, y_train2).predict(X_test_vec2)
cm=confusion_matrix(y_test2, y_pred2, labels=['f', 't'])
print(cm)

[[15  2]
 [14  6]]


Checking accuracy, precision, recall, f1 score

In [114]:
# print classification report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test2, y_pred2, average=None))
print(recall_score(y_test2, y_pred2, average=None))

from sklearn.metrics import classification_report
target_names = ['f', 't']
print(classification_report(y_test2, y_pred2, target_names=target_names))

[0.51724138 0.75      ]
[0.88235294 0.3       ]
              precision    recall  f1-score   support

           f       0.52      0.88      0.65        17
           t       0.75      0.30      0.43        20

    accuracy                           0.57        37
   macro avg       0.63      0.59      0.54        37
weighted avg       0.64      0.57      0.53        37



Error Analysis:

In [115]:
err_cnt = 0
for i in range(0, len(np.array(y_test2))):
    if(np.array(y_test2)[i]=='f' and np.array(y_pred2)[i]=='t'):
        print(X_test2[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

'OMG. This restaurant is horrible. The receptionist did not greet us, we just stood there and waited for five minutes. The food came late and served not warm. Me and my pet ordered a bowl of salad and a cheese pizza. The salad was not fresh, the crust of a pizza was so hard like plastics. My dog didn\'t even eat that pizza. I hate this place!!!!!!!!!!'
'Two days ago, I went to the rooftop restaurant in NYC that served brunch. it was one of the best brunch that I have ever had. The view from the table was serene and I could see both the the Hudson River and the East River with outstanding views of Empire State Building, the Chryslers tower, Freedom tower and the Central park. A great place with great food and a perplexing view'
errors: 2


In [116]:
err_cnt = 0
for i in range(0, len(np.array(y_test2))):
    if(np.array(y_test2)[i]=='t' and np.array(y_pred2)[i]=='f'):
        print(X_test2[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

'This place used to be great. I can\'t believe it\'s current state. Instead of the cool, dimly-lit lounge that I was used to, I was in a cheap, smelly bar. The music has no soul, the bartender is mean. This place no longer exudes a welcoming spirit. The crowd is awkward and old. I want my old hangout back!!'
'the staff at this restaurant is very unfriendly. the waitress for our table is extremely rude. we need to wait for one hour for our order to come. the place is noisy and the food isn\'t that good.'
'I went to this awesome restaurant in San Francisco (I forget the name), but it was on point. Huge beer list, quick seating, the menu was long but not over-whelming with great variety and unique options, and the staff was very friendly. They played great music the whole time, and the food was delicious. We ended up hanging out at the bar (the west coast has the best IPAs!) for a few hours after what was originally going to be a quick lunch, then went to a Phish show, pretty awesome day.