# HW_2 MNB and SVM for Fake Review Detection

In [199]:
import numpy as np
import pandas as p

# Step 1: Read in data - For both MNB and SVM

In [200]:
train = p.read_csv("/Users/shivangi/Downloads/deception_data_converted_final(1).tsv", delimiter='\t')
y = train['sentiment'].values
y1 = train['lie'].values
X = train['review'].values

# Step 2: Split train/test data for hold-out test - For both MNB and SVM

we'll first work on predicting the
1. sentiment, then
2. authenticity

In [201]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(X_train[0])
print(y_train[0])
print(X_test[0])
print(y_test[0])

(73,) (73,) (19,) (19,)
'I went to Applebees (regrettably) once and it was a train-wreck. The server was in a terrible mood, the beers arrived after the dinner was delivered, the appetizer was wrong, food was bad, the check was wrong, and there were barely any other people inside! I was pretty baffled at how so many things just went wrong in the whole process, the restaurant wasn\'t even busy. '
n
'After I went shopping with some of my friend, we went to DODO restaurant for dinner. I found worm in one of the dishes .'
n


# Step 2.1 Data Checking - MNB and SVM

In [202]:
unique, counts = np.unique(y_train, return_counts = True)
print(np.asarray((unique, counts)))

[['n' 'p']
 [35 38]]


In [203]:
uniqueTest, countsTest = np.unique(y_test, return_counts=True)
print(np.asarray((uniqueTest, countsTest)))

[['n' 'p']
 [11 8]]


trainRatio = 35/38 = 92%
testRatio = 11/8 = 137.5%

It's a bit lopsided in test data but that should not be a problem because training data has almost 1:1 negative and positive records

# Step 3: Vectorization - MNB and SVM

In [204]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# several commonly used vectorizer setting

#  unigram boolean vectorizer, set minimum document frequency to 5
unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')

#  unigram term frequency vectorizer, set minimum document frequency to 5
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')

#  unigram and bigram term frequency vectorizer, set minimum document frequency to 5
gram12_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')

#  unigram tfidf vectorizer, set minimum document frequency to 5
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')

# Step 3.1: Vectorize the training data - MNB and SVM

We'll use count vectorizer for sentiment because multinomialNB uses frequency of words

In [205]:
# fit vocabulary in training documents and transform the training documents into vectors
X_train_vec = unigram_count_vectorizer.fit_transform(X_train)

# check the content of a document vector
print(X_train_vec.shape)
print(X_train_vec[0].toarray())

# check the size of the constructed vocabulary
print(len(unigram_count_vectorizer.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(unigram_count_vectorizer.vocabulary_.items())[:10])

# check word index in vocabulary
print(unigram_count_vectorizer.vocabulary_.get('imaginative'))

(73, 70)
[[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 2 0]]
70
[('went', 68), ('terrible', 59), ('dinner', 16), ('food', 23), ('bad', 3), ('people', 45), ('just', 31), ('restaurant', 50), ('wasn', 67), ('friends', 26)]
None


# Step 3.2: Vectorize the test data - MNB and SVM

In [206]:
X_test_vec = unigram_count_vectorizer.transform(X_test)

# print out #examples and #features in the test set
print(X_test_vec.shape)

(19, 70)


# Step 4: Train a MNB classifier - MNB only

In [207]:
# import the MNB module
from sklearn.naive_bayes import MultinomialNB

# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
nb_clf.fit(X_train_vec,y_train)

MultinomialNB()

# Step 4: Train a linear SVC classifier - SVM only

In [208]:
# import the LinearSVC module
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec,y_train)

LinearSVC(C=1)

# Step 4.1 Interpret a trained MNB model - conditional probs - MNB

In [209]:
print(nb_clf.feature_log_prob_.shape)

print(unigram_count_vectorizer.vocabulary_.get('bad'))

# for i in range(0,1):
print(nb_clf.feature_log_prob_[0][unigram_count_vectorizer.vocabulary_.get('bad')])
print(nb_clf.feature_log_prob_[1][unigram_count_vectorizer.vocabulary_.get('bad')])

(2, 70)
3
-4.056604234239254
-5.424950017481403


# Step 4.1 Interpret the trained linear SVC model - SVM

In [210]:
## For category "0" (negative), get all features and their weights and sort them in increasing order
feature_ranks = sorted(zip(svm_clf.coef_[0], unigram_count_vectorizer.get_feature_names_out()))

## get the 10 features that are best indicators of negative sentiment (they are at the bottom of the ranked list negative_feature_ranks)
negative_10 = feature_ranks[:10]
print("negative words")
for i in range(0, len(negative_10)):
    print(negative_10[i])
print()

## get 10 features that are best indicators of positive sentiment (they are at the bottom of the ranked list positive_feature_ranks)
positive_10 = feature_ranks[-10:]
print("positive words")
for i in range(0, len(positive_10)):
    print(positive_10[i])
print()

negative words
(-0.752369028583336, 'cold')
(-0.6548064356265509, 'dishes')
(-0.6291139770843042, 'service')
(-0.5566375936209397, 'said')
(-0.5336867420691154, 'went')
(-0.521638769856277, 'meal')
(-0.5008944257517955, 'asked')
(-0.48791419931992813, 'ny')
(-0.4682526776670327, 'minutes')
(-0.45505653803669016, 'restaurant')

positive words
(0.3538775775757943, 'ask')
(0.3888561624326201, 'environment')
(0.4573940627397639, 'waiters')
(0.4662163169984037, 'great')
(0.5612563001744503, 'fresh')
(0.5899592689467185, 'delicious')
(0.6048695375063617, 'nice')
(0.6437193255535749, 'friendly')
(0.7861998220361109, 'need')
(0.9646194557255225, 'best')



# Step 4.2 Log ratio of conditional probs - MNB

In [211]:
log_ratios = []
features = unigram_count_vectorizer.get_feature_names_out()
neg_cond_prob = nb_clf.feature_log_prob_[0]
pos_cond_prob = nb_clf.feature_log_prob_[1]

for i in range(0, len(features)):
  log_ratio = pos_cond_prob[i] - neg_cond_prob[i]
  log_ratios.append(log_ratio)

exercise_C_ranks = sorted(zip(log_ratios, features))
print(exercise_C_ranks[:10])
print(exercise_C_ranks[-10:])

[(-2.262163659264246, 'terrible'), (-2.1668534794599212, 'asked'), (-2.0614929638020945, 'took'), (-1.9437099281457106, 'came'), (-1.9437099281457106, 'said'), (-1.4047134274130237, 'minutes'), (-1.368345783242149, 'bad'), (-1.368345783242149, 'wasn'), (-0.9628806751339853, 'cold'), (-0.9628806751339853, 'dine')]
[(1.11656086654585, 'sauce'), (1.3884945820294927, 'delicious'), (1.3884945820294927, 'need'), (1.5220259746540146, 'nice'), (1.5220259746540146, 'prices'), (1.6398090103103984, 'great'), (1.6398090103103988, 'friendly'), (1.84047970577255, 'fresh'), (1.9683130772824344, 'best'), (2.7006809709956614, 'amazing')]


# Step 5: Test the MNB classifier - MNB

In [212]:
# test the classifier on the test data set, print accuracy score

nb_clf.score(X_test_vec,y_test)

0.8947368421052632

In [213]:
# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred_mnb = nb_clf.fit(X_train_vec, y_train).predict(X_test_vec)
cm_mnb = confusion_matrix(y_test, y_pred_mnb, labels = ['n', 'p'])
print(cm_mnb)

[[9 2]
 [0 8]]


In [214]:
# print classification report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test, y_pred_mnb, average=None))
print(recall_score(y_test, y_pred_mnb, average=None))

from sklearn.metrics import classification_report
target_names = ['0','1']
print(classification_report(y_test, y_pred_mnb, target_names = target_names))

[1.  0.8]
[0.81818182 1.        ]
              precision    recall  f1-score   support

           0       1.00      0.82      0.90        11
           1       0.80      1.00      0.89         8

    accuracy                           0.89        19
   macro avg       0.90      0.91      0.89        19
weighted avg       0.92      0.89      0.90        19



# Step 5: Test the LinearSVC classifier

In [215]:
# test the classifier on the test data set, print accuracy score

svm_clf.score(X_test_vec,y_test)

0.7894736842105263

In [216]:
# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred_svm = svm_clf.predict(X_test_vec)
cm_svm = confusion_matrix(y_test, y_pred_svm, labels = ['n', 'p'])
print(cm_svm)
print()

from sklearn.metrics import classification_report
target_names = ['0','1']
print(classification_report(y_test, y_pred_svm, target_names = target_names))

[[10  1]
 [ 3  5]]

              precision    recall  f1-score   support

           0       0.77      0.91      0.83        11
           1       0.83      0.62      0.71         8

    accuracy                           0.79        19
   macro avg       0.80      0.77      0.77        19
weighted avg       0.80      0.79      0.78        19



# Step 5.1 Interpret the prediction result - MNB

In [217]:
## find the calculated posterior probability
posterior_probs_mnb = nb_clf.predict_proba(X_test_vec)

## find the posterior probabilities for the first test example
print(posterior_probs_mnb[0])

# find the category prediction for the first test example
y_pred_mnb = nb_clf.predict(X_test_vec)
print(y_pred_mnb[0])

# check the actual label for the first test example
print(y_test[0])

[0.86168678 0.13831322]
n
n


Because the posterior probability for category 'n' (negative) is the greatest, 0.862, the prediction should be "n". Because the actual label is also "n", this is a correct prediction.

# Step 5.1 Interpret the prediction result - SVM

In [218]:
## get the confidence scores for all test examples from each of the five binary classifiers
svm_confidence_scores = svm_clf.decision_function(X_test_vec)
## get the confidence score for the first test example
print(svm_confidence_scores[0])

## Confirm by printing out the actual prediction
print(y_test[0])
print(X[0])

-1.5476709994951778
n
'Mike\'s Pizza High Point, NY Service was very slow and the quality was low. You would think they would know at least how to make good pizza, not. Stick to pre-made dishes like stuffed pasta or a salad. You should consider dining else where.'


In [219]:
from sklearn.calibration import CalibratedClassifierCV
svm_calibrated = CalibratedClassifierCV(svm_clf) 
svm_calibrated.fit(X_train_vec, y_train)
y_test_proba_svm = svm_calibrated.predict_proba(X_test_vec)
y_test_proba_svm[0]

array([0.7286686, 0.2713314])

# Step 5.2 Error Analysis - MNB

In [220]:
# print out specific type of error for further analysis

# print out the negative examples that are mistakenly predicted as positive
# according to the confusion matrix, there should be 0 such examples
print("MNB error analysis")
print("negative examples that are mistakenly predicted as positive")
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i] == 'p' and y_pred_mnb[i] == 'n'):
        print(X_test[i])
        err_cnt = err_cnt + 1
print("errors:", err_cnt)

# print out the positive examples that are mistakenly predicted as negative
# according to the confusion matrix, there should be 2 such examples
print()
print("positive examples that are mistakenly predicted as negative")
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i] == 'n' and y_pred_mnb[i] == 'p'):
        print(X_test[i])
        err_cnt = err_cnt + 1
print("errors:", err_cnt)

MNB error analysis
negative examples that are mistakenly predicted as positive
errors: 0

positive examples that are mistakenly predicted as negative
'This place used to be great. I can\'t believe it\'s current state. Instead of the cool, dimly-lit lounge that I was used to, I was in a cheap, smelly bar. The music has no soul, the bartender is mean. This place no longer exudes a welcoming spirit. The crowd is awkward and old. I want my old hangout back!!'
'This diner was not at all up to par. I\'ve been to many diners, and get eggs benedict sometimes. There was nacho cheese on my eggs, and a plateful of watery runny eggs. And it smelled like smoke. And there was no heat, in the dead of winter. Their prices are not ANYWHERE near what is reasonable. Cool mom & pop place, but terrible food, smell, and prices.'
errors: 2


Since the results match with what we expected from the confusion matrix, the errors look correct

# Step 5.2 Error Analysis - SVM

In [221]:
# print out specific type of error for further analysis

# print out the negative examples that are mistakenly predicted as positive
# according to the confusion matrix, there should be 1 such examples
print("SVM error analysis")
print("negative examples that are mistakenly predicted as positive")
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i] == 'n' and y_pred_svm[i] == 'p'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)
print()
# print out the positive examples that are mistakenly predicted as negative
# according to the confusion matrix, there should be 3 such examples
# print("MNB error analysis")
print("positive examples that are mistakenly predicted as negative")
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i] == 'p' and y_pred_svm[i] == 'n'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

SVM error analysis
negative examples that are mistakenly predicted as positive
'the staff at this restaurant is very unfriendly. the waitress for our table is extremely rude. we need to wait for one hour for our order to come. the place is noisy and the food isn\'t that good.'
errors: 1
positive examples that are mistakenly predicted as negative
'This place was one of the best restaurant I have been. The price is little expensive, but the food and the service is best around the area. I went here with my family, and we ordered 4 dishes. They were all well cooked, and their taste were nicely balanced. Waiters came when we needed them without having to call for them. I would definitely recommend it to everyone visiting this area. '
'I ate at this restaurant called Banana Leaf. As I entered the restaurant I really liked the ambiance. I ordered noodle soup and fried rice with spicy black bean curry. The service was pretty fast and the food tasted amazing. There was a lot flavor in the food 

# Step 6: write the prediction output to file - MNB

In [222]:
y_pred_mnb = nb_clf.predict(X_test_vec)
output = open('/Users/shivangi/Documents/prediction_output_mnb.csv', 'w')
for x, value in enumerate(y_pred_mnb):
  output.write(str(value) + '\n') 
output.close()

# Step 6: write the prediction output to file - SVM

In [223]:
y_pred_svm = svm_clf.predict(X_test_vec)
output = open('/Users/shivangi/Documents/prediction_output_svm.csv', 'w')
for x, value in enumerate(y_pred_svm):
  output.write(str(value) + '\n') 
output.close()

# Cross Validation

In [224]:
# cross validation

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding = 'latin-1', binary = False)),('nb', MultinomialNB())])
scores_mnb = cross_val_score(nb_clf_pipe, X, y, cv = 5)
avg_mnb = sum(scores_mnb)/len(scores_mnb)
print(avg_mnb)

svm_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary = False)),('svm', LinearSVC(C=1))])
scores_svm = cross_val_score(svm_clf_pipe, X, y, cv=3)
avg_svm = sum(scores_svm)/len(scores_svm)
print(avg_svm)

0.8801169590643274
0.7935483870967742


# Repeat all steps for predicting the authenticity

# Step 2 - Split train/test data for hold-out test and Step 2.1 - Data Checking - MNB and SVM

In [225]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=0)

print(X1_train.shape, y1_train.shape, X1_test.shape, y1_test.shape)
print(X1_train[0])
print(y1_train[0])
print(X1_test[0])
print(y1_test[0])

unique1, counts1 = np.unique(y1_train, return_counts = True)
print(np.asarray((unique1, counts1)))

uniqueTest1, countsTest1 = np.unique(y1_test, return_counts=True)
print(np.asarray((uniqueTest1, countsTest1)))

(73,) (73,) (19,) (19,)
'I went to Applebees (regrettably) once and it was a train-wreck. The server was in a terrible mood, the beers arrived after the dinner was delivered, the appetizer was wrong, food was bad, the check was wrong, and there were barely any other people inside! I was pretty baffled at how so many things just went wrong in the whole process, the restaurant wasn\'t even busy. '
t
'After I went shopping with some of my friend, we went to DODO restaurant for dinner. I found worm in one of the dishes .'
f
[['f' 't']
 [36 37]]
[['f' 't']
 [10 9]]


trainRatio = 36/37 = 97.3% testRatio = 10/9 = 111.11%

It's a bit lopsided in test data but that should not be a problem because training data has almost 1:1 negative and positive records

We'll use boolean vectorizer for authenticity because Professor mentioned that she has seen it produce better results in short datasets

# Step 3.1: Vectorize the training data

In [226]:
# fit vocabulary in training documents and transform the training documents into vectors
X1_train_vec = unigram_bool_vectorizer.fit_transform(X1_train)

# check the content of a document vector
print(X1_train_vec.shape)
print(X1_train_vec[0].toarray())

# check the size of the constructed vocabulary
print(len(unigram_bool_vectorizer.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(unigram_bool_vectorizer.vocabulary_.items())[:10])

# check word index in vocabulary
print(unigram_bool_vectorizer.vocabulary_.get('imaginative'))

(73, 70)
[[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0]]
70
[('went', 68), ('terrible', 59), ('dinner', 16), ('food', 23), ('bad', 3), ('people', 45), ('just', 31), ('restaurant', 50), ('wasn', 67), ('friends', 26)]
None


In [227]:
X1_test_vec = unigram_bool_vectorizer.transform(X1_test)

# print out #examples and #features in the test set
print(X1_test_vec.shape)

(19, 70)


# Step 4: Train a MNB classifier

In [228]:
# initialize the MNB model
nb_clf1 = MultinomialNB()

# use the training data to train the MNB model
nb_clf1.fit(X1_train_vec, y1_train)

MultinomialNB()

# Step 4: Train a SVM classifier

In [229]:
# initialize the LinearSVC model
svm_clf1 = LinearSVC(C=1)

# use the training data to train the model
svm_clf1.fit(X1_train_vec,y1_train)

LinearSVC(C=1)

# Step 4.1 Interpret a trained MNB model - conditional probs

In [230]:
print(nb_clf1.feature_log_prob_.shape)

print(unigram_bool_vectorizer.vocabulary_.get('bad'))

# for i in range(0,1):
print(nb_clf1.feature_log_prob_[0][unigram_bool_vectorizer.vocabulary_.get('bad')])
print(nb_clf1.feature_log_prob_[1][unigram_bool_vectorizer.vocabulary_.get('bad')])

(2, 70)
3
-4.711779921046349
-4.135166556742355


# Step 4.1 Interpret the trained linear SVC model - SVM

In [231]:
## For category "0" (negative), get all features and their weights and sort them in increasing order
auth_feature_ranks = sorted(zip(svm_clf1.coef_[0], unigram_bool_vectorizer.get_feature_names_out()))

## get the 10 features that are best indicators of negative sentiment (they are at the bottom of the ranked list negative_feature_ranks)
false_10 = auth_feature_ranks[:10]
print("false words")
for i in range(0, len(false_10)):
    print(false_10[i])
print()

## get 10 features that are best indicators of positive sentiment (they are at the bottom of the ranked list positive_feature_ranks)
true_10 = auth_feature_ranks[-10:]
print("true words")
for i in range(0, len(true_10)):
    print(true_10[i])
print()

false words
(-1.4959444455936277, 'want')
(-1.3149676756434405, 'delicious')
(-1.2538386751664088, 'cold')
(-1.0948356958177885, 'high')
(-0.8856203894599162, 'meal')
(-0.8744710437247227, 'worth')
(-0.8472953732806963, 'said')
(-0.8312008757223224, 'definitely')
(-0.6775050231530999, 'dining')
(-0.6311361753681147, 'sauce')

true words
(0.4438421404653148, 'ordered')
(0.44657414108890164, 'dish')
(0.45984392656574846, 'overall')
(0.5298982521167177, 'bad')
(0.5358867020691211, 'amazing')
(0.6684545296157696, 'came')
(0.7259193693973914, 'ask')
(0.8004103257861921, 'bar')
(0.8334812232211545, 'good')
(0.9607665175274639, 'chicken')



# Step 4.2 Log ratio of conditional probs - MNB

In [232]:
log_ratios1 = []
features1 = unigram_bool_vectorizer.get_feature_names_out()
false_cond_prob = nb_clf1.feature_log_prob_[0]
truth_cond_prob = nb_clf1.feature_log_prob_[1]

for i in range(0, len(features1)):
  log_ratio1 = false_cond_prob[i] - truth_cond_prob[i]
  log_ratios1.append(log_ratio1)

exercise_C_ranks1 = sorted(zip(log_ratios1, features1))
print(exercise_C_ranks1[:10])
print(exercise_C_ranks1[-10:])

[(-1.0874389880699846, 'bar'), (-1.0874389880699846, 'environment'), (-1.018446116583033, 'people'), (-0.7465124010993911, 'good'), (-0.68197387996182, 'life'), (-0.6411518854415648, 'time'), (-0.5766133643039941, 'bad'), (-0.5766133643039941, 'nice'), (-0.5766133643039941, 'waitress'), (-0.4588303286476103, 'ask')]
[(0.7451424756783256, 'high'), (0.7451424756783256, 'meal'), (0.7451424756783256, 'sauce'), (0.7451424756783256, 'waiters'), (0.7451424756783256, 'worth'), (0.9274640324722805, 'delicious'), (0.9274640324722805, 'staff'), (1.0816147122995385, 'said'), (1.6206112130322259, 'cold'), (1.6206112130322259, 'want')]


# Step 5: Test the MNB classifier

In [233]:
# test the classifier on the test data set, print accuracy score

nb_clf1.score(X1_test_vec, y1_test)

0.47368421052631576

In [234]:
# print confusion matrix (row: ground truth; col: prediction)

y1_pred_mnb = nb_clf1.fit(X1_train_vec, y1_train).predict(X1_test_vec)
cm1_mnb = confusion_matrix(y1_test, y1_pred_mnb, labels = ['f', 't'])
print(cm1_mnb)

[[7 3]
 [7 2]]


In [235]:
# print classification report

print(precision_score(y1_test, y1_pred_mnb, average = None))
print(recall_score(y1_test, y1_pred_mnb, average = None))

target_names1 = ['0','1']
print(classification_report(y1_test, y1_pred_mnb, target_names = target_names1))

[0.5 0.4]
[0.7        0.22222222]
              precision    recall  f1-score   support

           0       0.50      0.70      0.58        10
           1       0.40      0.22      0.29         9

    accuracy                           0.47        19
   macro avg       0.45      0.46      0.43        19
weighted avg       0.45      0.47      0.44        19



# Step 5: Test the LinearSVC classifier

In [236]:
# test the classifier on the test data set, print accuracy score

svm_clf1.score(X1_test_vec,y1_test)

0.3684210526315789

In [237]:
# print confusion matrix and classification report

y1_pred_svm = svm_clf1.predict(X1_test_vec)
cm1_svm = confusion_matrix(y1_test, y1_pred_svm, labels = ['f', 't'])
print(cm1_svm)
print()

target_names = ['0','1']
print(classification_report(y1_test, y1_pred_svm, target_names = target_names))

[[5 5]
 [7 2]]

              precision    recall  f1-score   support

           0       0.42      0.50      0.45        10
           1       0.29      0.22      0.25         9

    accuracy                           0.37        19
   macro avg       0.35      0.36      0.35        19
weighted avg       0.35      0.37      0.36        19



# Step 5.1 Interpret the prediction result - MNB

In [238]:
## find the calculated posterior probability
posterior_probs1_mnb = nb_clf1.predict_proba(X1_test_vec)

## find the posterior probabilities for the first test example
print(posterior_probs1_mnb[0])

# find the category prediction for the first test example
y1_pred_mnb = nb_clf1.predict(X1_test_vec)
print(y1_pred_mnb[0])

# check the actual label for the first test example
print(y1_test[0])

[0.43542296 0.56457704]
t
f


Because the posterior probability for category 'f' (false) is the greatest, 0.565, the prediction should be "f". Because the actual label is "t", this is not as accurate of a prediction.

# Step 5.1 Interpret the prediction result - SVM

In [240]:
## get the confidence scores for all test examples from each of the five binary classifiers
svm_confidence_scores1 = svm_clf1.decision_function(X1_test_vec)
## get the confidence score for the first test example
print(svm_confidence_scores1[0])

## Confirm by printing out the actual prediction
print(y1_test[0])
print(X[0])

-0.10491385536768938
f
'Mike\'s Pizza High Point, NY Service was very slow and the quality was low. You would think they would know at least how to make good pizza, not. Stick to pre-made dishes like stuffed pasta or a salad. You should consider dining else where.'


In [241]:
svm_calibrated1 = CalibratedClassifierCV(svm_clf1) 
svm_calibrated1.fit(X1_train_vec, y1_train)
y1_test_proba_svm = svm_calibrated1.predict_proba(X1_test_vec)
y1_test_proba_svm[0]

array([0.53454464, 0.46545536])

# Step 5.2 Error Analysis - MNB

In [242]:
# print out specific type of error for further analysis

# print out the false examples that are mistakenly predicted as true
# according to the confusion matrix, there should be 3 such examples

err_cnt1 = 0
for i in range(0, len(y1_test)):
    if(y1_test[i] == 'f' and y1_pred_mnb[i] == 't'):
        print(X1_test[i])
        err_cnt1 = err_cnt1 + 1
print("errors:", err_cnt1)
print()
# print out the true examples that are mistakenly predicted as false
# according to the confusion matrix, there should be 7 such examples

err_cnt1 = 0
for i in range(0, len(y1_test)):
    if(y1_test[i] == 't' and y1_pred_mnb[i] == 'f'):
        print(X1_test[i])
        err_cnt1 = err_cnt1 + 1
print("errors:", err_cnt1)

'After I went shopping with some of my friend, we went to DODO restaurant for dinner. I found worm in one of the dishes .'
'I ate at this restaurant called Banana Leaf. As I entered the restaurant I really liked the ambiance. I ordered noodle soup and fried rice with spicy black bean curry. The service was pretty fast and the food tasted amazing. There was a lot flavor in the food which I truly enjoyed. Two thumbs up for Banana Leaf and I would totally recommend this restaurant.'
'OMG. This restaurant is horrible. The receptionist did not greet us, we just stood there and waited for five minutes. The food came late and served not warm. Me and my pet ordered a bowl of salad and a cheese pizza. The salad was not fresh, the crust of a pizza was so hard like plastics. My dog didn\'t even eat that pizza. I hate this place!!!!!!!!!!'
errors: 3

'This place used to be great. I can\'t believe it\'s current state. Instead of the cool, dimly-lit lounge that I was used to, I was in a cheap, smell

Since the results match with what we expected from the confusion matrix, the errors look correct

# Step 5.2 - Error Analysis - SVM

In [243]:
# print out specific type of error for further analysis

# print out the false examples that are mistakenly predicted as true
# according to the confusion matrix, there should be 5 such examples
print("SVM error analysis")
print("false examples that are mistakenly predicted as true")
err_cnt1 = 0
for i in range(0, len(y1_test)):
    if(y1_test[i] == 'f' and y1_pred_svm[i] == 't'):
        print(X1_test[i])
        err_cnt1 = err_cnt1 + 1
print("errors:", err_cnt1)
print()
# print out the true examples that are mistakenly predicted as false
# according to the confusion matrix, there should be 7 such examples
# print("MNB error analysis")
print("true examples that are mistakenly predicted as false")
err_cnt1 = 0
for i in range(0, len(y1_test)):
    if(y1_test[i] == 't' and y1_pred_svm[i] == 'f'):
        print(X1_test[i])
        err_cnt1 = err_cnt1 + 1
print("errors:", err_cnt1)

SVM error analysis
false examples that are mistakenly predicted as true
'I recently ate at a restaurant called White Castle and it was a dine in. I had to wait 20 minutes before the waiter came to my table to take my order even though it was not busy. I had to wait another 30 minutes for my order to come. I had ordered a veggie burger with fries and Iced tea. The veggie patty was not properly cooked, the lettuce had black patches and the tomatoes looked horrible. Overall the burger was a disaster. When I turned to the fries in the hopes that it would taste good--to my disappointment they were also not fried properly and had a raw taste. To top it all when picked up my iced tea took take a sip-- I found a fly swimming in my iced tea. Overall it was a really bad experience and I would not recommend White Castle to anybody. '
'I went there with two friends at 6pm. Long queue was there. But it didn\'t take us long to wait. The waiter was nice but worked in a hurry. We ordered \'Today\'s Sp

# Step 6: write the prediction output to file

In [244]:
y1_pred_mnb = nb_clf1.predict(X1_test_vec)
output1 = open('/Users/shivangi/Documents/prediction_output_auth_mnb.csv', 'w')
for x, value in enumerate(y1_pred_mnb):
  output1.write(str(value) + '\n') 
output1.close()

# Step 6: write the prediction output to file

In [245]:
y1_pred_svm = svm_clf1.predict(X1_test_vec)
output1 = open('/Users/shivangi/Documents/prediction_output_auth_svm.csv', 'w')
for x, value in enumerate(y1_pred_svm):
  output1.write(str(value) + '\n') 
output1.close()

# Cross Validation

In [247]:
# cross validation

nb_clf_pipe1 = Pipeline([('vect', CountVectorizer(encoding = 'latin-1', binary = True)),('nb', MultinomialNB())])
scores1 = cross_val_score(nb_clf_pipe1, X, y, cv = 5)
avg1 = sum(scores1)/len(scores1)
print(avg1)

svm_clf_pipe1 = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary = True)),('svm', LinearSVC(C=1))])
scores_svm1 = cross_val_score(svm_clf_pipe1, X, y, cv = 3)
avg_svm1 = sum(scores_svm1)/len(scores_svm1)
print(avg_svm1)

0.8251461988304094
0.8043010752688172
