In [50]:
import numpy as np
import pandas as pd
import eli5

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

print("Reading in data from `standardized_data.csv`")

# read data file in a DataFrame
df = pd.read_csv('./standardized_data.csv', sep='\t', header=0)

# split the data out into a test and train set
train_df, test_df = train_test_split(df, train_size=0.8,  random_state=np.random.seed(7))

train_text = train_df['text'].tolist()
train_targets = train_df['authenticity'].tolist()

test_text = test_df['text'].tolist()
test_targets = test_df['authenticity'].tolist()

print("Data split into train/test sets 80/20.")
print("Train length: ", len(train_text))
print("Test length: ", len(test_text))

clf = Pipeline([                
    ('count_vect', CountVectorizer(ngram_range=(1, 3))),
    ('tfidf', TfidfTransformer(norm='l2', use_idf = True)),
    ('svm', LinearSVC())
])

print("Fitting model")
clf = clf.fit(train_text, train_targets)

print("Predicting")
predicted = clf.predict(test_text)

print()

print(metrics.classification_report(test_targets, predicted))    
print(metrics.confusion_matrix(test_targets, predicted))
print(metrics.accuracy_score(test_targets, predicted))

correct_predictions = [test_text[index] for index, prediction in enumerate(predicted) if prediction == test_targets[index]]
incorrect_predictions = [test_text[index] for index, prediction in enumerate(predicted) if prediction != test_targets[index]]

eli5.show_prediction(
    estimator=clf.named_steps['svm'],
    doc=correct_predictions[233], # correct - authentic
    #doc=incorrect_predictions[5], # incorrect - authentic
    #doc=correct_predictions[7], # correct - deceptive
    #doc=incorrect_predictions[8], # inccorect - deceptive
    top=20,
    vec=clf.named_steps['count_vect'],
    force_weights=True,
    show_feature_values=True)


Reading in data from `standardized_data.csv`
Data split into train/test sets 80/20.
Train length:  1280
Test length:  320
Fitting model
Predicting

             precision    recall  f1-score   support

  AUTHENTIC       0.89      0.83      0.86       149
  DECEPTIVE       0.86      0.91      0.88       171

avg / total       0.87      0.87      0.87       320

[[124  25]
 [ 16 155]]
0.871875


Contribution?,Feature,Value
+5.611,was,7.000
+3.458,to,4.000
+2.364,my,1.000
+2.271,chicago,1.000
+1.441,the,9.000
+1.227,hotel,1.000
+1.027,will,2.000
+0.866,of,2.000
… 81 more positive …,… 81 more positive …,… 81 more positive …
… 136 more negative …,… 136 more negative …,… 136 more negative …

Contribution?,Feature,Value
… 81 more positive …,… 81 more positive …,… 81 more positive …
… 136 more negative …,… 136 more negative …,… 136 more negative …
-0.725,Highlighted in text (sum),
