In [24]:
import numpy as np
import pandas as pd
import eli5

from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVC, LinearSVR, SVC
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from pprint import pprint
from extract_features import extract_features

class FeatureExtractor(object):
    
    def transform(self, X):
        return extract_features(X)

    def fit(self, X, y=None):
        return self



print("Reading in data from `standardized_data.csv`")

# read data file in a DataFrame
df = pd.read_csv('./standardized_data.csv', sep='\t', header=0)

# split the data out into a test and train set
train_df, test_df = train_test_split(df, train_size=0.8,  random_state=np.random.seed(7))

train_text = train_df['text'].tolist()
train_targets = train_df['authenticity'].tolist()

test_text = test_df['text'].tolist()
test_targets = test_df['authenticity'].tolist()

print("Data split into train/test sets 80/20.")
print("Train length: ", len(train_text))
print("Test length: ", len(test_text))

# here be dragons! - things get a bit hacky through here
# overriding the Pipeline to have a get_feature_names method that
# points back to the vectorizer used
feat_extractor = FeatureExtractor()
dict_vectorizer = DictVectorizer()
feat_pipe = Pipeline([
    ('feat_extractor', feat_extractor),
    ('dict_vectorizer', dict_vectorizer)
]);

def get_dict_vectorizer_names(self):
    return dict_vectorizer.get_feature_names()

feat_pipe.get_feature_names = get_dict_vectorizer_names.__get__(feat_pipe)    


# construct our main pipeline
clf = Pipeline([
    ('union', FeatureUnion(
        transformer_list = [
            ('dict_vect', feat_pipe),                  
            ('count_vect', CountVectorizer(ngram_range=(1, 3)))
        ],
        transformer_weights= {
            'dict_vect': .35,
            'count_vect': 1.0,
        }
    )),
    ('tfidf', TfidfTransformer()),
    ('svm', LinearSVC())
])

print("Fitting model")
clf = clf.fit(train_text, train_targets)

print("Predicting")
predicted = clf.predict(test_text)

print()

print(metrics.classification_report(test_targets, predicted))    
print(metrics.confusion_matrix(test_targets, predicted))
print(metrics.accuracy_score(test_targets, predicted))

correct_predictions = [(prediction, test_text[index], index) for index, prediction in enumerate(predicted) if prediction == test_targets[index]]
incorrect_predictions = [(prediction, test_text[index], index) for index, prediction in enumerate(predicted) if prediction != test_targets[index]]

eli5.show_prediction(
    estimator=clf.named_steps['svm'],
    doc=test_text[4], # correct - authentic
    #doc=test_text[133], # incorrect - authentic
    #doc=test_text[3], # correct - deceptive
    #doc=test_text[306], # inccorect - deceptive
    top=20,
    vec=clf.named_steps['union'],
    force_weights=True)

  0%|          | 5/1280 [00:00<00:27, 46.36it/s]

Reading in data from `standardized_data.csv`
Data split into train/test sets 80/20.
Train length:  1280
Test length:  320
Fitting model
Starting feature extraction...


100%|██████████| 1280/1280 [00:40<00:00, 31.77it/s]
  1%|▏         | 4/320 [00:00<00:09, 34.16it/s]

Predicting
Starting feature extraction...


100%|██████████| 320/320 [00:10<00:00, 31.29it/s]



             precision    recall  f1-score   support

  AUTHENTIC       0.89      0.79      0.84       149
  DECEPTIVE       0.83      0.92      0.87       171

avg / total       0.86      0.86      0.86       320

[[117  32]
 [ 14 157]]
0.85625


100%|██████████| 1/1 [00:00<00:00, 51.36it/s]

Starting feature extraction...





Contribution?,Feature
+9.986,dict_vect__length
+9.904,dict_vect__avg_sentence_length
+4.260,count_vect__to
+1.404,count_vect__hotel
+1.373,dict_vect__count_personal_pronouns
+0.659,count_vect__recommend
+0.591,count_vect__at
+0.572,count_vect__of
+0.449,count_vect__while
+0.443,count_vect__rooms

Contribution?,Feature
+9.986,dict_vect__length
+9.904,dict_vect__avg_sentence_length
+3.567,count_vect: Highlighted in text (sum)
+1.373,dict_vect__count_personal_pronouns
… 32 more positive …,… 32 more positive …
… 54 more negative …,… 54 more negative …
-0.594,<BIAS>
-0.643,dict_vect__count_misspellings
-0.781,dict_vect__count_proper_noun
-1.560,dict_vect__count_of_sentences
