In [62]:
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt

In [63]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


### Comparing Fake News Classifiers

I wrote a longer explanation of the methodology and approach for detecting fake news using scikit-learn on DataCamp (and you can [find the notebook on my GitHub](https://github.com/kjam/random_hackery/blob/master/Attempting%20to%20detect%20fake%20news.ipynb)). I would start there if you are curious as to why I chose the data, what I learned about the models and so forth.

In this notebook, I wanted to compare some of the features learned by each classifier to see if there was overlap or patterns in the features.

In [64]:
df = pd.read_csv('corpusTestAndTrain1.csv',encoding='latin-1', error_bad_lines=False)

b'Skipping line 4: expected 1 fields, saw 4\nSkipping line 5: expected 1 fields, saw 2\nSkipping line 6: expected 1 fields, saw 2\nSkipping line 7: expected 1 fields, saw 3\nSkipping line 8: expected 1 fields, saw 4\nSkipping line 9: expected 1 fields, saw 7\nSkipping line 10: expected 1 fields, saw 2\nSkipping line 11: expected 1 fields, saw 7\nSkipping line 12: expected 1 fields, saw 2\nSkipping line 13: expected 1 fields, saw 9\nSkipping line 16: expected 1 fields, saw 4\nSkipping line 17: expected 1 fields, saw 2\nSkipping line 19: expected 1 fields, saw 8\nSkipping line 20: expected 1 fields, saw 2\nSkipping line 21: expected 1 fields, saw 2\nSkipping line 22: expected 1 fields, saw 9\nSkipping line 23: expected 1 fields, saw 3\nSkipping line 24: expected 1 fields, saw 3\nSkipping line 25: expected 1 fields, saw 8\nSkipping line 26: expected 1 fields, saw 5\nSkipping line 27: expected 1 fields, saw 8\nSkipping line 28: expected 1 fields, saw 3\nSkipping line 29: expected 1 fields,

In [65]:
y = df.label

AttributeError: 'DataFrame' object has no attribute 'label'

In [None]:
df = df.drop('label', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.25, random_state=53)

In [None]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

### Training models

Now I have my vectors and I can create my different classifiers. In my post I noted that there is definitely noise in the dataset, so we should expect to see that reflected in our features. Normally, I would spend some time cleaning the data, but this was a small proof of concept and investigation. I hoped merely that at least one model would be able to correct for the noise.

I will compare the following models (and training data):

- multinomialNB with counts (`sgd_count_clf`)
- multinomialNB with tf-idf (`mn_tfidf_clf`)
- passive aggressive with tf-idf (`pa_tfidf_clf`)
- linear svc with tf-idf (`svc_tfidf_clf`)
- linear sgd with tf-idf (`sgd_tfidf_clf`)

For speed and clarity, I am primarily not doing parameter tuning, although this could be added as a step (perhaps in a scikit-learn Pipeline).

In [None]:
mn_count_clf = MultinomialNB(alpha=0.1) 

In [None]:
mn_count_clf.fit(count_train, y_train)
pred = mn_count_clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)



In [None]:
mn_tfidf_clf = MultinomialNB(alpha=0.1) 

In [None]:
mn_tfidf_clf.fit(tfidf_train, y_train)
pred = mn_tfidf_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)


In [None]:
pa_tfidf_clf = PassiveAggressiveClassifier()

In [None]:
pa_tfidf_clf.fit(tfidf_train, y_train)
pred = pa_tfidf_clf.predict(tfidf_test)
score1 = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)


In [None]:
svc_tfidf_clf = LinearSVC()

In [None]:
svc_tfidf_clf.fit(tfidf_train, y_train)
pred = svc_tfidf_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

In [None]:
sgd_tfidf_clf = SGDClassifier()

In [None]:
sgd_tfidf_clf.fit(tfidf_train, y_train)
pred = sgd_tfidf_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
print("classificationreport")
sgd_tfidf_clf.fit(tfidf_train, y_train)
pred = sgd_tfidf_clf.predict(tfidf_test)

score = metrics.accuracy_score(y_test, pred)
print(y_test)
print("accuracy:   %0.3f" % score)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("accuracy:   %0.3f" % score)

len(pred)

In [None]:
sgd_tfidf_clf.decision_function?

In [None]:
mn_count_clf.predict_proba?

In [None]:
plt.figure(0).clf()

for model, name in [ (mn_count_clf, 'multinomial nb count'),
                     (mn_tfidf_clf, 'multinomial nb tfidf'),
                     (pa_tfidf_clf, 'passive aggressive'),
                     (svc_tfidf_clf, 'svc'),
                     (sgd_tfidf_clf, 'sgd')]:
    if 'count' in name:
        pred = model.predict_proba(count_test)[:,1]
    elif 'multinomial' in name:
        pred = model.predict_proba(tfidf_test)[:,1]
    else: 
        pred = model.decision_function(tfidf_test)
    fpr, tpr, thresh = metrics.roc_curve(y_test.values, pred, pos_label='REAL')
    plt.plot(fpr,tpr,label="{}".format(name))

plt.legend(loc=0)

### Introspecting models

My main goal for this notebook is not to compare accuracy, but to compare features learned. To do so, we can use the method shown in this [very useful StackOverflow answer](https://stackoverflow.com/a/26980472) to show significant features in a binary classifier. I will use a modified version to return top features for each label.

In [None]:
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):
    """
    See: https://stackoverflow.com/a/26980472
    
    Identify most important features if given a vectorizer and binary classifier. Set n to the number
    of weighted features you would like to show. (Note: current implementation merely prints and does not 
    return top classes.)
    
    Modified by @kjam to support a dict return.
    """

    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

    return {class_labels[0]: topn_class1,
            class_labels[1]: topn_class2
    }


most_informative_feature_for_binary_classification(tfidf_vectorizer, pa_tfidf_clf, n=10)

In [None]:
classifiers = [(mn_count_clf, count_vectorizer),
               (mn_tfidf_clf, tfidf_vectorizer),
               (pa_tfidf_clf, tfidf_vectorizer),
               (svc_tfidf_clf, tfidf_vectorizer),
               (sgd_tfidf_clf, tfidf_vectorizer)]

In [None]:
results = {}
for clf, vct in classifiers:
    results[clf] = most_informative_feature_for_binary_classification(vct, clf, n=10)

In [None]:
results

But this is both a bit hard to read and compare. What I really want is to see these possibly with ranks and compare the tokens to one another. Let's transform the data to look better for what we are trying to measure.

In [None]:
comparable_results = {'REAL': {}, 'FAKE': {}}
for clf, data in results.items():
    clf_name = clf.__class__.__name__
    for label, features in data.items():
        for rank, score_tuple in enumerate(features):
            if score_tuple[1] in comparable_results[label]:
                comparable_results[label][score_tuple[1]].append((rank + 1, clf_name))
            else:
                comparable_results[label][score_tuple[1]] = [(rank + 1, clf_name)]

Now these are a bit easier to compare and read:

In [None]:
comparable_results['FAKE']

I immediately noticed the multinomial models had picked up quite a bit of noise from the dataset. These models likely would have benefit from some preprocessing. I also noticed that *most* of the models had picked up what I would consider noise, such as `2016` and the words `print` and `share` (which are clearly scraping artifacts).

Let's see if we can score the tokens by popularity and rank. I also wanted to add in a warning message in case I had overlap between my real and fake tokens. (This may be the case if you take a larger n-features from each)

In [None]:
agg_results = {}
for label, features in comparable_results.items():
    for feature, ranks in features.items():
        if feature in agg_results:
            print("WARNING! DUPLICATE LABEL!!! {}".format(feature))
        agg_results[feature] = {
            'label': label,
            'agg_rank': np.mean([r[0] for r in ranks]),
            'count': len(ranks)
        }

I can then put this into a dataframe, for easier transformations and viewing.

In [None]:
comparison_df = pd.DataFrame(agg_results).T

In [None]:
comparison_df.head()

To investigate the top real and fake labels, I would advise to sort by count. Let's see my top 10 tokens for real and fake news ranked by the number of classifiers that used them as a top feature.

In [None]:
comparison_df[comparison_df['label'] == 'REAL'].sort_values('count', ascending=0).head(10)

In [None]:
comparison_df[comparison_df['label'] == 'FAKE'].sort_values('count', ascending=0).head(10)

### Conclusion

As expected, the bag-of-words and TF-IDF vectors didn't do much to determine meaningful features to classify fake or real news. As outlined in my DataCamp post, this problem is a lot harder than simple text classification.

That said, I did learn a few things. Namely, that linear models handle noise in this case better than the Naive Bayes multinomial classifier did. Also, finding a good dataset that has been scraped from the web and tagged for this problem would likely be a great help, and worth more of my time than parameter tuning on a clearly noisy and error prone dataset.

If you spend some time researching and find anything interesting, feel free to share your findings and notes in the comments or you can always reach out on Twitter (I'm [@kjam](https://twitter.com/kjam)).

I hope you had some fun exploring a new NLP dataset with me!

### Appendix A: Top features

Once I realized the Naive Bayes classifiers had identified many noisy tokens in alphabetical order as top fake news classifiers, I decided to see just how many "top features" the model had. 