In [130]:
import pandas as pd

import nltk
from nltk.corpus import stopwords

import scipy as sp
import numpy as np
import matplotlib.pyplot as plt

import sklearn as skl
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import HashingVectorizer


In [131]:
newsPred = pd.read_csv('factCheck.csv')
newsPred.columns = ['Json', 'Accuracy', 'Summary', 'Genre', 'KeywordName', 'Occupation', 'Location', 'PoliticalParty', 
                    '1', '2', '3', '4', '5', 'Source', 'Url']
newsPred.head()


Unnamed: 0,Json,Accuracy,Summary,Genre,KeywordName,Occupation,Location,PoliticalParty,1,2,3,4,5,Source,Url
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1,1,0,a floor speech.,"<p>Del. Scott Surovell&rsquo;s floor <a href=""..."
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70,71,160,163,9,Denver,"Barack Obama campaign Web site, <a href=""http:..."
2,1123.json,FALSE,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7,19,3,5,44,a news release,"<p> News release attributed to Matt Barber, &l..."
3,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15,9,20,19,2,an interview on CNN,"<p>CNN transcript, <a href=""http://transcripts..."
4,12465.json,TRUE,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0,3,2,5,1,a an online opinion-piece,"<p>Right Wisconsin, <a href=""http://www.rightw..."


In [132]:
count_nan = len(newsPred) - newsPred.count()
count_nan

Json               0
Accuracy           0
Summary            0
Genre              0
KeywordName        0
Occupation        40
Location          28
PoliticalParty     0
1                  0
2                  0
3                  0
4                  0
5                  0
Source             0
Url                0
dtype: int64

The only text that matters to us is the summary. Since there are NaNs in the Occupation and Location columns, we are just going to delete them. 

In [133]:
del newsPred['Occupation']
del newsPred['Location']

In [134]:
newsPred.shape

(149, 13)

In [135]:
newsPred.dtypes

Json              object
Accuracy          object
Summary           object
Genre             object
KeywordName       object
PoliticalParty    object
1                  int64
2                  int64
3                  int64
4                  int64
5                  int64
Source            object
Url               object
dtype: object

In [136]:
print (newsPred['Accuracy'].unique())

['half-true' 'mostly-true' 'FALSE' 'TRUE' 'barely-true' 'pants-fire']


Transforming into binary classification to make it easier. 

In [137]:
newsPred['Accuracy'] = newsPred['Accuracy'].replace('half-true', 'TRUE')
newsPred['Accuracy'] = newsPred['Accuracy'].replace('mostly-true', 'TRUE')
newsPred['Accuracy'] = newsPred['Accuracy'].replace('barely-true', 'FALSE')
newsPred['Accuracy'] = newsPred['Accuracy'].replace('pants-fire', 'FALSE')
print (newsPred['Accuracy'].unique())

['TRUE' 'FALSE']


In [139]:
newsPred.head()

Unnamed: 0,Json,Accuracy,Summary,Genre,KeywordName,PoliticalParty,1,2,3,4,5,Source,Url
0,10540.json,True,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,democrat,0,0,1,1,0,a floor speech.,"<p>Del. Scott Surovell&rsquo;s floor <a href=""..."
1,324.json,True,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,democrat,70,71,160,163,9,Denver,"Barack Obama campaign Web site, <a href=""http:..."
2,1123.json,False,Health care reform legislation is likely to ma...,health-care,blog-posting,none,7,19,3,5,44,a news release,"<p> News release attributed to Matt Barber, &l..."
3,9028.json,True,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,democrat,15,9,20,19,2,an interview on CNN,"<p>CNN transcript, <a href=""http://transcripts..."
4,12465.json,True,The Chicago Bears have had more starting quart...,education,robin-vos,republican,0,3,2,5,1,a an online opinion-piece,"<p>Right Wisconsin, <a href=""http://www.rightw..."


In [140]:
y = newsPred.Accuracy
newsPred.drop("Accuracy", axis = 1)

Unnamed: 0,Json,Summary,Genre,KeywordName,PoliticalParty,1,2,3,4,5,Source,Url
0,10540.json,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,democrat,0,0,1,1,0,a floor speech.,"<p>Del. Scott Surovell&rsquo;s floor <a href=""..."
1,324.json,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,democrat,70,71,160,163,9,Denver,"Barack Obama campaign Web site, <a href=""http:..."
2,1123.json,Health care reform legislation is likely to ma...,health-care,blog-posting,none,7,19,3,5,44,a news release,"<p> News release attributed to Matt Barber, &l..."
3,9028.json,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,democrat,15,9,20,19,2,an interview on CNN,"<p>CNN transcript, <a href=""http://transcripts..."
4,12465.json,The Chicago Bears have had more starting quart...,education,robin-vos,republican,0,3,2,5,1,a an online opinion-piece,"<p>Right Wisconsin, <a href=""http://www.rightw..."
5,2342.json,Jim Dunnam has not lived in the district he re...,candidates-biography,republican-party-texas,republican,3,1,1,3,1,a press release.,"<p> The Associated Press, news article,<a href..."
6,153.json,I'm the only person on this stage who has work...,ethics,barack-obama,democrat,70,71,160,163,9,"a Democratic debate in Philadelphia, Pa.","U.S. Senate, <a href="" http://frwebgate.access..."
7,5602.json,"However, it took $19.5 million in Oregon Lotte...",jobs,oregon-lottery,organization,0,0,1,0,1,a website,"<p> Oregon Lottery, &quot;<a href=""http://www...."
8,9741.json,Says GOP primary opponents Glenn Grothman and ...,"energy,message-machine-2014,voting-record",duey-stroebel,republican,0,0,0,1,0,an online video,"<p>Stroebel campaign ad, <a href=""https://www...."
9,7115.json,"For the first time in history, the share of th...",elections,robert-menendez,democrat,1,3,1,3,0,a speech,"<p> Center for American Progress Action Fund, ..."


In [144]:
X_train, X_test, y_train, y_test = train_test_split(newsPred['Summary'], y, test_size = 0.2, random_state = 44)

We can start building classifiers. CountVectorizer and TfidfVectorizer are used to give us an idea if the words and tokens in the article summaries had a significant impact on if the news is fake or real. 

In [145]:
count_vectorizer = CountVectorizer(stop_words = 'english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

Having a max threshold at 0.7 removes words that appear in more than 70% of the summaries. The stop words parameter will remove English stop words from the data before making vectors. 

In [146]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [147]:
print(tfidf_vectorizer.get_feature_names()[-10:])


['workers', 'world', 'wrong', 'year', 'years', 'york', 'young', 'youre', 'youth', 'zero']


In [148]:
# Get the feature names of `count_vectorizer` 
print(count_vectorizer.get_feature_names()[:10])

['000', '054th', '06', '092', '10', '100', '12', '18', '19', '1990']


In [149]:
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

In [150]:
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

In [151]:
difference = set(count_df.columns) - set(tfidf_df.columns)
difference

set()

In [152]:
print(count_df.equals(tfidf_df))

False


In [153]:
count_df.head()


Unnamed: 0,000,054th,06,092,10,100,12,18,19,1990,...,workers,world,wrong,year,years,york,young,youre,youth,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
4,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [154]:
tfidf_df.head()

Unnamed: 0,000,054th,06,092,10,100,12,18,19,1990,...,workers,world,wrong,year,years,york,young,youre,youth,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.385023,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.504461,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [173]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

A classifc nlp algo is the Multinomial Naive Bayes. This compaires the TF-DF versus vag of words (CountVectorizer). 

In [174]:
clf = MultinomialNB() 

In [175]:
clf.fit(tfidf_train, y_train)
pred = clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])


accuracy:   0.600


ValueError: At least one label specified must be in y_true

In [158]:
clf = MultinomialNB() 


In [159]:
clf.fit(count_train, y_train)
pred = clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])


accuracy:   0.533


ValueError: At least one label specified must be in y_true

In [160]:
linear_clf = PassiveAggressiveClassifier(n_iter=50)




In [161]:
linear_clf.fit(tfidf_train, y_train)
pred = linear_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])


accuracy:   0.567


ValueError: At least one label specified must be in y_true

In [162]:
clf = MultinomialNB(alpha=0.1)

In [163]:
last_score = 0
for alpha in np.arange(0,1,.1):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train, y_train)
    pred = nb_classifier.predict(tfidf_test)
    score = metrics.accuracy_score(y_test, pred)
    if score > last_score:
        clf = nb_classifier
    print("Alpha: {:.2f} Score: {:.5f}".format(alpha, score))

Alpha: 0.00 Score: 0.60000
Alpha: 0.10 Score: 0.56667
Alpha: 0.20 Score: 0.56667
Alpha: 0.30 Score: 0.53333
Alpha: 0.40 Score: 0.56667
Alpha: 0.50 Score: 0.53333
Alpha: 0.60 Score: 0.56667
Alpha: 0.70 Score: 0.56667
Alpha: 0.80 Score: 0.60000
Alpha: 0.90 Score: 0.60000


  'setting alpha = %.1e' % _ALPHA_MIN)


In [164]:
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):
    """
    See: https://stackoverflow.com/a/26980472
    
    Identify most important features if given a vectorizer and binary classifier. Set n to the number
    of weighted features you would like to show. (Note: current implementation merely prints and does not 
    return top classes.)
    """

    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)

    print()

    for coef, feat in reversed(topn_class2):
        print(class_labels[1], coef, feat)


most_informative_feature_for_binary_classification(tfidf_vectorizer, linear_clf, n=30)

FALSE -1.087422161 country
FALSE -1.03372719049 business
FALSE -0.963801456591 making
FALSE -0.959609743346 water
FALSE -0.909894963242 program
FALSE -0.826077579968 mandate
FALSE -0.823842602782 kidnapping
FALSE -0.823842602782 phoenix
FALSE -0.789042875122 homeless
FALSE -0.789042875122 veteran
FALSE -0.748040501396 high
FALSE -0.74751661374 mitt
FALSE -0.735567563031 construction
FALSE -0.713531844931 isis
FALSE -0.704152025858 going
FALSE -0.692183997928 provides
FALSE -0.692183997928 virtually
FALSE -0.675332747464 classes
FALSE -0.675332747464 offer
FALSE -0.675332747464 pe
FALSE -0.669494469549 man
FALSE -0.665445073621 ohio
FALSE -0.657667280792 use
FALSE -0.653584680326 burdened
FALSE -0.653584680326 growing
FALSE -0.653584680326 increases
FALSE -0.650622007145 men
FALSE -0.645981688425 change
FALSE -0.645981688425 likely
FALSE -0.645981688425 sex

TRUE 1.04327599889 100
TRUE 1.0193043549 million
TRUE 0.979341260718 texas
TRUE 0.914187160398 guaranteed
TRUE 0.893747937111 cost

In [165]:
feature_names = tfidf_vectorizer.get_feature_names()


In [166]:
sorted(zip(clf.coef_[0], feature_names), reverse=True)[:20]

[(-5.5953916863556792, 'says'),
 (-5.611278328560374, 'percent'),
 (-5.9289536657544994, 'obama'),
 (-5.9719099359283438, 'year'),
 (-5.9815939634806421, 'president'),
 (-6.0193922636410404, 'texas'),
 (-6.0945213527464563, 'million'),
 (-6.1265606045953565, 'health'),
 (-6.1314089966451997, '000'),
 (-6.1371966724060361, 'federal'),
 (-6.1406658533846592, 'women'),
 (-6.1461263301331739, 'american'),
 (-6.1483527330469583, 'care'),
 (-6.1596508293984629, 'cost'),
 (-6.1700796139998424, 'time'),
 (-6.1737399997824678, '100'),
 (-6.1994013399693717, 'hillary'),
 (-6.1994013399693717, 'clinton'),
 (-6.203175263079304, 'people'),
 (-6.2064483356969307, 'took')]

In [167]:
sorted(zip(clf.coef_[0], feature_names))[:20]


[(-6.9447321039254444, '06'),
 (-6.9447321039254444, '2010'),
 (-6.9447321039254444, '250'),
 (-6.9447321039254444, '34'),
 (-6.9447321039254444, '45'),
 (-6.9447321039254444, '53'),
 (-6.9447321039254444, '600'),
 (-6.9447321039254444, '700'),
 (-6.9447321039254444, '930'),
 (-6.9447321039254444, '98'),
 (-6.9447321039254444, 'able'),
 (-6.9447321039254444, 'accept'),
 (-6.9447321039254444, 'act'),
 (-6.9447321039254444, 'adjust'),
 (-6.9447321039254444, 'advantage'),
 (-6.9447321039254444, 'affordable'),
 (-6.9447321039254444, 'ago'),
 (-6.9447321039254444, 'agricultures'),
 (-6.9447321039254444, 'air'),
 (-6.9447321039254444, 'announced')]

In [168]:
tokens_with_weights = sorted(list(zip(feature_names, clf.coef_[0])))


In [169]:
hash_vectorizer = HashingVectorizer(stop_words='english', non_negative=True)
hash_train = hash_vectorizer.fit_transform(X_train)
hash_test = hash_vectorizer.transform(X_test)



In [170]:
clf = MultinomialNB(alpha=.01)

In [171]:
clf.fit(hash_train, y_train)
pred = clf.predict(hash_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])


accuracy:   0.600


ValueError: At least one label specified must be in y_true

In [172]:
clf.fit(hash_train, y_train)
pred = clf.predict(hash_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

accuracy:   0.600


ValueError: At least one label specified must be in y_true