In [None]:
%matplotlib inline
import gensim
import pandas
import nltk.corpus
import nltk.sentiment
import sklearn.linear_model
import textblob
import random
import numpy
import sklearn.metrics
import sklearn.ensemble
import seaborn
import re
import collections

sentence_splitter=re.compile(u"""[.?!]['"]*\s+""",re.UNICODE)


This dataset gives me the opportunity to see if the fake news detection methods I investigated in [The Grammar of Truth and Lies](https://www.kaggle.com/petebleackley/the-grammar-of-truth-and-lies) are reproducible. In that notebook I used the grammatical structure of sentences, sentiment analysis and stop words to classify documents as real or fake news. Results were promissing, but the question remains as to whether the techniques used will work on another sample. As before we begin by extracting sentence structure features (concatenations of part of speech labels) from the documents, reducing the dimensionality with Latent Semantic Indexing, and classifying with Logistic Regression.

In [None]:
def sentence_structure_features(document):
    return ['_'.join((pos for (word,pos) in sentence.pos_tags))
            for sentence in textblob.blob.TextBlob(document).sentences]

In [None]:
class SentenceStructureCorpus(object):
    def __init__(self):
        lies=pandas.read_csv("../input/fake-and-real-news-dataset/Fake.csv")
        n_lies=lies.shape[0]
        self.vader=nltk.sentiment.vader.SentimentIntensityAnalyzer()
        print("Converting Fake News corpus")
        self.data=[sentence_structure_features('{0}\n{1}'.format(row['title'],row['text']))
                   for (index,row) in lies.iterrows()]
        sentiments=[self.analyse_sentiments('{0}\n{1}'.format(row['title'],row['text']))
                    for (index,row) in lies.iterrows()]
        truth = pandas.read_csv('../input/fake-and-real-news-dataset/True.csv')
        print('Converting Real News corpus')
        self.data.extend([sentence_structure_features('{0}\n{1}'.format(row['title'],row['text']))
                          for (index,row) in truth.iterrows()])
        sentiments.extend([self.analyse_sentiments('{0}\n{1}'.format(row['title'],row['text']))
                           for (index,row) in truth.iterrows()])
        self.sentiments=numpy.array(sentiments)
        self.N=len(self.data)
        self.labels=numpy.ones(self.N)
        self.labels[:n_lies]=0
        self.test_sample=random.sample(range(self.N),self.N//10)
        print("Creating dictionary")
        self.dictionary=gensim.corpora.dictionary.Dictionary(self.data)
        
    def __iter__(self):
        return (self.dictionary.doc2bow(document) for document in self.data)
                          
    def analyse_sentiments(self,document):
        valences=numpy.array([[sent['pos'],sent['neg'],sent['neu']]
                             for sent in (self.vader.polarity_scores(sentence)
                                          for sentence in sentence_splitter.split(document))])
        return valences.sum(axis=0)
    
    def training_data(self):
        return [self.dictionary.doc2bow(document) for (i,document) in enumerate(self.data)
                if i not in self.test_sample]
                
    def training_labels(self):
        return self.labels[[i for i in range(self.N) if i not in self.test_sample]]
    
    def training_sentiments(self):
        return self.sentiments[[i for i in range(self.N) if i not in self.test_sample]]
    
    def test_sentiments(self):
        return self.sentiments[self.test_sample]
                
    def test_data(self):
        return [self.dictionary.doc2bow(self.data[i])
                for i in self.test_sample]
            
    def test_labels(self):
        return self.labels[self.test_sample]


In [None]:
ssf=SentenceStructureCorpus()
print("Training LSI")
lsi=gensim.models.lsimodel.LsiModel(ssf)

In [None]:
pandas.Series(ssf.labels).value_counts().plot.bar()

The `True` and `False` samples are almost balanced (there are slightly more `False` samples)

In [None]:
vectors=gensim.matutils.corpus2dense(lsi[ssf.training_data()],lsi.num_topics).T
classifier=sklearn.linear_model.LogisticRegression(max_iter=200)
print("Training classifier")
classifier.fit(vectors,ssf.training_labels())
print("Testing classifier")
confusion=sklearn.metrics.confusion_matrix(ssf.test_labels(),
                                           classifier.predict(gensim.matutils.corpus2dense(lsi[ssf.test_data()],
                                                                                           lsi.num_topics).T))
seaborn.heatmap(confusion,annot=True)


We can see that almost all the `True` articles are classified as `True`, but that slightly more than half of the `False` articles are classified as `True`. This is qualitiatively similar to the results from the original results.

In [None]:
def precision(cm):
    return cm[1,1]/cm[:,1].sum()

def recall(cm):
    return cm[1,1]/cm[1].sum()

def accuracy(cm):
    return (cm[0,0]+cm[1,1])/cm.sum()

def matthews(cm):
    return (cm[0,0]*cm[1,1]-cm[1,0]*cm[0,1])/numpy.sqrt(cm[0].sum()*cm[1].sum()*cm[:,0].sum()*cm[:,1].sum())



In [None]:
precision(confusion)

In [None]:
recall(confusion)

In [None]:
accuracy(confusion)

In [None]:
matthews(confusion)

Precision is 60%, Recall is 94%, Accuracy is 66% and Matthew's Coefficient is 40%. Precision and Recall are very similar to the original dataset, but Accuracy and Matthew Coefficient are lower.

Now let's use Sentiment Analysis.

In [None]:
sentiment_classifier=sklearn.linear_model.LogisticRegression(max_iter=200)
sentiment_classifier.fit(ssf.training_sentiments(),ssf.training_labels())
confusion=sklearn.metrics.confusion_matrix(ssf.test_labels(),
                                           sentiment_classifier.predict(ssf.test_sentiments()))
seaborn.heatmap(confusion,annot=True)

Again, Sentiment Analysis classifies more `False` articles correctly, but misclassifies more `True` articles. The loss of Recall is greater than in the original dataset.

In [None]:
precision(confusion)

In [None]:
recall(confusion)

In [None]:
accuracy(confusion)

In [None]:
matthews(confusion)

Precision is slightly increased to 62% (not as great an increase as in the original dataset), Recall is reduced to 51% (as opposed to 86% in the original dataset), Accuracy is 62% (78% in the original dataset) and Matthews Coefficient is only 24%. Sentiment Analysis is therefore a less reliable signla in this dataset than in the original one.

Now let's look at combining Sentence Structure Features with Sentiment Analysis.

In [None]:
enhanced_vectors=numpy.hstack([vectors,ssf.training_sentiments()])
combined_classifier=sklearn.linear_model.LogisticRegression(max_iter=200)
print("Training classifier")
combined_classifier.fit(enhanced_vectors,ssf.training_labels())
print("Testing classifier")
enhanced_test_vectors=numpy.hstack([gensim.matutils.corpus2dense(lsi[ssf.test_data()],
                                                                 lsi.num_topics).T,
                                    ssf.test_sentiments()])
confusion=sklearn.metrics.confusion_matrix(ssf.test_labels(),
                                           combined_classifier.predict(enhanced_test_vectors))
seaborn.heatmap(confusion,annot=True)

In [None]:
precision(confusion)

In [None]:
recall(confusion)

In [None]:
accuracy(confusion)

In [None]:
matthews(confusion)

Precision has increased to 67%, Recall to 74%, Accuracy to 70% and Matthews Coefficient is 41%. This is a better result than either feature set alone for Precision and Accuracy, equal to Sentence Structure Features alone on Matthews Coefficient, and intermediate between the two for Recall. However, it does not perform as well on any metric as it did on the original dataset.

Now let us try Random Forest classifiers with each feature set.

In [None]:
forest0=sklearn.ensemble.RandomForestClassifier(n_estimators=100)
forest0.fit(vectors,ssf.training_labels())
confusion=sklearn.metrics.confusion_matrix(ssf.test_labels(),
                                           forest0.predict(gensim.matutils.corpus2dense(lsi[ssf.test_data()],
                                                                                           lsi.num_topics).T))
seaborn.heatmap(confusion,annot=True)

In [None]:
precision(confusion)

In [None]:
recall(confusion)

In [None]:
accuracy(confusion)

In [None]:
matthews(confusion)

The Random Forest classifier does surprisingly well with Sentence Structure Features, achieving 97% precision, 94% Recall, 96% Accuracy and 92% Matthews Coefficient. This is much better than it performed on the original dataset. Note that in the discussion of this dataset, several people have reported surprisingly good results on this dataset with a variety of models.

In [None]:
forest1=sklearn.ensemble.RandomForestClassifier(n_estimators=100)
forest1.fit(ssf.training_sentiments(),ssf.training_labels())
confusion=sklearn.metrics.confusion_matrix(ssf.test_labels(),
                                           forest1.predict(ssf.test_sentiments()))
seaborn.heatmap(confusion,annot=True)

In [None]:
precision(confusion)

In [None]:
recall(confusion)

In [None]:
accuracy(confusion)

In [None]:
matthews(confusion)

Again, Random Forests does not do as well with Sentiment Analysis alone - 73% Precision, 64% Recall, 72% Accuracy and 43% Matthew's Coefficient. This is similar to the way it performed on the original dataset.

In [None]:
forest2=sklearn.ensemble.RandomForestClassifier(n_estimators=100)
forest2.fit(enhanced_vectors,ssf.training_labels())
confusion=sklearn.metrics.confusion_matrix(ssf.test_labels(),
                                           forest2.predict(enhanced_test_vectors))
seaborn.heatmap(confusion,annot=True)

In [None]:
precision(confusion)

In [None]:
recall(confusion)

In [None]:
accuracy(confusion)

In [None]:
matthews(confusion)

The results for combining Sentence Structure Features with Sentiment Analysis with a Random Forest classifier are practically the same as for Sentence Structure Features alone.

Now let us introduce Stopwords into our feature set, and classify with Logistic Regression.

In [None]:
stopwords = nltk.corpus.stopwords.words("english")
stopwords

In [None]:
def sentence_structure_features(document):
    blob = textblob.blob.TextBlob(document)
    return ['_'.join((pos for (word,pos) in sentence.pos_tags))
            for sentence in textblob.blob.TextBlob(document).sentences] +[word.lower() 
                                                                          for word in blob.words
                                                                          if  word.lower() in stopwords]
ssf2 = SentenceStructureCorpus()
transform = gensim.models.LsiModel(ssf2,id2word=ssf2.dictionary)
training_data = gensim.matutils.corpus2dense(transform[ssf2.training_data()],
                                             transform.num_topics).T
test_data = gensim.matutils.corpus2dense(transform[ssf2.test_data()],
                                         transform.num_topics).T
classifier = sklearn.linear_model.LogisticRegression(max_iter=200)
classifier.fit(training_data,ssf2.training_labels())
confusion = sklearn.metrics.confusion_matrix(ssf2.test_labels(),
                                             classifier.predict(test_data))
seaborn.heatmap(confusion,annot=True)

In [None]:
precision(confusion)

In [None]:
recall(confusion)

In [None]:
accuracy(confusion)

In [None]:
matthews(confusion)

These results are close to those achieved with the original dataset using Sentence Structure Features and Stopwords.

In general we can see that the performance of these algorithms is broadly consistent between both datasets. The most surprising thing is how well Random Forests perform on this dataset when compared both to their performance on the original dataset and to the performance of other algorithms on this dataset.