## Fake News Classifier

Based on the exercises in the https://www.datacamp.com/courses/natural-language-processing-fundamentals-in-python course.

## Import Data and Exploratory Analysis

In [1]:
# Import the necessary modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

In [2]:
# convert the csv file of labeled news stories into a dataframe
df1 = pd.read_csv('firsthalf.csv')
df2 = pd.read_csv('secondhalf.csv')
df = df1.append(df2)

In [3]:
# Print the head of df
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
# print the number of rows and columns in the dataframe
df.shape

(6336, 4)

## Split for Validation

In [5]:
# Create a series to store the labels: y
y = df.label

In [6]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df["text"],y,
                                    test_size=0.33,random_state=53)

## CountVectorizer

This method puts together a list of all the words in the train data. This list is called a vocabulary. Each row in the train data is called a document. This method next creates a dataframe that has the same number of rows as the train data, but each word will get its own column. At the intersection of each column and row in the dataframe is a count of the number of times the word for that column occured in the document for that row.

In [7]:
# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words="english")

In [8]:
# Transform the training data using only the 'text' column values: count_train 
count_train = count_vectorizer.fit_transform(X_train)

In [9]:
# Transform the test data using only the 'text' column values: count_test 
count_test = count_vectorizer.transform(X_test)

In [10]:
# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

['00', '000', '0000', '000035', '00006', '0001', '0001pt', '000ft', '000km', '001']


In [11]:
feat = count_vectorizer.get_feature_names()
print("There are",len(feat),"features in the count vectorizer object for the train data")

There are 57219 features in the count vectorizer object for the train data


In [12]:
# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A,
                        columns=count_vectorizer.get_feature_names())

In [13]:
# Print a sample of the rows in count_df
count_df.sample(frac=0.002)

Unnamed: 0,00,000,0000,000035,00006,0001,0001pt,000ft,000km,001,...,حلب,عربي,عن,لم,ما,محاولات,من,هذا,والمرضى,ยงade
410,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3883,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
909,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1475,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4081,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1070,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TfidfVectorizer

This is a similar method. It combines CountVectorizer with a tdidf transformation. The tf stands for term frequency, which counts the number of times that each word occurs in each row. The idf is an abbreviation for inverse document frequency, which means it gives less weight to common terms and more weight to rare terms. The column for each word becomes a vector in a matrix, with the documents forming its rows. 

In [14]:
# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english",max_df=0.7)

In [15]:
# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

In [16]:
# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [17]:
# Print the first 10 features
print(tfidf_vectorizer.get_feature_names()[:10])

['00', '000', '0000', '000035', '00006', '0001', '0001pt', '000ft', '000km', '001']


In [18]:
tfidffeat = tfidf_vectorizer.get_feature_names()
print("There are",len(tfidffeat),"features in the tfidf vectorizer object for the train data")

There are 57219 features in the tfidf vectorizer object for the train data


In [19]:
# Print the first 5 vectors of the tfidf training data
print(tfidf_train.A[:5])

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.06164315 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [20]:
# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A,
                        columns=tfidf_vectorizer.get_feature_names())

In [21]:
# Print a sample of the rows in tfidf_df
tfidf_df.sample(frac=0.002)

Unnamed: 0,00,000,0000,000035,00006,0001,0001pt,000ft,000km,001,...,حلب,عربي,عن,لم,ما,محاولات,من,هذا,والمرضى,ยงade
3679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3840,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Compare the Matrices

In [22]:
# Calculate the difference in columns: difference
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)
print('This means the two matrices have the same columns')

set()
This means the two matrices have the same columns


In [23]:
# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))
print('This means the two matrices have the same rows and columns, but different values in their cells')

False
This means the two matrices have the same rows and columns, but different values in their cells


## Classifier for CountVectorizer

In Naive Bayes, the rows become classes and the word vector columns become features. The algorithm assumes, naively, that the probability of each feature happening is independent of all other features. However, it uses the Bayes formula from statistics to calculate, quite reliably, the probability that a class belong to a target label (fake or real news), based on a series of probabilities that are already known about the features and classes.

In [24]:
# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

In [25]:
# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

In [27]:
# Compute accuracy score
print ("accuracy score:",metrics.accuracy_score(y_test, pred))
print()
# Calculate the confusion matrix and report
print ("confusion matrix:")
print (metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']))
print()
print ("classification report:")
print (metrics.classification_report(y_test, pred))

accuracy score: 0.8957436633189861

confusion matrix:
[[ 873  138]
 [  80 1000]]

classification report:
              precision    recall  f1-score   support

        FAKE       0.92      0.86      0.89      1011
        REAL       0.88      0.93      0.90      1080

   micro avg       0.90      0.90      0.90      2091
   macro avg       0.90      0.89      0.90      2091
weighted avg       0.90      0.90      0.90      2091



## Classifier for TfidfVectorizer

This classifier uses a multinomial distribution, which calculates the probability of success given a certain number of categories and trials. The tfidf vector values give the classifier's naive bayes formula a better start at predicting which target label each row class belongs to.

In [28]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

In [29]:
# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
# Create the predicted tags: pred
pred = nb_classifier.predict(tfidf_test)

In [31]:
# Compute accuracy score
print ("accuracy score:",metrics.accuracy_score(y_test, pred))
print()
# Calculate the confusion matrix and report
print ("confusion matrix:")
print (metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']))
print()
print ("classification report:")
print (metrics.classification_report(y_test, pred))

accuracy score: 0.8531802965088474

confusion matrix:
[[ 737  274]
 [  33 1047]]

classification report:
              precision    recall  f1-score   support

        FAKE       0.96      0.73      0.83      1011
        REAL       0.79      0.97      0.87      1080

   micro avg       0.85      0.85      0.85      2091
   macro avg       0.87      0.85      0.85      2091
weighted avg       0.87      0.85      0.85      2091



## Smoothing

This function will search for an ideal value for the alpha parameter, which will maximize the accuracy of the classifier.

In [32]:
# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tfidf_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score

In [33]:
# Create the list of alphas: alphas
alphas = np.arange(0,1,0.1)

In [34]:
# create a dictionary to store the alphas and scores
alphscordct = {}

In [35]:
# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    score=train_and_predict(alpha)
    alphscordct[alpha]=score
    print('Score: ', score)
    print()

Alpha:  0.0
Score:  0.8919177427068389

Alpha:  0.1
Score:  0.9014825442372071

Alpha:  0.2
Score:  0.8952654232424677

Alpha:  0.30000000000000004
Score:  0.8923959827833573

Alpha:  0.4
Score:  0.8857006217120995

Alpha:  0.5
Score:  0.8809182209469153

Alpha:  0.6000000000000001
Score:  0.8761358201817312

Alpha:  0.7000000000000001


  'setting alpha = %.1e' % _ALPHA_MIN)


Score:  0.8699186991869918

Alpha:  0.8
Score:  0.8656145384983261

Alpha:  0.9
Score:  0.8574844571975132



In [36]:
# get the best alpha and score
alphscor = sorted(alphscordct, key=alphscordct.get, reverse=True)
bestalpha = alphscor[0]
print(bestalpha, "is the best alpha value for this model")

0.1 is the best alpha value for this model


In [37]:
# Instantiate the classifier: nb_classifier
nb_classifier = MultinomialNB(alpha=bestalpha)
# Fit to the training data
nb_classifier.fit(tfidf_train, y_train)
# Predict the labels: pred
pred = nb_classifier.predict(tfidf_test)

In [38]:
# Compute accuracy score
print ("accuracy score:",metrics.accuracy_score(y_test, pred))
print()
# Calculate the confusion matrix and report
print ("confusion matrix:")
print (metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']))
print()
print ("classification report:")
print (metrics.classification_report(y_test, pred))

accuracy score: 0.9014825442372071

confusion matrix:
[[ 873  138]
 [  68 1012]]

classification report:
              precision    recall  f1-score   support

        FAKE       0.93      0.86      0.89      1011
        REAL       0.88      0.94      0.91      1080

   micro avg       0.90      0.90      0.90      2091
   macro avg       0.90      0.90      0.90      2091
weighted avg       0.90      0.90      0.90      2091



## Feature Weights

This gives us the features used by the model, and the weights attached to those features, when predicting whether a news article is fake or real. The top 20 lists of features for both types of articles appear below. The fake features are mostly numbers and timestamps, while the real ones are primarily names of politicians or terms used in news reports about politics.

In [39]:
# Get the class labels: class_labels
class_labels = nb_classifier.classes_

In [40]:
# Extract the features: feature_names
feature_names = tfidf_vectorizer.get_feature_names()

In [41]:
# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))

In [42]:
# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0][:20], feat_with_weights[:20])

FAKE [(-12.645624315988039, '0000'), (-12.645624315988039, '000035'), (-12.645624315988039, '0001'), (-12.645624315988039, '0001pt'), (-12.645624315988039, '000km'), (-12.645624315988039, '0011'), (-12.645624315988039, '004s'), (-12.645624315988039, '005'), (-12.645624315988039, '006s'), (-12.645624315988039, '007'), (-12.645624315988039, '007s'), (-12.645624315988039, '008s'), (-12.645624315988039, '009'), (-12.645624315988039, '0099'), (-12.645624315988039, '00am'), (-12.645624315988039, '00p'), (-12.645624315988039, '00pm'), (-12.645624315988039, '014'), (-12.645624315988039, '015'), (-12.645624315988039, '01am')]


In [43]:
# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1][-20:], feat_with_weights[-20:])

REAL [(-6.796871752001991, 'states'), (-6.744968914278907, 'rubio'), (-6.715233928796038, 'voters'), (-6.7113804591184945, 'house'), (-6.688499966355614, 'republicans'), (-6.6536495791034005, 'bush'), (-6.627623046863042, 'percent'), (-6.575508687311647, 'people'), (-6.565574971494339, 'new'), (-6.539521007839014, 'party'), (-6.464376056331245, 'state'), (-6.449112623035523, 'cruz'), (-6.4172252465408235, 'republican'), (-6.368564224937915, 'campaign'), (-6.3586088687509985, 'sanders'), (-6.340418204788652, 'president'), (-6.154020115959707, 'obama'), (-5.768689598834805, 'clinton'), (-5.624503296971422, 'said'), (-5.349386275175854, 'trump')]
