# Fake News Detection with Python



In [1]:
# importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

%matplotlib inline

In [2]:
news = pd.read_csv('news.csv',index_col=0)

In [3]:
news.head()

Unnamed: 0,title,text,label
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
news.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6335 entries, 8476 to 4330
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   6335 non-null   object
 1   text    6335 non-null   object
 2   label   6335 non-null   object
dtypes: object(3)
memory usage: 198.0+ KB


### Data pre-processing 

In [5]:
# merging title and text into one text
news['text'] = news['title']+' '+news['text']
del news['title']

# label conversion
news['label'] = news['label'].map({'REAL': 0,'FAKE': 1})

In [6]:
news.head()

Unnamed: 0,text,label
8476,You Can Smell Hillary’s Fear Daniel Greenfield...,1
10294,Watch The Exact Moment Paul Ryan Committed Pol...,1
3608,Kerry to go to Paris in gesture of sympathy U....,0
10142,Bernie supporters on Twitter erupt in anger ag...,1
875,The Battle of New York: Why This Primary Matte...,0


In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
#from nltk.stem.porter import PorterStemmer
import string

def text_preprocess(text):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns cleaned text in as string
    """
    # removing punctuation
    no_punc = [char for char in text if char not in string.punctuation]

    # rejoining the characters to form the string
    no_punc = ''.join(no_punc)
    
    # stemming and removing stopwords
    clean = [word for word in no_punc.split() if word.lower() not in stopwords.words('english')]
    
    # rejoining characters to form string
    clean = ' '.join(clean)
    
    return clean

[nltk_data] Downloading package stopwords to C:\Users\Rakibur
[nltk_data]     Rahman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# text pre-processing is being executed 
import time
t1 = time.time()
news['text'] = news['text'].apply(text_preprocess)
t2 = time.time()
print('Time taken: {:.1f} sec'.format(t2-t1))

Time taken: 1598.8 sec


In [9]:
# train-test splitting of data
# or only train data if test data is unseen

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =\
train_test_split(news['text'], news['label'],test_size=0.2, random_state=101)

### Cross-validation with default hyperparameters

In [10]:
# importing classifiers

from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

clf = {'Logistic Regression': LogisticRegression(),
       'Passive-Aggressive': PassiveAggressiveClassifier(),
       'K Neighbors': KNeighborsClassifier(), 
       'Support Vector': SVC(), 
       'Multinomial NB': MultinomialNB(),
       'Decision Tree': DecisionTreeClassifier(), 
       'Random Forest': RandomForestClassifier(), 
       'XG Boost': XGBClassifier(use_label_encoder=False, eval_metric='auc')}

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import time

def cross_val_report(clf_,X_,y_,cv_):
    '''This function gives cross-validated training accuracy for given classifier'''
    ti = time.time()
    pipeline = Pipeline([('bow',CountVectorizer()), ('tfidf',TfidfTransformer()), ('clf',clf_)])
    acc_list = cross_val_score(pipeline, X_train, y_train, cv=cv_, scoring='accuracy')
    mean_ = round(100*acc_list.mean(),1)
    err_ = round(100*np.sqrt(acc_list.std()/(cv_-1)),1)
    accuracy_ = str(mean_)+' $\pm$ '+str(err_)
    
    tf = time.time()
    time_ = round(tf-ti,1)
    
    return accuracy_, time_

In [12]:
accuracy = []
time_taken = []

for i in range(len(clf)):
    accuracy_, time_ = \
    cross_val_report(list(clf.values())[i],X_train,y_train,10)
    accuracy.append(accuracy_)
    time_taken.append(time_)
    print(list(clf.keys())[i]+' took {} sec'.format(time_))

Logistic Regression took 29.2 sec
Passive-Aggressive took 23.7 sec
K Neighbors took 24.9 sec
Support Vector took 428.4 sec
Multinomial NB took 23.0 sec
Decision Tree took 67.3 sec
Random Forest took 73.1 sec
XG Boost took 302.7 sec


In [13]:
# dataframe for cross validation

cross_val_data = {'Train Accuracy (%)': accuracy, 'Time (sec)': time_taken}
df1 = pd.DataFrame(index=clf.keys(), data=cross_val_data, columns=cross_val_data.keys())

In [14]:
df1

Unnamed: 0,Train Accuracy (%),Time (sec)
Logistic Regression,91.5 $\pm$ 3.8,29.2
Passive-Aggressive,93.6 $\pm$ 3.0,23.7
K Neighbors,84.7 $\pm$ 3.6,24.9
Support Vector,93.1 $\pm$ 3.0,428.4
Multinomial NB,83.2 $\pm$ 3.6,23.0
Decision Tree,81.1 $\pm$ 4.1,67.3
Random Forest,91.1 $\pm$ 3.1,73.1
XG Boost,92.6 $\pm$ 3.7,302.7


### Hyperparameter tuning

In [15]:
# Dictionary of tunable hyperparameters of various classifiers

from scipy.stats import expon

hp = {'K Neighbors': dict(n_neighbors = list(range(1,31)),
                          metric = ['euclidean', 'manhattan', 'minkowski'], 
                          weights = ['uniform', 'distance']),
      'Passive-Aggressive': dict(C = 10.**np.arange(-3, 3),
                                 max_iter = list(range(1_000,10_100,100))),
      'Logistic Regression': dict(C = 10.**np.arange(-3, 3),
                                  penalty = ['l1', 'l2', 'elasticnet']),
      'Decision Tree': dict(criterion = ['gini', 'entropy'],
                            min_samples_leaf = list(range(1,10)),
                            max_depth = list(range(2,14,2))),
      'Support Vector': dict(C = expon(scale=100),      # param distribution for SVC 
                             gamma = expon(scale=.1)),
      'Random Forest': dict(n_estimators = [10,50,100,500,1000],
                            max_features = ['auto', 'sqrt', 'log2']),
      'XG Boost': dict(learning_rate = 0.1*np.arange(1,11), # default = 0.3
                       max_depth = list(range(2,31,2)),  # default = 6
                       min_child_weight = 10.**np.arange(-2,3), # default = 1
                       min_split_loss = 0.1**np.arange(0,5), # default = 0
                       colsample_bytree = 0.1*np.arange(1,11))} # default = 1

In [16]:
# creating a dictionary for best estimators

best_values = {}

In [17]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def best_estimator(clf_name,X_,y_,cv_,rs_=False,n_iter=None):
    '''This function finds best estimator for a given classifier'''
    ti = time.time()
    if rs_:
        grid = RandomizedSearchCV(clf[clf_name], param_distributions=hp[clf_name], 
                                  n_iter=n_iter, scoring='accuracy', n_jobs=-1, cv=cv_)
    else:
        grid = GridSearchCV(clf[clf_name], param_grid=hp[clf_name], 
                        scoring='accuracy', n_jobs=-1, cv=cv_)
    pipeline = Pipeline([('bow',CountVectorizer()),('tfidf',TfidfTransformer()), 
                     ('grid',grid)])
    pipeline.fit(X_,y_)
    
    best_score = round(100*grid.best_score_,1)
    best_pipeline = Pipeline([('bow',CountVectorizer()),('tfidf',TfidfTransformer()), 
                           ('best_est',grid.best_estimator_)])
    
    tf = time.time()
    time_ = round(tf-ti,1)
    
    best_values[clf_name] = [best_score, best_pipeline, time_]
    
    print(clf_name+' took '+str(time_)+' sec; best train accuracy: '+str(best_score)+'%')

In [18]:
# hyperparameter tuning for Logistic Regression

best_estimator('Logistic Regression',X_train,y_train,cv_=10)

Logistic Regression took 21.0 sec; best train accuracy: 93.7%


In [19]:
# hyperparameter tuning for Passive-Aggressive

best_estimator('Passive-Aggressive',X_train,y_train,cv_=10)

Passive-Aggressive took 610.5 sec; best train accuracy: 94.1%


In [20]:
# hyperparameter tuning for K Neighbors

best_estimator('K Neighbors',X_train,y_train,cv_=10)

K Neighbors took 1287.2 sec; best train accuracy: 86.1%


In [21]:
# hyperparameter tuning for Support vector

best_estimator('Support Vector',X_train,y_train,cv_=10,rs_=True,n_iter=10)

Support Vector took 1106.1 sec; best train accuracy: 93.9%


In [22]:
# hyperparameter tuning for Decision Tree

best_estimator('Decision Tree',X_train,y_train,cv_=10)

Decision Tree took 637.5 sec; best train accuracy: 82.2%


In [23]:
# hyperparameter tuning for Random Forest

best_estimator('Random Forest',X_train,y_train,cv_=10)

Random Forest took 1322.9 sec; best train accuracy: 91.5%


In [24]:
# hyperparameter tuning for XG Boost

best_estimator('XG Boost',X_train,y_train,cv_=10,rs_=True,n_iter=15)

XG Boost took 2556.9 sec; best train accuracy: 92.7%


In [98]:
# dataframe for best training scores

def best_list(index):
    return [list(best_values.values())[i][index] for i in range(len(best_values))]
        
best_new = {'Best Score (%)': best_list(0), 'Search Time (sec)': best_list(2)}
df2 = pd.DataFrame(index=best_values.keys(), data=best_new, columns=best_new.keys())

# inclusion of default performance
df_best = pd.concat([df2,df1['Train Accuracy (%)'].apply(lambda i: float(i[0:4]))], axis=1)
df_best.rename(columns={'Train Accuracy (%)':'Default Accuracy (%)'}, inplace=True)

In [99]:
df_best

Unnamed: 0,Best Score (%),Search Time (sec),Default Accuracy (%)
Logistic Regression,93.7,21.0,91.5
Passive-Aggressive,94.1,610.5,93.6
K Neighbors,86.1,1287.2,84.7
Support Vector,93.9,1106.1,93.1
Decision Tree,82.2,637.5,81.1
Random Forest,91.5,1322.9,91.1
XG Boost,92.7,2556.9,92.6
Multinomial NB,,,83.2


### Performance on test data

In [53]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 

def test_result(clf_name,X_,y_,X_new,y_new):
    '''This function evaluates various performance metrics on test data'''
    if clf_name in best_values.keys():
        best_pipeline = best_values[clf_name][1]
    elif clf_name in clf.keys():
        best_pipeline =\
        Pipeline([('bow',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',clf[clf_name])])
    
    best_pipeline.fit(X_,y_)
    y_pred = best_pipeline.predict(X_new)
    
    accuracy = round(accuracy_score(y_new,y_pred),3)
    precision = round(precision_score(y_new,y_pred),3)
    recall = round(recall_score(y_new,y_pred),3)
    f1 = round(f1_score(y_new,y_pred),3)
    
    return accuracy, precision,recall, f1

In [94]:
# create a dataframe of performance metrics

metrics = {'Accuracy': [], 'Precision': [], 'Recall': [], 'f1 Score': []}
for estimator in df_best.index:
    accuracy,precision,recall,f1 = test_result(estimator,X_train,y_train,X_test,y_test)
    metrics['Accuracy'].append(accuracy)
    metrics['Precision'].append(precision)
    metrics['Recall'].append(recall)
    metrics['f1 Score'].append(f1)

df_result = pd.DataFrame(index=df_best.index, data=metrics, columns=metrics.keys())

In [95]:
df_result

Unnamed: 0,Accuracy,Precision,Recall,f1 Score
Logistic Regression,0.94,0.914,0.967,0.94
Passive-Aggressive,0.942,0.926,0.956,0.941
K Neighbors,0.867,0.926,0.79,0.852
Support Vector,0.935,0.91,0.961,0.935
Decision Tree,0.802,0.767,0.85,0.806
Random Forest,0.914,0.912,0.91,0.911
XG Boost,0.923,0.917,0.923,0.92
Multinomial NB,0.846,0.975,0.7,0.815


Passive Aggressive Classifier is the best performer.
Liner Regression is the second best.

### Voting classifier of 3 estimators

In [82]:
from sklearn.ensemble import VotingClassifier

def voting_(voter_list,vote_type):
    '''This function creates a voting classifier of 3 estimators,
    which is subsequently trained and tested'''
    
    # creating voting classifier
    estimator_list = []
    for clf_name in voter_list:
        if clf_name in best_values.keys():
            estimator_list.append((clf_name,best_values[clf_name][1]))
        else:
            pipeline =\
            Pipeline([('bow',CountVectorizer()),('tfidf',TfidfTransformer()),
                      ('clf',clf[clf_name])])
            estimator_list.append((clf_name,pipeline))
    voting_clf = VotingClassifier(estimators=estimator_list,voting=vote_type)
    
    # training and testing
    voting_clf.fit(X_train,y_train)
    y_pred = voting_clf.predict(X_test)
    
    # evaluation of metrics
    accuracy = round(accuracy_score(y_test,y_pred),3)
    precision = round(precision_score(y_test,y_pred),3)
    recall = round(recall_score(y_test,y_pred),3)
    f1 = round(f1_score(y_test,y_pred),3)
    
    return accuracy, precision, recall, f1

Choose the bottom 3 performers: 'K Neighbors', 'Decision Tree', 'Multinomial NB'

In [96]:
voter_list = ['K Neighbors', 'Decision Tree', 'Multinomial NB']

# create a dataframe of performance metrics

voting_metrics = {'Accuracy': [], 'Precision': [], 'Recall': [], 'f1 Score': []}
for vote_type in ['hard','soft']:
        accuracy, precision, recall, f1 = voting_(voter_list,vote_type)
        voting_metrics['Accuracy'].append(accuracy)
        voting_metrics['Precision'].append(precision)
        voting_metrics['Recall'].append(recall)
        voting_metrics['f1 Score'].append(f1)

df_voting = pd.DataFrame(index=['Hard Voting','Soft Voting'],
                         data=voting_metrics, columns=voting_metrics.keys())

In [97]:
df_voting

Unnamed: 0,Accuracy,Precision,Recall,f1 Score
Hard Voting,0.897,0.964,0.818,0.885
Soft Voting,0.894,0.936,0.839,0.885


Voting classifier has much more improved predictions than individual voters