In [10]:
# importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

%matplotlib inline

In [11]:
news = pd.read_csv('news.csv',index_col=0)

In [12]:
news.head()

Unnamed: 0,title,text,label
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### Data pre-processing 

In [13]:
# merging title and text into one text
news['text'] = news['title']+' '+news['text']
del news['title']

# label conversion
news['label'] = news['label'].map({'REAL': 0,'FAKE': 1})

In [14]:
news.head()

Unnamed: 0,text,label
8476,You Can Smell Hillary’s Fear Daniel Greenfield...,1
10294,Watch The Exact Moment Paul Ryan Committed Pol...,1
3608,Kerry to go to Paris in gesture of sympathy U....,0
10142,Bernie supporters on Twitter erupt in anger ag...,1
875,The Battle of New York: Why This Primary Matte...,0


In [15]:
news.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6335 entries, 8476 to 4330
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6335 non-null   object
 1   label   6335 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 148.5+ KB


In [17]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string

def text_preprocess(text):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns cleaned text in as string
    """
    # Check characters to see if they are in punctuation
    no_punc = [char for char in text if char not in string.punctuation]

    # Join the characters again to form the string
    no_punc = ''.join(no_punc)
    
    # Now just remove any stopwords
    no_stop = [word for word in no_punc.split() if word.lower() not in stopwords.words('english')]
    
    # Join the characters again to form the string
    no_stop = ' '.join(no_stop)
    
    return no_stop

[nltk_data] Downloading package stopwords to C:\Users\Rakibur
[nltk_data]     Rahman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
# text pre-processing is being executed 

t1 = time.time()
news['text'] = news['text'].apply(text_preprocess)
t2 = time.time()
print('Time taken: {:.1f} sec'.format(t2-t1))

Time taken: 2018.0 sec


In [91]:
# train-test splitting of data
# or only train data if test data is unseen

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =\
train_test_split(news['text'], news['label'],test_size=0.2, random_state=101)

### Cross-validation with default hyperparameters

In [23]:
# importing classifiers

from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

clf = {'Logistic Regression': LogisticRegression(),
       'Passive-Aggressive': PassiveAggressiveClassifier(),
       'K Neighbors': KNeighborsClassifier(), 
       'Support Vector': SVC(), 
       'Multinomial NB': MultinomialNB(),
       'Decision Tree': DecisionTreeClassifier(), 
       'Random Forest': RandomForestClassifier(), 
       'XG Boost': XGBClassifier(use_label_encoder=False, eval_metric='auc')}

In [79]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import time

def cross_val_report(clf_,X_,y_,cv_):
    '''This function gives cross-validated training accuracy for given classifier'''
    ti = time.time()
    pipeline = Pipeline([('bow',CountVectorizer()), ('tfidf',TfidfTransformer()), ('clf',clf_)])
    acc_list = cross_val_score(pipeline, X_train, y_train, cv=cv_, scoring='accuracy')
    mean_ = round(100*acc_list.mean(),1)
    err_ = round(100*np.sqrt(acc_list.std()/(cv_-1)),1)
    accuracy_ = str(mean_)+' $\pm$ '+str(err_)
    
    tf = time.time()
    time_ = round(tf-ti,1)
    
    return accuracy_, time_

In [46]:
accuracy = []
time_taken = []

for i in range(len(clf)):
    accuracy_, time_ = \
    cross_val_report(list(clf.values())[i],X_train,y_train,10)
    accuracy.append(accuracy_)
    time_taken.append(time_)
    print(list(clf.keys())[i]+' took {} sec'.format(time_))

Logistic Regression took 26.8 sec
Passive-Aggressive took 22.1 sec
K Neighbors took 24.3 sec
Support Vector took 439.0 sec
Multinomial NB took 23.2 sec
Decision Tree took 70.9 sec
Random Forest took 77.0 sec
XG Boost took 322.8 sec


In [194]:
# dataframe for cross validation

cross_val_data = {'Train Accuracy (%)': accuracy, 'Time (sec)': time_taken}
df1 = pd.DataFrame(index=clf.keys(), data=cross_val_data, columns=cross_val_data.keys())

In [195]:
df1

Unnamed: 0,Train Accuracy (%),Time (sec)
Logistic Regression,91.5 $\pm$ 3.8,26.8
Passive-Aggressive,93.8 $\pm$ 2.9,22.1
K Neighbors,84.7 $\pm$ 3.6,24.3
Support Vector,93.1 $\pm$ 3.0,439.0
Multinomial NB,83.2 $\pm$ 3.6,23.2
Decision Tree,81.5 $\pm$ 4.3,70.9
Random Forest,90.0 $\pm$ 2.7,77.0
XG Boost,92.6 $\pm$ 3.7,322.8


### Hyperparameter tuning

In [140]:
# Dictionary of tunable hyperparameters of various classifiers

from scipy.stats import expon

hp = {'K Neighbors': dict(n_neighbors = list(range(1,31)),
                          metric = ['euclidean', 'manhattan', 'minkowski'], 
                          weights = ['uniform', 'distance']),
      'Passive-Aggressive': dict(C = 10.**np.arange(-3, 3),
                                 max_iter = list(range(1_000,10_100,100))),
      'Logistic Regression': dict(C = 10.**np.arange(-3, 3),
                                  penalty = ['l1', 'l2', 'elasticnet']),
      'Decision Tree': dict(criterion = ['gini', 'entropy'],
                            min_samples_leaf = list(range(1,10)),
                            max_depth = list(range(2,14,2))),
      'Support Vector': dict(C = expon(scale=100),      # param distribution for SVC 
                             gamma = expon(scale=.1)),
      'Random Forest': dict(n_estimators = [10,50,100,500,1000],
                            max_features = ['auto', 'sqrt', 'log2']),
      'XG Boost': dict(learning_rate = 0.01*np.arange(1,11),
                       max_depth = list(range(2,31)),
                       min_child_weight = list(range(1,11,2)), 
                       gamma = 0.1**np.arange(0,5),
                       colsample_bytree = 0.1*np.arange(1,11))}

In [88]:
# creating a dictionary for best estimators

best_values = {}

In [130]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def best_estimator(clf_name,X_,y_,cv_,rs_=False,n_iter=None):
    '''This function finds best estimator for a given classifier'''
    ti = time.time()
    if rs_:
        grid = RandomizedSearchCV(clf[clf_name], param_distributions=hp[clf_name], 
                                  n_iter=n_iter, scoring='accuracy', n_jobs=-1, cv=cv_)
    else:
        grid = GridSearchCV(clf[clf_name], param_grid=hp[clf_name], 
                        scoring='accuracy', n_jobs=-1, cv=cv_)
    pipeline = Pipeline([('bow',CountVectorizer()),('tfidf',TfidfTransformer()), 
                     ('grid',grid)])
    pipeline.fit(X_,y_)
    
    best_score = round(100*grid.best_score_,1)
    best_pipeline = Pipeline([('bow',CountVectorizer()),('tfidf',TfidfTransformer()), 
                           ('best_est',grid.best_estimator_)])
    
    tf = time.time()
    time_ = round(tf-ti,1)
    
    best_values[clf_name] = [best_score, best_pipeline, time_]
    
    print(clf_name+' took '+str(time_)+' sec; best train accuracy: '+str(best_score)+'%')

In [93]:
# hyperparameter tuning for Logistic Regression

best_estimator('Logistic Regression',X_train,y_train,cv_=10)

Logistic Regression took 19.0 sec; best train accuracy: 93.7%


In [96]:
# hyperparameter tuning for Passive-Aggressive

best_estimator('Passive-Aggressive',X_train,y_train,cv_=10)

Passive-Aggressive took 545.4 sec; best train accuracy: 94.1%


In [97]:
# hyperparameter tuning for K Neighbors

best_estimator('K Neighbors',X_train,y_train,cv_=10)

K Neighbors took 1336.0 sec; best train accuracy: 86.1%


In [103]:
# hyperparameter tuning for Support vector

best_estimator('Support Vector',X_train,y_train,cv_=10,rs_=True,n_iter=10)

Support Vector took 967.9 sec; best train accuracy: 94.0%


In [141]:
# hyperparameter tuning for Decision Tree

best_estimator('Decision Tree',X_train,y_train,cv_=10)

Decision Tree took 586.8 sec; best train accuracy: 82.2%


In [101]:
# hyperparameter tuning for Random Forest

best_estimator('Random Forest',X_train,y_train,cv_=10)

Random Forest took 1324.9 sec; best train accuracy: 91.6%


In [105]:
# hyperparameter tuning for XG Boost

best_estimator('XG Boost',X_train,y_train,cv_=10,rs_=True,n_iter=10)

XG Boost took 1840.8 sec; best train accuracy: 93.0%


In [248]:
# dataframe for best training scores
        
best_score_list = [list(best_values.values())[i][0] for i in range(len(best_values))]
time_list = [list(best_values.values())[i][2] for i in range(len(best_values))]

best_new = {'Best Score (%)': best_score_list, 'Search Time (sec)': time_list}
df2 = pd.DataFrame(index=best_values.keys(), data=best_new, columns=best_new.keys())

# inclusion of default performance
df_best = pd.concat([df2,df1['Train Accuracy (%)'].apply(lambda i: float(i[0:4]))], axis=1)
df_best.rename(columns={'Train Accuracy (%)':'Default Accuracy (%)'}, inplace=True)

In [249]:
df_best

Unnamed: 0,Best Score (%),Search Time (sec),Default Accuracy (%)
Logistic Regression,93.7,19.0,91.5
Passive-Aggressive,94.1,545.4,93.8
K Neighbors,86.1,1336.0,84.7
Random Forest,91.6,1324.9,90.0
Support Vector,94.0,967.9,93.1
Decision Tree,82.2,586.8,81.5
XG Boost,93.0,1840.8,92.6
Multinomial NB,,,83.2


### Performance on test data

In [197]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 

def test_result(clf_name,X_,y_,X_new,y_new):
    '''This function evaluates various performance metrics on test data'''
    if clf_name in best_values.keys():
        best_pipeline = best_values[clf_name][1]
    else:
        best_pipeline =\
        Pipeline([('bow',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',clf[clf_name])])
    
    best_pipeline.fit(X_,y_)
    y_pred = best_pipeline.predict(X_new)
    
    accuracy = round(accuracy_score(y_new,y_pred),3)
    precision = round(precision_score(y_new,y_pred),3)
    recall = round(recall_score(y_new,y_pred),3)
    f1 = round(f1_score(y_new,y_pred),3)
    
    return accuracy, precision,recall, f1

In [210]:
# create a dataframe of performance metrics

metrics = {}
for i in df_best.index:
    accuracy, precision,recall, f1 = test_result(i,X_train,y_train,X_test,y_test)
    metrics[i] = [accuracy, precision,recall, f1]
    
metrics_new = {'Accuracy': [list(metrics.values())[k][0] for k in range(len(df_best.index))],
               'Precision': [list(metrics.values())[k][1] for k in range(len(df_best.index))],
               'Recall': [list(metrics.values())[k][2] for k in range(len(df_best.index))],
               'f1 Score': [list(metrics.values())[k][3] for k in range(len(df_best.index))]}

df_result = pd.DataFrame(index=df_best.index, data=metrics_new, columns=metrics_new.keys())

In [211]:
df_result

Unnamed: 0,Accuracy,Precision,Recall,f1 Score
Logistic Regression,0.94,0.914,0.967,0.94
Passive-Aggressive,0.94,0.926,0.953,0.939
K Neighbors,0.867,0.926,0.79,0.852
Random Forest,0.915,0.911,0.914,0.912
Support Vector,0.938,0.912,0.964,0.937
Decision Tree,0.797,0.763,0.844,0.801
XG Boost,0.928,0.909,0.946,0.927
Multinomial NB,0.846,0.975,0.7,0.815


### Voting classifier of the 5 best estimators

In [283]:
from sklearn.ensemble import VotingClassifier

# picking up top 5 estimators
df_accuracy = df_best['Best Score (%)'].fillna(df_best['Default Accuracy (%)'])
top5 = df_accuracy[df_accuracy>=sorted(list(df_accuracy))[-5]].index

# creating voting classifier and pipeline
top_clf = []
for i in range(5):
    top_clf.append((top5[i],best_values[top5[i]][1]))

voting_clf = VotingClassifier(estimators=top_clf,voting='hard')
voting_pipeline = Pipeline([('bow',CountVectorizer()),('tfidf',TfidfTransformer()),
                            ('voting',voting_clf)])

# training and testing
voting_pipeline.fit(X_train,y_train)
y_pred = voting_pipeline.predict(X_test)

AttributeError: lower not found

In [263]:
# evaluation of metrics
accuracy = round(accuracy_score(y_test,y_pred),3)
precision = round(precision_score(y_test,y_pred),3)
recall = round(recall_score(y_test,y_pred),3)
f1 = round(f1_score(y_test,y_pred),3)

In [121]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))

[[602  51]
 [ 22 592]]
