# Fake News Classifier

## Libraries used

In [3]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from time import time

## Read the data

In [4]:
#Read the fake data
df_fake=pd.read_csv('Fake.csv')
df_fake['label'] = 'Fake'
#Get shape and head
print(df_fake.shape)
df_fake.head()

(23481, 5)


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",Fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",Fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",Fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",Fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",Fake


In [5]:
#Read the true data
df_true=pd.read_csv('True.csv')
df_true['label'] = 'True'
#Get shape and head
print(df_true.shape)
df_true.head()
len(df_true['label'])

(21417, 5)


21417

### Concat the two dataframes

In [72]:
data = [df_fake, df_true]

df_all = pd.concat(data)

df_all['label'] = df_all.label.map({'True': 1, 'Fake': 0})

print(df_all.shape)
df_all.head()

(44898, 5)


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [83]:
df = df_all.sample(frac=0.1).reset_index(drop=True)
df.head()

Unnamed: 0,title,text,subject,date,label
0,How 'Stop Trump' failed to halt the Republican...,(Reuters) - Republican John Hammond III had ma...,politicsNews,"May 4, 2016",1
1,"After question on foreign meddling in Brexit, ...",LONDON (Reuters) - Britain s democracy is one ...,worldnews,"October 23, 2017",1
2,Puerto Rico votes in favor of U.S. statehood a...,SAN JUAN (Reuters) - The economically struggli...,politicsNews,"June 11, 2017",1
3,YOU’RE FIRED! PRES TRUMP FIRES Obama’s Partisa...,"The acting Attorney General, Sally Yates, has ...",left-news,"Jan 30, 2017",0
4,"Greek police find detonators, bomb making mate...",ATHENS (Reuters) - Greek police found bomb mak...,worldnews,"November 28, 2017",1


In [84]:
df.tail()

Unnamed: 0,title,text,subject,date,label
4485,Nigeria NGOs slam civil society bill as grave ...,ABUJA (Reuters) - A bill proposed by Nigerian ...,worldnews,"December 13, 2017",1
4486,Kuwait's ruler accepts cabinet resignation: st...,KUWAIT (Reuters) - Kuwait s ruling emir accept...,worldnews,"October 30, 2017",1
4487,Russian PM says U.S.-Russia ties at low ebb bu...,MOSCOW (Reuters) - Russian Prime Minister Dmit...,worldnews,"November 30, 2017",1
4488,Kenya High Court rules minor candidate should ...,NAIROBI (Reuters) - The Kenyan election board ...,worldnews,"October 11, 2017",1
4489,WATCH Rep. Steve King’s Bombshell Answer on Wh...,Rep. Steve King of Iowa was being interviewed ...,politics,"Jul 20, 2017",0


In [85]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4490 entries, 0 to 4489
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    4490 non-null   object
 1   text     4490 non-null   object
 2   subject  4490 non-null   object
 3   date     4490 non-null   object
 4   label    4490 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 175.5+ KB


(4490, 5)

## Data pre-processing
### Steps:
#### 1. Remove Blank rows in Data, if any
#### 2. Change all the text to lower case
#### 3. Remove Stop words
#### 4. Word Tokenization
#### 5. Word Lemmatization

In [78]:
import nltk
#nltk.download('all')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn

In [86]:
# 1) remove blank rows in data, if any
t = time()
df['text'].dropna(inplace=True)
test_time1 = time() - t
print("test time:  %0.3fs" % test_time1)
text.head()

test time:  0.003s


0    ankara reuters  turkey said on monday its form...
1    massive protests have been taking place boston...
2    while it s exciting to see a great television ...
3    trump says he s never talked to putin and the ...
4     disrupt the inauguration the majority have sp...
Name: text, dtype: object

In [87]:
# 2) change all the text to lower case (this is required as python interprets 'dog' and 'DOG' differently) and remove punctuations.
t = time()
df['text'] = [str(entry).lower() for entry in df['text']]
df['text'] = df.text.str.replace('[^\w\s]', '')
df['text'] = df.text.str.replace('br', '')
test_time2 = time() - t
print("test time:  %0.3fs" % test_time2)
text = df.text
text.head()

  df['text'] = df.text.str.replace('[^\w\s]', '')


test time:  1.183s


0    reuters  republican john hammond iii had made ...
1    london reuters  itain s democracy is one of th...
2    san juan reuters  the economically struggling ...
3    the acting attorney general sally yates has be...
4    athens reuters  greek police found bomb making...
Name: text, dtype: object

In [94]:
# 3) remove Stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
t = time()
df['text_without_stopwords'] = df['text'].apply(lambda x: ' '.join([item for item in x.split() if item not in (stop)]))
test_time3 = time() - t
print("test time:  %0.3fs" % test_time)
text_without_stopwords = df.text_without_stopwords
text_without_stopwords.head()

test time:  4.464s


0    reuters republican john hammond iii made secre...
1    london reuters itain democracy one secure worl...
2    san juan reuters economically struggling us is...
3    acting attorney general sally yates betrayed d...
4    athens reuters greek police found bomb making ...
Name: text_without_stopwords, dtype: object

In [95]:
# 4)word tokenization
t = time()
df['text_tokenized'] = df['text_without_stopwords'].apply(word_tokenize)
test_time4 = time() - t
print("test time:  %0.3fs" % test_time4)
text_tokenized = df['text_tokenized']
text_tokenized.head

test time:  5.457s


<bound method NDFrame.head of 0       [reuters, republican, john, hammond, iii, made...
1       [london, reuters, itain, democracy, one, secur...
2       [san, juan, reuters, economically, struggling,...
3       [acting, attorney, general, sally, yates, betr...
4       [athens, reuters, greek, police, found, bomb, ...
                              ...                        
4485    [abuja, reuters, bill, proposed, nigerian, law...
4486    [kuwait, reuters, kuwait, ruling, emir, accept...
4487    [moscow, reuters, russian, prime, minister, dm...
4488    [nairobi, reuters, kenyan, election, board, in...
4489    [rep, steve, king, iowa, interviewed, tucker, ...
Name: text_tokenized, Length: 4490, dtype: object>

In [110]:
# 5) word lemmatization using POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

def lem(sentence):
    lemmatized_text = []
    for i in sentence:
        lemmatized_text.append(lemmatizer.lemmatize(i, get_wordnet_pos(i)))
    return ' '.join(lemmatized_text)

#sentence = text_tokenized[0]
#print(sentence)
#print(lem(sentence))
#df['text_lemmatized'] = df.apply(lambda row: lem(row['text_tokenized']), axis=1)
#facts.apply(lambda row: final_pop(row['population'],row['population_growth']),axis=1)
t = time()
df['text_lemmatized'] = df.apply(lambda row: lem(row['text_tokenized']),axis=1)
test_time5= time() - t
print("test time:  %0.3fs" % test_time5)
#text_lemmatized = df['text_lemmatized']
#print(text_lemmatized.head)


test time:  204.897s


## The final text to be used is defined here. 
### We might use either the lemmatized one or the non-stopwords or the initial and compare the effectiveness of them.

In [125]:
df['text_final'] = df['text_lemmatized']

### Split the dataset into training and testing sets

In [126]:
#check the labels that are mixed
labels = df.label
labels.head()

0    1
1    1
2    1
3    0
4    1
Name: label, dtype: int64

In [127]:
x_train,x_test,y_train,y_test=train_test_split(df['text_final'], labels, test_size=0.2, random_state=7)

### Initialize the TF-IDF vectorizer

##### We initialize a TfidfVectorizer with stop words from the English language and a maximum document frequency of 0.7 (terms with a higher document frequency will be discarded). Stop words are the most common words in a language that are to be filtered out before processing the natural language data. And a TfidfVectorizer turns a collection of raw documents into a matrix of TF-IDF features.

In [128]:
# Initialise the vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(x_train) 
tfidf_test = tfidf_vectorizer.transform(x_test)

In [129]:
t = time()
tfidf_train_dense = tfidf_train.toarray()
tfidf_test_dense = tfidf_test.toarray()
test_time_tfidf = time() - t
print("test time:  %0.3fs" % test_time_tfidf)

test time:  1.498s


### Initialize a PassiveAggressiveClassifier

#### The Passive Aggressive Classifier is set to be the base which will be later compared to other classifiers such as Naive Bayes, SVM, Logistic Regression, Stochastic Gradient Descent and finally the Random Forest Classifier.

In [130]:
t = time()
#Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
#Predict on the test set and calculate accuracy
y_pred_pac=pac.predict(tfidf_test)
score_pac=accuracy_score(y_test,y_pred_pac)
print(f'Accuracy: {round(score_pac*100,2)}%')
#Check the test time
test_time_PAC = time() - t
print("test time:  %0.3fs" % test_time_PAC)

Accuracy: 97.55%
test time:  0.236s


##### print out a confusion matrix to gain insight into the number of false and true negatives and positives.

In [131]:
#Build confusion matrix
cm_pac = confusion_matrix(y_test,y_pred_pac, labels=[0,1])
print(cm_pac)

[[474  12]
 [ 10 402]]


In [132]:
print('so with this model, we have', cm_pac[0,0],'true positives',cm_pac[1,1],'true negatives,',cm_pac[1,0],'false positives, and',cm_pac[0,1],'false negatives.')


so with this model, we have 474 true positives 402 true negatives, 10 false positives, and 12 false negatives.


### Initialize a Naive Bayes classifier

In [133]:
#Import the GNB Classifier
from sklearn.naive_bayes import GaussianNB
t = time()
#Initialize a Naive Bayes Classifier
gnb = GaussianNB()
gnb.fit(tfidf_train_dense,y_train)
#Predict on the test set and calculate accuracy
y_pred_gnb = gnb.predict(tfidf_test_dense)
score_gnb=accuracy_score(y_test,y_pred_gnb)
print(f'Accuracy: {round(score_gnb*100,2)}%')
#Check the test time
test_time_NBC = time() - t
print("test time:  %0.3fs" % test_time_NBC)

Accuracy: 81.51%
test time:  17.878s


### Initialize an SVM classifier

In [135]:
#Import the SVM Classifier
from sklearn import svm
t = time()
#Initialize the SVM Classifier
svm = svm.SVC(C=1.0, kernel='linear', gamma='auto')
svm.fit(tfidf_train,y_train)
#Predict on the test set and calculate accuracy
y_pred_svm = svm.predict(tfidf_test)
score_svm = accuracy_score(y_test,y_pred_svm)
print(f'Accuracy: {round(score_svm*100,2)}%')
#Check the test time
test_time_SVM = time() - t
print("test time:  %0.3fs" % test_time_SVM)

Accuracy: 97.22%
test time:  11.474s


### Initialize a Logistic Regression classifier

In [136]:
#Import the Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
t = time()
#Initialize the Logistic Regression Classifier
log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
log_reg.fit(tfidf_train,y_train)
#Predict on the test set and calculate accuracy
y_pred_logreg = log_reg.predict(tfidf_test)
score_logreg = accuracy_score(y_test,y_pred_logreg)
print(f'Accuracy: {round(score_logreg*100,2)}%')
#Check the test time
test_time_LogReg = time() - t
print("test time:  %0.3fs" % test_time_LogReg)

[LibLinear]iter  1 act 8.354e+03 pre 7.452e+03 delta 3.903e+01 f 1.245e+04 |g| 7.224e+02 CG   5
iter  2 act 9.135e+02 pre 7.767e+02 delta 3.903e+01 f 4.095e+03 |g| 1.631e+02 CG   4
iter  3 act 1.205e+02 pre 1.096e+02 delta 3.903e+01 f 3.181e+03 |g| 5.091e+01 CG   4
iter  4 act 6.202e+00 pre 6.094e+00 delta 3.903e+01 f 3.061e+03 |g| 8.499e+00 CG   5
iter  5 act 7.448e-02 pre 7.449e-02 delta 3.903e+01 f 3.055e+03 |g| 7.466e-01 CG   6
Accuracy: 96.1%
test time:  0.687s


### Initialize a Random Forest classifier

In [137]:
#Import the Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
t = time()
#Initialize the Random Forest Classifier
random_forest = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=0)
random_forest.fit(tfidf_train,y_train)
#Predict on the test set and calculate accuracy
y_pred_randomforest=random_forest.predict(tfidf_test)
score_randomforest = accuracy_score(y_test,y_pred_randomforest)
print(f'Accuracy: {round(score_randomforest*100,2)}%')
#Check the test time
test_time_RandomForest = time() - t
print("test time:  %0.3fs" % test_time_RandomForest)

Accuracy: 93.21%
test time:  1.118s


### Initialize a Stochastic Gradient Descent classifier