# Fake News Detection System

In [1]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
df_ip = pd.read_csv('./data/train.csv')
df_ip.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
# Check for null values and provide the total counf=t of null values in the dataframe
df_ip.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [4]:
# Dropping null values and then reseting the index 
df = df_ip.dropna() # drops null values, have to store this as it return a new dataframe
df.reset_index(inplace = True) # resets the index value after dropping null values
df.isnull().sum()

index     0
id        0
title     0
author    0
text      0
label     0
dtype: int64

In [5]:
ps = PorterStemmer() #initialize an object for PorterStemmer class

In [6]:
# By iterating through each sentence, it cleans by removing any numbers, stopwprds, special characters and punctuation
# It then converts each sentence to lower case and splits the sentence to perform stemming operation on eqach word
def clean_data(df_ip):
    result = []
    for i in range(len(df_ip)):
        review = re.sub('[^a-zA-Z]', ' ', df_ip[i])
        review = review.lower()
        review = review.split()
        review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
        review = ' '.join(review) # join the words in list and reframe it inot sentences
        result.append(review)
    return result


In [7]:
# Considering the Text as independent variable for the output label column
X = df['text']
y = df['label']
X.isnull().sum()

0

In [8]:
#cleaning the input data
X_cleaned = clean_data(df['text'])
X_cleaned[:5]

['hous dem aid even see comey letter jason chaffetz tweet darrel lucu octob subscrib jason chaffetz stump american fork utah imag courtesi michael jolley avail creativ common licens apolog keith olbermann doubt worst person world week fbi director jame comey accord hous democrat aid look like also know second worst person well turn comey sent infam letter announc fbi look email may relat hillari clinton email server rank democrat relev committe hear comey found via tweet one republican committe chairmen know comey notifi republican chairmen democrat rank member hous intellig judiciari oversight committe agenc review email recent discov order see contain classifi inform long letter went oversight committe chairman jason chaffetz set polit world ablaz tweet fbi dir inform fbi learn exist email appear pertin investig case reopen jason chaffetz jasoninthehous octob cours know case comey actual say review email light unrel case know anthoni weiner sext teenag appar littl thing fact matter c

In [9]:
y.shape

(18285,)

In [10]:
len(X_cleaned)

18285

## Applying Countvectorizer to create BagOfWords

In [11]:
# Applying Countvectorizer
# Creating the Bag of Words model
cv = CountVectorizer(max_features=5000, ngram_range=(1,3))
bof = cv.fit_transform(X_cleaned).toarray()

In [12]:
bof.shape

(18285, 5000)

### get_params()
- Used to obtain the parameters for the model instance you're working with

In [13]:
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 3),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

## Applying TF-IDF

In [14]:
tfidf_v = TfidfVectorizer(max_features = 5000, ngram_range = (1,3))
X_tfidf = tfidf_v.fit_transform(X_cleaned).toarray()

In [15]:
X_tfidf.shape

(18285, 5000)

## Split the data set into train and test set

In [16]:
X_BoW_train, X_BoW_test, y_BoW_train, y_BoW_test = train_test_split(bof, y, test_size=0.33, random_state=0)
X_TFIDF_train, X_TFIDF_test, y_TFIDF_train, y_TFIDF_test = train_test_split(X_tfidf, y, test_size=0.33, random_state=0)

## MultinomialNB

In [17]:
model_MNB = MultinomialNB()

In [18]:
model_MNB.fit(X_BoW_train, y_BoW_train) # Using Bag of Words
model_MNB.fit(X_TFIDF_train, y_TFIDF_train) # Using TF-IDF

In [19]:
y_pred_BoW = model_MNB.predict(X_BoW_test) # Predictions using Bag of Words
y_pred_TFIDF = model_MNB.predict(X_TFIDF_test) # Predictions using TF-IDF

In [20]:
# Accuracy
accuracy_BoW = accuracy_score(y_BoW_test, y_pred_BoW)
accuracy_TFIDF = accuracy_score(y_TFIDF_test, y_pred_TFIDF)

# Precision
precision_BoW = precision_score(y_BoW_test, y_pred_BoW, average='binary')
precision_TFIDF = precision_score(y_TFIDF_test, y_pred_TFIDF, average='binary')

# Recall
recall_BoW = recall_score(y_BoW_test, y_pred_BoW, average='binary')
recall_TFIDF = recall_score(y_TFIDF_test, y_pred_TFIDF, average='binary')

# F1 Score
f1_BoW = f1_score(y_BoW_test, y_pred_BoW, average='binary')
f1_TFIDF = f1_score(y_TFIDF_test, y_pred_TFIDF, average='binary')

# Confusion Matrix
conf_matrix_BoW = confusion_matrix(y_BoW_test, y_pred_BoW)
conf_matrix_TFIDF = confusion_matrix(y_TFIDF_test, y_pred_TFIDF)

# metrics for Bag of Words
print("Bag of Words Metrics:")
print("Accuracy:", accuracy_BoW)
print("Precision:", precision_BoW)
print("Recall:", recall_BoW)
print("F1 Score:", f1_BoW)
print("Confusion Matrix:\n", conf_matrix_BoW)

# metrics for TF-IDF
print("\nTF-IDF Metrics:")
print("Accuracy:", accuracy_TFIDF)
print("Precision:", precision_TFIDF)
print("Recall:", recall_TFIDF)
print("F1 Score:", f1_TFIDF)
print("Confusion Matrix:\n", conf_matrix_TFIDF)


Bag of Words Metrics:
Accuracy: 0.907705053852527
Precision: 0.9325051759834369
Recall: 0.8510959939531368
F1 Score: 0.8899426990713297
Confusion Matrix:
 [[3226  163]
 [ 394 2252]]

TF-IDF Metrics:
Accuracy: 0.899917149958575
Precision: 0.935580204778157
Recall: 0.828798185941043
F1 Score: 0.8789579158316633
Confusion Matrix:
 [[3238  151]
 [ 453 2193]]


## Passive Aggressive Classifier Algorithm

In [21]:
pac = PassiveAggressiveClassifier(max_iter = 1000, random_state = 42)

In [22]:
# Train with BoW
pac.fit(X_BoW_train, y_BoW_train)
y_pred_BoW = pac.predict(X_BoW_test)

In [23]:
# Train with TF-IDF
pac.fit(X_TFIDF_train, y_TFIDF_train)
y_pred_TFIDF = pac.predict(X_TFIDF_test)

In [37]:
# Evaluation metrics for BoW
accuracy_BoW = accuracy_score(y_BoW_test, y_pred_BoW)
precision_BoW = precision_score(y_BoW_test, y_pred_BoW, average='binary')
recall_BoW = recall_score(y_BoW_test, y_pred_BoW, average='binary')
f1_BoW = f1_score(y_BoW_test, y_pred_BoW, average='binary')
conf_matrix_BoW = confusion_matrix(y_BoW_test, y_pred_BoW)

# Evaluation metrics for TF-IDF
accuracy_TFIDF = accuracy_score(y_TFIDF_test, y_pred_TFIDF)
precision_TFIDF = precision_score(y_TFIDF_test, y_pred_TFIDF, average='binary')
recall_TFIDF = recall_score(y_TFIDF_test, y_pred_TFIDF, average='binary')
f1_TFIDF = f1_score(y_TFIDF_test, y_pred_TFIDF, average='binary')
conf_matrix_TFIDF = confusion_matrix(y_TFIDF_test, y_pred_TFIDF)

# metrics for Bag of Words
print("Bag of Words Metrics:")
print("Accuracy:", accuracy_BoW)
print("Precision:", precision_BoW)
print("Recall:", recall_BoW)
print("F1 Score:", f1_BoW)
print("Confusion Matrix:\n", conf_matrix_BoW)

# metrics for TF-IDF
print("\nTF-IDF Metrics:")
print("Accuracy:", accuracy_TFIDF)
print("Precision:", precision_TFIDF)
print("Recall:", recall_TFIDF)
print("F1 Score:", f1_TFIDF)
print("Confusion Matrix:\n", conf_matrix_TFIDF)


Bag of Words Metrics:
Accuracy: 0.9375310687655344
Precision: 0.9200296186597556
Recall: 0.9391534391534392
F1 Score: 0.9294931737422854
Confusion Matrix:
 [[3173  216]
 [ 161 2485]]

TF-IDF Metrics:
Accuracy: 0.9514498757249379
Precision: 0.9424595712673938
Recall: 0.9470899470899471
F1 Score: 0.9447690857681432
Confusion Matrix:
 [[3236  153]
 [ 140 2506]]


# Using TF-IDF with Passive Aggressive Classifier Algorithm

In [25]:
df_pred_ip = pd.read_csv('./data/test.csv')
df_pred_ip.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [26]:
df_pred_ip.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [27]:
# Drop any rows with missing values
df_pred = df_pred_ip.dropna()
df_pred.reset_index(inplace = True) # resets the index value after dropping null values
df_pred.isnull().sum()

index     0
id        0
title     0
author    0
text      0
dtype: int64

In [28]:
df_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4575 entries, 0 to 4574
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   4575 non-null   int64 
 1   id      4575 non-null   int64 
 2   title   4575 non-null   object
 3   author  4575 non-null   object
 4   text    4575 non-null   object
dtypes: int64(2), object(3)
memory usage: 178.8+ KB


In [29]:
#cleaning the input data
X_sample_cleaned = clean_data(df_pred['text'])
X_sample_cleaned[:5]

['palo alto calif year scorn polit process silicon valley leapt fray prospect presid donald j trump push tech commun move beyond tradit role donor embrac new exist agit activist distinguish ventur capit firm emblazon corpor home page earthi epithet one promin tech chieftain say consequ mr trump elect would rang disastr terribl anoth compar dictat nearli tech leader sign open letter decri mr trump campaign anger bigotri quit action peter thiel founder paypal palantir first outsid investor facebook spoke republican convent juli new york time report saturday mr thiel give million support mr trump candidaci even support flee also recent gave million super pac support senat rob portman republican freshman run ohio get involv polit use seen clash silicon valley valu system transform world make problem obsolet solv washington entrepreneur want alien whatev segment custom agre polit retic longer style bunch nerd use lot limelight said dave mcclure investor run tech incub call startup quot grea

In [30]:
X.isnull().sum()

0

In [31]:
# Transform the cleaned text data into TF-IDF features 
X_sample_tfidf = tfidf_v.fit_transform(X_sample_cleaned).toarray()

In [32]:
# Predict using the Passive Aggressive Classifier
predicted_values = pac.predict(X_sample_tfidf)

In [33]:
predicted_values[0]

0

In [34]:
sample_idx = 398 # total entries (0,4574)
print("Sample Text:", df_pred['text'][sample_idx])

if predicted_values[sample_idx] == 0:
    print("\nPredicted Output: Authhentic News")

if predicted_values[sample_idx] == 1:
    print("\nPredicted output: Fake News")

Sample Text: Wednesday on CBS’s “The Late Show,” host Stephen Colbert mocked MSNBC’s Rachel Maddow for her lengthy tease a night earlier that led up to the release of President Donald Trump’s 2005 tax forms.  “I hold in my hand something very significant,” Colbert said. “It is a joke  —   a joke that we have confirmed has been heard by Donald Trump. We believe this is the first time any joke dealing with Donald Trump has been released. ” Follow Breitbart. tv on Twitter @BreitbartVideo

Predicted Output: Authhentic News
