Dataset - https://www.kaggle.com/code/barkhaverma/fake-news-detection

In [1]:
# Description: This program detects real (0) and fake (1) news

In [2]:
# Import the libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [3]:
# Load the data
from google.colab import files
files.upload()

Saving Fake_News.csv to Fake_News.csv


In [4]:
df = pd.read_csv('Fake_News.csv')

In [5]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [6]:
df.shape

(20800, 5)

In [8]:
df.drop_duplicates(inplace=True)
df.shape

(20800, 5)

In [9]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [10]:
df.dropna(axis=0,inplace=True)
df.shape

(18285, 5)

In [12]:
#Combine important columns
df['combined'] = df['author'] + '' +df['title']
df.head()

Unnamed: 0,id,title,author,text,label,combines,combined
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell LucusHouse Dem Aide: We Didn’t Even Se...,Darrell LucusHouse Dem Aide: We Didn’t Even Se...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. FlynnFLYNN: Hillary Clinton, Big Wom...","Daniel J. FlynnFLYNN: Hillary Clinton, Big Wom..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.comWhy the Truth Might Get You ...,Consortiumnews.comWhy the Truth Might Get You ...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss15 Civilians Killed In Single U...,Jessica Purkiss15 Civilians Killed In Single U...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard PortnoyIranian woman jailed for fiction...,Howard PortnoyIranian woman jailed for fiction...


In [13]:
# now i'll download some stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
# I'm going to remove punctuation from the text - nopunc = [char for char in text not in string.punctuation]
# I'm returning the characters that are not in "string.punctuation"
# this should only return the words that are not in stopwords - clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
def process_text(text):
  nopunc = [char for char in text if char not in string.punctuation]
  nopunc = ''.join(nopunc)

  clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

  return clean_words

In [18]:
df['combined'].head().apply(process_text)

# this shows first five rows of the data
# these words are all clean. so that this list don't contain any "stopwords" or "punctuation"

0    [Darrell, LucusHouse, Dem, Aide, Didn’t, Even,...
1    [Daniel, J, FlynnFLYNN, Hillary, Clinton, Big,...
2     [ConsortiumnewscomWhy, Truth, Might, Get, Fired]
3    [Jessica, Purkiss15, Civilians, Killed, Single...
4    [Howard, PortnoyIranian, woman, jailed, fictio...
Name: combined, dtype: object

In [19]:
# now I'm going to convert the text to a matrix of token counts.
# to do that I need another library,
# we haven't applied the function to every single text and combined yet
# so, we are going to make process_text the analyzer for converting these strings into integer counts.

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
message_bow = CountVectorizer(analyzer=process_text).fit_transform(df['combined'])

In [22]:
# split the data into 80% training data sets and 20% testing data sets.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(message_bow, df['label'], test_size=0.20, random_state=0)

In [23]:
# now I want to shape of the "message_bow"

message_bow.shape

(18285, 45190)

In [24]:
# message bow contains same number of rows of our dataset.
# the column number has increased to 45190. That is because of the number of words in the dataset. Each column is a different word.

In [27]:
# lets create and train a naive bayes classifier model

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [28]:
print(classifier.predict(X_train))
print(y_train.values)

[1 1 1 ... 1 0 0]
[1 1 1 ... 1 0 0]


In [29]:
from sklearn.metrics import classification_report
pred = classifier.predict(X_train)
print(classification_report(y_train,pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      8321
           1       1.00      0.97      0.98      6307

    accuracy                           0.99     14628
   macro avg       0.99      0.98      0.99     14628
weighted avg       0.99      0.99      0.99     14628



In [30]:
# now i want to see how well the dataset did on the test dataset. which should be data that it has never seen before.

from sklearn.metrics import classification_report
pred = classifier.predict(X_test)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      2040
           1       0.99      0.88      0.93      1617

    accuracy                           0.94      3657
   macro avg       0.95      0.93      0.94      3657
weighted avg       0.95      0.94      0.94      3657



In [31]:
# now it's visible that accuracy has decreased to 94% and we are working on 3657 rows

In [32]:
# reference - https://www.youtube.com/watch?v=Ns2Adgcd3yE&list=PLBhJnyA0V0uKUX93csc8u3uVsYqISJnsv&index=40