# Fake News Prediction using Machine Learning

## Downloading and Exploring a Kaggle Dataset

In [1]:
import opendatasets as od

In [2]:
dataset="https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification"

In [3]:
od.download(dataset)

Skipping, found downloaded files in ".\fake-news-classification" (use force=True to force download)


In [4]:
import os

In [5]:
data="fake-news-classification"

In [6]:
os.listdir(data)

['.ipynb_checkpoints', 'WELFake_Dataset.csv']

## Required Libraries For Fake News Prediction Model

In [7]:
import numpy as np 
import pandas as pd
# re=regular expression, it is used for finding a text in a paragraph or a document
import re
# nltk= natural language tool kit, which is used find stopwords like (is,an,the,me,myself....)
from nltk.corpus import stopwords
# porterStemmer which is used to find the rootword like (actress,actor,acting - act) reducing a word to rootword
from nltk.stem.porter import PorterStemmer
# TfidfVectorizer is used for converting Text to numeric.Tf is for calculating (n/N) which tell you most repeated words to detect
# Idf is for log(N/n) which gives the more value for less repeated words
from sklearn.feature_extraction.text import TfidfVectorizer
# which is used to split the data in train and test
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# to find the accuracy score
from sklearn.metrics import accuracy_score

## Data Loading and Preprocessing

In [8]:
df= pd.read_csv("fake-news-classification/WELFake_Dataset.csv")

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [10]:
df.drop("Unnamed: 0",axis=1)

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
...,...,...,...
72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


In [11]:
df.shape

(72134, 4)

In [12]:
df.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [13]:
df[df.isnull().any(axis=1)]

Unnamed: 0.1,Unnamed: 0,title,text,label
1,1,,Did they post their votes for Hillary already?,1
43,43,,True. Hillary needs a distraction and what bet...,1
162,162,,All eyes on Electoral delegates. The People kn...,1
185,185,,Cool,1
269,269,,A leading US senator: US Supporting War in Syr...,1
...,...,...,...,...
71484,71484,,Another Arab supremacist masturbation fantasy....,1
71521,71521,,I'm sure they drastically changed accounting m...,1
71540,71540,,It's easy to imagine Obama or Kerry pissing hi...,1
71570,71570,,Ever since the powers to be assassinated JFK A...,1


In [14]:
df.fillna(" ",inplace=True)

In [15]:
df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [16]:
content=df["title"]+" "+df["text"]

In [17]:
content

0        LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1           Did they post their votes for Hillary already?
2        UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3        Bobby Jindal, raised Hindu, uses story of Chri...
4        SATAN 2: Russia unvelis an image of its terrif...
                               ...                        
72129    Russians steal research on Trump in hack of U....
72130     WATCH: Giuliani Demands That Democrats Apolog...
72131    Migrants Refuse To Leave Train At Refugee Camp...
72132    Trump tussle gives unpopular Mexican leader mu...
72133    Goldman Sachs Endorses Hillary Clinton For Pre...
Length: 72134, dtype: object

## Feature Engineering

In [25]:
x= content
y=df["label"]

In [26]:
import nltk

In [27]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PHOENIX\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [29]:
porter_stem= PorterStemmer()

In [30]:
def stemming(content):
    # Remove non-alphabetic characters and replace them with spaces
    stemmed_content=re.sub('[^a-zA-Z]',' ', content)
    # covert the text to lower case
    stemmed_content=stemmed_content.lower()
    # split the text into individual words
    stemmed_content=stemmed_content.split()
    # perform stemming on each word using a stemming algorithm
    stemmed_content=[porter_stem.stem(word) for word in stemmed_content if not word in stopwords.words("english")]
    # join the stemmed words back into a single string
    stemmed_content= " ".join(stemmed_content)
    # return the stemmed content
    return stemmed_content

In [None]:
x= x.apply(stemming)

In [80]:
tfidf_vectorizer= TfidfVectorizer(stop_words="english")

In [81]:
model= tfidf_vectorizer.fit_transform(x)

In [82]:
model

<72134x244131 sparse matrix of type '<class 'numpy.float64'>'
	with 14114626 stored elements in Compressed Sparse Row format>

In [83]:
print(x)

0        LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1           Did they post their votes for Hillary already?
2        UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3        Bobby Jindal, raised Hindu, uses story of Chri...
4        SATAN 2: Russia unvelis an image of its terrif...
                               ...                        
72129    Russians steal research on Trump in hack of U....
72130     WATCH: Giuliani Demands That Democrats Apolog...
72131    Migrants Refuse To Leave Train At Refugee Camp...
72132    Trump tussle gives unpopular Mexican leader mu...
72133    Goldman Sachs Endorses Hillary Clinton For Pre...
Length: 72134, dtype: object


In [84]:
print(model)

  (0, 135758)	0.013090502286751153
  (0, 185703)	0.018820280917057716
  (0, 67059)	0.02101346936420286
  (0, 36162)	0.04756187497559437
  (0, 133559)	0.03692124076047333
  (0, 212314)	0.04418086928888125
  (0, 149525)	0.024600038616467205
  (0, 19415)	0.022878534730813062
  (0, 149571)	0.02927971702118056
  (0, 142237)	0.03572362193056873
  (0, 159438)	0.043802318914474614
  (0, 43687)	0.02768888650006659
  (0, 13634)	0.05797284915269563
  (0, 216076)	0.02076285005137679
  (0, 89746)	0.01626078868521928
  (0, 115436)	0.06741299921497354
  (0, 137448)	0.06741299921497354
  (0, 124008)	0.06741299921497354
  (0, 178212)	0.04193443014012317
  (0, 92421)	0.051150393868900815
  (0, 109057)	0.037152586043446865
  (0, 109276)	0.037906990632699374
  (0, 111054)	0.038062377071206584
  (0, 202296)	0.04028524420556936
  (0, 151800)	0.018232642694925492
  :	:
  (72133, 165316)	0.06814758746308489
  (72133, 37452)	0.039747574400063775
  (72133, 154292)	0.028224092986985333
  (72133, 1)	0.02283311457

## Model Selection

In [85]:
x= model
y= df["label"]

In [86]:
x_train,x_test,y_train,y_test= train_test_split(x,y,stratify=y,test_size=0.1,random_state=2)

In [87]:
print(x.shape,x_train.shape,x_test.shape)

(72134, 244131) (64920, 244131) (7214, 244131)


## Model Training

In [88]:
LR= LogisticRegression()

In [89]:
LR.fit(x_train,y_train)

## Model Evaluation

In [90]:
xtrain_predict= LR.predict(x_train)

In [91]:
xtrain_accuracy= accuracy_score(y_train,xtrain_predict)

In [92]:
xtrain_accuracy

0.9643561306223044

In [93]:
xtest_predict= LR.predict(x_test)

In [94]:
xtest_accuracy= accuracy_score(y_test,xtest_predict)

In [95]:
xtest_accuracy

0.9503742722484059

In [101]:
input_data=("Bobby Jindal, raised Hindu, uses story of Christian conversion to woo evangelicals for potential 2016 bid","A dozen politically active pastors came here for a private dinner Friday night to hear a conversion story unique in the context of presidential politics: how Louisiana Gov. Bobby Jindal traveled from Hinduism to Protestant Christianity and, ultimately, became what he calls an “evangelical Catholic.”)
transform= tfidf_vectorizer.transform(input_data)
predict= LR.predict(input_data)
print(predict)

SyntaxError: unterminated string literal (detected at line 1) (2384629472.py, line 1)

In [None]:
transform= tfidf_vectorizer.transform(input_data)