<a href="https://colab.research.google.com/github/rahulchavan-07/fake-news-detection-logistic-model/blob/main/fake_news_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Dependencies

In [140]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [141]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [142]:
#printing stopwords in english
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data preprocessing

In [143]:
#loading the dataset to pandas dataframe
news_dataset=pd.read_csv('/content/fake_news_dataset.csv')

In [144]:
news_dataset.shape

(20000, 7)

In [145]:
#print first rows of the dataset
news_dataset.head()

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake


In [146]:
news_dataset.isnull().sum()

Unnamed: 0,0
title,0
text,0
date,0
source,1000
author,1000
category,0
label,0


In [147]:
#replacing the null values with empty string
news_dataset=news_dataset.fillna('')

In [148]:
news_dataset.isnull().sum()

Unnamed: 0,0
title,0
text,0
date,0
source,0
author,0
category,0
label,0


In [149]:
#encoding the label column into zero(real) and one(fake)
news_dataset['label'] = news_dataset['label'].map({'real': 0, 'fake': 1})


In [150]:
# merging the author and title
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']+' '+news_dataset['category']+' '+news_dataset['source']

In [151]:
print(news_dataset['content'])

0        Paula George Foreign Democrat final. Politics ...
1        Joseph Hill To offer down resource great point...
2        Julia Robinson Himself church myself carry. Bu...
3        Mr. David Foster DDS You unit its should. Scie...
4        Austin Walker Billion believe employee summer ...
                               ...                        
19995       Gary Miles House party born. Entertainment BBC
19996    Maria Mcbride Though nation people maybe price...
19997    Kristen Franklin Yet exist with experience uni...
19998    David Wise School wide itself item. Health Reu...
19999    James Peterson Offer chair cover senior born. ...
Name: content, Length: 20000, dtype: object


In [152]:
# seperating the data and label

X=news_dataset.drop(['label'],axis=1)
Y=news_dataset['label']

Stemming (process of reducing a word to its root word)

In [153]:
port_stem=PorterStemmer()

In [154]:
def stemming(content):
  stemmed_content=re.sub('[^a-zA-z]',' ',content)
  stemmed_content=stemmed_content.lower()
  stemmed_content=stemmed_content.split()
  stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content=' '.join(stemmed_content)
  return stemmed_content

In [155]:
news_dataset['content']=news_dataset['content'].apply(stemming)

In [156]:
print(news_dataset['content'])

0         paula georg foreign democrat final polit ny time
1        joseph hill offer resourc great point polit fo...
2                     julia robinson church carri busi cnn
3                    mr david foster dd unit scienc reuter
4        austin walker billion believ employe summer te...
                               ...                        
19995              gari mile hous parti born entertain bbc
19996    maria mcbride though nation peopl mayb price b...
19997    kristen franklin yet exist experi unit enterta...
19998            david wise school wide item health reuter
19999    jame peterson offer chair cover senior born he...
Name: content, Length: 20000, dtype: object


In [157]:
# seperating the data and label
X=news_dataset['content'].values
Y=news_dataset['label'].values

In [158]:
print(X)


['paula georg foreign democrat final polit ny time'
 'joseph hill offer resourc great point polit fox news'
 'julia robinson church carri busi cnn' ...
 'kristen franklin yet exist experi unit entertain bbc'
 'david wise school wide item health reuter'
 'jame peterson offer chair cover senior born health daili news']


In [159]:
print(Y)

[0 1 1 ... 0 1 1]


In [160]:
# conerting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X=vectorizer.transform(X)

In [161]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 185211 stored elements and shape (20000, 2328)>
  Coords	Values
  (0, 556)	0.37180031549650283
  (0, 733)	0.3641016731951959
  (0, 764)	0.4063074159876705
  (0, 824)	0.44401213632578984
  (0, 1543)	0.21026222618346077
  (0, 1614)	0.5083508318560718
  (0, 1660)	0.1940965164756313
  (0, 2125)	0.16327208507773133
  (1, 772)	0.19913632312396015
  (1, 860)	0.38719860070843864
  (1, 951)	0.41329598043087307
  (1, 1112)	0.37431640843942426
  (1, 1513)	0.15367884825798153
  (1, 1551)	0.38568705048394286
  (1, 1657)	0.38618692792566395
  (1, 1660)	0.18424599208143896
  (1, 1768)	0.3795220799151894
  (2, 295)	0.21305789992048524
  (2, 341)	0.4387720860642851
  (2, 403)	0.444989327740496
  (2, 424)	0.2289856809792094
  (2, 1122)	0.5512750318488429
  (2, 1810)	0.45577547447825084
  (3, 532)	0.37044470424332365
  (3, 538)	0.4460426793142536
  :	:
  (19997, 169)	0.18772122864156499
  (19997, 665)	0.17640253723597527
  (19997, 700)	0.36238

Splitting dataset to training and test data

In [162]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

Training model using logistic regression

In [163]:
model=LogisticRegression()

In [164]:
model.fit(X_train,Y_train)

Evaluation

Accuracy Score

In [165]:
# accuracy score of training data
X_train_prediction=model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [166]:
print('accuracy score of training data',training_data_accuracy)

accuracy score of training data 0.6466875


In [167]:
# accuracy score of testing data
X_test_prediction=model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [168]:
print(test_data_accuracy)

0.49625


Making a predeictive system

In [169]:
X_new = X_test[0]

prediction=model.predict(X_new)

In [170]:
if(prediction[0]==0):
  print('The new is real')
else:
  print('The news is fake')

The new is real


In [172]:
print(Y_test[0])

0
