In [36]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [37]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thora\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
print(stopwords.words('English'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [39]:
news_data=pd.read_csv("fake_news_dataset.csv")

In [40]:
news_data.head()

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake


In [41]:
news_data.shape

(20000, 7)

In [42]:
news_data.isnull().sum()

title          0
text           0
date           0
source      1000
author      1000
category       0
label          0
dtype: int64

In [43]:
news_data=news_data.fillna('')

In [44]:
news_data.isnull().sum()

title       0
text        0
date        0
source      0
author      0
category    0
label       0
dtype: int64

In [45]:
news_data['label'].value_counts()

label
fake    10056
real     9944
Name: count, dtype: int64

In [46]:
news_data['source'].value_counts()

source
Daily News      2439
BBC             2393
The Guardian    2382
CNN             2375
NY Times        2372
Fox News        2362
Reuters         2360
Global Times    2317
                1000
Name: count, dtype: int64

In [47]:
news_data['author'].value_counts()

author
                       1000
Michael Smith            12
John Smith               11
Christopher Johnson       9
David Brown               7
                       ... 
Tonya Anderson            1
Patricia Stewart          1
Stephen Parks             1
Desiree Castro            1
Tina Garrett              1
Name: count, Length: 17052, dtype: int64

In [48]:
news_data['label']=news_data['label'].replace({'real':1,'fake':0}).astype(int)

  news_data['label']=news_data['label'].replace({'real':1,'fake':0}).astype(int)


In [49]:
news_data.head()

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,1
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,0
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,0
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,0
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,0


In [50]:
news_data['label'].value_counts()

label
0    10056
1     9944
Name: count, dtype: int64

In [51]:
news_data['content']=news_data['author']+' '+news_data['title']

In [52]:
news_data['content']

0                     Paula George Foreign Democrat final.
1          Joseph Hill To offer down resource great point.
2              Julia Robinson Himself church myself carry.
3                Mr. David Foster DDS You unit its should.
4        Austin Walker Billion believe employee summer ...
                               ...                        
19995                         Gary Miles House party born.
19996    Maria Mcbride Though nation people maybe price...
19997     Kristen Franklin Yet exist with experience unit.
19998                  David Wise School wide itself item.
19999        James Peterson Offer chair cover senior born.
Name: content, Length: 20000, dtype: object

In [53]:
port_stem=PorterStemmer()

In [54]:
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ',content)
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word)for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content=' '.join(stemmed_content)
    return stemmed_content

In [55]:
news_data['content']=news_data['content'].apply(stemming)

In [57]:
X=news_data['content'].values
Y=news_data['label'].values

In [58]:
print(X)
print(Y)

['paula georg foreign democrat final'
 'joseph hill offer resourc great point' 'julia robinson church carri' ...
 'kristen franklin yet exist experi unit' 'david wise school wide item'
 'jame peterson offer chair cover senior born']
[1 0 0 ... 1 0 0]


In [56]:
vectorizer=TfidfVectorizer()

In [59]:
vectorizer.fit(X)

In [60]:
X=vectorizer.transform(X)

In [61]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [68]:
x_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 109497 stored elements and shape (16000, 2320)>

In [66]:
y_train

array([0, 0, 1, ..., 0, 1, 1])

In [62]:
model=LogisticRegression()

In [69]:
model.fit(x_train, y_train)

In [70]:
prediction=model.predict(x_test)

In [73]:
accuracy=accuracy_score(y_test,prediction)
print(accuracy)

0.5015
