# Fake News Prediction System
# *(Using Logistic Regression,Stemming,TfidfVectorizer)*

_____________________________________________________________________________________________

## Importing Dependancies

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thora\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('English'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

## Importing Dataset

In [4]:
news_data=pd.read_csv("WELFake_Dataset.csv")

## Data Analysis 

In [5]:
news_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [6]:
news_data.shape

(72134, 4)

In [7]:
news_data.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [8]:
news_data=news_data.fillna('')

In [9]:
news_data.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [10]:
news_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


## Data Preprocessing

In [11]:
# Sample 17,000 rows with label 0 and 17,000 rows with label 1
rows_to_drop_0 = news_data[news_data['label'] == 0].sample(n=30000, random_state=42).index
rows_to_drop_1 = news_data[news_data['label'] == 1].sample(n=30000, random_state=42).index

# Drop these rows from the DataFrame
news_data = news_data.drop(index=rows_to_drop_0)
news_data = news_data.drop(index=rows_to_drop_1)

# Reset index if needed
news_data = news_data.reset_index(drop=True)

In [12]:
news_data['label'].value_counts()

label
1    7106
0    5028
Name: count, dtype: int64

In [13]:
news_data.shape

(12134, 4)

In [14]:
news_data['content']=news_data['text']+' '+news_data['title']

In [15]:
news_data['content']

0          Did they post their votes for Hillary already? 
1        The most punchable Alt-Right Nazi on the inter...
2         21st Century Wire says Amid the tossing and t...
3        Is the European gravy train finally coming to ...
4        WASHINGTON  —   Several Republican senators on...
                               ...                        
12129    We continue to undercover the details of the O...
12130    Flip-flop: Vox warns of serious risk of Electi...
12131     (Story corrects third paragraph to show Mosul...
12133    Waking Times – by Alex Pietrowski \nThe most i...
Name: content, Length: 12134, dtype: object

## Stemming

In [16]:
port_stem=PorterStemmer()

In [17]:
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ',content)
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word)for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content=' '.join(stemmed_content)
    return stemmed_content

In [18]:
news_data['content']=news_data['content'].apply(stemming)

In [19]:
X=news_data['content'].values
Y=news_data['label'].values

In [20]:
print(X)
print(Y)

['post vote hillari alreadi'
 'punchabl alt right nazi internet got thorough beatdown sen ben sass r neb twitter epic tweetstorm richard spencer alt right leader becom human punch bag got racism smack republican senat thursday white nationalist tweet goober conserv blame russia racial divis unit state spencer respond tweet sass sent wednesday sen ben sass share articl regard sen jame lankford r okla explain russian internet troll help fuel divis controversi donald trump ignit nfl athlet choos kneel rather stand nation anthem protest racial inequ polic brutal one love american vs american fight putin intel agenc stoke side everi divid http co h bwjhzokh ben sass bensass septemb spencer respond write mind goober conserv russian blame racial divis mind goober conserv russian blame racial divis http co czpgfl u richard spencer richardbspenc septemb sass tore spencer call clown one brown shirt pajama boy nazi oh let goober nongoob agre racist like blame putin agenc also love use divis tool 

## Vectorization

In [21]:
vectorizer=TfidfVectorizer()

In [22]:
vectorizer.fit(X)

In [23]:
X=vectorizer.transform(X)

## Splitting Train Test Data

x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [25]:
x_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1810890 stored elements and shape (9707, 67111)>

In [26]:
y_train

array([1, 0, 0, ..., 1, 1, 1])

## Model Training

In [27]:
model=LogisticRegression()

In [28]:
model.fit(x_train, y_train)

## Making Prediction

In [29]:
prediction=model.predict(x_test)

In [32]:
accuracy=accuracy_score(y_test,prediction)
print("Accuracy = ",accuracy)

Accuracy =  0.9188298310671611
