In [1]:
import numpy as np
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
news_df = pd.read_csv("C:/Users/aksha/Desktop/Fake News Detector/WELFake_Dataset.csv")

In [3]:
news_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [4]:
news_df.shape

(72134, 4)

In [5]:
news_df.isna().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [6]:

print("Number of null values before dropping:", news_df.isna().sum())

# Drop rows with any null values
news_df.dropna(inplace=True)

print("Number of null values after dropping:", news_df.isna().sum())


Number of null values before dropping: Unnamed: 0      0
title         558
text           39
label           0
dtype: int64
Number of null values after dropping: Unnamed: 0    0
title         0
text          0
label         0
dtype: int64


In [7]:
news_df.shape

(71537, 4)

# separating the data & label

In [8]:
X = news_df.drop('label',axis=1)
Y = news_df['label']

In [9]:
print(X)

       Unnamed: 0                                              title  \
0               0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
2               2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3               3  Bobby Jindal, raised Hindu, uses story of Chri...   
4               4  SATAN 2: Russia unvelis an image of its terrif...   
5               5  About Time! Christian Group Sues Amazon and SP...   
...           ...                                                ...   
72129       72129  Russians steal research on Trump in hack of U....   
72130       72130   WATCH: Giuliani Demands That Democrats Apolog...   
72131       72131  Migrants Refuse To Leave Train At Refugee Camp...   
72132       72132  Trump tussle gives unpopular Mexican leader mu...   
72133       72133  Goldman Sachs Endorses Hillary Clinton For Pre...   

                                                    text  
0      No comment is expected from Barack Obama Membe...  
2       Now, most

# Stemming:
Stemming is the process of reducing a word to its Root word

example: hung hanged hanging ======hang

Steps:
lower case
splitting
removing stopwords
stemming

In [10]:
ps = PorterStemmer()
def stemming(title):
    stemmed_title = re.sub('[^a-zA-Z]',' ',title)
    stemmed_title = stemmed_title.lower()
    stemmed_title = stemmed_title.split()
    stemmed_title = [ps.stem(word) for word in stemmed_title if not word in stopwords.words('english')]
    stemmed_title = ' '.join(stemmed_title)
    return stemmed_title

In [11]:
news_df['title'] = news_df['title'].apply(stemming)

In [13]:
news_df['title']

0        law enforc high alert follow threat cop white ...
2        unbeliev obama attorney gener say charlott rio...
3        bobbi jindal rais hindu use stori christian co...
4        satan russia unv imag terrifi new supernuk wes...
5        time christian group sue amazon splc design ha...
                               ...                        
72129    russian steal research trump hack u democrat p...
72130    watch giuliani demand democrat apolog trump ra...
72131         migrant refus leav train refuge camp hungari
72132    trump tussl give unpopular mexican leader much...
72133           goldman sach endors hillari clinton presid
Name: title, Length: 71537, dtype: object

# separating the data and label

In [14]:
X = news_df['title'].values
Y = news_df['label'].values

In [15]:
print(X)

['law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video'
 'unbeliev obama attorney gener say charlott rioter peac protest home state north carolina video'
 'bobbi jindal rais hindu use stori christian convers woo evangel potenti bid'
 ... 'migrant refus leav train refuge camp hungari'
 'trump tussl give unpopular mexican leader much need shot arm'
 'goldman sach endors hillari clinton presid']


# converting the textual data to numerical data

In [16]:
vector = TfidfVectorizer()
vector.fit(X)
X = vector.transform(X)

In [17]:
print(X)

  (0, 18956)	0.19119199199155718
  (0, 18504)	0.12953081995001134
  (0, 17225)	0.25417143325832803
  (0, 17122)	0.24861338779858738
  (0, 9620)	0.2281779531036376
  (0, 7816)	0.2673841297708583
  (0, 6667)	0.4856722738685227
  (0, 6364)	0.2892696681391208
  (0, 5456)	0.31817689197461024
  (0, 3641)	0.24869119486730346
  (0, 1780)	0.334723411455833
  (0, 403)	0.3194027206998875
  (1, 18504)	0.13424895962371275
  (1, 17891)	0.35973057550211296
  (1, 16311)	0.1998676543814105
  (1, 14973)	0.1608430986300803
  (1, 14473)	0.35810711329768474
  (1, 13485)	0.2268087560849422
  (1, 12644)	0.2790489798405972
  (1, 11914)	0.1686662338838085
  (1, 11768)	0.22306828908813514
  (1, 7949)	0.26921648198053316
  (1, 6815)	0.2652110968000934
  (1, 2890)	0.36407357799494927
  (1, 2645)	0.3081355901867282
  :	:
  (71533, 1746)	0.4930846869221777
  (71533, 755)	0.39873664445651247
  (71534, 17478)	0.39738586728341585
  (71534, 14075)	0.35335720051740843
  (71534, 14071)	0.32382708160669216
  (71534, 10844

# Splitting the dataset to training & test data

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [21]:
X_train.shape

(57229, 19489)

# Training the Model: Logistic Regression

In [22]:
model = LogisticRegression()
model.fit(X_train,Y_train)

In [23]:
# on training set
train_y_pred = model.predict(X_train)
print(accuracy_score(train_y_pred,Y_train))

0.9205123276660434


In [24]:
# on testing set
testing_y_pred = model.predict(X_test)
print(accuracy_score(testing_y_pred,Y_test))

0.9017332960581493


# Detection System

In [25]:
input_data = X_test[10]
prediction = model.predict(input_data)

In [26]:
if prediction[0] == 0:
    print('The News Is Real')
else:
    print('The News is Fake')

The News is Fake
