# About the Dataset

Importing the dependencies

1.   List item
2.   List item



In [3]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [4]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91851\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# printing the stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data pre-processing

# loading the dataset to pandas Dataframe

In [6]:
news_dataset = pd.read_csv('WELFake_Dataset.csv', on_bad_lines='skip', engine='python')

In [7]:
news_dataset.shape

(72154, 4)

In [8]:
news_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [9]:
# Counting the missing values in the dataset
news_dataset.isnull().sum()

Unnamed: 0      0
title         565
text           57
label          20
dtype: int64

In [10]:
# replacing the null value with empty string
news_dataset = news_dataset.fillna('')

In [11]:
# merging the author name and news title
news_dataset['content'] = news_dataset['title'] + ' ' + news_dataset['text']

In [12]:
print(news_dataset['content'])

0        LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1           Did they post their votes for Hillary already?
2        UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3        Bobby Jindal, raised Hindu, uses story of Chri...
4        SATAN 2: Russia unvelis an image of its terrif...
                               ...                        
72149    Russians steal research on Trump in hack of U....
72150     WATCH: Giuliani Demands That Democrats Apolog...
72151    Migrants Refuse To Leave Train At Refugee Camp...
72152    Trump tussle gives unpopular Mexican leader mu...
72153    Goldman Sachs Endorses Hillary Clinton For Pre...
Name: content, Length: 72154, dtype: object


In [13]:
print(news_dataset.columns)

Index(['Unnamed: 0', 'title', 'text', 'label', 'content'], dtype='object')


In [14]:
# seperating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [15]:
print(X)
print(Y)

      Unnamed: 0                                              title  \
0              0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1              1                                                      
2              2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3              3  Bobby Jindal, raised Hindu, uses story of Chri...   
4              4  SATAN 2: Russia unvelis an image of its terrif...   
...          ...                                                ...   
72149      72129  Russians steal research on Trump in hack of U....   
72150      72130   WATCH: Giuliani Demands That Democrats Apolog...   
72151      72131  Migrants Refuse To Leave Train At Refugee Camp...   
72152      72132  Trump tussle gives unpopular Mexican leader mu...   
72153      72133  Goldman Sachs Endorses Hillary Clinton For Pre...   

                                                    text  \
0      No comment is expected from Barack Obama Membe...   
1         Did they post the

**Stemming :**

Stemming is the process of reducing a word to its Root word

*Example:*    actor, actress,acting --> act


In [16]:
port_stem = PorterStemmer()

In [17]:
english_stopwords = set(stopwords.words('english'))

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in english_stopwords]
    return ' '.join(stemmed_content)

In [18]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [19]:
print(news_dataset['content'])

0        law enforc high alert follow threat cop white ...
1                                post vote hillari alreadi
2        unbeliev obama attorney gener say charlott rio...
3        bobbi jindal rais hindu use stori christian co...
4        satan russia unv imag terrifi new supernuk wes...
                               ...                        
72149    russian steal research trump hack u democrat p...
72150    watch giuliani demand democrat apolog trump ra...
72151    migrant refus leav train refuge camp hungari m...
72152    trump tussl give unpopular mexican leader much...
72153    goldman sach endors hillari clinton presid gol...
Name: content, Length: 72154, dtype: object


In [20]:
news_dataset = news_dataset.dropna(subset=['label'])

In [21]:
#separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [22]:
print(X)

['law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video comment expect barack obama member fyf fukyoflag blacklivesmatt movement call lynch hang white peopl cop encourag other radio show tuesday night turn tide kill white peopl cop send messag kill black peopl america one f yoflag organ call sunshin radio blog show host texa call sunshin f ing opinion radio show snapshot fyf lolatwhitefear twitter page p show urg support call fyf tonight continu dismantl illus white snapshot twitter radio call invit fyf radio show air p eastern standard time show caller clearli call lynch kill white peopl minut clip radio show heard provid breitbart texa someon would like refer hannib alreadi receiv death threat result interrupt fyf confer call unidentifi black man said mother f ker start f ing like us bunch ni er takin one us roll said caus alreadi roll gang anyway six seven black mother f cker see white person lynch ass let turn tabl conspir cop start lose peopl state emerg

In [23]:
print(Y)

['1' '1' '1' ... '0' '0' '1']


In [24]:
Y.shape

(72154,)

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [26]:
# converting the textual data to numerical data
# Ensure X is text before vectorization
X_text = news_dataset['content'].values   # raw text
vectorizer = TfidfVectorizer()
vectorizer.fit(X_text)
X = vectorizer.transform(X_text)


In [27]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 13653900 stored elements and shape (72154, 162095)>
  Coords	Values
  (0, 938)	0.01910552418058514
  (0, 1282)	0.017364615787469322
  (0, 2130)	0.05245527850493085
  (0, 2782)	0.020232287339117336
  (0, 3613)	0.029903742758890916
  (0, 3998)	0.027472562800523327
  (0, 4263)	0.02386747704109067
  (0, 4334)	0.05055895390577549
  (0, 4845)	0.015140155259054535
  (0, 4861)	0.024864694749651122
  (0, 6011)	0.014597765824109333
  (0, 6504)	0.0573101769991896
  (0, 6842)	0.015891819449096088
  (0, 8434)	0.1265997857889395
  (0, 8973)	0.015517499527015192
  (0, 10474)	0.06692452526863618
  (0, 11425)	0.01896351536128849
  (0, 12721)	0.0158024505958719
  (0, 14066)	0.01834677995195762
  (0, 14673)	0.017851229844495997
  (0, 15436)	0.19280311089763172
  (0, 15493)	0.08125309240415281
  (0, 15605)	0.08888812544032272
  (0, 15880)	0.029336851329394658
  (0, 18056)	0.10843517154604479
  :	:
  (72153, 132554)	0.0317135476398895
  (72153, 

**Splitting the dataset to training and test data**

In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

# Training the model: Logistic Regression

In [29]:
model = LogisticRegression()

In [30]:
model.fit(X_train , Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


**Accuracy Score**

In [31]:
# accuracy score on the training data
training_data_accuracy = accuracy_score(model.predict(X_train), Y_train)
test_data_accuracy = accuracy_score(model.predict(X_test), Y_test)

In [32]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9701851948096946


In [33]:
print('Accuracy score of the test data : ', training_data_accuracy)

Accuracy score of the test data :  0.9701851948096946


## Making a Predictive system

In [34]:
X_new = X_test[4]

prediction = model.predict(X_new)
print(prediction)

if (int(prediction[0])==0):
  print('The news is Real')
else:
  print('The news is Fake')

['0']
The news is Real


In [35]:
print(Y_test[4])

0


In [36]:
import pickle

In [37]:
pickle.dump(model, open('model.pkl', 'wb'))

In [38]:
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))