Importing the dependencies

In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Printing the stopwords in english
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data pre-Processing


In [None]:
# Load the dataset into the pandas data frame
news_dataset = pd.read_csv('/content/WELFake_Dataset.csv')

In [None]:
# To get the Rows and columns
news_dataset.shape

(72134, 4)

In [None]:
# Prin the first 5 rows of the data frame
news_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [None]:
# To check whether some values are missing
news_dataset.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,558
text,39
label,0


In [None]:
 # Replacing the missing values with empty string
 news_dataset = news_dataset.fillna('')

In [None]:
# Merging the title and text column
news_dataset['content']  = news_dataset['text']+' '+news_dataset['title']

In [None]:
print(news_dataset['content'])
# We will be using content data and labels to make prediction

0        No comment is expected from Barack Obama Membe...
1          Did they post their votes for Hillary already? 
2         Now, most of the demonstrators gathered last ...
3        A dozen politically active pastors came here f...
4        The RS-28 Sarmat missile, dubbed Satan 2, will...
                               ...                        
72129    WASHINGTON (Reuters) - Hackers believed to be ...
72130    You know, because in fantasyland Republicans n...
72131    Migrants Refuse To Leave Train At Refugee Camp...
72132    MEXICO CITY (Reuters) - Donald Trump’s combati...
72133    Goldman Sachs Endorses Hillary Clinton For Pre...
Name: content, Length: 72134, dtype: object


In [None]:
# Separating the data and the labels, as we are removing a column we need axis =1, if a row then axis =0
X = news_dataset.drop(columns = 'label', axis = 1)
Y = news_dataset['label']

In [None]:
print(X)
print(Y)

       Unnamed: 0                                              title  \
0               0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1               1                                                      
2               2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3               3  Bobby Jindal, raised Hindu, uses story of Chri...   
4               4  SATAN 2: Russia unvelis an image of its terrif...   
...           ...                                                ...   
72129       72129  Russians steal research on Trump in hack of U....   
72130       72130   WATCH: Giuliani Demands That Democrats Apolog...   
72131       72131  Migrants Refuse To Leave Train At Refugee Camp...   
72132       72132  Trump tussle gives unpopular Mexican leader mu...   
72133       72133  Goldman Sachs Endorses Hillary Clinton For Pre...   

                                                    text  \
0      No comment is expected from Barack Obama Membe...   
1         Did t

Stemming: Process of reducing a word to its root word

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content
# using def we are creating a function called stemming. its not inbuilt
# in first step, we use regular expressions library. sub means substituting certain values. ^ means exclusion that is i am removing all content excoet alphabet and words, and if there are numbers, commas or code, it will be replaced by space.
# everything is converted to lowercase letters
# everything is split and converetd to a list
# using all the words which are not stopwords and using port stem on it. for loop removes all the stop words


In [None]:
new
s_dataset['content'] = news_dataset['content'].apply(stemming)
# taking the content column which is the joint form of title and text and applying the stemming function

In [None]:
print(news_dataset['content'])

0        comment expect barack obama member fyf fukyofl...
1                                post vote hillari alreadi
2        demonstr gather last night exercis constitut p...
3        dozen polit activ pastor came privat dinner fr...
4        rs sarmat missil dub satan replac ss fli mile ...
                               ...                        
72129    washington reuter hacker believ work russian g...
72130    know fantasyland republican never question cit...
72131    migrant refus leav train refuge camp hungari t...
72132    mexico citi reuter donald trump comb style buf...
72133    goldman sach endors hillari clinton presid gol...
Name: content, Length: 72134, dtype: object


In [None]:
# Separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [None]:
print(X)

['comment expect barack obama member fyf fukyoflag blacklivesmatt movement call lynch hang white peopl cop encourag other radio show tuesday night turn tide kill white peopl cop send messag kill black peopl america one f yoflag organ call sunshin radio blog show host texa call sunshin f ing opinion radio show snapshot fyf lolatwhitefear twitter page p show urg support call fyf tonight continu dismantl illus white snapshot twitter radio call invit fyf radio show air p eastern standard time show caller clearli call lynch kill white peopl minut clip radio show heard provid breitbart texa someon would like refer hannib alreadi receiv death threat result interrupt fyf confer call unidentifi black man said mother f ker start f ing like us bunch ni er takin one us roll said caus alreadi roll gang anyway six seven black mother f cker see white person lynch ass let turn tabl conspir cop start lose peopl state emerg specul one two thing would happen big ass r war ni er go start backin alreadi ge

In [None]:
print(Y)

[1 1 1 ... 0 0 1]


In [None]:
Y.shape

(72134,)

In [None]:
# we need to convert all the text into meaningful numbers so that computers can undertsand it
vectorizer = TfidfVectorizer() #Tf stands for Term Frequency and idf for Inverse Document Frequency. Tf counts the no. of times a word is repeated, tells how important it is and assigns the number accordingly. many times a word which is repeated doesnt have much value in it, it detects those words and reduces the importance.
vectorizer.fit(X) #fitting the vectorizer function with X

X = vectorizer.transform(X)  # This is convert all the values to their respective features/ numbers

In [None]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 13656667 stored elements and shape (72134, 162203)>
  Coords	Values
  (0, 938)	0.019104619426517897
  (0, 1282)	0.017363778513914716
  (0, 2131)	0.052457780993620334
  (0, 2783)	0.020231302394732035
  (0, 3614)	0.029904475345647965
  (0, 3999)	0.02747310458208844
  (0, 4264)	0.023865946576073604
  (0, 4335)	0.05055646943154232
  (0, 4846)	0.01513932938772633
  (0, 4862)	0.02486341752399553
  (0, 6013)	0.014596932940161228
  (0, 6507)	0.057303304534120254
  (0, 6845)	0.01589099748716943
  (0, 8437)	0.12657603668480968
  (0, 8976)	0.015516676506767142
  (0, 10478)	0.06692816334064453
  (0, 11430)	0.018962618491219916
  (0, 12727)	0.01580162854760987
  (0, 14072)	0.018345912143817013
  (0, 14679)	0.01785037970922704
  (0, 15442)	0.19279395985841352
  (0, 15499)	0.08125624068348719
  (0, 15611)	0.0888918729364855
  (0, 15886)	0.029332963149593518
  (0, 18063)	0.10843561013885229
  :	:
  (72133, 132638)	0.031715743461707
  (72133

Splitting the dataset into training and test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)
# I want 80% of the data to be training data and 20% to be test data
# Variable Y contains 0 and 1. if we dont mention stratify, real and fake news wont be segregates equally.
# Random state sayd that everytime you shuffle the code, you get the same split

Training the Model: Logistic Regression Model

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)
# It will take the x train and y train data and plot the sigmoid function curve using logistic regression

Evaluation : Accuracy score

In [None]:
# Accuracy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9625695322924429


In [None]:
# Accuracy score on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9478061967144936


Making a Predictive System

In [None]:
X_new = X_test[7] # 0 means the first row in our x_test column which is the first news

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Fake')
else:
  print('The news is Real')

[0]
The news is Fake


In [None]:
print(Y_test[7]) # To check the result whether fake/real, Y contains the labels

0
