Importing Dependencies

In [6]:
import numpy as np
import pandas as pd
import re #for searching text in doc
from nltk.corpus import stopwords #those words which do not add values to paragraph, we need to remove that
from nltk.stem.porter import PorterStemmer #used to reduce words to their root form (called stemming)
from sklearn.feature_extraction.text import TfidfVectorizer #convert text to feature vector
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Preprocessing

In [12]:
news_dataset = pd.read_csv("/content/drive/MyDrive/Data Science/Projects/04 Fake News Prediction/WELFake_Dataset.csv")
news_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [13]:
#counting the number of missing values
news_dataset.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,558
text,39
label,0


In [15]:
#replacing the num values with empty string
news_dataset = news_dataset.fillna("")

In [16]:
news_dataset.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,0
text,0
label,0


In [19]:
#seperate the data and label
x = news_dataset.drop(['label'], axis=1)
y = news_dataset['label']


Stemming


In [21]:
#Stemming is the process of reducing a word to its Root word
#e.g:
#actress, acting, actor -> act

In [22]:
port_stem = PorterStemmer()

In [29]:
def stemming(content):
  stemmed_content =  re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [30]:
news_dataset['title'] = news_dataset['title'].apply(stemming)

In [31]:
news_dataset['title']

Unnamed: 0,title
0,law enforc high alert follow threat cop white ...
1,
2,unbeliev obama attorney gener say charlott rio...
3,bobbi jindal rais hindu use stori christian co...
4,satan russia unv imag terrifi new supernuk wes...
...,...
72129,russian steal research trump hack u democrat p...
72130,watch giuliani demand democrat apolog trump ra...
72131,migrant refus leav train refuge camp hungari
72132,trump tussl give unpopular mexican leader much...


In [34]:
#separating the label and data
x = news_dataset['title'].values
y = news_dataset['label'].values
y

array([1, 1, 1, ..., 0, 0, 1])

In [36]:
#converting the texual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(x)

x = vectorizer.transform(x)

In [40]:
x

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 629110 stored elements and shape (72134, 19639)>

Spitting the dataset to training and test data

In [42]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, stratify=y, random_state=2)

In [44]:
#Training the model using Logistic Regression
model = LogisticRegression()
model.fit(x_train, y_train)

In [48]:
#Evaluation
#Accuracy score
#Accuracy on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)
training_data_accuracy*100

91.93858630668723

In [49]:
#Accuracy on test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)
test_data_accuracy*100

90.0603035974215

In [57]:
#Now we will build predictive system
#if new data is given then:
x_new = x_test[0]

prediction = model.predict(x_new)


if prediction[0] == 0:
  print("Real News")
else:
  print("Fake News")

Fake News
