In [None]:
# FakeNewsDetection

In [21]:
#importing the dependency
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [22]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandipan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [23]:
#printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [24]:
#Data Pre-processing
#loading the dataset to a pandas DataFrame
news_dataset=pd.read_csv("fake_news_train.csv")

In [25]:
news_dataset.shape

(4998, 2)

In [26]:
#coverting variable to quantity data
vlabel=pd.get_dummies(news_dataset['label'],drop_first=True)
news_dataset=pd.concat((news_dataset,vlabel),axis=1)
news_dataset.drop(['label'],axis=1,inplace=True)
news_dataset.rename(columns={'real':'label'},inplace=True)

In [27]:
#printing the first 5 rows of the dataframe
news_dataset.head()
#here 1 indicates fake news and 0 indicates not fake news

Unnamed: 0,news_text,label
0,Get the latest from TODAY Sign up for our news...,0
1,2d Conan On The Funeral Trump Will Be Invited...,0
2,It’s safe to say that Instagram Stories has fa...,1
3,Much like a certain Amazon goddess with a lass...,1
4,At a time when the perfect outfit is just one ...,1


In [28]:
#counting the number of missing values in the dataset
news_dataset.isnull().sum()

news_text    0
label        0
dtype: int64

In [29]:
#replacing the null values with empty string
news_dataset=news_dataset.fillna('')

In [30]:
#seperating the data & label
X = news_dataset.drop(columns='label',axis=1)
Y= news_dataset['label']
print(X)
print(Y)

                                              news_text
0     Get the latest from TODAY Sign up for our news...
1     2d  Conan On The Funeral Trump Will Be Invited...
2     It’s safe to say that Instagram Stories has fa...
3     Much like a certain Amazon goddess with a lass...
4     At a time when the perfect outfit is just one ...
...                                                 ...
4993  A jury ruled against Bill Cosby in his sexual ...
4994                An unicorn was found in city centre
4995        An alien was spotted walking on the streets
4996  Twitter has not taken any legal action against...
4997  It appears that the index has embarked on a su...

[4998 rows x 1 columns]
0       0
1       0
2       1
3       1
4       1
       ..
4993    1
4994    0
4995    0
4996    1
4997    1
Name: label, Length: 4998, dtype: uint8


In [31]:
#Stemming:- Stemming is the process of reducing a word to its Root 
#example:- actor,actress,acting -->act
port_stem = PorterStemmer()
def stemming(news_text):
    stemmed_text=re.sub('[^a-zA-Z]',' ',news_text)
    stemmed_text=stemmed_text.lower()
    stemmed_text=stemmed_text.split()
    stemmed_text= [port_stem.stem(word) for word in stemmed_text if not word in stopwords.words('english')]
    stemmed_text= ' '.join(stemmed_text)
    return stemmed_text

In [32]:
news_dataset['news_text'] = news_dataset['news_text'].apply(stemming)

In [33]:
#separating the data and label
X=news_dataset['news_text'].values
Y = news_dataset['label'].values


In [34]:
print(X)

['get latest today sign newslett one ever truli get lose love one blake shelton except older brother richi die nov shelton note tweet monday chang life forev richi die car accid shelton home state oklahoma two year ago shelton sent messag th anniversari loss richi blake half brother share mother passeng car collid school bu ada south oklahoma citi richi driver redena mcmanu year old boy christoph mcmanu die shortli collis bu driver passeng uninjur accord polic report accid clearli remain blake told minut rememb pick phone call week dead tell someth pick phone call tell someth saw tv like constantli shock dead blake shelton play today halloween extravaganza new york citi oct getti imag blake wife miranda lambert wrote singl call inspir richi still two brother bond despit age differ share love countri music bedroom right across hallway mine littl blake said interview listen hank william jr waylon lynyrd skynyrd bob seeger whatev popular realli richi love music would sit go man guy hero c

In [35]:
print(Y)

[0 0 1 ... 0 1 1]


In [36]:
Y.shape

(4998,)

In [37]:
#converting textual data to numerical data
#TfidfVectorizer=Tf-terms frequency,idf- inverse documents frequency
#fit= feature
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X= vectorizer.transform(X)


In [38]:
print(X)

  (0, 42987)	0.036687095657634915
  (0, 42866)	0.03890145346743022
  (0, 42593)	0.035513346103561605
  (0, 42521)	0.026141771679327165
  (0, 42129)	0.04504218712712073
  (0, 42055)	0.03667094093049357
  (0, 41882)	0.05056312972930073
  (0, 41704)	0.030606019456130448
  (0, 41615)	0.10540773601124404
  (0, 40115)	0.1001512186965051
  (0, 39601)	0.04978496070613522
  (0, 39592)	0.03554291092105746
  (0, 39545)	0.04068890610986377
  (0, 39520)	0.037470012825171074
  (0, 39318)	0.05254586023753641
  (0, 38746)	0.02595676945197298
  (0, 38718)	0.07653339381700214
  (0, 38104)	0.04019229945273098
  (0, 37917)	0.0630993309457412
  (0, 36473)	0.03039537845555897
  (0, 36268)	0.03663869174258676
  (0, 35709)	0.05430841179035804
  (0, 35563)	0.06984333527623678
  (0, 35092)	0.10540773601124404
  (0, 34958)	0.04710751432010369
  :	:
  (4997, 43333)	0.1873800157134517
  (4997, 42013)	0.12475324869048751
  (4997, 39137)	0.15812300057816359
  (4997, 38017)	0.1253415554785386
  (4997, 37668)	0.157104

In [39]:
#SPLITING DATA SET TO TRAINING & TEST DATA
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)


In [40]:
#Traning the model: logistic Regression
model=LogisticRegression()
model.fit(X_train,Y_train)

LogisticRegression()

In [41]:
#Evaluation #accuracy score
#accuracy score for traning data
X_train_prediction =model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)
print('Accuracy score of the training data : ',training_data_accuracy)

Accuracy score of the training data :  0.8529264632316158


In [42]:
#accuracy score for testing data
X_test_prediction =model.predict(X_test)
testing_data_accuracy=accuracy_score(X_test_prediction,Y_test)
print('Accuracy score of the testing data : ',testing_data_accuracy)

Accuracy score of the testing data :  0.757


In [43]:
#Making a predictive System
X_new =X_test[5]

prediction=model.predict(X_new)
print(prediction)

if(prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[1]
The news is Fake


In [44]:
print(Y_test[5])

1
