In [24]:
#importing dependencies

import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk

In [25]:
 nltk.download("stopwords")
    #Loading news dataset
news_dataset=pd.read_csv(r"C:\aa projects\machine learning\fake_news_dataset.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91920\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:

#*************************Data preprocessing*********************************************

#Loading news dataset
news_dataset=pd.read_csv(r"C:\aa projects\machine learning\fake_news_dataset.csv")
print(news_dataset.shape)
print(news_dataset.isnull().sum())

#filled missing value with null string bcoz the dataset was huge so we don't need any preprocessing
news_dataset=news_dataset.fillna("")

#rechecking if there is still any any null value
print(news_dataset.isnull().sum())

#merging the author and title into content
news_dataset["content"]=news_dataset["author"]+" "+news_dataset["title"]
print(news_dataset.head())

X=news_dataset.drop(columns="label",axis=1)
Y=news_dataset["label"]


(20800, 5)
id           0
title      558
author    1957
text        39
label        0
dtype: int64
id        0
title     0
author    0
text      0
label     0
dtype: int64
   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  \
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1   
1  Ever get the feeling your life circles the rou...      0   
2  Why the Truth Might Get You Fired October 29, ...      1   
3  Videos 15 Civilians Killed In Single US Airstr...      1   
4  Print \nAn Iranian woman has 

In [27]:
#Stemming:
            #The process of reducing word to its root
            #actor,actress,acting-->act

port_stem=PorterStemmer()

def stemming(content):
    stemmed_content=re.sub(r"[^a-zA-Z]"," ",content)        #removing everything except alphabets
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words("english")]    #stemming non stopwords only
    stemmed_content=" ".join(stemmed_content)
    return stemmed_content

news_dataset["content"]=news_dataset["content"].apply(stemming)



In [28]:
print(news_dataset.head())

   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  \
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1   
1  Ever get the feeling your life circles the rou...      0   
2  Why the Truth Might Get You Fired October 29, ...      1   
3  Videos 15 Civilians Killed In Single US Airstr...      1   
4  Print \nAn Iranian woman has been sentenced to...      1   

                                             content  
0  darrel lucu hous dem aid even see comey letter...  
1  daniel j flynn flynn hillar

In [29]:
#Seperating the dataset
X=news_dataset["content"].values
Y=news_dataset["label"].values

print(X)
print(Y)

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time'
 'alex ansari nato russia hold parallel exercis balkan'
 'david swanson keep f aliv']
[1 0 1 ... 0 1 1]


In [30]:
#Convert the textual data into numerical data
vectorizer=TfidfVectorizer()
X=vectorizer.fit_transform(X)

print(X)

  (0, 15686)	0.28485063562728646
  (0, 2483)	0.3676519686797209
  (0, 7692)	0.24785219520671603
  (0, 8630)	0.29212514087043684
  (0, 2959)	0.2468450128533713
  (0, 13473)	0.2565896679337957
  (0, 4973)	0.233316966909351
  (0, 267)	0.27010124977708766
  (0, 3792)	0.2705332480845492
  (0, 7005)	0.21874169089359144
  (0, 8909)	0.3635963806326075
  (0, 3600)	0.3598939188262559
  (1, 1894)	0.15521974226349364
  (1, 2223)	0.3827320386859759
  (1, 16799)	0.30071745655510157
  (1, 1497)	0.2939891562094648
  (1, 2813)	0.19094574062359204
  (1, 6816)	0.1904660198296849
  (1, 5503)	0.7143299355715573
  (1, 3568)	0.26373768806048464
  (2, 5389)	0.3866530551182615
  (2, 5968)	0.3474613386728292
  (2, 9620)	0.49351492943649944
  (2, 15611)	0.41544962664721613
  (2, 2943)	0.3179886800654691
  :	:
  (20797, 1287)	0.3353805680413986
  (20797, 13122)	0.24825263521976057
  (20797, 12344)	0.27263457663336677
  (20797, 14967)	0.3115945315488075
  (20797, 12138)	0.24778257724396505
  (20797, 9518)	0.295420

In [32]:
#Splitting the data into rain test and split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=3)
            #stratify=Y bcoz it will split the real and fake news in eqal equal portion in training and testing data

In [35]:
#Training the Model:logistic Regression

model=LogisticRegression()
model.fit(X_train,Y_train)

LogisticRegression()

In [None]:

#Evaluation
#Accuracy

X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)

print("Accuracy on training data:",training_data_accuracy)
X_test_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction,Y_test)
print("Accuracy on test data:",test_data_accuracy)






Making a predictive system


In [41]:
X_new=X_test[1]
prediction=model.predict(X_new)
if prediction==0:
    print("The news is real")
else:
    print("The news is fake")

The news is real
