In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_csv('fake_news_train.csv')

In [4]:
df.shape

(20800, 5)

In [5]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


* id - Unique id for each news headline/title.
* title - Headline/title of the news.
* author - Author of the news article.
* text - Breifing of the headline/title (Body of the news).
* label - Fake news - 1 ; Valid news - 0

In [6]:
# Checking for null values in each column

df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [7]:
# Filling the null values with empty string

df = df.fillna(' ')

In [8]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

# Merging

In [9]:
# Merging 'title' and 'author' column with space in between title and author

df['content'] = df['author'] + ' ' + df['title']

In [10]:
df.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...


# Separating features and label

In [11]:
x = df.drop(columns = 'label', axis = 1)
y = df['label']

In [12]:
x

Unnamed: 0,id,title,author,text,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Howard Portnoy Iranian woman jailed for fictio...
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,"Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma..."
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,Michael J. de la Merced and Rachel Abrams Macy...
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...","Alex Ansary NATO, Russia To Hold Parallel Exer..."


In [13]:
y

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64

# Stemming

Stemming is a process to reduce a word to its root by removing suffix and prefix from the root word.

In [14]:
# Getting all the stopwords

stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

`Stopwords` - These are the words which are used so commonly that they carry very little useful information and hence, ignored and removed from the text before using NLP.

In [15]:
port_stem = PorterStemmer()

In [16]:
# Function for stemming each word in a text

def stemming(x):
    
    # Replacing elements other than letters from (a-z) and (A-Z) with blank space (' ') in text
    stem_text = re.sub('[^a-zA-Z]', ' ', x)
    
    # Converting all the letters to small letters
    stem_text = stem_text.lower()
    
    # Putting each word in a list by splitting the sentence
    stem_text = stem_text.split()
    
    # Stemming each word in the text which is not a stopword
    stem_text = [port_stem.stem(i) for i in stem_text if not i in stopwords.words('english')]
    
    # Joining the stemmed word
    stem_text = ' '.join(stem_text)
    
    return stem_text

In [17]:
# Applying the function on 'content' column

df['content'] = df['content'].apply(stemming)

In [18]:
df['content']

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object

* All the wors are converted to its root word and stopwords are removed.
* The text present in 'content' column only contains the useful words. 

In [19]:
# Again assigning features and labels

x_new = df['content'].values
y_new = df['label'].values

* Using only 'content' as feature and not 'text' as 'text' contains huge paragraph which takes much more time to process. 

In [20]:
x_new

array(['darrel lucu hous dem aid even see comey letter jason chaffetz tweet',
       'daniel j flynn flynn hillari clinton big woman campu breitbart',
       'consortiumnew com truth might get fire', ...,
       'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time',
       'alex ansari nato russia hold parallel exercis balkan',
       'david swanson keep f aliv'], dtype=object)

In [21]:
y_new

array([1, 0, 1, ..., 0, 1, 1], dtype=int64)

# Vectorizing the textual data

Vectorizer is required to convert text to numbers as machine can only understand numbers.

TfidfVectorizer() - 
* Tf stands for `term frequency` which counts the number of times a particular word is repeated and find its importance in the text according to which a numeric value is assigned to it.

* idf stands for `inverse frequency` which checks the most repeated word in the text and lowers its importance as the word has repeated several times and can convey very less information.

In [22]:
vector = TfidfVectorizer()

# Fitting the textual data to vectorizer
vector.fit(x_new)

# Converting textual data to numeric data 
x_new = vector.transform(x_new)

In [23]:
print(x_new)

  (0, 15686)	0.28485063562728646
  (0, 13473)	0.2565896679337957
  (0, 8909)	0.3635963806326075
  (0, 8630)	0.29212514087043684
  (0, 7692)	0.24785219520671603
  (0, 7005)	0.21874169089359144
  (0, 4973)	0.233316966909351
  (0, 3792)	0.2705332480845492
  (0, 3600)	0.3598939188262559
  (0, 2959)	0.2468450128533713
  (0, 2483)	0.3676519686797209
  (0, 267)	0.27010124977708766
  (1, 16799)	0.30071745655510157
  (1, 6816)	0.1904660198296849
  (1, 5503)	0.7143299355715573
  (1, 3568)	0.26373768806048464
  (1, 2813)	0.19094574062359204
  (1, 2223)	0.3827320386859759
  (1, 1894)	0.15521974226349364
  (1, 1497)	0.2939891562094648
  (2, 15611)	0.41544962664721613
  (2, 9620)	0.49351492943649944
  (2, 5968)	0.3474613386728292
  (2, 5389)	0.3866530551182615
  (2, 3103)	0.46097489583229645
  :	:
  (20797, 13122)	0.2482526352197606
  (20797, 12344)	0.27263457663336677
  (20797, 12138)	0.24778257724396507
  (20797, 10306)	0.08038079000566466
  (20797, 9588)	0.174553480255222
  (20797, 9518)	0.295420

* We need not to use vectorizer for 'y_new' as it is already contains numeric values.

# Using the data in model

In [24]:
# Splitting the data into training and testing

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_new, y_new, test_size = 0.2, stratify = y_new, random_state = 2)

In [25]:
print (x_new.shape, x_train.shape, x_test.shape)

(20800, 17128) (16640, 17128) (4160, 17128)


### Using Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
# Training the model

log = LogisticRegression()
log.fit(x_train, y_train)

LogisticRegression()

#### Evaluation

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
# For training data

train_pred = log.predict(x_train)
acc_train = accuracy_score(y_train, train_pred)
print ('Accuracy score for training data:', acc_train)

Accuracy score for training data: 0.9865985576923076


In [30]:
# For testing data

test_pred = log.predict(x_test)
acc_test = accuracy_score(y_test, test_pred)
print ('Accuracy score for testing data:', acc_test)

Accuracy score for testing data: 0.9790865384615385


* As the difference between accuracy score of training and testing data is very less, the model is `very accurate`.

# Predictive system

In [41]:
# Providing text input to the system
inp = x_test[8]

# Making prediction
pred = log.predict(inp)

# Printing the prediction
if(pred[0] == 1):
    print ('The news is FAKE !')
else:
    print ('The news is TRUE.')

The news is FAKE !
