In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-news/submit.csv
/kaggle/input/fake-news/train.csv
/kaggle/input/fake-news/test.csv


**Dataset Variables:**

* id - Every article's ID (unique)
* title - Title of the news article
* author - Author of the article
* text - The text of the article (might be incomplete)
* label - Identification of article as Fake or not (Target Variable)

In [2]:
# Importing the dependencies 

import pandas as pd
import numpy as np
import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

* Numpy and Pandas are used for handling and processing the csv data as a dataframe.
* re is used for identifying/finding words or parts of words from an extract.
* In NLTK, we use stopwords to identify the unnecessary words and Porter Stemmer performs stemming to reduce a word to its root word.
* In sklearn, we use: TfidVectorizer to compute word occurrence frequency matrix. train_test_split to perform splitting of data. LogisticRegression is the classification model we are going to implement in this case. And accuracy_score is used to obtain the accuracy metrics of our model.


In [3]:
# Download all the stopwords from the nltk library

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# View the English stopwords that will be unnecessary for our analysis

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
# Load the data into a dataframe

news_df = pd.read_csv("/kaggle/input/fake-news/train.csv")

In [6]:
# View the dataframe's first few entries

news_df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
# Number of rows and columns in our dataframe (ROWS, COLUMNS)

news_df.shape

(20800, 5)

In [8]:
# View the general info about our dataframe columns

news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


Since there are differences in the number of non null entries in each column, we will now check the number of entries which are empty in each column.

In [9]:
# View number of null values in each column

news_df.isna().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [10]:
# Fill our Null values with an empty space

news_df = news_df.fillna('')

In [11]:
# Merging the title and author columns in our dataframe

news_df['article'] = news_df['title'] + news_df['author']

In [12]:
news_df

Unnamed: 0,id,title,author,text,label,article
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You FiredConsortiumnew...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...
...,...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0,Rapper T.I.: Trump a ’Poster Child For White S...
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0,"N.F.L. Playoffs: Schedule, Matchups and Odds -..."
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0,Macy’s Is Said to Receive Takeover Approach by...
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1,"NATO, Russia To Hold Parallel Exercises In Bal..."


In [13]:
news_df.drop(columns=['id'], inplace=True)

In [14]:
news_df

Unnamed: 0,title,author,text,label,article
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You FiredConsortiumnew...
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...
...,...,...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0,Rapper T.I.: Trump a ’Poster Child For White S...
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0,"N.F.L. Playoffs: Schedule, Matchups and Odds -..."
20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0,Macy’s Is Said to Receive Takeover Approach by...
20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1,"NATO, Russia To Hold Parallel Exercises In Bal..."


In [15]:
news_df["author"].value_counts()

                                             1957
Pam Key                                       243
admin                                         193
Jerome Hudson                                 166
Charlie Spiering                              141
                                             ... 
Jeremy R. Hammond                               1
Vic Bishop                                      1
Douglas Martin                                  1
Najim Rahim and Fahim Abed                      1
Michael J. de la Merced and Rachel Abrams       1
Name: author, Length: 4202, dtype: int64

In [16]:
# Separate the Target variable from the dataset

X = news_df.drop(columns='label', axis=1)
Y = news_df['label']

In [17]:
X

Unnamed: 0,title,author,text,article
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Why the Truth Might Get You FiredConsortiumnew...
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,15 Civilians Killed In Single US Airstrike Hav...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Iranian woman jailed for fictional unpublished...
...,...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,Rapper T.I.: Trump a ’Poster Child For White S...
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,"N.F.L. Playoffs: Schedule, Matchups and Odds -..."
20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,Macy’s Is Said to Receive Takeover Approach by...
20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...","NATO, Russia To Hold Parallel Exercises In Bal..."


In [18]:
Y

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64

**Stemming Process**

In [19]:
p_stemming = PorterStemmer()

In [20]:
# Function to reduce the word to its original

def stemming(content):
    stemmed_word = re.sub('[^a-zA-Z]',' ',content)
    stemmed_word = stemmed_word.lower()
    stemmed_word = stemmed_word.split()
    stemmed_word = [p_stemming.stem(word) for word in stemmed_word if not word in stopwords.words('english')]
    stemmed_word = ' '.join(stemmed_word)
    return stemmed_word

In [21]:
# Apply the above stemming function to the column having the article name and author together. 

news_df['article'] = news_df['article'].apply(stemming)

In [22]:
news_df['article']

0        hous dem aid even see comey letter jason chaff...
1        flynn hillari clinton big woman campu breitbar...
2                   truth might get firedconsortiumnew com
3        civilian kill singl us airstrik identifiedjess...
4        iranian woman jail fiction unpublish stori wom...
                               ...                        
20795    rapper trump poster child white supremaci jero...
20796    n f l playoff schedul matchup odd new york tim...
20797    maci said receiv takeov approach hudson bay ne...
20798    nato russia hold parallel exercis balkansalex ...
20799                            keep f alivedavid swanson
Name: article, Length: 20800, dtype: object

In [23]:
X = news_df['article'].values
X

array(['hous dem aid even see comey letter jason chaffetz tweet itdarrel lucu',
       'flynn hillari clinton big woman campu breitbartdaniel j flynn',
       'truth might get firedconsortiumnew com', ...,
       'maci said receiv takeov approach hudson bay new york timesmichael j de la merc rachel abram',
       'nato russia hold parallel exercis balkansalex ansari',
       'keep f alivedavid swanson'], dtype=object)

In [24]:
Y = news_df['label'].values
Y

array([1, 0, 1, ..., 0, 1, 1])

In [25]:
# Convert the text to numerical data using Vectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [26]:
X

<20800x23585 sparse matrix of type '<class 'numpy.float64'>'
	with 196475 stored elements in Compressed Sparse Row format>

In [27]:
# Splitting of Train and Test data

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 1)

In [28]:
# Performing Logistic Regression

ml_model = LogisticRegression()

In [29]:
# Fit & Train our data

ml_model.fit(X_train, Y_train)

LogisticRegression()

In [30]:
# Check for Accuracy (Train Data)

X_train_predict = ml_model.predict(X_train)
train_data_accuracy = accuracy_score(X_train_predict, Y_train)
percent_tr_accuracy = train_data_accuracy * 100
print("Accuracy for Train data: ", percent_tr_accuracy)

Accuracy for Train data:  98.26322115384616


In [31]:
# Check for Accuracy (Test Data)

X_test_predict = ml_model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_predict, Y_test)
percent_test_accuracy = test_data_accuracy * 100
print("Accuracy for Test data: ", percent_test_accuracy)

Accuracy for Test data:  96.5625


In [32]:
# Build a Simple Predictive System

'''
index = int(input("Enter article number to be verified: "))
^ To get article number as input from user
'''

X_new = X_test[26]
new_predict = ml_model.predict(X_new)
if(new_predict[0]==0):
    print("The News is real")
else:
    print("The News is fake")

The News is fake
