The following program functions as a Fake News detector

In [69]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.stem.porter import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

DATA PREPROCESSING

In [70]:
train_data = pd.read_csv(r'news.csv\train.csv')
print(train_data)
print(train_data.shape)
train_data.info()

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

In [71]:
train_data.isnull().sum()
check = train_data[train_data.isnull().any(axis=1)]
print(check)
train_data.fillna('')

          id                                              title       author  \
6          6  Life: Life Of Luxury: Elton John’s 6 Favorite ...          NaN   
8          8  Excerpts From a Draft Script for Donald Trump’...          NaN   
20        20  News: Hope For The GOP: A Nude Paul Ryan Has J...          NaN   
23        23  Massachusetts Cop’s Wife Busted for Pinning Fa...          NaN   
31        31  Israel is Becoming Pivotal to China’s Mid-East...          NaN   
...      ...                                                ...          ...   
20745  20745  Thomas Frank Explores Whether Hillary Clinton ...          NaN   
20768  20768  Osama bin Laden’s older brother rents out luxu...          NaN   
20771  20771                                                NaN   Letsbereal   
20772  20772                                                NaN  beersession   
20786  20786  Government Forces Advancing at Damascus-Aleppo...          NaN   

                                       

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [72]:
train_data['content'] =  train_data['author'] + '--' + train_data['title']
train_data.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus--House Dem Aide: We Didn’t Even ...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn--FLYNN: Hillary Clinton, Big W..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com--Why the Truth Might Get Yo...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss--15 Civilians Killed In Single...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy--Iranian woman jailed for ficti...


In [73]:
X = train_data['content']
y = train_data['label']

X.head()
# y.head()

0    Darrell Lucus--House Dem Aide: We Didn’t Even ...
1    Daniel J. Flynn--FLYNN: Hillary Clinton, Big W...
2    Consortiumnews.com--Why the Truth Might Get Yo...
3    Jessica Purkiss--15 Civilians Killed In Single...
4    Howard Portnoy--Iranian woman jailed for ficti...
Name: content, dtype: object

STEMMING

In [74]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
port_stem = PorterStemmer()
lemm = WordNetLemmatizer()
def stemming(content):
    if not isinstance(content, str):
        return ""

    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    return stemmed_content

def lem(content):
    lem_content = []
    for word in content:
        word = lemm.lemmatize(word)
        lem_content.append(word)
    lem_content = ' '.join(lem_content)
    return lem_content

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shaunak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shaunak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [75]:
X = X.apply(stemming)
X = X.apply(lem)
X.head()

0    darrel lucu hous dem aid even see comey letter...
1    daniel j flynn flynn hillari clinton big woman...
2               consortiumnew com truth might get fire
3    jessica purkiss civilian kill singl u airstrik...
4    howard portnoy iranian woman jail fiction unpu...
Name: content, dtype: object

In [76]:
X=X.values
y=y.values
print(X)
print(y)

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time'
 'alex ansari nato russia hold parallel exercis balkan'
 'david swanson keep f aliv']
[1 0 1 ... 0 1 1]


In [77]:
v = TfidfVectorizer()
v.fit(X)
X = v.transform(X)

In [78]:
print(X.__len__)

<bound method spmatrix.__len__ of <20800x15920 sparse matrix of type '<class 'numpy.float64'>'
	with 194573 stored elements in Compressed Sparse Row format>>


TRAINING THE MODEL

In [79]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, stratify=y, random_state=2)

In [80]:
ml_mod = LogisticRegression()

In [81]:
ml_mod.fit(X_train, y_train)

EVALUATION

In [82]:
X_train_pred_label = ml_mod.predict(X_train)
acc_train = accuracy_score(X_train_pred_label, y_train)
print(f"Training Accuracy: {acc_train*100:.2f}%")

Training Accuracy: 98.64%


In [83]:
X_test_pred_label = ml_mod.predict(X_test)
print(X_test_pred_label)
acc_test = accuracy_score(X_test_pred_label, y_test)
print(f"Testing Accuracy: {acc_test*100:.2f}%")

[1 0 1 ... 1 1 0]
Testing Accuracy: 98.05%
