In [75]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import re
import string

In [76]:
dfake = pd.read_csv('./dataset/Fake.csv')
dtrue = pd.read_csv('./dataset/True.csv')

In [77]:
dfake['class'] = 0
dtrue['class'] = 1

# Identify the quantity of entries and its columns
dfake.shape, dtrue.shape

((23481, 5), (21417, 5))

In [78]:
# Clean dataset
dfake_manual_testing = dfake.tail(10)
dfake.drop(dfake.tail(10).index, inplace=True)

dtrue_manual_testing = dtrue.tail(10)
dtrue.drop(dtrue.tail(10).index, inplace=True)

dfake.shape, dtrue.shape


((23471, 5), (21407, 5))

In [79]:
dfake_manual_testing['class'] = 0
dtrue_manual_testing['class'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfake_manual_testing['class'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dtrue_manual_testing['class'] = 1


In [80]:
# View first 10 entries
dfake_manual_testing.head(10)

Unnamed: 0,title,text,subject,date,class
23471,Seven Iranians freed in the prisoner swap have...,"21st Century Wire says This week, the historic...",Middle-east,"January 20, 2016",0
23472,#Hashtag Hell & The Fake Left,By Dady Chery and Gilbert MercierAll writers ...,Middle-east,"January 19, 2016",0
23473,Astroturfing: Journalist Reveals Brainwashing ...,Vic Bishop Waking TimesOur reality is carefull...,Middle-east,"January 19, 2016",0
23474,The New American Century: An Era of Fraud,Paul Craig RobertsIn the last years of the 20t...,Middle-east,"January 19, 2016",0
23475,Hillary Clinton: ‘Israel First’ (and no peace ...,Robert Fantina CounterpunchAlthough the United...,Middle-east,"January 18, 2016",0
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0
23480,10 U.S. Navy Sailors Held by Iranian Military ...,21st Century Wire says As 21WIRE predicted in ...,Middle-east,"January 12, 2016",0


In [81]:
# View first 10 entries
dtrue_manual_testing.head(10)

Unnamed: 0,title,text,subject,date,class
21407,"Mata Pires, owner of embattled Brazil builder ...","SAO PAULO (Reuters) - Cesar Mata Pires, the ow...",worldnews,"August 22, 2017",1
21408,"U.S., North Korea clash at U.N. forum over nuc...",GENEVA (Reuters) - North Korea and the United ...,worldnews,"August 22, 2017",1
21409,"U.S., North Korea clash at U.N. arms forum on ...",GENEVA (Reuters) - North Korea and the United ...,worldnews,"August 22, 2017",1
21410,Headless torso could belong to submarine journ...,COPENHAGEN (Reuters) - Danish police said on T...,worldnews,"August 22, 2017",1
21411,North Korea shipments to Syria chemical arms a...,UNITED NATIONS (Reuters) - Two North Korean sh...,worldnews,"August 21, 2017",1
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


In [82]:
# Merge true and fake news dataset
data_merge = pd.concat([dfake, dtrue], axis=0)
data_merge.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [83]:
data_merge.columns

Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')

In [84]:
# Remove unnecessary columns
data = data_merge.drop(['title', 'subject', 'date'], axis=1)
data.isnull().sum()


text     0
class    0
dtype: int64

In [85]:
data.sample(frac=1)

Unnamed: 0,text,class
8525,NEW YORK (Reuters) - Democratic presidential n...,1
6867,"FAYETTEVILLE, N.C. (Reuters) - President-elect...",1
1328,(Reuters) - U.S. Food and Drug Administration ...,1
2075,KABUL (Reuters) - A spokesman for the Afghan T...,1
13952,Donald Trump was on a phone interview with CNN...,0
...,...,...
20765,NY Gov. Andrew Cuomo (D) thinks he s the boss ...,0
16941,GENEVA (Reuters) - U.S. President Donald Trump...,1
13933,BANGKOK (Reuters) - Thai Prime Minister Prayut...,1
12605,,0


In [86]:
data.reset_index(inplace=True)
data.drop(['index'], axis=1, inplace=True)
data.columns

Index(['text', 'class'], dtype='object')

In [87]:
data.head()

Unnamed: 0,text,class
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [88]:
# Convert to lowercase and remove unnecessary characters. Gives more weight to generate accurate results
def wordformat(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text) # removes special characters 
    text = re.sub('https?://\S+|www\.\S+', '', text) # removes links
    text = re.sub('<.*?>+', '', text) # removes html tags
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # removes punctuation
    text = re.sub('\n', '', text) # removes new lines
    text = re.sub('\w*\d\w*', '', text) # removes words containing numbers
    return text

data['text'] = data['text'].apply(wordformat)

In [89]:
x = data['text']
y = data['class']
x_tr, x_ts, y_tr, y_ts = train_test_split(x, y, test_size=0.25)

In [90]:
tf = TfidfVectorizer()

xv_tr = tf.fit_transform(x_tr)
xv_ts = tf.transform(x_ts)


In [91]:
LR = LogisticRegression()

LR.fit(xv_tr, y_tr)

In [93]:
# Check for accuracy
pred_lr = LR.predict(xv_ts)
LR.score(xv_ts, y_ts)

0.9864527629233512

In [94]:
print(classification_report(y_ts, pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5935
           1       0.98      0.99      0.99      5285

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220

