In [28]:
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
%matplotlib inline

In [29]:
# dataframe 'data' is created
data = pd.read_csv('fake_or_real_news.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [30]:
# Concise summary about the dataframe
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
Unnamed: 0    6335 non-null int64
title         6335 non-null object
text          6335 non-null object
label         6335 non-null object
dtypes: int64(1), object(3)
memory usage: 198.0+ KB


In [31]:
# Count of real and fake news
data.label.value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [32]:
# Renaming column name 'Unnamed: 0' to 'ID'
data = data.rename({'Unnamed: 0': 'ID'}, axis='columns')
data.head(1)

Unnamed: 0,ID,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE


In [33]:
data = data.set_index('ID')
data.head()

Unnamed: 0_level_0,title,text,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### We will use only the 'text' column to predict news category(FAKE or REAL)

In [34]:
X = data.text
X.head()

ID
8476     Daniel Greenfield, a Shillman Journalism Fello...
10294    Google Pinterest Digg Linkedin Reddit Stumbleu...
3608     U.S. Secretary of State John F. Kerry said Mon...
10142    — Kaydee King (@KaydeeKing) November 9, 2016 T...
875      It's primary day in New York and front-runners...
Name: text, dtype: object

In [35]:
# Target Variable
y = data.label
y.head()

ID
8476     FAKE
10294    FAKE
3608     REAL
10142    FAKE
875      REAL
Name: label, dtype: object

### Train Test Split

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)

#### Since, MultinomialNB Classifier is tuned to work with numbers, we will use CountVectorizer function to convert text to a matrix of token counts

In [37]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer(stop_words='english')
count_train = count_vec.fit_transform(X_train)
count_test = count_vec.transform(X_test)

In [38]:
# Count Vectorizer paramaters
print(count_vec.get_params(deep=True))

{'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.int64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': None, 'min_df': 1, 'ngram_range': (1, 1), 'preprocessor': None, 'stop_words': 'english', 'strip_accents': None, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'vocabulary': None}


In [39]:
# Stop words in the traning set
print(count_vec.get_stop_words())

frozenset({'have', 'ie', 'due', 'else', 'often', 'seemed', 'could', 'fifty', 'hasnt', 'hence', 'here', 'least', 'formerly', 'ten', 'de', 'sometimes', 'may', 'no', 'they', 'beside', 'system', 'before', 'him', 'many', 'the', 'those', 'twenty', 'whereby', 'would', 'your', 'even', 'always', 'sometime', 'empty', 'down', 'yours', 'within', 'their', 'get', 'sixty', 'more', 'noone', 'ltd', 'becomes', 'below', 'became', 'too', 'whence', 'be', 'me', 'however', 'wherein', 'almost', 'an', 'namely', 'wherever', 'etc', 'above', 'none', 'own', 'somewhere', 'whoever', 'thereupon', 'whole', 'anyway', 'everywhere', 'after', 'throughout', 'last', 'toward', 'a', 'amongst', 'while', 'by', 'beyond', 'two', 'only', 'either', 'or', 'eleven', 'next', 'herself', 'one', 'back', 'hereafter', 'also', 'every', 'except', 'mill', 'thereafter', 'nine', 'must', 'ours', 'mostly', 'describe', 'front', 'whom', 'against', 'although', 'this', 'few', 'same', 'much', 'elsewhere', 'together', 'behind', 'for', 'less', 'any', 'c

In [40]:
# First 30 features names mapped from the feature integer
print(count_vec.get_feature_names()[:30])

['00', '000', '0000', '000000031', '00000031', '00006', '0001pt', '0002', '000billion', '000ft', '000x', '001', '002', '003', '004', '005', '006', '00684', '006s', '007', '007s', '008', '008s', '009', '0099', '00am', '00p', '00pm', '01', '011']


In [41]:
# Total number of featrues in count_vectorizer
len(count_vec.get_feature_names())

59647

In [42]:
# Creating dataframe 'count_X_train' with features names along with their count
count_X_train = pd.DataFrame(count_train.A, columns = count_vec.get_feature_names())
count_X_train.head()

Unnamed: 0,00,000,0000,000000031,00000031,00006,0001pt,0002,000billion,000ft,...,حلب,عربي,عن,لم,ما,محاولات,من,هذا,والمرضى,ยงade
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Building Naive Bayes Classifier model

In [43]:
clf = MultinomialNB() 
clf.fit(count_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Prediction and Accuracy 

In [44]:
pred = clf.predict(count_test)

In [45]:
score = accuracy_score(y_test, pred)
print("accuracy:   %0.4f" % score)

accuracy:   0.8819


#### Accuracy with default MutinomialNB parameter (i.e at alpha = 1)  is 88.19%. Let's try to find best parameter value to get a better accuracy using GridSearchCV function.

### Grid Search

In [46]:
from sklearn.model_selection import GridSearchCV
param_grid={'alpha':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]}
grid_search = GridSearchCV(clf, param_grid, cv = 5)
grid_search.fit(count_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [47]:
print('Best parameters{}'.format(grid_search.best_params_))
print('Best score {:.2f}'.format(grid_search.best_score_))

Best parameters{'alpha': 0.1}
Best score 0.90


### Building classifer with best parameter (alpha = 0.1)

In [48]:
clf = MultinomialNB(alpha=0.1) 
clf.fit(count_train, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [49]:
pred = clf.predict(count_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.4f" % score)

accuracy:   0.8870


#### We can see that accuracy is improved from 88.19% to 88.7%

### Evaluation

In [50]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred))

[[653 114]
 [ 65 752]]


In [51]:
print(classification_report(y_test,pred, labels=['FAKE', 'REAL']))

             precision    recall  f1-score   support

       FAKE       0.91      0.85      0.88       767
       REAL       0.87      0.92      0.89       817

avg / total       0.89      0.89      0.89      1584



### The model has precision of 91% for predicting Fake News which seems to be good enough for a spam classifer.

### Top 30 words for Fake News

In [52]:
features = count_vec.get_feature_names()
sorted(zip(clf.coef_[0], features))[:30]

[(-16.18204997023321, '0001pt'),
 (-16.18204997023321, '0002'),
 (-16.18204997023321, '000billion'),
 (-16.18204997023321, '005'),
 (-16.18204997023321, '00684'),
 (-16.18204997023321, '006s'),
 (-16.18204997023321, '007'),
 (-16.18204997023321, '007s'),
 (-16.18204997023321, '008s'),
 (-16.18204997023321, '0099'),
 (-16.18204997023321, '00am'),
 (-16.18204997023321, '00p'),
 (-16.18204997023321, '00pm'),
 (-16.18204997023321, '013c2812c9'),
 (-16.18204997023321, '01am'),
 (-16.18204997023321, '020'),
 (-16.18204997023321, '02714'),
 (-16.18204997023321, '02870'),
 (-16.18204997023321, '02welcome'),
 (-16.18204997023321, '031'),
 (-16.18204997023321, '032'),
 (-16.18204997023321, '033'),
 (-16.18204997023321, '03747'),
 (-16.18204997023321, '039'),
 (-16.18204997023321, '0400'),
 (-16.18204997023321, '049'),
 (-16.18204997023321, '04pm'),
 (-16.18204997023321, '0509245d29'),
 (-16.18204997023321, '052'),
 (-16.18204997023321, '053')]

### Top 30 words for Real News

In [53]:
sorted(zip(clf.coef_[0], features), reverse=True)[:30]

[(-4.437211518097527, 'said'),
 (-4.54327630289812, 'trump'),
 (-4.897393670119733, 'clinton'),
 (-5.43632735165234, 'state'),
 (-5.444111006439407, 'president'),
 (-5.455023141954477, 'people'),
 (-5.466943549766896, 'obama'),
 (-5.52359133474522, 'new'),
 (-5.554376947352951, 'campaign'),
 (-5.674219128812631, 'republican'),
 (-5.775940351281779, 'party'),
 (-5.910072545713812, 'time'),
 (-5.923303475816235, 'states'),
 (-5.937067103593539, 'just'),
 (-5.940626966382516, 'like'),
 (-5.961163788457368, 'sanders'),
 (-5.990717858446995, 'house'),
 (-6.035184959552062, 'percent'),
 (-6.075580758206824, 'political'),
 (-6.119381553344769, 'voters'),
 (-6.120234759317777, 'year'),
 (-6.1245117428701565, 'presidential'),
 (-6.128376730933197, 'democratic'),
 (-6.129668387015645, 'republicans'),
 (-6.17957718101266, 'white'),
 (-6.190505753876798, 'cruz'),
 (-6.224026885049067, 'told'),
 (-6.248955072944423, 'going'),
 (-6.2611646405206605, 'say'),
 (-6.264117253265585, 'years')]