# Detecting is a news is fake or real

In [37]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import random

In [3]:
news_data = pd.read_csv('data/news.zip', compression='zip') 
## downloaded the news from https://drive.google.com/file/d/1er9NJTLUA3qnRuyhfzuN0XUsoIC4a-_q/view

In [4]:
news_data.shape

(6335, 4)

In [5]:
news_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [7]:
Counter(news_data['label']) #it is rather balanced 

Counter({'FAKE': 3164, 'REAL': 3171})

In [9]:
x = news_data['text']
y = news_data['label']
print(x.shape)
print(y.shape)

(6335,)
(6335,)


In [11]:
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
x_tfidf = tfidf.fit_transform(x)
x_tfidf

<6335x67351 sparse matrix of type '<class 'numpy.float64'>'
	with 1665262 stored elements in Compressed Sparse Row format>

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.3, random_state=42)

In [14]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(4434, 67351)
(4434,)
(1901, 67351)
(1901,)


In [17]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

In [19]:
np.unique(y_test)

array([0, 1])

## Build a PassiveAggressiveClassifier

In [27]:
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(x_train, y_train)
preds = pac.predict(x_test)
f1score = f1_score(preds, y_test)
cm = confusion_matrix(preds, y_test)
# if y_test and y_train were not encoded: confusion_matrix(preds, y_test, labels=['FAKE','REAL'])
cr = classification_report(preds, y_test)
print(f'F1 score is:\n {f1score}\n Confusion matix is: \n {cm} \n Classification reposrt is:\n {cr}')

F1 score is:
 0.9342672413793104
 Confusion matix is: 
 [[912  66]
 [ 56 867]] 
 Classification reposrt is:
               precision    recall  f1-score   support

           0       0.94      0.93      0.94       978
           1       0.93      0.94      0.93       923

    accuracy                           0.94      1901
   macro avg       0.94      0.94      0.94      1901
weighted avg       0.94      0.94      0.94      1901



## Fine tune the PassiveAggressiveClassifier's parameters

In [34]:
parameters = {'C':[1, 2, 3], 'max_iter':[100, 300, 600, 900]}
pac = PassiveAggressiveClassifier()
pac_grid = GridSearchCV(pac, parameters, cv=5)
pac_grid.fit(x_train, y_train)


GridSearchCV(cv=5, estimator=PassiveAggressiveClassifier(),
             param_grid={'C': [1, 2, 3], 'max_iter': [100, 300, 600, 900]})

In [35]:
pac_grid.best_params_

{'C': 2, 'max_iter': 900}

In [36]:
pac = PassiveAggressiveClassifier(C=pac_grid.best_params_['C'], max_iter=pac_grid.best_params_['max_iter'])
pac.fit(x_train, y_train)
preds = pac.predict(x_test)
f1score = f1_score(preds, y_test)
cm = confusion_matrix(preds, y_test)
# if y_test and y_train were not encoded: confusion_matrix(preds, y_test, labels=['FAKE','REAL'])
cr = classification_report(preds, y_test)
print(f'F1 score is:\n {f1score}\n Confusion matix is: \n {cm} \n Classification reposrt is:\n {cr}')

F1 score is:
 0.9364224137931034
 Confusion matix is: 
 [[914  64]
 [ 54 869]] 
 Classification reposrt is:
               precision    recall  f1-score   support

           0       0.94      0.93      0.94       978
           1       0.93      0.94      0.94       923

    accuracy                           0.94      1901
   macro avg       0.94      0.94      0.94      1901
weighted avg       0.94      0.94      0.94      1901



## Example

In [55]:
index = random.randint(0,news_data.shape[0])
text = news_data.iloc[index]['text']
pred = pac.predict(x_tfidf[index])
acctual = y[index]
print(f'The text is: \n {text}')
print(f'The model prediction is: {pred}')
print(f'The actual label is: {acctual}')

The text is: 
 Here's everything you need to know about how the labor market fared in March
The model prediction is: [1]
The actual label is: REAL
