In [2]:
#notebook content source and project idea: https://data-flair.training/blogs/advanced-python-project-detecting-fake-news/
#package import
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
#read data. personal note: pathname has to be specified all the way from Users. 
#no changing directory in jupyter notebook apparently
df=pd.read_csv('/Users/stefbp/Desktop/bata/ML_AI/FakeNewsDetection/data/news.csv')

#Get data shape and head
df.shape

(6335, 4)

In [4]:
#first 5 observations
df.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
#define labels series as the label column
label = df.loc[:,'label']
label.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [6]:
#Split the dataset. df's text column as the predictor, and the label column or variable as the dependent variable.
#split is 80-20 train-test.
x_train,x_test,y_train,y_test=train_test_split(df['text'], label, test_size=0.2, random_state=66)

In [7]:
#Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
#stop_words arg: 'english' basically corresponds to a set of stop words. stop words (like 'and') are to be thrown out.
#stop words must be read on later, and must be chosen appropriately according to aim.

#Fit transform train set, transform test set according to tfidf_vectorizer's set parameters.
tfidf_train=tfidf_vectorizer.fit_transform(x_train)
tfidf_test=tfidf_vectorizer.transform(x_test)

In [8]:
tfidf_train.shape

(5068, 61872)

In [9]:
tfidf_test.shape
#kinda begs the question: what is the 61872 for the columns? the number of unique words in the data?

(1267, 61872)

In [10]:
#Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)#fit the train tfidf_vectorized to the train label.

#Predict with the trained passive aggressive classifier on the test set
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)#calculate accuracy
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 92.66%


In [11]:
#Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])
#so, 603 true positives, 570 true negatives, 52 false positives, 42 false negatives

array([[607,  38],
       [ 55, 567]])

okay, so the passive aggressive classifier:
good for huge datasets since it works iteratively (get example, update model, throw away example, rinse & repeat).
it's aggressive in that it has regularization to penalize errors, but passive when the example's prediction result is within expectations (in this case, actually being right).
parameters: regularization parameter (how much the penalty is), max iterations (typical), and error tolerance (yeah).

sources: https://www.geeksforgeeks.org/passive-aggressive-classifiers/

so, begs the question: how do we make it better? i think the way to crank up this accuracy:
1. vectorize the data another way (will try this later)
2. play around with the parameters of the passive aggressive classifier
we'll go with no. 2 first.

In [51]:
#general aim is to run the passive aggressive classifier on several different levels of regularization and max iterations.
#will leave the error requirement alone for now.
#start with iterations alone. i'll try between 50-200 on increments of 10 (16 levels) and make a chart of the accuracy for each point.
trial_iter = pd.DataFrame(index=range(16),columns=range(2))
trial_iter.columns=['iterations','accuracy']
trial_iter['iterations'] = np.linspace(50,200,num=16)#the iterations
trial_iter.tail(4)

Unnamed: 0,iterations,accuracy
12,170.0,
13,180.0,
14,190.0,
15,200.0,


In [52]:
for iter in trial_iter['iterations']:
    trial_pac=PassiveAggressiveClassifier(max_iter=iter,tol=1e-10)
    trial_pac.fit(tfidf_train,y_train)#fit the train tfidf_vectorized to the train label.
    trial_y_pred=pac.predict(tfidf_test)
    trial_score=accuracy_score(y_test,trial_y_pred)#calculate accuracy
    
    trial_iter.loc[trial_iter['iterations'] == iter,'accuracy'] = trial_score #allocate the trial_score to its corresponding iteration

In [53]:
trial_iter

Unnamed: 0,iterations,accuracy
0,50.0,0.926598
1,60.0,0.926598
2,70.0,0.926598
3,80.0,0.926598
4,90.0,0.926598
5,100.0,0.926598
6,110.0,0.926598
7,120.0,0.926598
8,130.0,0.926598
9,140.0,0.926598


so clearly adding iterations didnt do jack. what about regularization? i'll try it from 0.5 to 2.0 in increments of 0.1, so 16 levels again to this thing.

In [41]:
trial_reg = pd.DataFrame(index=range(16),columns=range(2))
trial_reg.columns=['regularization','accuracy']
trial_reg['regularization'] = np.linspace(0.5,2.0,num=16)#the iterations
trial_reg.tail(4)

Unnamed: 0,regularization,accuracy
12,1.7,
13,1.8,
14,1.9,
15,2.0,


In [46]:
for iter in trial_reg['regularization']:
    trial_pac=PassiveAggressiveClassifier(max_iter=50,C=iter)
    trial_pac.fit(tfidf_train,y_train)#fit the train tfidf_vectorized to the train label.
    trial_y_pred=pac.predict(tfidf_test)
    trial_score=accuracy_score(y_test,trial_y_pred)#calculate accuracy
    
    trial_reg.loc[trial_reg['regularization'] == iter,'accuracy'] = trial_score #allocate the trial_score to its corresponding iteration

In [47]:
trial_reg

Unnamed: 0,regularization,accuracy
0,0.5,0.926598
1,0.6,0.926598
2,0.7,0.926598
3,0.8,0.926598
4,0.9,0.926598
5,1.0,0.926598
6,1.1,0.926598
7,1.2,0.926598
8,1.3,0.926598
9,1.4,0.926598


separately, i did try to tighten the tolerance level up to 1e-10. same results, even with anticipation that a stricter tolerance would usually imply the need of a higher iteration limit.

thus, i cant think of any way to modify the passive aggressive classifier on tf-idf vectorized data that actually increases the accuracy. will have to modify the methodology completely. thus ends the experiment here.