In [58]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib
from random import randint
from numpy import linalg as LA

In [59]:
df=pd.read_csv('news.csv')
df.shape

(6335, 4)

In [60]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [61]:
df.isnull()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
6330,False,False,False,False
6331,False,False,False,False
6332,False,False,False,False
6333,False,False,False,False


In [62]:
df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [63]:
df.label

0       FAKE
1       FAKE
2       REAL
3       FAKE
4       REAL
        ... 
6330    REAL
6331    FAKE
6332    FAKE
6333    REAL
6334    REAL
Name: label, Length: 6335, dtype: object

In [64]:
#checking for imbalance in the dataset
df.label.value_counts()
i=df.label.value_counts()
fig = go.Figure(data=[go.Bar(
            x=['Real','Fake'], y=i,
            text=i,
            textposition='auto',
        )])

fig.show()

In [65]:
X_train,X_test,y_train,y_test=train_test_split(df['text'], df.label, test_size=0.2, random_state=7)

In [66]:
X_train
X_train.shape

(5068,)

In [67]:
y_train
y_train.shape

(5068,)

In [68]:
X_test.shape

(1267,)

In [69]:
y_test.shape

(1267,)

In [70]:
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
tfidf_test=tfidf_vectorizer.transform(X_test)

print(type(tfidf_train))
print(tfidf_train)
print(tfidf_train.toarray())

#Converting scilearn.csr matrix to numpy array for further processing
newX_train = tfidf_train.toarray()
print(len(newX_train))
newX_test = tfidf_test.toarray()

# Converting Categorical values i.e FAKE and REAL to numbers
le = LabelEncoder()
le.fit(y_train)
y_train_enc = le.transform(y_train)
y_test_enc = le.transform(y_test)
print(y_train_enc)
print(y_test_enc)

<class 'scipy.sparse._csr.csr_matrix'>
  (0, 56381)	0.03622223988286098
  (0, 16314)	0.053492157980948106
  (0, 19620)	0.030351855107005405
  (0, 52607)	0.04266045446208797
  (0, 14900)	0.039165339742818085
  (0, 53749)	0.029756205182552464
  (0, 15211)	0.07772572986248194
  (0, 61154)	0.06726619958695557
  (0, 59042)	0.047893261248723944
  (0, 42972)	0.03152542343098286
  (0, 54232)	0.038673616329284524
  (0, 59249)	0.04106143649018827
  (0, 28891)	0.06514397995138038
  (0, 41708)	0.03983513460128018
  (0, 50192)	0.045331181477256094
  (0, 44691)	0.0318676439567658
  (0, 11820)	0.046381950858248124
  (0, 7682)	0.04137048243377956
  (0, 50343)	0.10196965191544219
  (0, 48095)	0.021092647294770877
  (0, 17916)	0.03674587236023286
  (0, 46027)	0.10236534701241509
  (0, 16993)	0.02775494464904786
  (0, 55006)	0.03368300200002207
  (0, 51389)	0.03397042876291898
  :	:
  (5067, 32909)	0.09429823872256275
  (5067, 59221)	0.11305513144362901
  (5067, 14649)	0.03772971846597005
  (5067, 55827)

In [71]:
class PassiveAggressive():
    def __init__(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train
        self._epochs = 5 #hyperparameter

    def train(self):
        w = np.zeros(self.x_train.shape) #hyperparameter
        print(w)
        for e in range(self._epochs):
            mistake = 0
            for i in range(len(self.x_train)):
                x, y = shuffle(self.x_train, self.y_train)
                # predict
                y_hat = int(np.argmax(np.dot(w, x)))#here it is either 0 or 1 which are indexes for two classes fake and real resp.
                #np.dot(w,x) gives an array with values at only two indexes 0 and 1 and others are zeros since only two classes are present
                #np.argmax(np.dot(w,x)) gives the maximum value index to which it has been predicted.
                #if the value at index 1 which is for real is greater than the value at index 0 which is for fake then the prediction
                #is done on the basis of max value. so the class is predicted.
                # print(y_hat)
                # update for wrong prediction
                if y != y_hat:
                    mistake += 1
                    loss = max(0, 1 - np.dot(w[y, :], x) + np.dot(w[y_hat, :], x)) 
                    tau = loss / (2 * np.power(LA.norm(x), 2))  #Among PA,PA-I,PA-II it is PA so C is not required
                    w[y, :] = w[y, :] + tau * x
                    w[y_hat, :] = w[y_hat, :] - tau * x
        return w, mistake

def shuffle(x,y):
    p = randint(0,len(x)-1)
    return x[p],int(y[p])

In [72]:
def test(test_x, test_y):
    miss = 0
    print(w)
    pred_values = np.empty(0)
    for i in range(len(test_x)):
        res = int(np.argmax(np.dot(w, test_x[i]))) #This is predicted value for y
        pred_values=np.append(pred_values, res)
        if res != test_y[i]: #if wrong prediction
            miss += 1
    return miss,pred_values

In [73]:
#run for creating a new model
pa = PassiveAggressive(newX_train, y_train_enc)
print(pa)
print(np.count_nonzero(pa))
w, mistakes = pa.train()
print(w)
miss, pred_values = test(newX_test, y_test_enc)
print("accuracy:", "{0:.2f}%".format((1 - miss / len(newX_test)) * 100)) #This line and the below line gives same accuracy

<__main__.PassiveAggressive object at 0x7f2d82de58a0>
1
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[ 0.08275474 -0.05225321 -0.02995192 ...  0.          0.
   0.        ]
 [-0.08275474  0.05225321  0.02995192 ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]
[[ 0.08275474 -0.05225321 -0.02995192 ...  0.          0.
   0.        ]
 [-0.08275474  0.05225321  0.02995192 ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.  

In [74]:
filename = 'Running_model.sav'
joblib.dump(w, filename)

['Running_model.sav']

In [75]:
#takes about 5min
miss, pred_values = test(newX_test, y_test_enc)
pred_PA = pred_values
score=accuracy_score(y_test_enc,pred_values)
print(f'Accuracy: {round(score*100,2)}%')


[[ 0.08275474 -0.05225321 -0.02995192 ...  0.          0.
   0.        ]
 [-0.08275474  0.05225321  0.02995192 ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]
Accuracy: 92.27%


In [76]:
confusion_matrix(y_test_enc,pred_values, labels=[0,1])

array([[593,  45],
       [ 53, 576]])

In [77]:
print('\n clasification report:\n',classification_report(y_test_enc,pred_values))


 clasification report:
               precision    recall  f1-score   support

           0       0.92      0.93      0.92       638
           1       0.93      0.92      0.92       629

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267



In [78]:
loaded_model = joblib.load('Running_model.sav')

#this ii is real news
ii=['"We are mammas on a mission to protect our children. In Florida, the buck stops with the parents," DeSantis said Saturday at the Women of Distinction Awards Program and Gala, which was held at St. Cecilia’s Catholic Church in Fort Myers. She was honored with the Woman of Inspiration Award during the event.']

In [79]:
# This ii is fake news comment above and uncomment below to see different result
# ii=["""Hey Facebook, As some of you may know, I'm Bill Gates. If you click that share link, I will give you $5,000. I always deliver, I mean, I brought you Windows XP, right?"""]
ii=tfidf_vectorizer.transform(ii)
# final_test = ii.toarray()
# print(final_test)

In [80]:
print(ii)
final_test = ii.toarray()
print(final_test)
print(len(final_test))
print(len(final_test[0]))
print(np.count_nonzero(final_test))
print(len(final_test[0]) - np.count_nonzero(final_test))

  (0, 60178)	0.11231204057778854
  (0, 60169)	0.12359428063229991
  (0, 52346)	0.20269444670586415
  (0, 51742)	0.1820122713479666
  (0, 48012)	0.13571806063341774
  (0, 47648)	0.057240553833322126
  (0, 43329)	0.13339742418481543
  (0, 43100)	0.12802669862444538
  (0, 40189)	0.1547808265450421
  (0, 36813)	0.2773621947872829
  (0, 35659)	0.16183975815412024
  (0, 28304)	0.2352066483715118
  (0, 26318)	0.22322032192110336
  (0, 25537)	0.1164242836503344
  (0, 22690)	0.2618038368832844
  (0, 21950)	0.21194822839457272
  (0, 21551)	0.12297146319410722
  (0, 19585)	0.12626657444283793
  (0, 16717)	0.2126523147904698
  (0, 15567)	0.3125217655260105
  (0, 10970)	0.16072205878566845
  (0, 10745)	0.12077057529354389
  (0, 10119)	0.296963407622012
  (0, 9991)	0.1947955674421839
  (0, 8783)	0.23785401744459175
  (0, 5564)	0.22416781776373773
  (0, 5561)	0.18824838789841108
[[0. 0. 0. ... 0. 0. 0.]]
1
61651
27
61624


In [81]:
# For finding the predicted value, executes only once for one vector so only one answer produced
final_pred_values = np.empty(0)
print(len(final_test))
for i in range(len(final_test)):
    print(np.dot(loaded_model, final_test[i]))
    print(np.count_nonzero(np.dot(loaded_model, final_test[i])))
    result = int(np.argmax(np.dot(loaded_model, final_test[i])))#predicted value
    final_pred_values=np.append(final_pred_values, result)

1
[-0.22034205  0.22034205  0.         ...  0.          0.
  0.        ]
2


In [82]:
print(final_pred_values)
if final_pred_values[0] == 0:
    print('The News is fake')
else:
    print('The news is real')
# out 1 corresponds to real and 0 corresponds to fake

[1.]
The news is real


In [83]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=5).fit(tfidf_train,y_train_enc)
X_train_prediction = model.predict(tfidf_train)
training_accuracy = accuracy_score(X_train_prediction, y_train_enc)
print(training_accuracy)

0.8338595106550908



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [84]:
X_test_prediction = model.predict(tfidf_test)
testing_accuracy = accuracy_score(X_test_prediction, y_test_enc)
print(testing_accuracy)

0.8208366219415943


In [85]:
model_LR = LogisticRegression(max_iter=900).fit(tfidf_train,y_train_enc)
pred_LR = model_LR.predict(tfidf_test)
cr_LR = classification_report(y_test_enc, pred_LR)
print(cr_LR)

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       638
           1       0.94      0.89      0.91       629

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267



In [86]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [87]:
model_DT = DecisionTreeClassifier().fit(tfidf_train,y_train_enc)
pred_DT = model_DT.predict(tfidf_test)
cr_DT = classification_report(y_test_enc, pred_DT)
print(cr_DT)

              precision    recall  f1-score   support

           0       0.81      0.82      0.81       638
           1       0.81      0.81      0.81       629

    accuracy                           0.81      1267
   macro avg       0.81      0.81      0.81      1267
weighted avg       0.81      0.81      0.81      1267



In [88]:
model_RF = RandomForestClassifier().fit(tfidf_train,y_train_enc)
pred_RF = model_RF.predict(tfidf_test)
cr_RF = classification_report(y_test_enc, pred_RF)
print(cr_RF)

              precision    recall  f1-score   support

           0       0.91      0.91      0.91       638
           1       0.91      0.91      0.91       629

    accuracy                           0.91      1267
   macro avg       0.91      0.91      0.91      1267
weighted avg       0.91      0.91      0.91      1267



In [89]:
from sklearn.ensemble import AdaBoostClassifier
model_ADA = AdaBoostClassifier().fit(tfidf_train, y_train_enc)
pred_ADA = model_ADA.predict(tfidf_test)
cr_ADA = classification_report(y_test_enc, pred_ADA)
print(cr_ADA)

              precision    recall  f1-score   support

           0       0.88      0.88      0.88       638
           1       0.88      0.88      0.88       629

    accuracy                           0.88      1267
   macro avg       0.88      0.88      0.88      1267
weighted avg       0.88      0.88      0.88      1267



In [90]:
score_1 = accuracy_score(y_test_enc,pred_LR)
score_2 = accuracy_score(y_test_enc,pred_DT)
score_3 = accuracy_score(y_test_enc,pred_RF)
score_4 = accuracy_score(y_test_enc, pred_ADA)
score_5 = accuracy_score(y_test_enc, pred_PA)
results = pd.DataFrame([["Logistic Regression",score_1],["Decision Tree",score_2],["Random Forest",score_3],["ADABoost Classifier", score_4],["PA Classifier", score_5]],columns=["Model","Accuracy"])
results

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.917127
1,Decision Tree,0.811365
2,Random Forest,0.909234
3,ADABoost Classifier,0.879242
4,PA Classifier,0.922652


In [91]:
#Comparing with Sklearn model
from sklearn.linear_model import PassiveAggressiveClassifier

model_PA = PassiveAggressiveClassifier(max_iter=100).fit(tfidf_train, y_train_enc)
pred_PA_predefined = model_PA.predict(tfidf_test)
cr_PA = classification_report(y_test_enc, pred_PA_predefined)
print(cr_PA)
accuracy= accuracy_score(y_test_enc, pred_PA_predefined)
print(accuracy)

              precision    recall  f1-score   support

           0       0.93      0.92      0.93       638
           1       0.92      0.93      0.93       629

    accuracy                           0.93      1267
   macro avg       0.93      0.93      0.93      1267
weighted avg       0.93      0.93      0.93      1267

0.9289660615627466
