In [12]:
import pandas as pd
import numpy as np

In [2]:
d = pd.read_csv("fake news train.csv")
d.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [15]:
d.shape

(18285, 5)

In [16]:
d.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [17]:
# d.dropna(inplace=True)
d.reset_index(inplace=True, drop=True)

#### DATA PREPROCESSING 🧹

In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

In [19]:
messages = d.copy()

In [20]:
messages["title"]

0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        FLYNN: Hillary Clinton, Big Woman on Campus - ...
2                        Why the Truth Might Get You Fired
3        15 Civilians Killed In Single US Airstrike Hav...
4        Iranian woman jailed for fictional unpublished...
                               ...                        
18280    Rapper T.I.: Trump a ’Poster Child For White S...
18281    N.F.L. Playoffs: Schedule, Matchups and Odds -...
18282    Macy’s Is Said to Receive Takeover Approach by...
18283    NATO, Russia To Hold Parallel Exercises In Bal...
18284                            What Keeps the F-35 Alive
Name: title, Length: 18285, dtype: object

In [21]:
len(messages)

18285

In [22]:
corpus = []
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages['title'][i])
    review = review.lower()
    review = review.split()
    review = [PorterStemmer().stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [23]:
corpus

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri',
 'jacki mason hollywood would love trump bomb north korea lack tran bathroom exclus video breitbart',
 'beno hamon win french socialist parti presidenti nomin new york time',
 'back channel plan ukrain russia courtesi trump associ new york time',
 'obama organ action partner soro link indivis disrupt trump agenda',
 'bbc comedi sketch real housew isi caus outrag',
 'russian research discov secret nazi militari base treasur hunter arctic photo',
 'us offici see link trump russia',
 'ye paid govern troll social media blog forum websit',
 'major leagu soccer argentin find home success new york time',
 'well fargo chief abruptli step new york time',
 'anonym donor pay million releas everyon arrest dakota access pipelin',
 'fbi close hilla

#### LIBRARIES 📚

In [24]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional

voc_size = 5000

In [25]:
one_hot_represent = [one_hot(words,voc_size)for words in corpus]
one_hot_represent

[[2568, 2131, 93, 3600, 4067, 4884, 1787, 3126, 2869, 3515],
 [4712, 484, 4607, 1425, 603, 2902, 1117],
 [1729, 2624, 959, 4247],
 [1389, 3461, 3450, 2851, 23, 446],
 [285, 603, 995, 1729, 775, 1682, 603, 4183, 3888, 4077],
 [3193,
  2587,
  697,
  1638,
  2540,
  4932,
  695,
  3614,
  316,
  4110,
  163,
  4414,
  1920,
  2119,
  1117],
 [4215, 1873, 1893, 323, 598, 2542, 2058, 2878, 599, 3225, 292],
 [862, 115, 1980, 3938, 99, 1637, 4932, 4045, 599, 3225, 292],
 [2759, 2891, 3896, 3800, 1369, 610, 2646, 4039, 4932, 3696],
 [4352, 1274, 2271, 4970, 360, 173, 1511, 4531],
 [772, 3568, 4239, 3744, 668, 4282, 2606, 4499, 1524, 4807, 935],
 [2851, 4569, 4067, 610, 4932, 99],
 [2642, 4051, 2405, 3713, 3559, 4939, 3828, 1928, 295],
 [4477, 985, 1242, 4894, 2651, 2579, 3105, 599, 3225, 292],
 [3317, 1070, 3169, 3833, 1261, 599, 3225, 292],
 [1952, 1450, 2130, 2595, 3612, 15, 3815, 743, 4729, 3891],
 [1560, 2916, 484],
 [2896, 4473, 2227, 880, 4932, 3363, 2608, 1117],
 [93, 941, 4607, 2693, 

In [26]:
embedded_docs = pad_sequences(one_hot_represent,padding='pre',maxlen=20)
embedded_docs

array([[   0,    0,    0, ..., 3126, 2869, 3515],
       [   0,    0,    0, ...,  603, 2902, 1117],
       [   0,    0,    0, ..., 2624,  959, 4247],
       ...,
       [   0,    0,    0, ...,  599, 3225,  292],
       [   0,    0,    0, ..., 1392,  842, 4227],
       [   0,    0,    0, ..., 4011, 1402, 1855]], dtype=int32)

In [27]:
embedded_docs.shape

(18285, 20)

### Build The Embedding Layer ✈️

In [28]:
max_features = 40
model = Sequential()
model.add(Embedding(voc_size,max_features,input_length=20))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
# print(model.summary())



### Create X and Y features to feed the model

In [29]:
x_final = np.array(embedded_docs)
y_final = np.array(d['label'])

In [30]:
x_final.shape,y_final.shape

((18285, 20), (18285,))

In [31]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_final,y_final,test_size=0.30,random_state=42)

In [32]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.7955 - loss: 0.4139 - val_accuracy: 0.9149 - val_loss: 0.1976
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.9519 - loss: 0.1237 - val_accuracy: 0.9151 - val_loss: 0.2022
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.9664 - loss: 0.0894 - val_accuracy: 0.9101 - val_loss: 0.2387
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.9790 - loss: 0.0624 - val_accuracy: 0.9101 - val_loss: 0.2499
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.9874 - loss: 0.0421 - val_accuracy: 0.9098 - val_loss: 0.2953
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9913 - loss: 0.0284 - val_accuracy: 0.8966 - val_loss: 0.3515
Epoch 7/10
[1m200/200

<keras.src.callbacks.history.History at 0x11a13eb4530>

In [33]:
model.summary()

In [34]:
##see the embedded vectors
model.layers[0].get_weights()[0]
model.layers[0].get_weights()[0].shape


(5000, 40)

In [35]:
corpus[0],embedded_docs[1]

('hous dem aid even see comey letter jason chaffetz tweet',
 array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0, 4712,  484, 4607, 1425,  603, 2902, 1117], dtype=int32))

In [36]:
y_pred = model.predict(x_test)

[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [37]:
y_pred

array([[9.9995798e-01],
       [1.0805184e-03],
       [2.9471926e-03],
       ...,
       [9.9991232e-01],
       [7.4319229e-07],
       [9.9921536e-01]], dtype=float32)

In [38]:
y_pred = np.where(y_pred>=0.5,1,0)

In [39]:
y_pred

array([[1],
       [0],
       [0],
       ...,
       [1],
       [0],
       [1]])

#### MODEL PERFORMANCE 🔥

In [40]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
print("CONFUSION MATRIX:")
print(confusion_matrix(y_test,y_pred))
print("\nACCURACY SCORE:")
print(accuracy_score(y_test,y_pred)*100,"%")
print("\nCLASSICATION REPORT")
print(classification_report(y_test,y_pred))

CONFUSION MATRIX:
[[2832  275]
 [ 252 2127]]

ACCURACY SCORE:
90.39372949325556 %

CLASSICATION REPORT
              precision    recall  f1-score   support

           0       0.92      0.91      0.91      3107
           1       0.89      0.89      0.89      2379

    accuracy                           0.90      5486
   macro avg       0.90      0.90      0.90      5486
weighted avg       0.90      0.90      0.90      5486



In [41]:
import pickle
with open("fake_news_model.pkl", "wb") as file:  
    pickle.dump(model, file)
