In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split




In [2]:
data=pd.read_csv('fake_news.csv')
data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [3]:
data.dropna(inplace=True)
data.reset_index(inplace=True)

In [4]:
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.text import one_hot

In [5]:
X=data.drop('label',axis=1)
X.shape

(18285, 5)

In [6]:
y=data['label']
y.shape

(18285,)

### Text Preprocessing 

In [7]:
import re
corpus=[]

wordnet=WordNetLemmatizer()

for i in range(0,len(data)):
    review=re.sub('[^a-zA-Z]',' ',data['title'][i])
    review=review.lower()
    review=review.split()
    review=[wordnet.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review=' '.join(review)
    corpus.append(review)

In [8]:
corpus

['house dem aide even see comey letter jason chaffetz tweeted',
 'flynn hillary clinton big woman campus breitbart',
 'truth might get fired',
 'civilian killed single u airstrike identified',
 'iranian woman jailed fictional unpublished story woman stoned death adultery',
 'jackie mason hollywood would love trump bombed north korea lack trans bathroom exclusive video breitbart',
 'beno hamon win french socialist party presidential nomination new york time',
 'back channel plan ukraine russia courtesy trump associate new york time',
 'obama organizing action partner soros linked indivisible disrupt trump agenda',
 'bbc comedy sketch real housewife isi cause outrage',
 'russian researcher discover secret nazi military base treasure hunter arctic photo',
 'u official see link trump russia',
 'yes paid government troll social medium blog forum website',
 'major league soccer argentine find home success new york time',
 'well fargo chief abruptly step new york time',
 'anonymous donor pay 

### One Hot Representation

In [9]:
voc_size=5000

In [10]:
onehot_rep=[one_hot(word,voc_size)for word in corpus]
print(onehot_rep)

[[3896, 4368, 2954, 638, 3655, 3487, 4772, 2046, 4171, 546], [2328, 318, 2294, 1512, 4115, 32, 569], [1450, 3404, 1097, 2095], [102, 3022, 3411, 1180, 478, 1608], [1751, 4115, 3850, 491, 2331, 861, 4115, 902, 1723, 835], [3220, 3692, 4385, 3425, 3940, 3663, 4915, 934, 995, 3165, 2543, 4717, 3069, 3045, 569], [3998, 116, 2521, 1520, 2960, 2543, 3132, 319, 4981, 1681, 3796], [1272, 1574, 4430, 3887, 1611, 4237, 3663, 1066, 4981, 1681, 3796], [483, 2915, 3617, 3499, 3674, 3044, 2304, 3768, 3663, 4611], [185, 4747, 1892, 4456, 254, 4756, 1509, 4667], [3331, 3720, 2773, 1364, 1662, 3016, 1240, 2558, 1956, 4882, 1745], [1180, 2171, 3655, 2695, 3663, 1611], [1945, 1777, 598, 3218, 2532, 2681, 1543, 1693, 356], [230, 3313, 432, 1363, 578, 4314, 4738, 4981, 1681, 3796], [4189, 2190, 3869, 942, 4987, 4981, 1681, 3796], [1419, 399, 3464, 3530, 437, 2926, 2351, 3727, 439, 115], [3909, 4500, 318], [4935, 53, 3956, 1349, 3663, 956, 2511, 569], [3611, 360, 2294, 89, 4622, 2158, 2477, 2135, 861], [323

### Padding Sequence

In [11]:
sent_length=20
embedded_docs=pad_sequences(onehot_rep,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 2046 4171  546]
 [   0    0    0 ... 4115   32  569]
 [   0    0    0 ... 3404 1097 2095]
 ...
 [   0    0    0 ... 4981 1681 3796]
 [   0    0    0 ... 1494 4567 2615]
 [   0    0    0 ... 1153  462 3944]]


In [12]:
len(embedded_docs)

18285

In [13]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 3896,
       4368, 2954,  638, 3655, 3487, 4772, 2046, 4171,  546])

In [20]:
num_features=20

## LSTM Model

In [21]:
embeding_vector_features=40
model=Sequential()

model.add(Embedding(voc_size,embeding_vector_features,input_length=sent_length))
model.add(LSTM(100,input_shape=(1,num_features)))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 40)            200000    
                                                                 
 lstm_1 (LSTM)               (None, 100)               56400     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256501 (1001.96 KB)
Trainable params: 256501 (1001.96 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [22]:
X_final=np.array(embedded_docs)
y_final=np.array(y)
X_final.shape,y_final.shape

((18285, 20), (18285,))

In [23]:
X_train,X_test,y_train,y_test=train_test_split(X_final,y_final,test_size=0.30,random_state=0)
X_train.shape

(12799, 20)

In [24]:
X_train_reshaped=X_train.reshape(-1,1,num_features)
X_test_reshaped=X_test.reshape(-1,1,num_features)
X_train_reshaped.shape,X_test_reshaped.shape

((12799, 1, 20), (5486, 1, 20))

In [25]:
model.fit(X_train_reshaped,y_train,validation_data=(X_test_reshaped,y_test),epochs=10,batch_size=64)

Epoch 1/10


ValueError: in user code:

    File "C:\Users\z004vc9h\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\z004vc9h\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\z004vc9h\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\z004vc9h\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1150, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\z004vc9h\AppData\Local\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\z004vc9h\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_1" is incompatible with the layer: expected shape=(None, 20), found shape=(None, 1, 20)


In [19]:
y_pred=model.predict(X_test)



In [21]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test,y_pred))

ValueError: Classification metrics can't handle a mix of binary and continuous targets

## Adding Drop Out Layer

In [44]:
from tensorflow.keras.layers import Dropout
model.add(Embedding(voc_size,embeding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x23922954790>