# Fake News Classifier Using LSTM

## Data Collection

In [1]:
#Download the dataset using  "https://www.kaggle.com/code/ahmedtronic/fake-news-classification/input"

## Data Preprocessing

In [1]:
import pandas as pd

#load dataset
df = pd.read_csv('train.csv')

#remove missing values
df = df.dropna()

df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [2]:
#get Independent Features
X = df.drop('label',axis=1)

#get dependent Features
y = df['label']

print('X Shape:',X.shape)
print('y Shape:',y.shape)

X Shape: (18285, 4)
y Shape: (18285,)


## Text preprocessing

In [3]:
messages = X.copy()
messages.reset_index(inplace=True)

print("Actual messages")
print(messages['title'])

messages['title'] = messages['title'].fillna('')  # Replace NaN with an empty string


import nltk
import re
from nltk.corpus import stopwords

# Download the NLTK stopwords  
nltk.download('stopwords')

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []

print(len(messages),"of messgaes ,takes conversion")
# Process each message
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages['title'][i])
    review = review.lower()
    review = review.split()
    
     # Remove stopwords and apply stemming
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

print("After converting :",len(corpus))
print(corpus[:5])
 

Actual messages
0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        FLYNN: Hillary Clinton, Big Woman on Campus - ...
2                        Why the Truth Might Get You Fired
3        15 Civilians Killed In Single US Airstrike Hav...
4        Iranian woman jailed for fictional unpublished...
                               ...                        
18280    Rapper T.I.: Trump a ’Poster Child For White S...
18281    N.F.L. Playoffs: Schedule, Matchups and Odds -...
18282    Macy’s Is Said to Receive Takeover Approach by...
18283    NATO, Russia To Hold Parallel Exercises In Bal...
18284                            What Keeps the F-35 Alive
Name: title, Length: 18285, dtype: object


[nltk_data] Downloading package stopwords to /home/rgukt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


18285 of messgaes ,takes conversion
After converting : 18285
['hous dem aid even see comey letter jason chaffetz tweet', 'flynn hillari clinton big woman campu breitbart', 'truth might get fire', 'civilian kill singl us airstrik identifi', 'iranian woman jail fiction unpublish stori woman stone death adulteri']


## One hot Encoding

In [4]:
from tensorflow.keras.preprocessing.text import one_hot

# One-hot encode the messages
voc_size = 5000  # Vocabulary size
onehot_repr = [one_hot(words, voc_size) for words in corpus]

print("One hot representation:")
print(onehot_repr[:5])

2025-02-04 21:15:39.567980: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-04 21:15:39.602556: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-04 21:15:39.613179: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-04 21:15:39.638819: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


One hot representation:
[[1536, 1718, 3166, 4543, 4730, 2829, 1483, 3479, 4075, 1344], [1956, 2980, 1493, 1828, 4571, 1744, 409], [3288, 602, 2341, 229], [1082, 2900, 4565, 4052, 4734, 2251], [1182, 4571, 1325, 4271, 2152, 2890, 4571, 2597, 4310, 4841]]


## Padding Sequences

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Embedding Representation
# Define sentence length (maximum number of words per message)
sent_len = 20

# Pad the sequences to make them the same length
embed_docs = pad_sequences(onehot_repr, padding = 'pre', maxlen = sent_len)

print("Embedded docs:")
print(embed_docs[:5])

Embedded docs:
[[   0    0    0    0    0    0    0    0    0    0 1536 1718 3166 4543
  4730 2829 1483 3479 4075 1344]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0 1956
  2980 1493 1828 4571 1744  409]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0 3288  602 2341  229]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
  1082 2900 4565 4052 4734 2251]
 [   0    0    0    0    0    0    0    0    0    0 1182 4571 1325 4271
  2152 2890 4571 2597 4310 4841]]


## Creating the Model

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Model definition
embed_vec_features = 40
model = Sequential()
model.add(Embedding(voc_size, embed_vec_features, input_length = sent_len))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

# Model summary
print("Model summary:")
model.summary()


Model summary:




## Model Compilation & Training

In [7]:
import numpy as np

# Convert data into numpy arrays
X_final = np.array(embed_docs)
y_final = np.array(y)

print("X final shape:",X_final.shape,"y final shape:",y_final.shape)


from sklearn.model_selection import train_test_split
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 42)

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)


X final shape: (18285, 20) y final shape: (18285,)
Epoch 1/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 39ms/step - accuracy: 0.7899 - loss: 0.4216 - val_accuracy: 0.9163 - val_loss: 0.1956
Epoch 2/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 36ms/step - accuracy: 0.9425 - loss: 0.1439 - val_accuracy: 0.9210 - val_loss: 0.1937
Epoch 3/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 36ms/step - accuracy: 0.9681 - loss: 0.0901 - val_accuracy: 0.9173 - val_loss: 0.2024
Epoch 4/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 41ms/step - accuracy: 0.9781 - loss: 0.0658 - val_accuracy: 0.9180 - val_loss: 0.2585
Epoch 5/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 39ms/step - accuracy: 0.9823 - loss: 0.0490 - val_accuracy: 0.9167 - val_loss: 0.3075
Epoch 6/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 36ms/step - accuracy: 0.9908 - loss: 0.0282 - val_accur

<keras.src.callbacks.history.History at 0x7f19af1d4ac0>

## Adding Dropout

In [8]:
from tensorflow.keras.layers import Dropout

# Create model with dropout
model = Sequential()
model.add(Embedding(voc_size, embed_vec_features, input_length = sent_len))
model.add(Dropout(0.3))  # Dropout layer added
model.add(LSTM(100))
model.add(Dropout(0.3))  # Dropout layer added
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

# Model summary
model.summary()


## Model Evaluation

In [9]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Get predictions for the test set
y_pred = model.predict(X_test) > 0.5

# Evaluate model performance
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)

print("Confusion Matrix:")
print(cm)

print("Accuracy:")
print(acc)


[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step
Confusion Matrix:
[[2430  989]
 [ 406 2210]]
Accuracy:
0.7688483844241922
