In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download NLTK data

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load dataset

In [3]:
df = pd.read_csv(r"C:\Users\musta\OneDrive\Desktop\WELFake_Dataset.csv")


# Preprocessing function

In [4]:
def preprocess_text(text):
    if isinstance(text, str):  # Handle NaN or None values
        # Lowercase
        text = text.lower()
        # Remove special characters and numbers
        text = re.sub(r'[^a-z\s]', '', text)
        # Tokenize
        tokens = nltk.word_tokenize(text)
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    else:
        return ''

# Apply preprocessing to title and text columns

In [5]:
print(df.columns)

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')


In [6]:
df['cleaned_title'] = df['title'].apply(preprocess_text)
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Combine title and text for final input

In [7]:
df['combined'] = df['cleaned_title'] + " " + df['cleaned_text']

# Tokenization and sequence padding


In [8]:
tokenizer = Tokenizer(num_words=10000)  
tokenizer.fit_on_texts(df['combined'])
sequences = tokenizer.texts_to_sequences(df['combined'])

# Pad sequences

In [9]:
max_sequence_length = 200  # Adjust as needed
X = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Target variable

In [10]:
y = np.array(df['label'])

# Train-test split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Build LSTM model

In [12]:
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size
embedding_dim = 100

In [13]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),
    LSTM(128, return_sequences=False),  # Bidirectional LSTM
    Dropout(0.3),  # Dropout for regularization
    Dense(64, activation='relu'),  # Fully connected layer
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])



# Compile the model

In [14]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model


In [15]:
history = model.fit(X_train, y_train, epochs=2, batch_size=64, validation_split=0.1)

Epoch 1/2
[1m812/812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1350s[0m 2s/step - accuracy: 0.7307 - loss: 0.5097 - val_accuracy: 0.8605 - val_loss: 0.3126
Epoch 2/2
[1m812/812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1185s[0m 1s/step - accuracy: 0.8604 - loss: 0.3238 - val_accuracy: 0.8998 - val_loss: 0.2595


# Evaluate the model

In [16]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m451/451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 42ms/step - accuracy: 0.9004 - loss: 0.2537
Test Accuracy: 0.90
