In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

# The URL to the raw WELFake dataset CSV file
url = '/content/drive/My Drive/WELFake_Dataset.csv'

# Load the dataset directly from the URL
# The dataset has no header, so we specify header=None
df = pd.read_csv(url, header=None)

# Let's give the columns meaningful names
df.columns = ['id', 'title', 'text', 'label']

print("Dataset loaded successfully from URL!")
df.head()

Dataset loaded successfully from URL!


Unnamed: 0,id,title,text,label
0,,title,text,label
1,0.0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,1.0,,Did they post their votes for Hillary already?,1
3,2.0,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
4,3.0,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0


In [None]:
# Check for missing values
print("Missing values before cleaning:")
print(df.isnull().sum())

# Drop the 'id' column as it's not needed
df = df.drop('id', axis=1)

# Fill any NaNs in text/title with an empty string
df = df.fillna('')

# Combine title and text for a richer feature
df['content'] = df['title'] + ' ' + df['text']

# Define our features (X) and target (y)
# Ensure the label is numeric
X = df['content']
y = pd.to_numeric(df['label'], errors='coerce')

# Drop any rows where the label could not be converted
y = y.dropna()
X = X[y.index]


print("\nData cleaned and preprocessed.")
print(f"Total samples: {len(df)}")

Missing values before cleaning:
id         1
title    558
text      39
label      0
dtype: int64

Data cleaned and preprocessed.
Total samples: 72135


In [None]:
# Define vocabulary size and max sequence length
vocab_size = 10000
max_len = 250
oov_token = "<OOV>" # Token for words not in the vocabulary

# Initialize and fit the tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X)

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(X)

# Pad the sequences to ensure uniform length
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

print("Text tokenized and padded.")
print("Shape of padded sequences:", padded_sequences.shape)

Text tokenized and padded.
Shape of padded sequences: (72134, 250)


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Training set shape: (57707, 250)
Testing set shape: (14427, 250)


In [None]:
# Make sure your max_len variable is defined from the padding step
# For example:
max_len = 250
vocab_size = 10000

# 1. Define the entire model in one block
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# 2. ✅ Force the model to build its layers with a specific input shape
# The 'None' stands for the batch size, which can be variable.
model.build(input_shape=(None, max_len))

# 3. Now, print the summary
print("--- Model Summary After Building ---")
model.summary()

--- Model Summary After Building ---


In [None]:
# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("Model compiled successfully.")

Model compiled successfully.


In [None]:
# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.1, # Use 10% of training data for validation
    callbacks=[early_stopping],
    verbose=1
)

print("\n✅ Model training complete!")

Epoch 1/5
[1m812/812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m495s[0m 603ms/step - accuracy: 0.9010 - loss: 0.2292 - val_accuracy: 0.9738 - val_loss: 0.0750
Epoch 2/5
[1m812/812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m495s[0m 595ms/step - accuracy: 0.9873 - loss: 0.0382 - val_accuracy: 0.9768 - val_loss: 0.0733
Epoch 3/5
[1m812/812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m509s[0m 603ms/step - accuracy: 0.9925 - loss: 0.0219 - val_accuracy: 0.9647 - val_loss: 0.1243
Epoch 4/5
[1m812/812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m507s[0m 608ms/step - accuracy: 0.9961 - loss: 0.0119 - val_accuracy: 0.9724 - val_loss: 0.1226
Epoch 5/5
[1m812/812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m484s[0m 587ms/step - accuracy: 0.9980 - loss: 0.0069 - val_accuracy: 0.9698 - val_loss: 0.1428

✅ Model training complete!


In [None]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {accuracy:.4f}")
print(f"Test Loss: {loss:.4f}")
print(classification_report(y_test, y_pred, target_names=['Reliable', 'Unreliable']))


[1m451/451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 73ms/step - accuracy: 0.9788 - loss: 0.0651

Test Accuracy: 0.9777
Test Loss: 0.0667


NameError: name 'y_pred' is not defined

In [None]:
# Generate predictions
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype("int32")

# Print classification report
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['Reliable', 'Unreliable']))


In [None]:

# Plot training history
plt.figure(figsize=(12, 5))

# Plot Accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix


# 1. Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# 2. Create the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm,
            annot=True,         # Show the numbers in the cells
            fmt='d',            # Format as integers
            cmap='Blues',       # Color scheme
            xticklabels=['Predicted Negative (0)', 'Predicted Positive (1)'],
            yticklabels=['Actual Negative (0)', 'Actual Positive (1)'])

plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()