In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/content/WELFake_Dataset_Preprocessed.csv")

In [3]:
df.shape

(61937, 2)

In [4]:
df = df.sample(frac = 1)
df.head()
df.reset_index(inplace = True)
df.drop(['index'], axis = 1, inplace = True)
df.head()

Unnamed: 0,text,label
0,new york reuters trump administration combativ...,0
1,dcg comment aint multiculturalism grand daily ...,1
2,moscow reuters speaker russia lower house parl...,0
3,another longtime lover bill clinton come forth...,1
4,show force russian longrange tu bear bomber br...,1


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [6]:
texts = df['text'].to_list()

In [7]:
labels = df['label'].to_list()

In [8]:
from torchtext.data.utils import get_tokenizer

# Preprocessing: handle missing or invalid values
texts = [str(text) if isinstance(text, (int, float)) else text for text in texts]
texts = [text for text in texts if isinstance(text, str)]  # Remove non-string items
texts = df["text"].fillna("").tolist()

# Tokenize
tokenizer = get_tokenizer("basic_english")
tokenized_texts = [tokenizer(text) for text in texts]

# Vocabulary
vocab = set(word for text in tokenized_texts for word in text)
word_to_idx = {word: idx + 1 for idx, word in enumerate(vocab)}  # Start indexing from 1
word_to_idx["<PAD>"] = 0  # Add padding token
print(word_to_idx)

# Convert text to sequences of indices
max_length = 100  # Max length of a sentence
sequences = [[word_to_idx[word] for word in text if word in word_to_idx] for text in tokenized_texts]
padded_sequences = [seq + [0] * (max_length - len(seq)) if len(seq) < max_length else seq[:max_length] for seq in sequences]

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)




In [9]:
# Step 2: Load GloVe embeddings
embedding_dim = 100  # Use 100-dimensional GloVe vectors
embedding_index = {}

# Replace 'glove.6B.50d.txt' with your GloVe file path
with open('/content/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        embedding_index[word] = vector

# Create the embedding matrix
vocab_size = len(word_to_idx)
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, idx in word_to_idx.items():
    if word in embedding_index:
        embedding_matrix[idx] = embedding_index[word]


In [11]:
# Step 3: Prepare DataLoader
class NewsDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, encoded_labels, test_size=0.4, random_state=42)

train_dataset = NewsDataset(X_train, y_train)
test_dataset = NewsDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [14]:
# Step 4: Define the Model
class FakeNewsDetector(nn.Module):
    def __init__(self, vocab_size, embedding_dim, embedding_matrix, hidden_dim, output_dim):
        super(FakeNewsDetector, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False  # Freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        out = self.fc(hidden[-1])
        return self.sigmoid(out)

# Model parameters
hidden_dim = 64
output_dim = 1

model = FakeNewsDetector(vocab_size, embedding_dim, embedding_matrix, hidden_dim, output_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [15]:
# Step 5: Train the Model
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for sequences, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(sequences)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

Epoch 1/5, Loss: 8436.4166
Epoch 2/5, Loss: 4015.0862
Epoch 3/5, Loss: 2928.7334
Epoch 4/5, Loss: 2337.3904
Epoch 5/5, Loss: 1863.4662


In [18]:
# Step 6: Evaluate the Model

from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

model.eval()
correct = 0
total = 0
all_predictions = []
all_labels = []

with torch.no_grad():
    for sequences, labels in test_loader:
        outputs = model(sequences).squeeze()
        predictions = (outputs > 0.5).float()

        # Ensure the predictions and labels are 1D arrays
        all_predictions.extend(predictions.cpu().numpy().flatten())  # Flatten to make sure it's 1D
        all_labels.extend(labels.cpu().numpy().flatten())  # Flatten to make sure it's 1D

        correct += (predictions == labels).sum().item()
        total += labels.size(0)

# Calculate Accuracy
accuracy = correct / total * 100

# Calculate Precision, Recall, F1 Score
precision = precision_score(all_labels, all_predictions)
recall = recall_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions)

# Confusion Matrix
conf_matrix = confusion_matrix(all_labels, all_predictions)

# Output the metrics
print(f"Test Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)


Test Accuracy: 94.12%
Precision: 0.91
Recall: 0.96
F1 Score: 0.94
Confusion Matrix:
[[12793   982]
 [  476 10524]]


In [20]:
# Step 7: Save the model's state_dict

torch.save(model.state_dict(), "lstm_model_colab_final.pth")

In [21]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

# Ensure the necessary NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Load your model
model = FakeNewsDetector(vocab_size, embedding_dim, embedding_matrix, hidden_dim, output_dim)
model.load_state_dict(torch.load('/content/lstm_model_colab_final.pth'))
model.eval()  # Switch the model to evaluation mode


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


FakeNewsDetector(
  (embedding): Embedding(316282, 100)
  (lstm): LSTM(100, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [23]:
import re

def clean_text(text):
    # Implement the cleaning steps: lowercasing, removing stop words, lemmatization, etc.
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

def predict_fake_news(text):
    text = clean_text(text)
    tokenized_text = tokenizer(text)
    sequence = [word_to_idx.get(word, 0) for word in tokenized_text]  # OOV words are mapped to 0
    padded_sequence = sequence + [0] * (max_length - len(sequence)) if len(sequence) < max_length else sequence[:max_length]
    sequence_tensor = torch.tensor(padded_sequence, dtype=torch.long).unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        output = model(sequence_tensor)
        prediction = (output > 0.5).float()

    if prediction.item() == 0:
        return "Fake News"
    else:
        return "Real News"


In [24]:
user_input = input("Enter news text: ")
result = predict_fake_news(user_input)
print(f"Prediction: {result}")

Enter news text: BRUSSELS (Reuters) - British Prime Minister Theresa May s offer of  settled status  for EU residents is flawed and will leave them with fewer rights after Brexit, the European Parliament s Brexit coordinator said on Tuesday. A family of five could face a bill of 360 pounds to acquire the new status, Guy Verhofstadt told May s Brexit Secretary David Davis in a letter seen by Reuters    a very significant amount for a family on low income . Listing three other concerns for the EU legislature, which must approve any treaty on the March 2019 exit, Verhofstadt told Davis:  Under your proposals, EU citizens will definitely notice a deterioration of their status as a result of Brexit. And the Parliament s aim all along has been that EU citizens, and UK citizens in the EU-27, should notice no difference.  Verhofstadt, a former Belgian prime minister, wrote in response to Davis, who had written to him after Parliament complained last week that there remained  major issues  to b

In [25]:
user_input = input("Enter news text: ")
result = predict_fake_news(user_input)
print(f"Prediction: {result}")

Enter news text: The Atlantic, a publication that wouldn t know unbiased journalism if it bit them in the a$$ published what appears to be a reluctant piece on President Trump s outstanding accomplishments during his first 6 months in office. Most of the people reading this piece by The Atlantic are fans of their writing because they ve bought into the progressive, anti-American sentiment they were fed like crack cocaine in college. Almost like the patch that smokers wear to help them get through the withdrawal of nicotine while going through a cessation program, The Atlantic provides their readers with enough anti-Trump propaganda to keep them in business, while acting like a support group for their readers, who fear every anti-American piece of legislation Barack Obama worked so hard to implement, is all unraveling, thanks to this guy who  foolishly  wants  Make America Great Again . Of course, in the eyes of The Atlantic reader, America was never great to begin with.The parts where 