In [None]:
!pip install optuna --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/386.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/231.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances
import matplotlib.pyplot as plt

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import re
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import kagglehub
path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset")
fake = pd.read_csv(path + "/Fake.csv")
real = pd.read_csv(path + "/True.csv")
fake['label'] = 1
real['label'] = 0

data = pd.concat([fake, real], axis=0)
data = shuffle(data, random_state=42).reset_index(drop=True)

In [None]:
# Deduplicate
data = data.drop_duplicates().reset_index(drop=True)
data = data.drop_duplicates(subset=['text']).reset_index(drop=True)
data = data.drop_duplicates(subset=['title']).reset_index(drop=True)

In [None]:
data.head(10)

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",0
5,Paul Ryan Responds To Dem’s Sit-In On Gun Con...,"On Wednesday, Democrats took a powerful stance...",News,"June 22, 2016",1
6,AWESOME! DIAMOND AND SILK Rip Into The Press: ...,President Trump s rally in FL on Saturday was ...,Government News,"Feb 19, 2017",1
7,STAND UP AND CHEER! UKIP Party Leader SLAMS Ge...,He s been Europe s version of the outspoken Te...,left-news,"Mar 8, 2016",1
8,North Korea shows no sign it is serious about ...,WASHINGTON (Reuters) - The State Department sa...,worldnews,"December 13, 2017",0
9,Trump signals willingness to raise U.S. minimu...,(This version of the story corrects the figur...,politicsNews,"May 4, 2016",0


In [None]:
n_per_class = 5000
df_fake = data[data['label'] == 1].sample(n=n_per_class, random_state=42)
df_real = data[data['label'] == 0].sample(n=n_per_class, random_state=42)
data = pd.concat([df_fake, df_real]).sample(frac=1, random_state=42).reset_index(drop=True)

data['text_combined'] = data['title'] + " " + data['text']

In [None]:
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [None]:
data['preprocessed'] = data['text_combined'].apply(preprocess_text)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data['preprocessed'], data['label'], test_size=0.3, random_state=42)

In [None]:
from collections import Counter

def tokenize(text):
    return text.split()

word_counter = Counter()
for text in x_train:
    word_counter.update(tokenize(text))

vocab = {word: i+2 for i, (word, _) in enumerate(word_counter.most_common(20000))}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

def encode(text, vocab, max_len=500):
    tokens = tokenize(text)
    encoded = [vocab.get(word, vocab['<UNK>']) for word in tokens[:max_len]]
    return encoded + [vocab['<PAD>']] * (max_len - len(encoded))

In [None]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=500):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = encode(self.texts.iloc[idx], self.vocab, self.max_len)
        return torch.tensor(encoded, dtype=torch.long), torch.tensor(self.labels.iloc[idx], dtype=torch.long)

train_dataset = FakeNewsDataset(x_train, y_train, vocab)
test_dataset = FakeNewsDataset(x_test, y_test, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
class LSTMClassifier(nn.Module):
        def __init__(self, vocab_size, embed_dim, hidden_dim, dropout, output_dim=2):
            super(LSTMClassifier, self).__init__()
            self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
            self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
            self.dropout = nn.Dropout(dropout)
            self.fc = nn.Linear(hidden_dim, output_dim)

        def forward(self, x):
            x = self.embedding(x)
            _, (hidden, _) = self.lstm(x)
            x = self.dropout(hidden[-1])
            return self.fc(x)

In [None]:
def objective(trial):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Sample hyperparameters
    embed_dim = trial.suggest_categorical("embed_dim", [64, 128, 256])
    hidden_dim = trial.suggest_categorical("hidden_dim", [64, 128, 256])
    dropout = trial.suggest_float("dropout", 0.2, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)  # Updated deprecated function


    model = LSTMClassifier(vocab_size=len(vocab), embed_dim=embed_dim, hidden_dim=hidden_dim, dropout = dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    # Training
    model.train()
    for epoch in range(3):  # Keep short for tuning
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.numpy())

    return accuracy_score(true_labels, predictions)

In [None]:
study = optuna.create_study(direction="maximize", study_name = 'LSTM')
study.optimize(objective, n_trials=20)

# Best hyperparameters
print("Best trial:")
print(study.best_trial)

[I 2025-04-30 21:09:01,123] A new study created in memory with name: LSTM
[I 2025-04-30 21:09:19,509] Trial 0 finished with value: 0.8116666666666666 and parameters: {'embed_dim': 128, 'hidden_dim': 256, 'dropout': 0.37319864993014285, 'lr': 0.0016059595262606992}. Best is trial 0 with value: 0.8116666666666666.
[I 2025-04-30 21:09:24,831] Trial 1 finished with value: 0.487 and parameters: {'embed_dim': 64, 'hidden_dim': 128, 'dropout': 0.4237910969036611, 'lr': 0.00023773320343213603}. Best is trial 0 with value: 0.8116666666666666.
[I 2025-04-30 21:09:31,463] Trial 2 finished with value: 0.7203333333333334 and parameters: {'embed_dim': 128, 'hidden_dim': 128, 'dropout': 0.4699614670222218, 'lr': 0.0006437686656644495}. Best is trial 0 with value: 0.8116666666666666.
[I 2025-04-30 21:09:36,516] Trial 3 finished with value: 0.6743333333333333 and parameters: {'embed_dim': 128, 'hidden_dim': 64, 'dropout': 0.26682804180624187, 'lr': 0.004741276192282539}. Best is trial 0 with value: 0.8

Best trial:
FrozenTrial(number=6, state=1, values=[0.923], datetime_start=datetime.datetime(2025, 4, 30, 21, 9, 47, 887000), datetime_complete=datetime.datetime(2025, 4, 30, 21, 9, 55, 823908), params={'embed_dim': 256, 'hidden_dim': 128, 'dropout': 0.3585718875304013, 'lr': 0.0033834789846606626}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'embed_dim': CategoricalDistribution(choices=(64, 128, 256)), 'hidden_dim': CategoricalDistribution(choices=(64, 128, 256)), 'dropout': FloatDistribution(high=0.5, log=False, low=0.2, step=None), 'lr': FloatDistribution(high=0.01, log=True, low=0.0001, step=None)}, trial_id=6, value=None)


In [None]:
best_params = study.best_trial.params


final_model = LSTMClassifier(
    vocab_size=len(vocab),
    embed_dim=best_params["embed_dim"],
    hidden_dim=best_params["hidden_dim"],
    dropout=best_params["dropout"]
).to(device)

optimizer = torch.optim.Adam(final_model.parameters(), lr=best_params["lr"])
criterion = nn.CrossEntropyLoss()

# Train final model
for epoch in range(3):
    final_model.train()
    running_loss = 0.0

    # Use notebook-friendly tqdm here
    loop = tqdm(train_loader, desc=f'Epoch {epoch+1}', leave=True)

    for inputs, labels in loop:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = final_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        loop.set_postfix(loss=running_loss / (loop.n + 1))

    print(f"Epoch {epoch+1} complete")


Epoch 1:   0%|          | 0/219 [00:00<?, ?it/s]

Epoch 1 complete


Epoch 2:   0%|          | 0/219 [00:00<?, ?it/s]

Epoch 2 complete


Epoch 3:   0%|          | 0/219 [00:00<?, ?it/s]

Epoch 3 complete


In [None]:
from sklearn.metrics import accuracy_score, classification_report
def evaluate(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.numpy())
    acc = accuracy_score(true_labels, predictions)
    print("Classification Report:\n", classification_report(true_labels, predictions))
    return acc

In [None]:
train_acc = evaluate(final_model, train_loader)
print(f"Final train accuracy: {train_acc:.4f}")

test_acc = evaluate(final_model, test_loader)
print(f"Final test accuracy: {test_acc:.4f}")

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.76      0.81      3539
           1       0.78      0.88      0.83      3461

    accuracy                           0.82      7000
   macro avg       0.83      0.82      0.82      7000
weighted avg       0.83      0.82      0.82      7000

Final train accuracy: 0.8206
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.71      0.77      1461
           1       0.76      0.88      0.82      1539

    accuracy                           0.80      3000
   macro avg       0.80      0.80      0.80      3000
weighted avg       0.80      0.80      0.80      3000

Final test accuracy: 0.7973


In [None]:
def predict_fake_news(text, model, vocab, max_len=500):
    model.eval()
    cleaned = preprocess_text(text)
    encoded = encode(cleaned, vocab, max_len)
    tensor = torch.tensor(encoded, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(tensor)
        pred = torch.argmax(output, dim=1).item()
    return "Fake" if pred == 1 else "Real"


In [None]:
# Example usage:
sample1 = df_real.iloc[657]['title'] + " " + df_real.iloc[657]['text']
sample2 = df_fake.iloc[256]['title'] + " " + df_fake.iloc[256]['text']
print("Prediction:", predict_fake_news(sample2, final_model, vocab))

Prediction: Fake


In [None]:
save_path = 'lstm_fake_news_model.pth'
torch.save({
    'model_state_dict': final_model.state_dict(),
    'vocab': vocab,
    'params': best_params
}, save_path)