# Bidirectional LSTM – Fake‑News Detection
Trains on the LIAR dataset (Hugging Face) and saves `models/bi_lstm_fake_news.h5`.

In [2]:
import os, numpy as np, pandas as pd, tensorflow as tf
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MAX_VOCAB = 25_000
MAX_LEN   = 120
EMB_DIM   = 128
LSTM_UNITS= 64
DROPOUT   = 0.3
BATCH     = 128
EPOCHS    = 5

In [6]:
trust_remote_code=True
ds = load_dataset("liar", split="train")
df = ds.to_pandas()[["statement", "label"]]
df.columns = ["text","target"]
df["target"] = df["target"].apply(lambda x: 0 if x in [0,1,2] else 1)

tok = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tok.fit_on_texts(df.text)
X = pad_sequences(tok.texts_to_sequences(df.text), maxlen=MAX_LEN, padding="post", truncating="post")
y = df.target.values

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print('Train/val shapes:', X_tr.shape, X_te.shape)

Downloading data: 100%|██████████| 1.01M/1.01M [00:00<00:00, 4.74MB/s]
Generating train split: 100%|██████████| 10269/10269 [00:00<00:00, 16007.22 examples/s]
Generating test split: 100%|██████████| 1283/1283 [00:00<00:00, 16745.27 examples/s]
Generating validation split: 100%|██████████| 1284/1284 [00:00<00:00, 17108.51 examples/s]


Train/val shapes: (8215, 120) (2054, 120)


In [7]:
model = Sequential([
    Embedding(MAX_VOCAB, EMB_DIM, mask_zero=True),
    Bidirectional(LSTM(LSTM_UNITS)),
    Dropout(DROPOUT),
    Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

2025-06-16 00:16:31.078886: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [8]:
history = model.fit(
    X_tr, y_tr,
    epochs=EPOCHS,
    batch_size=BATCH,
    validation_split=0.2
)

Epoch 1/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 310ms/step - accuracy: 0.5832 - loss: 0.6822 - val_accuracy: 0.5928 - val_loss: 0.6744
Epoch 2/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 300ms/step - accuracy: 0.6084 - loss: 0.6433 - val_accuracy: 0.5758 - val_loss: 0.7111
Epoch 3/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 306ms/step - accuracy: 0.8103 - loss: 0.4370 - val_accuracy: 0.5344 - val_loss: 0.8538
Epoch 4/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 304ms/step - accuracy: 0.9121 - loss: 0.2403 - val_accuracy: 0.5460 - val_loss: 1.1862
Epoch 5/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 300ms/step - accuracy: 0.9569 - loss: 0.1304 - val_accuracy: 0.5283 - val_loss: 1.5461


In [9]:
loss, acc = model.evaluate(X_te, y_te, verbose=0)
print(f"Test accuracy: {acc:.3f}")

Test accuracy: 0.544


In [10]:
os.makedirs("models", exist_ok=True)
model.save("models/bi_lstm_fake_news.h5")
import pickle
with open("models/tokenizer.pkl", "wb") as f:
    pickle.dump(tok, f)
print("✅  Model and tokenizer saved!")



✅  Model and tokenizer saved!
