# load the data

In [1]:
import sys
import os
os.chdir("/home/rana-helal/PycharmProjects/fake_news_classifier")

from src.data_loader import load_kaggle_data


df_kaggle = load_kaggle_data()


# preprocessing

In [11]:
#!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
from tqdm import tqdm
tqdm.pandas()


In [3]:
from src.preprocessing import remove_duplicates_and_missing, clean_for_ml

# Apply text cleaning for ML models

df_cleaned = remove_duplicates_and_missing(df_kaggle)


In [4]:
from src.preprocessing import clean_for_dl

df_cleaned['clean_text'] = df_cleaned['text'].progress_apply(clean_for_dl)


100%|██████████| 44898/44898 [00:21<00:00, 2134.43it/s]


In [6]:
from src.utils import save_cleaned_data

save_cleaned_data(df_cleaned, "kaggle_clean_dl.csv")

 Saved cleaned data to data/processed/kaggle_clean_dl.csv


# label encoding

In [2]:
import pandas as pd

df = pd.read_csv("data/processed/kaggle_clean_dl.csv")
df.head()


Unnamed: 0,title,text,subject,label,clean_text
0,UNHOLY ALLIANCE: Hillary Clinton’s Saudi Spons...,21st Century Wire says Amid the tossing and t...,Middle-east,fake,NUM st century wire says amid the tossing and ...
1,"BREAKING: SYRIAN REFUGEE KILLS German Woman, I...",This news comes as Obama works to bring even m...,left-news,fake,this news comes as obama works to bring even m...
2,French magazine found guilty over topless phot...,PARIS (Reuters) - A French court ruled on Tues...,worldnews,real,paris reuters a french court ruled on tuesday ...
3,“WOODY” KAINE One Of Six ARRESTED After Peacef...,Watch the local news report as they explain wh...,politics,fake,watch the local news report as they explain wh...
4,At least three dead as Lidia slams Mexico's Lo...,MEXICO CITY (Reuters) - At least three people ...,worldnews,real,mexico city reuters at least three people died...


In [3]:
df['label'] = df['label'].map({'fake': 0, 'real': 1})

In [4]:
df.head()

Unnamed: 0,title,text,subject,label,clean_text
0,UNHOLY ALLIANCE: Hillary Clinton’s Saudi Spons...,21st Century Wire says Amid the tossing and t...,Middle-east,0,NUM st century wire says amid the tossing and ...
1,"BREAKING: SYRIAN REFUGEE KILLS German Woman, I...",This news comes as Obama works to bring even m...,left-news,0,this news comes as obama works to bring even m...
2,French magazine found guilty over topless phot...,PARIS (Reuters) - A French court ruled on Tues...,worldnews,1,paris reuters a french court ruled on tuesday ...
3,“WOODY” KAINE One Of Six ARRESTED After Peacef...,Watch the local news report as they explain wh...,politics,0,watch the local news report as they explain wh...
4,At least three dead as Lidia slams Mexico's Lo...,MEXICO CITY (Reuters) - At least three people ...,worldnews,1,mexico city reuters at least three people died...


In [5]:
df['label'].value_counts()


label
0    22847
1    21207
Name: count, dtype: int64

# Prepare tokenizer and sequences

In [6]:
texts = df["clean_text"].astype(str).tolist()


In [7]:
from src.features import prepare_tokenizer_and_sequences

tokenizer, padded_sequences = prepare_tokenizer_and_sequences(texts, max_vocab=10000, max_len=300)


2025-07-10 20:38:40.101730: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-10 20:38:40.175152: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-10 20:38:40.234213: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752169120.284928  113212 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752169120.300556  113212 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752169120.408005  113212 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

[INFO] Tokenizer and sequences prepared. Vocab size: 227803


In [8]:
from src.utils import save_pickle

# Save the tokenizer
save_pickle(tokenizer, "models/dl_tokenizer.pkl")



tokenizer saved to models/dl_tokenizer.pkl


In [9]:
save_pickle(padded_sequences, "models/padded_sequences.pkl")


tokenizer saved to models/padded_sequences.pkl


# split the data

In [10]:
from src.utils import load_pickle

padded_sequences = load_pickle("models/padded_sequences.pkl")


Loading tokenizer from models/padded_sequences.pkl


In [11]:
from src.utils import split_data


X = df['clean_text']
y = df['label']

X_train, X_val, X_test, y_train, y_val, y_test = split_data(padded_sequences, y)

Data split into train, validation, and test sets:


# train the model

In [12]:
from src.utils import load_pickle

tokenizer = load_pickle("models/dl_tokenizer.pkl")


Loading tokenizer from models/dl_tokenizer.pkl


In [13]:
from src.models.lstm_model import train_lstm_model

model, history = train_lstm_model(X_train, y_train, X_val, y_val, tokenizer)

[INFO] LSTM Model built with vocab size: 227804, embedding dim: 100, lstm units: 64
Epoch 1/5


E0000 00:00:1752169200.375874  113212 cuda_executor.cc:1228] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1752169200.376764  113212 gpu_device.cc:2341] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2025-07-10 20:40:02.349806: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 91121600 exceeds 10% of free system memory.
2025-07-10 20:40:02.391041: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 91121600 exceeds 10% of free system memory.
2025-07-10 20:40:02.533507: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allo

[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0m 552ms/step - accuracy: 0.9341 - loss: 0.1385 - val_accuracy: 0.9991 - val_loss: 0.0052
Epoch 2/5
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 542ms/step - accuracy: 0.9992 - loss: 0.0051 - val_accuracy: 0.9993 - val_loss: 0.0050
Epoch 3/5
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 532ms/step - accuracy: 0.9996 - loss: 0.0016 - val_accuracy: 0.9991 - val_loss: 0.0056
Epoch 4/5
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 524ms/step - accuracy: 0.9996 - loss: 0.0022 - val_accuracy: 0.9989 - val_loss: 0.0060
Epoch 5/5
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m295s[0m 613ms/step - accuracy: 0.9999 - loss: 8.4287e-04 - val_accuracy: 0.9993 - val_loss: 0.0054
Training curves saved to visualizations/training_curves/lstm_training_curves.png
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 60ms/step




Confusion matrix saved to visualizations/confusion_matrices/lstm_confusion_matrix.png
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 52ms/step
[INFO] LSTM model trained and saved.


In [14]:
from sklearn.metrics import accuracy_score

loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {acc:.4f}")


Test Accuracy: 0.9983
