# load the data

In [26]:
import sys
import os
os.chdir("/home/rana-helal/PycharmProjects/fake_news_classifier")

from src.data_loader import load_kaggle_data


df_kaggle = load_kaggle_data()


# preprocessing

In [11]:
#!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
from tqdm import tqdm
tqdm.pandas()


In [3]:
from src.preprocessing import remove_duplicates_and_missing, clean_for_ml

# Apply text cleaning for ML models

df_cleaned = remove_duplicates_and_missing(df_kaggle)


In [4]:
from src.preprocessing import clean_for_dl

df_cleaned['clean_text'] = df_cleaned['text'].progress_apply(clean_for_dl)


100%|██████████| 44898/44898 [00:21<00:00, 2134.43it/s]


In [6]:
from src.utils import save_cleaned_data

save_cleaned_data(df_cleaned, "kaggle_clean_dl.csv")

 Saved cleaned data to data/processed/kaggle_clean_dl.csv


# label encoding

In [27]:
import pandas as pd

df = pd.read_csv("data/processed/kaggle_clean_dl.csv")
df.head()


Unnamed: 0,title,text,subject,label,clean_text
0,UNHOLY ALLIANCE: Hillary Clinton’s Saudi Spons...,21st Century Wire says Amid the tossing and t...,Middle-east,fake,NUM st century wire says amid the tossing and ...
1,"BREAKING: SYRIAN REFUGEE KILLS German Woman, I...",This news comes as Obama works to bring even m...,left-news,fake,this news comes as obama works to bring even m...
2,French magazine found guilty over topless phot...,PARIS (Reuters) - A French court ruled on Tues...,worldnews,real,paris reuters a french court ruled on tuesday ...
3,“WOODY” KAINE One Of Six ARRESTED After Peacef...,Watch the local news report as they explain wh...,politics,fake,watch the local news report as they explain wh...
4,At least three dead as Lidia slams Mexico's Lo...,MEXICO CITY (Reuters) - At least three people ...,worldnews,real,mexico city reuters at least three people died...


In [28]:
df['label'] = df['label'].map({'fake': 0, 'real': 1})

In [29]:
df.head()

Unnamed: 0,title,text,subject,label,clean_text
0,UNHOLY ALLIANCE: Hillary Clinton’s Saudi Spons...,21st Century Wire says Amid the tossing and t...,Middle-east,0,NUM st century wire says amid the tossing and ...
1,"BREAKING: SYRIAN REFUGEE KILLS German Woman, I...",This news comes as Obama works to bring even m...,left-news,0,this news comes as obama works to bring even m...
2,French magazine found guilty over topless phot...,PARIS (Reuters) - A French court ruled on Tues...,worldnews,1,paris reuters a french court ruled on tuesday ...
3,“WOODY” KAINE One Of Six ARRESTED After Peacef...,Watch the local news report as they explain wh...,politics,0,watch the local news report as they explain wh...
4,At least three dead as Lidia slams Mexico's Lo...,MEXICO CITY (Reuters) - At least three people ...,worldnews,1,mexico city reuters at least three people died...


In [30]:
df['label'].value_counts()


label
0    22847
1    21207
Name: count, dtype: int64

# Prepare tokenizer and sequences

In [26]:
texts = df["clean_text"].astype(str).tolist()


In [23]:
from src.features import prepare_tokenizer_and_sequences

tokenizer, padded_sequences = prepare_tokenizer_and_sequences(texts, max_vocab=10000, max_len=300)


[INFO] Tokenizer and sequences prepared. Vocab size: 227803


In [25]:
from src.utils import save_pickle

# Save the tokenizer
save_pickle(tokenizer, "models/dl_tokenizer.pkl")



tokenizer saved to models/dl_tokenizer.pkl


# split the data

In [37]:
from src.utils import split_data


X = df['clean_text']
y = df['label']

X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)