In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

In [2]:
df = pd.read_csv("../data/processed/fake_news_full.csv")
df = df.dropna(subset=["text"])
df["text"] = df["text"].astype(str)

In [4]:
def clean_text(text):
    '''
    Text cleaning function
    '''
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [5]:
df["text"] = df["text"].apply(clean_text)

## Train/Test split

In [6]:
X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## Building and training the pipeline

In [7]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=50000, ngram_range=(1,2), stop_words="english")),
    ("smote", SMOTE(random_state=42)),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", n_jobs=-1))
])

pipeline.fit(X_train, y_train)

## Evaluating the model

In [9]:
y_pred = pipeline.predict(X_test)
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")
print(classification_report(y_test, y_pred))

F1 Score: 0.8793
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      7504
           1       0.90      0.86      0.88      8673

    accuracy                           0.87     16177
   macro avg       0.87      0.88      0.87     16177
weighted avg       0.88      0.87      0.87     16177



In [None]:
# Saving the trained pipeline
# joblib.dump(pipeline, "../artifacts/fake_news_pipeline.pkl")
# print("Pipeline saved!")