# Logistische Regression - hyperparameter tuning

In [2]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_halving_search_cv  # noqa: F401
from sklearn.model_selection import HalvingRandomSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from scipy.stats import uniform, randint
import joblib


In [3]:
train_path = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/train.tsv"
val_path   = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/validation.tsv"
test_path  = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/test.tsv"

# Reading with tab separator and parsing dates
df_train = pd.read_csv(train_path, sep='\t', parse_dates=["date"], dayfirst=False)
df_val   = pd.read_csv(val_path,   sep='\t', parse_dates=["date"], dayfirst=False)
df_test  = pd.read_csv(test_path,  sep='\t', parse_dates=["date"], dayfirst=False)

# Merge title and text into a single input text
def merge_text(row):
    return f"{row['title']} \n{row['text']}"

for df in (df_train, df_val, df_test):
    df['input_text'] = df.apply(merge_text, axis=1)

X_train, y_train = df_train['input_text'], df_train['label']
X_val,   y_val   = df_val['input_text'],   df_val['label']
X_test,  y_test  = df_test['input_text'],  df_test['label']  # For final evaluation

In [4]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(solver='saga', max_iter=5000, random_state=42, n_jobs=-1))
])

param_distributions = [
    {
        'tfidf__max_features': randint(5000, 20001),
        'tfidf__ngram_range': [(1,1), (1,2)],
        'tfidf__min_df': randint(1, 6),
        'tfidf__max_df': uniform(0.8, 0.2),
        'tfidf__stop_words': [None, 'english'],
        'clf__penalty': ['l2', 'l1'],
        'clf__C': uniform(0.01, 10),
        'clf__class_weight': [None, 'balanced'],
        'clf__l1_ratio': [None]
    },
    {
        'tfidf__max_features': randint(5000, 20001),
        'tfidf__ngram_range': [(1,1), (1,2)],
        'tfidf__min_df': randint(1, 6),
        'tfidf__max_df': uniform(0.8, 0.2),
        'tfidf__stop_words': [None, 'english'],
        'clf__penalty': ['elasticnet'],
        'clf__C': uniform(0.01, 10),
        'clf__l1_ratio': uniform(0, 1),
        'clf__class_weight': [None, 'balanced']
    }
]

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Define halving search (full run) with adjusted number of candidates to fit ~60x pilot time
n_pilot_cand = 9
pilot_frac = 0.3
scale_factor = 1 / pilot_frac  # ≈3.33
# To spend ~60× the pilot time: n_full_cand ≈ 60 * n_pilot_cand / scale_factor
n_full_candidates = int(60 * n_pilot_cand / scale_factor)
print(f"Configuring full HalvingRandomSearchCV with ~{n_full_candidates} initial candidates")  # for sanity check

halving_search_full = HalvingRandomSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_candidates=n_full_candidates,
    factor=3,
    resource='n_samples',
    max_resources=len(X_train),
    min_resources=len(X_train) // 10,
    scoring='accuracy',
    cv=cv,
    verbose=2,
    n_jobs=-1,
    random_state=42,
    return_train_score=True,
    error_score='raise'
)

Configuring full HalvingRandomSearchCV with ~162 initial candidates


In [None]:
halving_search_full.fit(X_train, y_train)

print("Best parameters found:", halving_search.best_params_)
print(f"Best cross-validation accuracy: {halving_search.best_score_:.4f}")

n_iterations: 3
n_required_iterations: 5
n_possible_iterations: 3
min_resources_: 3000
max_resources_: 30000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 162
n_resources: 3000
Fitting 3 folds for each of 162 candidates, totalling 486 fits


In [None]:
best_model = halving_search_full.best_estimator_
val_preds = best_model.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds, target_names=['Real', 'Fake']))
print("Confusion Matrix:\n", confusion_matrix(y_val, val_preds))

In [None]:
test_preds = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, test_preds))
print(classification_report(y_test, test_preds, target_names=['Real', 'Fake']))
print("Confusion Matrix:\n", confusion_matrix(y_test, test_preds))


In [None]:
joblib.dump(best_model, 'best_fake_news_detector_halving_random_search.joblib')