# Logistische Regression - hyperparameter tuning

In [2]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_halving_search_cv  # noqa: F401
from sklearn.model_selection import HalvingRandomSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from scipy.stats import uniform, randint
import joblib


In [3]:
train_path = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/train.tsv"
val_path   = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/validation.tsv"
test_path  = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/test.tsv"

# Reading with tab separator and parsing dates
df_train = pd.read_csv(train_path, sep='\t', parse_dates=["date"], dayfirst=False)
df_val   = pd.read_csv(val_path,   sep='\t', parse_dates=["date"], dayfirst=False)
df_test  = pd.read_csv(test_path,  sep='\t', parse_dates=["date"], dayfirst=False)

# Merge title and text into a single input text
def merge_text(row):
    return f"{row['title']} \n{row['text']}"

for df in (df_train, df_val, df_test):
    df['input_text'] = df.apply(merge_text, axis=1)

X_train, y_train = df_train['input_text'], df_train['label']
X_val,   y_val   = df_val['input_text'],   df_val['label']
X_test,  y_test  = df_test['input_text'],  df_test['label']  # For final evaluation

In [4]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(solver='saga', max_iter=5000, random_state=42, n_jobs=-1))
])

param_distributions = [
    {
        'tfidf__max_features': randint(5000, 20001),
        'tfidf__ngram_range': [(1,1), (1,2)],
        'tfidf__min_df': randint(1, 6),
        'tfidf__max_df': uniform(0.8, 0.2),
        'tfidf__stop_words': [None, 'english'],
        'clf__penalty': ['l2', 'l1'],
        'clf__C': uniform(0.01, 10),
        'clf__class_weight': [None, 'balanced'],
        'clf__l1_ratio': [None]
    },
    {
        'tfidf__max_features': randint(5000, 20001),
        'tfidf__ngram_range': [(1,1), (1,2)],
        'tfidf__min_df': randint(1, 6),
        'tfidf__max_df': uniform(0.8, 0.2),
        'tfidf__stop_words': [None, 'english'],
        'clf__penalty': ['elasticnet'],
        'clf__C': uniform(0.01, 10),
        'clf__l1_ratio': uniform(0, 1),
        'clf__class_weight': [None, 'balanced']
    }
]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define halving search (full run) with adjusted number of candidates to fit ~60x pilot time
n_pilot_cand = 9
pilot_frac = 0.3
scale_factor = 1 / pilot_frac  # ≈3.33
n_full_candidates = int(100 * n_pilot_cand / scale_factor)
print(f"Configuring full HalvingRandomSearchCV with ~{n_full_candidates} initial candidates")  # for sanity check

halving_search_full = HalvingRandomSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_candidates=n_full_candidates,
    factor=3,
    resource='n_samples',
    max_resources=len(X_train),
    min_resources=len(X_train) // (300),
    scoring='accuracy',
    cv=cv,
    verbose=2,
    n_jobs=-1,
    random_state=42,
    return_train_score=True,
    error_score='raise'
)

Configuring full HalvingRandomSearchCV with ~270 initial candidates


In [5]:
halving_search_full.fit(X_train, y_train)

print("Best parameters found:", halving_search_full.best_params_)
print(f"Best cross-validation accuracy: {halving_search_full.best_score_:.4f}")

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 6
min_resources_: 100
max_resources_: 30000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 270
n_resources: 100
Fitting 5 folds for each of 270 candidates, totalling 1350 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 1
n_candidates: 90
n_resources: 300
Fitting 5 folds for each of 90 candidates, totalling 450 fits
----------
iter: 2
n_candidates: 30
n_resources: 900
Fitting 5 folds for each of 30 candidates, totalling 150 fits
----------
iter: 3
n_candidates: 10
n_resources: 2700
Fitting 5 folds for each of 10 candidates, totalling 50 fits
----------
iter: 4
n_candidates: 4
n_resources: 8100
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 5
n_candidates: 2
n_resources: 24300
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters found: {'clf__C': 8.725368061523762, 'clf__class_weight': 'balanced', 'clf__l1_ratio': 0.6760522571867544, 'clf__penalty': 'elasticnet', 'tfidf__max_df': 0.8891916614020643, 'tfidf__max_features': 13954, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': None}
Best cross-validation accuracy: 0.9911


In [6]:
best_model = halving_search_full.best_estimator_
val_preds = best_model.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds, target_names=['Real', 'Fake']))
print("Confusion Matrix:\n", confusion_matrix(y_val, val_preds))

Validation Accuracy: 0.993
              precision    recall  f1-score   support

        Real       1.00      0.99      0.99      3089
        Fake       0.99      1.00      0.99      2911

    accuracy                           0.99      6000
   macro avg       0.99      0.99      0.99      6000
weighted avg       0.99      0.99      0.99      6000

Confusion Matrix:
 [[3061   28]
 [  14 2897]]


In [7]:
test_preds = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, test_preds))
print(classification_report(y_test, test_preds, target_names=['Real', 'Fake']))
print("Confusion Matrix:\n", confusion_matrix(y_test, test_preds))


Test Accuracy: 0.992500302407161
              precision    recall  f1-score   support

        Real       0.99      0.99      0.99      4284
        Fake       0.99      0.99      0.99      3983

    accuracy                           0.99      8267
   macro avg       0.99      0.99      0.99      8267
weighted avg       0.99      0.99      0.99      8267

Confusion Matrix:
 [[4258   26]
 [  36 3947]]


In [8]:
joblib.dump(best_model, 'best_fake_news_detector_halving_random_search.joblib')

['best_fake_news_detector_halving_random_search.joblib']