# Logistische Regression - GonzaloA

In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import uniform, randint
import joblib

In [3]:
train_path = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/train.tsv"
val_path   = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/validation.tsv"
test_path  = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/test.tsv"

df_train = pd.read_csv(train_path, sep='\t', parse_dates=["date"], dayfirst=False)
df_val   = pd.read_csv(val_path,   sep='\t', parse_dates=["date"], dayfirst=False)
df_test  = pd.read_csv(test_path,  sep='\t', parse_dates=["date"], dayfirst=False)

def merge_text(row):
    return f"{row['title']} \n{row['text']}"

for df in (df_train, df_val, df_test):
    df['input_text'] = df.apply(merge_text, axis=1)

X_train, y_train = df_train['input_text'], df_train['label']
X_val, y_val     = df_val['input_text'], df_val['label']
X_test, y_test   = df_test['input_text'], df_test['label']

In [4]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_distributions = {
    'tfidf__max_features': randint(5000, 20001),
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__min_df': randint(1, 6),
    'tfidf__max_df': uniform(0.8, 0.2),
    'tfidf__stop_words': [None, 'english'],

    'clf__n_estimators': randint(100, 501),
    'clf__max_depth': [None] + list(randint(5, 26).rvs(5)),
    'clf__min_samples_split': randint(2, 11),
    'clf__min_samples_leaf': randint(1, 11),
    'clf__max_features': [None, 'sqrt', 'log2']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [5]:
n_pilot = 9
pilot_frac = 0.3
scale = 1/pilot_frac  # ≈3.33
n_full = int(100 * n_pilot / scale)
print(f"Initial candidates for RF halving: {n_full}")

halving_search = HalvingRandomSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_candidates=n_full,
    factor=3,
    resource='n_samples',
    max_resources=len(X_train),
    min_resources=len(X_train)//300,  # ensure 6 iterations
    scoring='accuracy',
    cv=cv,
    verbose=2,
    n_jobs=-1,
    random_state=42,
    return_train_score=True,
    error_score='raise'
)


Initial candidates for RF halving: 270


In [6]:
halving_search.fit(X_train, y_train)
print("Best RF params:", halving_search.best_params_)
print(f"Best RF CV accuracy: {halving_search.best_score_:.4f}")

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 6
min_resources_: 100
max_resources_: 30000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 270
n_resources: 100
Fitting 5 folds for each of 270 candidates, totalling 1350 fits
----------
iter: 1
n_candidates: 90
n_resources: 300
Fitting 5 folds for each of 90 candidates, totalling 450 fits
----------
iter: 2
n_candidates: 30
n_resources: 900
Fitting 5 folds for each of 30 candidates, totalling 150 fits
----------
iter: 3
n_candidates: 10
n_resources: 2700
Fitting 5 folds for each of 10 candidates, totalling 50 fits
----------
iter: 4
n_candidates: 4
n_resources: 8100
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 5
n_candidates: 2
n_resources: 24300
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best RF params: {'clf__max_depth': 22, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 4, 'clf__n_estimators': 419, 'tfidf__max

In [7]:
best_rf = halving_search.best_estimator_
val_preds = best_rf.predict(X_val)
print("Val Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds, target_names=['Real','Fake']))
print("Confusion Matrix:\n", confusion_matrix(y_val, val_preds))


Val Accuracy: 0.9858333333333333
              precision    recall  f1-score   support

        Real       0.99      0.98      0.99      3089
        Fake       0.98      0.99      0.99      2911

    accuracy                           0.99      6000
   macro avg       0.99      0.99      0.99      6000
weighted avg       0.99      0.99      0.99      6000

Confusion Matrix:
 [[3039   50]
 [  35 2876]]


In [8]:
test_preds = best_rf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, test_preds))
print(classification_report(y_test, test_preds, target_names=['Real','Fake']))
print("Confusion Matrix:\n", confusion_matrix(y_test, test_preds))

Test Accuracy: 0.9852425305431233
              precision    recall  f1-score   support

        Real       0.99      0.98      0.99      4284
        Fake       0.98      0.99      0.98      3983

    accuracy                           0.99      8267
   macro avg       0.99      0.99      0.99      8267
weighted avg       0.99      0.99      0.99      8267

Confusion Matrix:
 [[4210   74]
 [  48 3935]]


In [9]:

joblib.dump(best_rf, 'best_fake_news_rf_halving_random_search.joblib')

['best_fake_news_rf_halving_random_search.joblib']