In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from scipy.stats import uniform, randint

RANDOM_SEED = 2025

In [2]:
# Load data
train_df = pd.read_csv('../../data/Article-Bias-Prediction/article-bias-detection_train.csv')
test_df = pd.read_csv('../../data/Article-Bias-Prediction/article-bias-detection_test.csv')

In [3]:
# Fill missing
for df in (train_df, test_df):
    df['title'] = df['title'].fillna('')
    df['content'] = df['content'].fillna('')

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(train_df['bias_text'])
y_test  = le.transform(test_df['bias_text'])

# Combine title + content
train_text = train_df['title'] + " " + train_df['content']
test_text  = test_df['title'] + " " + test_df['content']

In [4]:
# Define pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('xgb', XGBClassifier(
        num_class=3,
        n_jobs=-1,
        random_state=RANDOM_SEED
    ))
])

# Define joint search space
param_dist = {
    # TF-IDF
    'tfidf__max_features': [3000, 5000, 8000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__min_df': [1, 2],
    'tfidf__max_df': [0.85, 0.95],
    'tfidf__sublinear_tf': [True, False],

    # XGBoost
    'xgb__n_estimators': randint(100, 800),
    'xgb__max_depth': randint(3, 10),
    'xgb__learning_rate': uniform(0.01, 0.3),
    'xgb__subsample': uniform(0.6, 0.4),
    'xgb__colsample_bytree': uniform(0.6, 0.4),
    'xgb__gamma': uniform(0, 5),
    'xgb__reg_alpha': uniform(0, 1),
    'xgb__reg_lambda': uniform(0, 1),
}

# Stratified CV
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)

# Random search
search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=36,
    cv=cv,
    scoring='f1_macro',
    verbose=2,
    random_state=RANDOM_SEED,
    n_jobs=-1
)

# Fit
search.fit(train_text, y_train)

# Best results
print("Best Parameters:")
print(search.best_params_)
print(f"\nBest CV F1 score: {search.best_score_:.4f}")

# Evaluate on test
y_pred = search.predict(test_text)
print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END tfidf__max_df=0.85, tfidf__max_features=8000, tfidf__min_df=1, tfidf__ngram_range=(1, 1), tfidf__sublinear_tf=True, xgb__colsample_bytree=0.642221019999886, xgb__gamma=3.237997531452609, xgb__learning_rate=0.2884773944115559, xgb__max_depth=3, xgb__n_estimators=585, xgb__reg_alpha=0.78276849160058, xgb__reg_lambda=0.126408546896565, xgb__subsample=0.9528216594486966; total time= 7.8min
[CV] END tfidf__max_df=0.95, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngram_range=(1, 2), tfidf__sublinear_tf=False, xgb__colsample_bytree=0.6504303094000228, xgb__gamma=3.0751242532664307, xgb__learning_rate=0.10821432085882737, xgb__max_depth=3, xgb__n_estimators=716, xgb__reg_alpha=0.4996489668070282, xgb__reg_lambda=0.8790444951447868, xgb__subsample=0.6300408701714589; total time=13.0min




[CV] END tfidf__max_df=0.95, tfidf__max_features=8000, tfidf__min_df=2, tfidf__ngram_range=(1, 1), tfidf__sublinear_tf=False, xgb__colsample_bytree=0.6077414414877761, xgb__gamma=2.3186822154881224, xgb__learning_rate=0.20885302257757454, xgb__max_depth=8, xgb__n_estimators=242, xgb__reg_alpha=0.33959424514489256, xgb__reg_lambda=0.08744431464946445, xgb__subsample=0.8146168078693996; total time=12.3min
[CV] END tfidf__max_df=0.85, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngram_range=(1, 2), tfidf__sublinear_tf=False, xgb__colsample_bytree=0.7957091965943826, xgb__gamma=1.749577829510402, xgb__learning_rate=0.2983216645206764, xgb__max_depth=9, xgb__n_estimators=112, xgb__reg_alpha=0.49874536391231916, xgb__reg_lambda=0.9267631049637567, xgb__subsample=0.8705335607135489; total time=10.9min
[CV] END tfidf__max_df=0.95, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngram_range=(1, 1), tfidf__sublinear_tf=False, xgb__colsample_bytree=0.7133426947522157, xgb__gamma=3.53832650