In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
RANDOM_SEED = 2025

In [3]:
# Load data
train_file = '../../data/Article-Bias-Prediction/article-bias-detection_train.csv'
test_file = '../../data/Article-Bias-Prediction/article-bias-detection_test.csv'

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

for df in (train_df, test_df):
    print(df.columns)

Index(['topic', 'source', 'bias', 'url', 'title', 'date', 'authors', 'content',
       'content_original', 'source_url', 'bias_text', 'ID'],
      dtype='object')
Index(['topic', 'source', 'bias', 'url', 'title', 'date', 'authors', 'content',
       'content_original', 'source_url', 'bias_text', 'ID'],
      dtype='object')


In [4]:
# Fill missing text
for df in (train_df, test_df):
    df['title']   = df['title'].fillna('')
    df['content'] = df['content'].fillna('')

In [5]:
# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(train_df['bias_text'])
y_test  = le.transform(test_df['bias_text'])

In [6]:
# Vectorize text features
tfidf_title = TfidfVectorizer(max_features=5000)
tfidf_content = TfidfVectorizer(max_features=5000)

X_title_train   = tfidf_title.fit_transform(train_df['title'])
X_title_test    = tfidf_title.transform(test_df['title'])
X_content_train = tfidf_content.fit_transform(train_df['content'])
X_content_test  = tfidf_content.transform(test_df['content'])

In [7]:
# Combine features
X_train = hstack([X_title_train, X_content_train])
X_test  = hstack([X_title_test,  X_content_test])

In [8]:
# Train XGBoost multiclass classifier
model = XGBClassifier(
    num_class=3,
    n_jobs=-1,
    random_state=RANDOM_SEED
)
model.fit(X_train, y_train)

In [9]:
# Predict & evaluate
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}\n')
print('Classification Report:')
print(classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy: 0.7463

Classification Report:
              precision    recall  f1-score   support

      center       0.79      0.68      0.73      2155
        left       0.76      0.74      0.75      2607
       right       0.71      0.81      0.75      2751

    accuracy                           0.75      7513
   macro avg       0.75      0.74      0.74      7513
weighted avg       0.75      0.75      0.75      7513

