In [1]:
import os
import json
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

RANDOM_SEED = 2025

In [2]:
def load_article_bias_data_local(split_type="media", repo_root="Article-Bias-Prediction"):
    """
    Load article data from local repository structure.

    Expected structure:
    data/
      Article-Bias-Prediction/
        jsons/            --> individual article JSON files
        splits/
          media/          --> train.tsv, valid.tsv, test.tsv (media-based split)
          random/         --> train.tsv, valid.tsv, test.tsv (random split)

    Parameters:
      split_type (str): "media" or "random" split strategy
      repo_root (str): root directory of the dataset

    Returns:
      (train_df, valid_df, test_df): DataFrames containing full article info
    """
    data_dir = os.path.join("../../data", repo_root)
    splits_dir = os.path.join(data_dir, "splits", split_type)
    jsons_dir = os.path.join(data_dir, "jsons")

    def read_tsv(filename):
        return pd.read_csv(
            os.path.join(splits_dir, filename),
            sep="\t", header=None, names=["ID", "bias"], skiprows=1
        )

    def load_jsons(ids_df):
        records = []
        for article_id in ids_df["ID"]:
            json_path = os.path.join(jsons_dir, f"{article_id}.json")
            try:
                with open(json_path, 'r', encoding='utf-8') as f:
                    records.append(json.load(f))
            except Exception as e:
                print(f"Error loading {json_path}: {e}")
        return pd.DataFrame(records)

    train_ids = read_tsv("train.tsv")
    valid_ids = read_tsv("valid.tsv")
    test_ids  = read_tsv("test.tsv")

    train_df = load_jsons(train_ids)
    valid_df = load_jsons(valid_ids)
    test_df  = load_jsons(test_ids)

    return train_df, valid_df, test_df

# Load and prepare data
train_df, valid_df, test_df = load_article_bias_data_local(split_type="random")
print("Train:", "\n  Shape:", train_df.shape, "\n  Columns:", train_df.columns)
print("Valid:", "\n  Shape:", valid_df.shape, "\n  Columns:", valid_df.columns)
print("Test: ", "\n  Shape:", test_df.shape, "\n  Columns:", test_df.columns)

# Combine training and validation sets
train_df = pd.concat([train_df, valid_df])

Train: 
  Shape: (27978, 12) 
  Columns: Index(['topic', 'source', 'bias', 'url', 'title', 'date', 'authors', 'content',
       'content_original', 'source_url', 'bias_text', 'ID'],
      dtype='object')
Valid: 
  Shape: (6996, 12) 
  Columns: Index(['topic', 'source', 'bias', 'url', 'title', 'date', 'authors', 'content',
       'content_original', 'source_url', 'bias_text', 'ID'],
      dtype='object')
Test:  
  Shape: (1300, 12) 
  Columns: Index(['topic', 'source', 'bias', 'url', 'title', 'date', 'authors', 'content',
       'content_original', 'source_url', 'bias_text', 'ID'],
      dtype='object')


In [3]:
# Fill missing text and concatenate title + content
for df in (train_df, test_df):
    df['title'] = df['title'].fillna('')
    df['content'] = df['content'].fillna('')
    df['text'] = df['title'] + ' ' + df['content']

In [4]:
# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['bias_text'])
y_test = label_encoder.transform(test_df['bias_text'])

In [5]:
# Vectorize text using TF-IDF
tfidf = TfidfVectorizer(max_features=60000)
X_train = tfidf.fit_transform(train_df['text'])
X_test = tfidf.transform(test_df['text'])

In [6]:
# Train XGBoost classifier
model = XGBClassifier(
    num_class=3,
    objective='multi:softmax',
    eval_metric='mlogloss',
    n_estimators=500,
    n_jobs=-1,
    random_state=RANDOM_SEED
)
model.fit(X_train, y_train)

In [7]:
# Predict and evaluate
y_pred = model.predict(X_test)
print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.7738

Classification Report:
              precision    recall  f1-score   support

      center       0.75      0.91      0.82       299
        left       0.72      0.72      0.72       402
       right       0.83      0.74      0.78       599

    accuracy                           0.77      1300
   macro avg       0.77      0.79      0.78      1300
weighted avg       0.78      0.77      0.77      1300



# Save Best Model & Label Encoder

In [8]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

# Save trained pipeline components
joblib.dump(model, 'tfidf_xgb_pipeline.joblib')        # Trained XGBoost model
joblib.dump(tfidf, 'tfidf_vectorizer.joblib')          # TF-IDF vectorizer
joblib.dump(label_encoder, 'label_encoder.joblib')     # Label encoder

# Load saved model and label encoder
model = joblib.load('tfidf_xgb_pipeline.joblib')
le = joblib.load('label_encoder.joblib')

# Load and apply saved TF-IDF vectorizer
tfidf = joblib.load('tfidf_vectorizer.joblib')
X_test = tfidf.transform(test_df["text"])

# Encode labels
y_test = le.transform(test_df['bias_text'])

# Predict
y_pred = model.predict(X_test)

# Report
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Test Accuracy: 0.7738
Classification Report:
              precision    recall  f1-score   support

      center       0.75      0.91      0.82       299
        left       0.72      0.72      0.72       402
       right       0.83      0.74      0.78       599

    accuracy                           0.77      1300
   macro avg       0.77      0.79      0.78      1300
weighted avg       0.78      0.77      0.77      1300

