In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import label_binarize
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, log_loss, cohen_kappa_score, matthews_corrcoef,
    top_k_accuracy_score, classification_report, roc_curve,
    auc, roc_auc_score, f1_score
)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from lightgbm import early_stopping, log_evaluation
from xgboost.callback import EarlyStopping


In [2]:
categories = None
newsgroups = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=5)
X, y = newsgroups.data, newsgroups.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5, stratify=y)

In [3]:
def evaluate_multiclass_model(
    y_test,
    y_pred,
    y_score,
    class_names,
    top_k=3,
    show_report=False
):
    """
    Evaluate a multi-class classification model and return metrics as a dictionary.

    Parameters:
    - y_test: True labels (integers)
    - y_pred: Predicted labels (integers)
    - y_score: Predicted class probabilities (shape: [n_samples, n_classes])
    - class_names: List of class names (strings)
    - top_k: Value of k for Top-k Accuracy (default: 3)
    - show_report: Whether to return the classification report (default: False)

    Returns:
    - A dictionary of evaluation metrics
    """

    n_classes = len(class_names)
    y_test_bin = label_binarize(y_test, classes=range(n_classes))

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "log_loss": log_loss(y_test, y_score),
        "f1_macro": f1_score(y_test, y_pred, average="macro"),
        "f1_weighted": f1_score(y_test, y_pred, average="weighted"),
        "cohen_kappa": cohen_kappa_score(y_test, y_pred),
        "mcc": matthews_corrcoef(y_test, y_pred),
        f"top_{top_k}_accuracy": top_k_accuracy_score(y_test, y_score, k=top_k),
        "macro_auc": roc_auc_score(y_test_bin, y_score, average="macro"),
    }

    if show_report:
        metrics["classification_report"] = classification_report(
            y_test, y_pred, target_names=class_names, output_dict=True
        )

    return metrics

In [4]:
def plot_multiclass_roc(y_test, y_score, class_names):
    """
    Plot ROC curves for multi-class classification using One-vs-Rest approach.

    Parameters:
    - y_test: True labels (integers)
    - y_score: Predicted probabilities (shape: [n_samples, n_classes])
    - class_names: List of class names
    """
    n_classes = len(class_names)
    y_test_bin = label_binarize(y_test, classes=range(n_classes))

    plt.figure(figsize=(8, 6))
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{class_names[i]} (AUC = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curves (One-vs-Rest)")
    plt.legend(loc="lower right", fontsize='small')
    plt.grid()
    plt.show()


In [5]:
models = {
#    "Naive Bayes": MultinomialNB(),
#    "Logistic Regression": LogisticRegression(max_iter=1000),
#    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
#    "LightGBM": LGBMClassifier(
#        n_estimators=1000,
#        learning_rate=0.05,
#        max_depth=8,
#        num_leaves=64,
#        subsample=0.8,
#        colsample_bytree=0.8,
#        random_state=5,
#        verbose=-1
#    ),
    "XGBoost": XGBClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric="mlogloss",    
        tree_method="hist",  
        random_state=42,
        verbosity=0
    ),
}


In [6]:
all_results = []

for model_name, clf in models.items():
    print(f"Evaluating: {model_name}")
    
    if model_name == "LightGBM":
        # TF-IDF + early stopping
        X_train_part, X_val_part, y_train_part, y_val_part = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42
        )

        # TF-IDF
        vectorizer = TfidfVectorizer()
        X_train_vec = vectorizer.fit_transform(X_train_part)
        X_val_vec = vectorizer.transform(X_val_part)
        X_test_vec = vectorizer.transform(X_test)

        clf.fit(
            X_train_vec, y_train_part,
            eval_set=[(X_val_vec, y_val_part)],
            eval_metric="multi_logloss",
            callbacks=[
                early_stopping(stopping_rounds=20),
                log_evaluation(period=0)  # no verbosity
            ]
        )
        y_pred = clf.predict(X_test_vec)
        y_score = clf.predict_proba(X_test_vec)


    elif model_name == "XGBoost":
        X_train_part, X_val_part, y_train_part, y_val_part = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42
        )
    
        vectorizer = TfidfVectorizer()
        X_train_vec = vectorizer.fit_transform(X_train_part)
        X_val_vec = vectorizer.transform(X_val_part)
        X_test_vec = vectorizer.transform(X_test)
    
        clf.fit(
            X_train_vec,
            y_train_part,
            eval_set=[(X_val_vec, y_val_part)],
            early_stopping_rounds=20,   
            verbose=False
        )
    
        y_pred = clf.predict(X_test_vec)
        y_score = clf.predict_proba(X_test_vec)


    else:
        model = make_pipeline(TfidfVectorizer(), clf)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_score = model.predict_proba(X_test)

    results = evaluate_multiclass_model(
        y_test=y_test,
        y_pred=y_pred,
        y_score=y_score,
        class_names=newsgroups.target_names,
        show_report=False
    )
    results["model"] = model_name
    all_results.append(results)

    plot_multiclass_roc(
        y_test=y_test,
        y_score=y_score,
        class_names=newsgroups.target_names
    )

df_all = pd.DataFrame(all_results).set_index("model").round(4)
display(df_all)


Evaluating: XGBoost


TypeError: XGBClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [None]:
import xgboost
print(xgboost.__version__)


In [11]:
import xgboost; print(xgboost.__version__)
from xgboost import XGBClassifier; print(XGBClassifier.__module__)
import sklearn; print(sklearn.__version__)
import sys; print(sys.version)

3.0.1
xgboost.sklearn
1.6.1
3.10.16 (main, Dec 11 2024, 10:22:29) [Clang 14.0.6 ]


xgboost.sklearn


1.6.1
