# XGBoost vs LightGBM â€“ Baseline Notebook

This notebook implements the baseline pipeline for the Focused Comparative Study:

- Load dataset
- Train XGBoost
- Train LightGBM
- Evaluate performance
- Run a simple ablation study


In [None]:
# Imports
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

## Load Dataset

In [None]:
def load_data():
    data = load_breast_cancer()
    X_train, X_test, y_train, y_test = train_test_split(
        data.data, data.target, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = load_data()
X_train.shape, X_test.shape

## Define Models

In [None]:
def get_xgboost_model(params=None):
    if params is None:
        params = {
            'n_estimators': 200,
            'learning_rate': 0.1,
            'max_depth': 6,
            'subsample': 0.9,
            'eval_metric': 'logloss'
        }
    return XGBClassifier(**params)


def get_lightgbm_model(params=None):
    if params is None:
        params = {
            'n_estimators': 200,
            'learning_rate': 0.1,
            'max_depth': -1,
            'subsample': 0.9
        }
    return LGBMClassifier(**params)

## Evaluation Function

In [None]:
def evaluate_model(model, X_test, y_test):
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    return {'accuracy': acc, 'f1_score': f1}

## Train & Evaluate Baseline Models

In [None]:
# XGBoost
xgb_model = get_xgboost_model()
xgb_model.fit(X_train, y_train)
xgb_metrics = evaluate_model(xgb_model, X_test, y_test)

xgb_metrics

In [None]:
# LightGBM
lgbm_model = get_lightgbm_model()
lgbm_model.fit(X_train, y_train)
lgbm_metrics = evaluate_model(lgbm_model, X_test, y_test)

lgbm_metrics

## Ablation Study

In [None]:
def run_ablation(X_train, X_test, y_train, y_test):
    results = []

    learning_rates = [0.01, 0.1]
    n_estimators_list = [100, 300]

    for lr in learning_rates:
        for n_est in n_estimators_list:

            params = {'learning_rate': lr, 'n_estimators': n_est}

            # XGBoost
            xgb = get_xgboost_model(params)
            xgb.fit(X_train, y_train)
            xgb_m = evaluate_model(xgb, X_test, y_test)

            # LightGBM
            lgb = get_lightgbm_model(params)
            lgb.fit(X_train, y_train)
            lgb_m = evaluate_model(lgb, X_test, y_test)

            results.append({
                'learning_rate': lr,
                'n_estimators': n_est,
                'xgb_accuracy': xgb_m['accuracy'],
                'xgb_f1': xgb_m['f1_score'],
                'lgb_accuracy': lgb_m['accuracy'],
                'lgb_f1': lgb_m['f1_score']
            })

    return results

ablation_results = run_ablation(X_train, X_test, y_train, y_test)
ablation_results