In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from tqdm import tqdm
from xgboost import XGBClassifier

In [2]:
# Cargar y preprocesar el dataset
X, y = fetch_openml(name="adult", version=2, as_frame=True, return_X_y=True)
y = (y == ">50K").astype(int)

categorical = X.select_dtypes(include="category").columns.tolist()
numerical = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy="mean"), numerical),
    ("cat", Pipeline([
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", OneHotEncoder(handle_unknown="ignore"))
    ]), categorical)
])

In [3]:
# Dividir train/sel(o val)/test (72/18/10)
X_dev, X_test, y_dev, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42
)
X_train, X_sel, y_train, y_sel = train_test_split(
    X_dev, y_dev, test_size=0.2, stratify=y_dev, random_state=42
)

In [19]:
# Definir modelos a comparar (boosting)
models = {
    "AdaBoost (árbol débil)": AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=1),
        n_estimators=30,
        random_state=0
    ),
    "AdaBoost (árbol más profundo)": AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=None),
        n_estimators=30,
        random_state=0
    ),
    "GB (árbol débil)": GradientBoostingClassifier(
        max_depth=1,
        n_estimators=30,
        random_state=0
    ),
    "GB (árbol más profundo)": GradientBoostingClassifier(
        max_depth=None,
        n_estimators=30,
        random_state=0
    ),
    "XGB (árbol débil)": XGBClassifier(
        max_depth=1,
        n_estimators=30,
        use_label_encoder=False,
        eval_metric="logloss",
        verbosity=0,
        random_state=0,
        n_jobs=-1
    ),
    "XGB (árbol más profundo)": XGBClassifier(
        max_depth=None,
        n_estimators=30,
        use_label_encoder=False,
        eval_metric="logloss",
        verbosity=0,
        random_state=0,
        n_jobs=-1
    )
}

In [20]:
# Entrenar y evaluar todos
results = {}

for name, model in tqdm(models.items()):
    pipe = Pipeline([
        ("preproc", preprocessor),
        ("clf", model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_sel)
    acc = accuracy_score(y_sel, y_pred)
    results[name] = acc

100%|██████████| 6/6 [02:15<00:00, 22.57s/it]


In [21]:
# Mostrar resultados finales
print("Accuracy en conjunto de selección:")
for name, acc in results.items():
    print(f"{name:35s}: {acc:.4f}")

Accuracy en conjunto de selección:
AdaBoost (árbol débil)             : 0.8447
AdaBoost (árbol más profundo)      : 0.8215
GB (árbol débil)                   : 0.8292
GB (árbol más profundo)            : 0.8161
XGB (árbol débil)                  : 0.8495
XGB (árbol más profundo)           : 0.8760


In [22]:
# Evaluar el mejor modelo en test
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]

pipe = Pipeline([
    ("preproc", preprocessor),
    ("clf", best_model)
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f'\nMejor modelo en selección: {best_model_name}')
print(f'Accuracy en test: {acc:.4f}')


Mejor modelo en selección: XGB (árbol más profundo)
Accuracy en test: 0.8696
