In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from tqdm import tqdm

In [2]:
# Cargar y preprocesar el dataset
X, y = fetch_openml(name="adult", version=2, as_frame=True, return_X_y=True)

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             48842 non-null  int64   
 1   workclass       46043 non-null  category
 2   fnlwgt          48842 non-null  int64   
 3   education       48842 non-null  category
 4   education-num   48842 non-null  int64   
 5   marital-status  48842 non-null  category
 6   occupation      46033 non-null  category
 7   relationship    48842 non-null  category
 8   race            48842 non-null  category
 9   sex             48842 non-null  category
 10  capital-gain    48842 non-null  int64   
 11  capital-loss    48842 non-null  int64   
 12  hours-per-week  48842 non-null  int64   
 13  native-country  47985 non-null  category
dtypes: category(8), int64(6)
memory usage: 2.6 MB


In [4]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States


In [5]:
X.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [6]:
y.value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
<=50K,37155
>50K,11687


In [7]:
# Convertir etiquetas a binario
y = (y == ">50K").astype(int)

In [8]:
# Separar tipos de variables
categorical = X.select_dtypes(include="category").columns.tolist()
numerical = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [9]:
# Preprocesamiento
preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy="mean"), numerical),
    ("cat", Pipeline([
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", OneHotEncoder(handle_unknown="ignore"))
    ]), categorical)
])

In [10]:
# Dividir train/sel(o val)/test (72/18/10)

X_dev, X_test, y_dev, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42
)

X_train, X_sel, y_train, y_sel = train_test_split(
    X_dev, y_dev, test_size=0.2, stratify=y_dev, random_state=42
)

In [14]:
# Definir los modelos a comparar

models = {
    "Bagging (árbol débil)": BaggingClassifier(
        estimator=DecisionTreeClassifier(max_depth=1),
        n_estimators=99,
        random_state=0,
        n_jobs=-1
    ),
    "Bagging (árbol profundo)": BaggingClassifier(
        estimator=DecisionTreeClassifier(max_depth=None),
        n_estimators=99,
        random_state=0,
        n_jobs=-1
    ),
    "RF (árbol débil)": RandomForestClassifier(
        max_depth=1,
        n_estimators=99,
        max_features="sqrt",
        random_state=0,
        n_jobs=-1
    ),
    "RF (árbol profundo)": RandomForestClassifier(
        max_depth=None,
        n_estimators=99,
        max_features="sqrt",
        random_state=0,
        n_jobs=-1
    )
}

In [15]:
# Entrenar y evaluar todos

results = {}

for name, model in tqdm(models.items()):
    pipe = Pipeline([
        ("preproc", preprocessor),
        ("clf", model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_sel)
    acc = accuracy_score(y_sel, y_pred)
    results[name] = acc

100%|██████████| 4/4 [02:10<00:00, 32.52s/it]


In [16]:
# Mostrar resultados finales

print("Accuracy en test set:")
for name, acc in results.items():
    print(f"{name:30s}: {acc:.4f}")


Accuracy en test set:
Bagging (árbol débil)         : 0.7607
Bagging (árbol profundo)      : 0.8567
RF (árbol débil)              : 0.7607
RF (árbol profundo)           : 0.8528


In [18]:
# Validamos el mejor modelo con los datos de test
best_model = models["RF (árbol profundo)"]
pipe = Pipeline([
        ("preproc", preprocessor),
        ("clf", best_model)
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
acc = accuracy_score(y_test, y_pred)

In [19]:
print(f'Accuracy en test: {acc:.3f}')

Accuracy en test: 0.856
