In [None]:
# === Celda A: RESET + PREPRO ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# --- Configuración de dataset/target ---
PATH = "/content/data.csv"
TARGET_COL = "Loan_Status"  # cámbialo si tu target es otro

df = pd.read_csv(PATH)
assert TARGET_COL in df.columns, f"'{TARGET_COL}' no está en {df.columns.tolist()}"

# Quitar identificadores obvios si existen
for c in ["Loan_ID","ID","Id","id"]:
    if c in df.columns:
        df = df.drop(columns=[c])

y = df[TARGET_COL].copy()
X = df.drop(columns=[TARGET_COL]).copy()

# Detectar tipos
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Split (estratificado si es clasificación con pocas clases)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42,
    stratify=y if y.nunique() <= 20 else None
)

# OneHotEncoder compatible con cualquier versión de scikit-learn
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)  # sklearn >= 1.2
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)         # sklearn < 1.2

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", ohe),
])
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
])

preprocess = ColumnTransformer([
    ("cat", cat_pipe, cat_cols),
    ("num", num_pipe, num_cols),
], verbose_feature_names_out=False)

# Fit/transform
Xtr_proc = preprocess.fit_transform(X_train)
Xte_proc = preprocess.transform(X_test)
feature_names = preprocess.get_feature_names_out()

print("Xtr_proc:", Xtr_proc.shape, "| Xte_proc:", Xte_proc.shape)
print("y_train:", y_train.shape, "| y_test:", y_test.shape)
print("Ejemplos de features:", feature_names[:10])


In [None]:
# === Celda B: MODELOS + MÉTRICAS ===
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

models = {}

# Árbol interpretable
tree = DecisionTreeClassifier(max_depth=4, min_samples_leaf=5, random_state=42)
tree.fit(Xtr_proc, y_train)
models["DecisionTree(max_depth=4)"] = tree

# Random Forest (con OOB)
rf = RandomForestClassifier(
    n_estimators=300, max_depth=None, min_samples_leaf=2,
    max_features="sqrt", oob_score=True, n_jobs=-1, random_state=42
)
rf.fit(Xtr_proc, y_train)
models["RandomForest"] = rf

# Gradient Boosting
gb = GradientBoostingClassifier(
    learning_rate=0.08, n_estimators=300, max_depth=3, random_state=42
)
gb.fit(Xtr_proc, y_train)
models["GradientBoosting"] = gb

# (Opcional) XGBoost si está instalado
xgb_ok = False
try:
    import xgboost as xgb
    xgb_clf = xgb.XGBClassifier(
        n_estimators=150, max_depth=3, learning_rate=0.1,
        subsample=0.8, colsample_bytree=0.8,
        reg_lambda=1.0, reg_alpha=0.0,
        objective="binary:logistic", eval_metric="logloss",
        n_jobs=-1, random_state=42
    )
    xgb_clf.fit(Xtr_proc, y_train)
    models["XGBoost"] = xgb_clf
    xgb_ok = True
except Exception as e:
    print("XGBoost no disponible:", e)

# Evaluación
rows = []
for name, clf in models.items():
    ytr = clf.predict(Xtr_proc)
    yte = clf.predict(Xte_proc)
    row = {
        "modelo": name,
        "acc_train": accuracy_score(y_train, ytr),
        "acc_test":  accuracy_score(y_test, yte)
    }
    if name == "RandomForest":
        row["oob_score"] = getattr(clf, "oob_score_", None)
    rows.append(row)

res_df = pd.DataFrame(rows).sort_values("acc_test", ascending=False)
print(res_df)


In [None]:
# === Celda C: Importancia de variables (impureza) ===
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def plot_importance_impurity(clf, feat_names, title, top=15):
    if not hasattr(clf, "feature_importances_"):
        print(f"{title}: el estimador no expone feature_importances_.")
        return None
    imp = np.asarray(clf.feature_importances_)
    idx = np.argsort(imp)[::-1][:top]
    top_tbl = pd.DataFrame({
        "feature": np.array(feat_names)[idx],
        "importance_impurity": imp[idx]
    })
    plt.figure(figsize=(6, 5))
    y_pos = np.arange(len(idx))
    plt.barh(y_pos, imp[idx])
    plt.yticks(y_pos, np.array(feat_names)[idx])
    plt.gca().invert_yaxis()
    plt.title(title)
    plt.tight_layout()
    plt.show()
    return top_tbl

imp_tables = {}
if "RandomForest" in models:
    imp_tables["RF"] = plot_importance_impurity(models["RandomForest"], feature_names, "RandomForest — Importancia (impureza)")
if "GradientBoosting" in models:
    imp_tables["GB"] = plot_importance_impurity(models["GradientBoosting"], feature_names, "GradientBoosting — Importancia (impureza)")

# Mostrar top conjunto si existen ambos
if imp_tables:
    display({k: v.head(10) for k, v in imp_tables.items() if v is not None})


In [None]:
# === Celda D: Permutation Importance en test ===
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance

# Elegimos mejor por acc_test de res_df
assert "res_df" in globals(), "No encuentro res_df; ejecuta Celda B primero."
best_name = res_df.iloc[0]["modelo"]
best_clf = models[best_name]
print("Mejor modelo según acc_test:", best_name)

perm = permutation_importance(best_clf, Xte_proc, y_test, n_repeats=10, random_state=42, n_jobs=-1)
idx = np.argsort(perm.importances_mean)[::-1]
top = 15
idx = idx[:min(top, len(idx))]

perm_tbl = pd.DataFrame({
    "feature": np.array(feature_names)[idx],
    "perm_importance_mean": perm.importances_mean[idx],
    "perm_importance_std": perm.importances_std[idx]
})
display(perm_tbl)

plt.figure(figsize=(6, 5))
y_pos = np.arange(len(idx))
plt.barh(y_pos, perm.importances_mean[idx])
plt.yticks(y_pos, np.array(feature_names)[idx])
plt.gca().invert_yaxis()
plt.title(f"Permutation Importance — {best_name} (top {len(idx)})")
plt.tight_layout()
plt.show()



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# --- elegir par de features como antes (si ya lo hiciste, puedes omitir esta parte) ---
preferred = ["ApplicantIncome", "Credit_History"]
use_feats = [f for f in preferred if f in feature_names]
if len(use_feats) < 2:
    try:
        top2 = perm_tbl["feature"].tolist()[:2]
        use_feats = top2
    except Exception:
        # fallback: primeras dos columnas del espacio procesado
        use_feats = feature_names[:2]

print("Par de variables para el plano 2D:", use_feats)

mask = np.isin(feature_names, use_feats)
Xtr_2d = Xtr_proc[:, mask]
Xte_2d = Xte_proc[:, mask]

# Asegurar y_train numérica para evitar sorpresas en clasificador
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)

clf2d = DecisionTreeClassifier(max_depth=3, random_state=42)
clf2d.fit(Xtr_2d, y_train_enc)

# Malla
x_min, x_max = Xtr_2d[:,0].min()-0.3, Xtr_2d[:,0].max()+0.3
y_min, y_max = Xtr_2d[:,1].min()-0.3, Xtr_2d[:,1].max()+0.3
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 350),
                     np.linspace(y_min, y_max, 350))
grid = np.c_[xx.ravel(), yy.ravel()]

# Probabilidades para superficie continua
proba = clf2d.predict_proba(grid)  # shape: (N, n_classes)
if proba.shape[1] == 2:
    # binario: tomamos proba de la clase positiva (etiqueta max del encoder)
    Z = proba[:, 1].reshape(xx.shape)
    levels = np.linspace(0.0, 1.0, 11)
    cmap = "RdBu"
else:
    # multiclase: usamos el argmax de proba como superficie categórica (numérica)
    Z = np.argmax(proba, axis=1).reshape(xx.shape).astype(float)
    # niveles discretos (0..K-1)
    n_classes = proba.shape[1]
    levels = np.arange(-0.5, n_classes + 0.5, 1)
    cmap = None  # dejar default

plt.figure(figsize=(5.5, 4.5))
cs = plt.contourf(xx, yy, Z, alpha=0.18, levels=levels, cmap=cmap)

# Puntos de entrenamiento (colorear por la clase codificada)
plt.scatter(Xtr_2d[:,0], Xtr_2d[:,1], c=y_train_enc, s=16, edgecolor="k")
plt.xlabel(use_feats[0]); plt.ylabel(use_feats[1])
plt.title(f"Frontera 2D — {use_feats[0]} vs {use_feats[1]} (probabilidades)")
plt.tight_layout()
plt.show()


In [None]:
# === Celda G: GridSearch para RF y GB ===
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf_grid = {
    "n_estimators": [200, 400],
    "max_features": ["sqrt", 0.5],
    "min_samples_leaf": [1, 2, 4],
    "max_depth": [None, 8]
}
rf_gs = GridSearchCV(
    estimator=RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42),
    param_grid=rf_grid, scoring="accuracy", cv=cv, n_jobs=-1
)
rf_gs.fit(Xtr_proc, y_train)
print("RF best params:", rf_gs.best_params_, "| CV acc:", rf_gs.best_score_)
rf_best = rf_gs.best_estimator_
print("RF OOB:", getattr(rf_best, "oob_score_", None), "| Test acc:", rf_best.score(Xte_proc, y_test))

gb_grid = {
    "n_estimators": [200, 400],
    "learning_rate": [0.05, 0.1],
    "max_depth": [2, 3]
}
gb_gs = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=gb_grid, scoring="accuracy", cv=cv, n_jobs=-1
)
gb_gs.fit(Xtr_proc, y_train)
print("GB best params:", gb_gs.best_params_, "| CV acc:", gb_gs.best_score_)
gb_best = gb_gs.best_estimator_
print("GB Test acc:", gb_best.score(Xte_proc, y_test))


In [None]:
# === Celda H: Reportes del mejor modelo ===
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

best_name = res_df.iloc[0]["modelo"]
best_clf = models[best_name]
y_pred = best_clf.predict(Xte_proc)

print("Mejor modelo:", best_name)
print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))
print("\nClasification report:")
print(classification_report(y_test, y_pred))
