## üë• Autores

Este proyecto fue desarrollado por:

| Nombre Completo                     | N√∫mero de Identificaci√≥n | Carrera Universitaria              |
| ----------------------------------- | -----------------------: | ---------------------------------- |
| **Yorladys Argumedo Lozano**        | `1038824209`            | Ingenier√≠a Industrial Virtual      |
| **Sebastian Gabriel Castro**        | `1029720632`            | Ingenier√≠a Industrial Virtual      |

---


In [14]:
# ==============================================================================
# SOLUCI√ìN FINAL: STACKING + AGGREGATION FEATURES (NIVEL COMPETENCIA)
# ==============================================================================

import pandas as pd
import numpy as np
import os
import gc
import warnings
from itertools import combinations

# Instalaci√≥n
try:
    import lightgbm
    import catboost
    import xgboost
except ImportError:
    !pip install lightgbm catboost xgboost --quiet

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')

# 1. CARGA
print("üìÇ Cargando datos...")
if not os.path.exists('train.csv'):
    if not os.path.exists('kaggle.json'):
        from google.colab import files
        uploaded = files.upload()
    !mkdir -p ~/.kaggle
    !cp kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
    !kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia
    !unzip -q -o udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
test_ids = df_test['ID']

# 2. INGENIER√çA DE CARACTER√çSTICAS "CONTEXTUAL"
def engineer_features(df):
    # --- A. Mapeos Base ---
    map_estrato = {'Estrato 1': 1, 'Estrato 2': 2, 'Estrato 3': 3, 'Estrato 4': 4, 'Estrato 5': 5, 'Estrato 6': 6, 'Sin Estrato': 0}
    map_edu = {
        'Ninguno': 0, 'No sabe': 0, 'No Aplica': 0, 'Primaria incompleta': 1, 'Primaria completa': 2,
        'Secundaria (Bachillerato) incompleta': 3, 'Secundaria (Bachillerato) completa': 4,
        'T√É¬©cnica o tecnol√É¬≥gica incompleta': 5, 'T√©cnica o tecnol√≥gica incompleta': 5,
        'T√É¬©cnica o tecnol√É¬≥gica completa': 6, 'T√©cnica o tecnol√≥gica completa': 6,
        'Educaci√É¬≥n profesional incompleta': 7, 'Educaci√≥n profesional incompleta': 7,
        'Educaci√É¬≥n profesional completa': 8, 'Educaci√≥n profesional completa': 8, 'Postgrado': 9
    }

    df['F_ESTRATOVIVIENDA'] = df['F_ESTRATOVIVIENDA'].map(map_estrato).fillna(1).astype('int8')
    df['F_EDUCACIONPADRE'] = df['F_EDUCACIONPADRE'].map(map_edu).fillna(1).astype('int8')
    df['F_EDUCACIONMADRE'] = df['F_EDUCACIONMADRE'].map(map_edu).fillna(1).astype('int8')

    # Binarias r√°pidas
    for col in [c for c in df.columns if 'TIENE' in c or 'PAGO' in c]:
        df[col] = np.where(df[col].astype(str).str.upper().str.contains('S|1|YES'), 1, 0).astype('int8')

    # --- B. AGREGACIONES DE GRUPO (CONTEXTO) ---
    # Esto le da "inteligencia" al modelo sobre el entorno del estudiante

    # 1. Contexto del Departamento (¬øEs un dpto rico o educado?)
    # Calculamos la media de estrato y educaci√≥n por departamento
    if 'E_PRGM_DEPARTAMENTO' in df.columns:
        df['DEPARTAMENTO_ESTRATO_MEAN'] = df.groupby('E_PRGM_DEPARTAMENTO')['F_ESTRATOVIVIENDA'].transform('mean')
        df['DEPARTAMENTO_EDU_PADRES_MEAN'] = df.groupby('E_PRGM_DEPARTAMENTO')['F_EDUCACIONPADRE'].transform('mean')

    # 2. Contexto del Programa (¬øEs un programa elitista?)
    if 'E_PRGM_ACADEMICO' in df.columns:
        # Frecuencia del programa (tama√±o)
        df['PROGRAMA_FREQ'] = df.groupby('E_PRGM_ACADEMICO')['E_PRGM_ACADEMICO'].transform('count')
        # Nivel socioecon√≥mico promedio del programa
        df['PROGRAMA_NSE_MEAN'] = df.groupby('E_PRGM_ACADEMICO')['F_ESTRATOVIVIENDA'].transform('mean')

    # --- C. INTERACCIONES ---
    df['INTERACCION_PC_INTERNET'] = df['F_TIENEINTERNET'] * df['F_TIENECOMPUTADOR']
    df['MAX_EDU_PADRES'] = df[['F_EDUCACIONPADRE', 'F_EDUCACIONMADRE']].max(axis=1)

    # --- D. LIMPIEZA CATEG√ìRICA ---
    cat_cols = []
    for col in df.columns:
        if not pd.api.types.is_numeric_dtype(df[col]):
            df[col] = df[col].astype(str).replace('nan', 'MISSING').astype('category')
            cat_cols.append(col)

    return df, cat_cols

print("‚ö° Aplicando Ingenier√≠a de Contexto...")
X = df_train.drop(columns=['ID', 'RENDIMIENTO_GLOBAL'], errors='ignore')
y = df_train['RENDIMIENTO_GLOBAL'].map({'bajo': 0, 'medio-bajo': 1, 'medio-alto': 2, 'alto': 3})

n_train = len(X)
X_all = pd.concat([X, df_test.drop(columns=['ID'], errors='ignore')], axis=0)

X_all, cat_features = engineer_features(X_all)

# Versi√≥n num√©rica para XGBoost
X_all_xgb = X_all.copy()
for col in cat_features:
    le = LabelEncoder()
    X_all_xgb[col] = le.fit_transform(X_all_xgb[col].astype(str))

# Separar
X_clean = X_all.iloc[:n_train]
X_test_clean = X_all.iloc[n_train:]
X_clean_xgb = X_all_xgb.iloc[:n_train]
X_test_clean_xgb = X_all_xgb.iloc[n_train:]

del df_train, df_test, X, X_all, X_all_xgb
gc.collect()

# 3. ENTRENAMIENTO CON GENERACI√ìN DE OOF (Para Stacking)
# Usamos 5 Folds para m√°xima calidad en el Stacking
FOLDS = 5
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

# Arrays para guardar las predicciones "fuera de muestra" (OOF)
# Estos servir√°n para entrenar al Meta-Modelo
oof_lgbm = np.zeros((n_train, 4))
oof_cat = np.zeros((n_train, 4))
oof_xgb = np.zeros((n_train, 4))

# Arrays para las predicciones del test
test_lgbm = np.zeros((len(X_test_clean), 4))
test_cat = np.zeros((len(X_test_clean), 4))
test_xgb = np.zeros((len(X_test_clean), 4))

print(f"\n‚öîÔ∏è Iniciando STACKING con {FOLDS} Folds...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_clean, y)):
    print(f"\n--- FOLD {fold+1}/{FOLDS} ---")

    # Datos Fold Actual
    X_tr, y_tr = X_clean.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X_clean.iloc[val_idx], y.iloc[val_idx]

    # Datos XGBoost (Num√©ricos)
    X_tr_xgb, X_val_xgb = X_clean_xgb.iloc[train_idx], X_clean_xgb.iloc[val_idx]

    # 1. LIGHTGBM (CPU - R√°pido en Hist)
    model_lgb = LGBMClassifier(
        objective='multiclass', n_estimators=1000, learning_rate=0.04,
        num_leaves=40, n_jobs=-1, verbose=-1
    )
    model_lgb.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lightgbm.early_stopping(30, verbose=False)])
    oof_lgbm[val_idx] = model_lgb.predict_proba(X_val)
    test_lgbm += model_lgb.predict_proba(X_test_clean) / FOLDS

    # 2. CATBOOST (GPU - Profundo)
    cat_idx = [X_clean.columns.get_loc(c) for c in cat_features]
    train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
    val_pool = Pool(X_val, y_val, cat_features=cat_idx)

    model_cat = CatBoostClassifier(
        loss_function='MultiClass', iterations=1200, learning_rate=0.05,
        depth=7, task_type='GPU', devices='0', verbose=0
    )
    model_cat.fit(train_pool, eval_set=val_pool, early_stopping_rounds=30)
    oof_cat[val_idx] = model_cat.predict_proba(X_val)
    test_cat += model_cat.predict_proba(X_test_clean) / FOLDS

    # 3. XGBOOST (GPU)
    model_xgb = XGBClassifier(
        n_estimators=1000, learning_rate=0.05, max_depth=8,
        tree_method='hist', device='cuda', enable_categorical=True,
        verbose=0, early_stopping_rounds=30
    )
    model_xgb.fit(X_tr_xgb, y_tr, eval_set=[(X_val_xgb, y_val)], verbose=False)
    oof_xgb[val_idx] = model_xgb.predict_proba(X_val_xgb)
    test_xgb += model_xgb.predict_proba(X_test_clean_xgb) / FOLDS

    # Limpieza
    del X_tr, X_val, X_tr_xgb, X_val_xgb, train_pool, val_pool
    gc.collect()

# 4. META-MODELO (STACKING)
# Usamos Regresi√≥n Log√≠stica para combinar las probabilidades de los 3 modelos
print("\nüß† Entrenando Meta-Modelo (Logistic Regression)...")

# Creamos el dataset para el meta-modelo: Concatenamos las probabilidades OOF
# Forma: (N_samples, 12) -> 4 clases * 3 modelos
X_meta_train = np.hstack([oof_lgbm, oof_cat, oof_xgb])
X_meta_test = np.hstack([test_lgbm, test_cat, test_xgb])

# Entrenamos la Regresi√≥n Log√≠stica para encontrar los pesos √≥ptimos
meta_model = LogisticRegression(multi_class='multinomial', max_iter=1000)
meta_model.fit(X_meta_train, y)

# Predicci√≥n final
final_probs = meta_model.predict_proba(X_meta_test)
final_preds_idx = np.argmax(final_probs, axis=1)

# Generar CSV
inverse_map = {0: 'bajo', 1: 'medio-bajo', 2: 'medio-alto', 3: 'alto'}
submission = pd.DataFrame({
    'ID': test_ids,
    'RENDIMIENTO_GLOBAL': [inverse_map[i] for i in final_preds_idx]
})

submission.to_csv('submission_stacking_master.csv', index=False)
print("üèÜ ¬°Archivo generado: submission_stacking_master.csv!")
print(f"Coeficientes del Meta-Modelo (Pesos aproximados): \n{meta_model.coef_}")

try:
    from google.colab import files
    files.download('submission_stacking_master.csv')
except:
    pass


üìÇ Cargando datos...
‚ö° Aplicando Ingenier√≠a de Contexto...

‚öîÔ∏è Iniciando STACKING con 5 Folds...

--- FOLD 1/5 ---

--- FOLD 2/5 ---

--- FOLD 3/5 ---

--- FOLD 4/5 ---

--- FOLD 5/5 ---

üß† Entrenando Meta-Modelo (Logistic Regression)...
üèÜ ¬°Archivo generado: submission_stacking_master.csv!
Coeficientes del Meta-Modelo (Pesos aproximados): 
[[ 1.28679917  0.19428757 -0.50757926 -1.03521145  0.90724368 -0.28865366
  -0.33487242 -0.34542158  0.69105079 -0.12763533 -0.16063685 -0.4644826 ]
 [ 0.31710585  0.44102988  0.06236007 -0.75391998 -0.21684739  0.70534863
  -0.31420136 -0.10772406  0.1597313   0.21171605  0.00319961 -0.30807118]
 [-0.49965657 -0.11187128  0.42062502  0.28580703 -0.19956743 -0.1322092
   0.6095019  -0.18282107 -0.3332369   0.0568346   0.42655706 -0.05525067]
 [-1.10424845 -0.52344616  0.02459417  1.50332441 -0.49082886 -0.28448578
   0.03957188  0.63596671 -0.51754518 -0.14091532 -0.26911982  0.82780445]]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>