## üë• Autores

Este proyecto fue desarrollado por:

| Nombre Completo                     | N√∫mero de Identificaci√≥n | Carrera Universitaria              |
| ----------------------------------- | -----------------------: | ---------------------------------- |
| **Yorladys Argumedo Lozano**        | `1038824209`            | Ingenier√≠a Industrial Virtual      |
| **Sebastian Gabriel Castro**        | `1029720632`            | Ingenier√≠a Industrial Virtual      |

---


In [None]:
# ==============================================================================
# SOLUCI√ìN FINAL: ENSEMBLE DE POTENCIA PUNTAJE 0.43772
# VERSI√ìN ADAPTADA PARA GOOGLE COLAB
# ==============================================================================

import pandas as pd
import numpy as np
import sys
import os
import warnings

# Instalamos librer√≠as si no est√°n (por si acaso es un entorno nuevo)
try:
    import xgboost
except ImportError:
    !pip install xgboost lightgbm --quiet

# Librer√≠as de Machine Learning Avanzado
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

# Preprocesamiento
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Configuraci√≥n
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# ==============================================================================
# 1. DETECCI√ìN DE ENTORNO Y CARGA DE DATOS (L√ìGICA COLAB)
# ==============================================================================
print("üìÇ Configurando entorno de datos...")

# Verificamos si los archivos YA existen localmente
if os.path.exists('train.csv') and os.path.exists('test.csv'):
    print("‚úÖ Archivos train.csv y test.csv detectados localmente.")
    print("   -> Omitiendo descarga de Kaggle para ahorrar tiempo.")
    base_path = '.'

else:
    print("‚ö†Ô∏è Archivos no encontrados. Se proceder√° a descargarlos desde Kaggle.")

    # Verificamos si tenemos el json, si no, lo pedimos
    if not os.path.exists('kaggle.json'):
        print("Por favor, sube el archivo 'kaggle.json' ahora:")
        from google.colab import files
        uploaded = files.upload()

    print("Configurando API de Kaggle...")
    # Comandos de configuraci√≥n
    !mkdir -p ~/.kaggle
    !cp kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json

    print("Descargando dataset...")
    # Descarga espec√≠fica de la competencia
    !kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia

    print("Descomprimiendo...")
    !unzip -q -o udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip
    print("¬°Descarga y descompresi√≥n completada!")
    base_path = '.'

# Cargar datos
print(f"\nLeyendo CSVs desde: {base_path}")
df_train = pd.read_csv(os.path.join(base_path, 'train.csv'))
df_test = pd.read_csv(os.path.join(base_path, 'test.csv'))
test_ids = df_test['ID']

print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")

# ==============================================================================
# 2. INGENIER√çA DE CARACTER√çSTICAS (FEATURE ENGINEERING)
# ==============================================================================
def enrich_data(df):
    df_eng = df.copy()

    # 1. Conteo de Nulos
    df_eng['NUM_NULOS'] = df_eng.isnull().sum(axis=1)

    # 2. √çndice de Riqueza Tecnol√≥gica
    cols_tiene = [c for c in df.columns if 'TIENE' in c.upper()]
    df_eng['INDICE_RIQUEZA'] = 0
    for col in cols_tiene:
        # Convertimos Si/No a 1/0
        mapper = {'Si': 1, 'No': 0, 'S√ç': 1, 'NO': 0, 'si': 1, 'no': 0}
        df_eng[col] = df_eng[col].map(mapper)
        df_eng['INDICE_RIQUEZA'] += df_eng[col].fillna(0)

    return df_eng

print("üõ†Ô∏è Aplicando ingenier√≠a de caracter√≠sticas...")
df_train = enrich_data(df_train)
df_test = enrich_data(df_test)

# ==============================================================================
# 3. PREPARACI√ìN DE PIPELINES
# ==============================================================================
target_col = 'RENDIMIENTO_GLOBAL'
label_map = {'bajo': 0, 'medio-bajo': 1, 'medio-alto': 2, 'alto': 3}
inverse_map = {0: 'bajo', 1: 'medio-bajo', 2: 'medio-alto', 3: 'alto'}

X = df_train.drop(columns=[target_col, 'ID'], errors='ignore')
y = df_train[target_col].map(label_map)
X_kaggle = df_test.drop(columns=['ID'], errors='ignore')

# Identificar columnas
num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Pipeline Num√©rico
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline Categ√≥rico
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='MISSING')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    verbose_feature_names_out=False
)

# ==============================================================================
# 4. DEFINICI√ìN DE MODELOS "STATE OF THE ART"
# ==============================================================================

# MODELO 1: XGBoost
xgb_params = {
    'n_estimators': 800,
    'learning_rate': 0.02,
    'max_depth': 8,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'multi:softprob',
    'random_state': 42,
    'n_jobs': -1,
    'tree_method': 'hist'
}

# MODELO 2: LightGBM
lgbm_params = {
    'n_estimators': 1000,
    'learning_rate': 0.03,
    'num_leaves': 40,
    'max_depth': -1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'multiclass',
    'random_state': 123,
    'n_jobs': -1,
    'verbose': -1
}

# Creamos los pipelines individuales
pipe_xgb = Pipeline([('pre', preprocessor), ('clf', XGBClassifier(**xgb_params))])
pipe_lgbm = Pipeline([('pre', preprocessor), ('clf', LGBMClassifier(**lgbm_params))])

# ==============================================================================
# 5. ENSAMBLAJE (VOTING CLASSIFIER)
# ==============================================================================
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', pipe_xgb),
        ('lgbm', pipe_lgbm)
    ],
    voting='soft',
    weights=[1, 1]
)

# ==============================================================================
# 6. ENTRENAMIENTO Y GENERACI√ìN DE ARCHIVO
# ==============================================================================
print("üèãÔ∏è Iniciando entrenamiento del Ensemble (XGBoost + LightGBM)...")
print("   (Esto puede tomar unos minutos en Colab)...")

voting_clf.fit(X, y)
print("‚úÖ Entrenamiento completado.")

print("üîÆ Generando predicciones...")
y_pred_indices = voting_clf.predict(X_kaggle)

# Convertir n√∫meros a texto
y_pred_text = [inverse_map[v] for v in y_pred_indices]

# Crear DataFrame
submission = pd.DataFrame({
    'ID': test_ids,
    'RENDIMIENTO_GLOBAL': y_pred_text
})

# Guardar
filename = 'submission_colab_ensemble.csv'
submission.to_csv(filename, index=False)

print(f"\nüèÜ ¬°Archivo '{filename}' generado!")
print(f"Dimensiones: {submission.shape}")
# C√≥digo para descargar autom√°ticamente en Colab (opcional)
try:
    from google.colab import files
    files.download(filename)
except:
    print("Descarga el archivo manualmente desde la carpeta de archivos.")