⚠️ **Aviso:** Este cuaderno NO requiere login ni API. Solo lee archivos locales.

# 02 - preprocesado

In [None]:

import os, pandas as pd

def load_table(base_name):
    csv_path = os.path.join('.', f'{base_name}.csv')
    xlsx_path = os.path.join('.', f'{base_name}.xlsx')
    if os.path.exists(csv_path):
        return pd.read_csv(csv_path), csv_path
    elif os.path.exists(xlsx_path):
        return pd.read_excel(xlsx_path), xlsx_path
    else:
        raise FileNotFoundError(f'No se encontró {base_name}.csv ni {base_name}.xlsx en la carpeta del notebook.')

def detect_id(df):
    for c in df.columns:
        name = c.lower()
        if name == 'id' or 'documento' in name or 'cedul' in name or 'identific' in name or 'codigo' in name or 'código' in name:
            return c
    return 'ID' if 'ID' in df.columns else df.columns[0]

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

train, train_path = load_table('train')
print('Usando:', train_path, '| shape:', train.shape)

try:
    test, test_path = load_table('test')
    print('Usando:', test_path, '| shape:', test.shape)
except FileNotFoundError:
    test = None
    print('test no encontrado (opcional).')

ID_COL = detect_id(train)
TARGET  = 'RENDIMIENTO_GLOBAL' if 'RENDIMIENTO_GLOBAL' in train.columns else None

if TARGET:
    y = (train[TARGET].astype(str).str.strip().str.lower()
         .replace({'medio bajo':'medio-bajo','medio alto':'medio-alto'}))
    X = train.drop(columns=[TARGET])
else:
    y = None; X = train.copy()

cat_cols = [c for c in X.columns if X[c].dtype == 'object']
num_cols = [c for c in X.columns if c not in cat_cols]
print(f'ID_COL: {ID_COL} | TARGET: {TARGET} | cat_cols={len(cat_cols)} | num_cols={len(num_cols)}')

preprocess = ColumnTransformer([
    ('cat', Pipeline([('imp', SimpleImputer(strategy='most_frequent')),
                      ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), cat_cols),
    ('num', Pipeline([('imp', SimpleImputer(strategy='median'))]), num_cols)
])

preprocess.fit(X)
X_tr = preprocess.transform(X)
print('X_preprocesado (train):', X_tr.shape)
if test is not None:
    X_te = preprocess.transform(test)
    print('X_preprocesado (test):', X_te.shape)
