# Modeling pipeline: Preprocesado, selección de features y entrenamiento
Este notebook realiza un pipeline simple reproducible: carga datos limpios, aplica preprocesado (imputación, codificación y escalado), selección de features y entrena un modelo de referencia (RandomForest). Ejecuta las celdas en orden.

## Sección 1: Imports y rutas

In [21]:
# Sección 1: Imports y rutas
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error
import joblib
import os

# Rutas (ajusta si es necesario)
OUT_PATH_CLEAN = Path('..') / 'src' / 'data'/ 'processed' /  'german_credit_clean.csv'
OUT_DIR = Path('..') / 'src' / 'models'


df = pd.read_csv(OUT_PATH_CLEAN)


## Sección 2: Preprocesado simple
- Codificación: OneHot para categorías con baja cardinalidad, Ordinal para alta cardinalidad.
- Escalado: StandardScaler para numéricos.
Construimos un `ColumnTransformer` y un `Pipeline` sencillo.

In [12]:
df

Unnamed: 0.1,Unnamed: 0,laufkont,laufzeit,moral,verw,hoehe,sparkont,beszeit,rate,famges,...,verm,alter,weitkred,wohn,bishkred,beruf,pers,telef,gastarb,kredit
0,0,1,18,4,2,1049,1,2,4,2,...,2,21,3,2,1,3,2,1,2,1
1,1,1,9,4,0,2799,1,3,2,3,...,1,36,3,2,2,3,2,1,2,1
2,2,2,12,2,6,841,2,4,2,2,...,1,23,3,2,1,3,2,1,2,1
3,3,1,12,4,0,2122,1,3,3,3,...,1,39,3,2,2,3,2,1,2,1
4,4,1,12,4,0,2171,1,3,4,3,...,2,38,3,2,2,3,2,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1004,1007,4,6,2,6,1538,1,2,1,2,...,4,56,3,2,1,3,2,1,2,1
1005,1008,2,39,3,6,7865,2,4,2,3,...,4,32,3,2,1,3,2,2,2,1
1006,1010,2,14,2,3,766,3,3,4,3,...,1,64,3,2,1,3,2,1,2,0
1007,1011,2,30,3,3,1919,2,2,4,3,...,4,30,3,2,2,3,2,1,2,0


In [22]:
# Fijar target explícito según el diccionario: 'kredit'
target_col = 'kredit'
# Cast de columnas categóricas conocidas según el diccionario de variables
ordin = ['beszeit', 'rate','wohnzeit','verm','bishkred','beruf']
categorical_manual = ['laufkont','moral','verw','sparkont','famges','buerge','weitkred','wohn','pers','telef','gastarb']
for c in categorical_manual + ordin:
    if c in df.columns:
        try:
            df[c] = df[c].astype('category')
        except Exception:
            pass

# Asegurar que target queda en formato numérico (0/1) si es binario
y = None
# Definir y_raw a partir del dataframe (evita referencia indefinida)
y_raw = df[target_col] if target_col in df.columns else None
if y_raw is not None:
    if pd.api.types.is_numeric_dtype(y_raw):
        y = pd.to_numeric(y_raw, errors='coerce')
        df[target_col] = y
    else:
        uniques = y_raw.dropna().unique()
        if len(uniques) == 2:
            # Mapear a 0/1 conservando el orden original
            mapping = {uniques[0]: 0, uniques[1]: 1}
            df[target_col] = y_raw.map(mapping)
            y = df[target_col]
        else:
            # Intentar coerción numérica como fallback
            y = pd.to_numeric(y_raw, errors='coerce')
            df[target_col] = y

# Separar X/y si se determinó target
X = df.drop(columns=[target_col])

# Columnas numéricas y categóricas (actualizadas)
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
print('Num cols:', num_cols)
print('Cat cols:', cat_cols)
# Decide encoding: OneHot si cardinalidad <= 10, else Ordinal
low_card = [c for c in cat_cols if X[c].nunique(dropna=True) <= 10]
high_card = [c for c in cat_cols if X[c].nunique(dropna=True) > 10]
print('OneHot cols:', low_card)
print('Ordinal cols:', high_card)

# Pipelines
numeric_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
low_card_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder( handle_unknown='ignore'))])
high_card_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('ord', OrdinalEncoder())])

transformers = []
if num_cols:
    transformers.append(('num', numeric_pipeline, num_cols))
if low_card:
    transformers.append(('lowcard', low_card_pipeline, low_card))
if high_card:
    transformers.append(('highcard', high_card_pipeline, high_card))

preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')
print('Preprocessor built with transformers:', [t[0] for t in transformers])

# Si hay target, hacemos split ahora
if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y if y.nunique()<=10 else None)
else:
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
    y_train = y_test = None

print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)


Num cols: ['Unnamed: 0', 'laufzeit', 'hoehe', 'alter']
Cat cols: ['laufkont', 'moral', 'verw', 'sparkont', 'beszeit', 'rate', 'famges', 'buerge', 'wohnzeit', 'verm', 'weitkred', 'wohn', 'bishkred', 'beruf', 'pers', 'telef', 'gastarb']
OneHot cols: ['laufkont', 'moral', 'verw', 'sparkont', 'beszeit', 'rate', 'famges', 'buerge', 'wohnzeit', 'verm', 'weitkred', 'wohn', 'bishkred', 'beruf', 'pers', 'telef', 'gastarb']
Ordinal cols: []
Preprocessor built with transformers: ['num', 'lowcard']
Train shape: (807, 21) Test shape: (202, 21)


## Sección 3: Selección de features (simple)
Si hay target y es supervisado, usamos SelectKBest (f_classif o f_regression). Si no hay target, se deja el preprocesado tal cual.

In [23]:
# Sección 3: Feature selection sencillo
selector = None
k = None
if y is not None:
    # necesitaremos aplicar preprocessor.fit_transform para obtener n_features_finales
    Xt_sample = preprocessor.fit_transform(X_train)
    try:
        n_feats = Xt_sample.shape[1]
    except Exception:
        n_feats = None
    if n_feats is not None:
        k = min(20, n_feats)
        if y_train.dtype.kind in 'biu' or y_train.nunique() <= 10:
            selector = SelectKBest(score_func=f_classif, k=k)
        else:
            selector = SelectKBest(score_func=f_regression, k=k)
        # fit selector on preprocessed matrix
        selector.fit(Xt_sample, y_train)
        print(f'Select K Best selected k={k} features (of {n_feats})')
    else:
        print('No se pudo determinar número de features tras preprocesado; saltando selección')
else:
    print('No target -> no se aplica SelectKBest')


Select K Best selected k=20 features (of 57)


## Sección 4: Entrenamiento del modelo de referencia
Entrenamos un RandomForest (clasificación o regresión según el target). Mostramos métricas básicas y exportamos el modelo y el pipeline.

### Random Forest

In [32]:
# Sección 4: Entrenamiento
# Construir pipeline final: preprocessor -> (selector) -> model
model_type = 'classification'
if selector is not None:
    from sklearn.pipeline import make_pipeline
    # selector espera matriz, así que lo añadimos después del preprocessor
    # crear pipeline final
    
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    full_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('selector', selector), ('model', model)])

# Fit
full_pipeline.fit(X_train, y_train)
print('Modelo entrenado.')
# Evaluación básica
y_pred = full_pipeline.predict(X_test)
if model_type == 'classification':
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f'Accuracy: {acc:.4f}, F1(weighted): {f1:.4f}')
    # ROC AUC si binario y hay predict_proba
    if len(np.unique(y_test.dropna())) == 2 and hasattr(full_pipeline.named_steps['model'], 'predict_proba'):
        proba = full_pipeline.predict_proba(X_test)[:,1]
        try:
            auc = roc_auc_score(y_test, proba)
            print(f'RandomForest - ROC AUC: {auc:.4f}')
        except Exception:
            pass

# Guardar pipeline completo
model_path = OUT_DIR / 'pipeline_RandomForest.joblib'
joblib.dump(full_pipeline, model_path)
print('Pipeline y modelo guardados en', model_path)


Modelo entrenado.
Accuracy: 0.9208, F1(weighted): 0.9186
RandomForest - ROC AUC: 0.9422
Pipeline y modelo guardados en ..\src\models\pipeline_RandomForest.joblib


### Regresión Logistica

In [25]:
# --- Logistic Regression ---
print('Entrenando Regresión Logística...')

logreg = LogisticRegression(max_iter=1000, solver='lbfgs')
pipe_logreg = Pipeline(steps=[('preprocessor', preprocessor), ('model', logreg)])
pipe_logreg.fit(X_train, y_train)
y_pred = pipe_logreg.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'LogReg - Accuracy: {acc:.4f}, F1(weighted): {f1:.4f}')

proba = pipe_logreg.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, proba)
print(f'LogReg - ROC AUC: {auc:.4f}')
        
# Guardar pipeline logreg
model_path_logreg = OUT_DIR / 'pipeline_logreg.joblib'
joblib.dump(pipe_logreg, model_path_logreg)
print('LogReg pipeline guardado en', model_path_logreg)


Entrenando Regresión Logística...
LogReg - Accuracy: 0.9010, F1(weighted): 0.8995
LogReg - ROC AUC: 0.9463
LogReg pipeline guardado en ..\src\models\pipeline_logreg.joblib


### XGBoost

In [31]:
# --- XGBoost Classifier ---
print('Entrenando XGBoost...')

from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import joblib

xgb = XGBClassifier(
    n_estimators=300,    
    learning_rate=0.1,    
    max_depth=6,          
    subsample=0.8,         
    colsample_bytree=0.8,   
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'    
)

# Pipeline con preprocesamiento + modelo
pipe_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb)
])

pipe_xgb.fit(X_train, y_train)

y_pred = pipe_xgb.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'XGBoost - Accuracy: {acc:.4f}, F1(weighted): {f1:.4f}')

proba = pipe_xgb.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, proba)
print(f'XGBoost - ROC AUC: {auc:.4f}')

model_path_xgb = OUT_DIR / 'pipeline_xgboost.joblib'
joblib.dump(pipe_xgb, model_path_xgb)
print('XGBoost pipeline guardado en', model_path_xgb)


Entrenando XGBoost...
XGBoost - Accuracy: 0.9257, F1(weighted): 0.9239
XGBoost - ROC AUC: 0.9213
XGBoost pipeline guardado en ..\src\models\pipeline_xgboost.joblib
