In [65]:
##Instalaci√≥n de Pycaret:
##Pipeline de pycaret:

#1. Setup(): Definir el dataset y par√°metros
#2. Compare_models(): Comparar multiples modelos y los rankear
#3. Create model(): Crear un modelo espec√≠fico
#4. Tune_model(): Ajuste autom√°tico de hiperpar√°metros
#5. plot_model(): visualizar el desempe√±o del modelo
#6. Evaluate_model(): Evaluaci√≥n del modelo
#7. Predict_model(): Predicciones sobre nuevos datos en el DF
#8. Save_model/load_model(): Guardar y cargar el modelo

In [66]:
##CASO: Predicci√≥n sobre si es propenso a caer en dafualt para un credito vehicular
##Objetivo: Construir un modelo de clasificaci√≥n autom√°tico con pycaret para predecir si caera en default en base a sus features

In [67]:
import pandas as pd
from pycaret.classification import *

In [68]:
#1. Cargamos la data
df = pd.read_csv('ds_credito_vehicular.csv')

In [69]:
df.head()

Unnamed: 0,edad,ingresos_mensuales,estado_civil,historia_credito,nro_creditos_previos,cuota_vs_ingreso,vehiculo_propio,default
0,59,7668,Divorciado,Buena,2,0.64,0,1
1,49,6279,Casado,Buena,2,0.31,1,1
2,35,2722,Soltero,Mala,4,0.41,1,0
3,63,4314,Divorciado,Regular,4,0.57,0,0
4,28,4157,Soltero,Buena,1,0.34,0,0


In [70]:
df['default'].value_counts()

default
0    125
1     25
Name: count, dtype: int64

In [71]:
df['default'].value_counts(normalize=True) * 100

default
0    83.333333
1    16.666667
Name: proportion, dtype: float64

In [72]:
#1. Setup(): Definir el dataset y par√°metros
# Configuraci√≥n mejorada de setup() para un dataset desbalanceado
cls = setup(
    data=df,                       # dataframe con los datos
    target='default',               # columna objetivo (clase)
    session_id=123,                 # para reproducibilidad
    normalize=True,                 # normaliza variables num√©ricas
    categorical_features=['estado_civil','historia_credito'],  # columnas categ√≥ricas
    numeric_imputation='mean',      # reemplazo de valores faltantes num√©ricos
    categorical_imputation='mode',  # reemplazo de valores faltantes categ√≥ricos
    fix_imbalance=True,             # habilita t√©cnicas de rebalanceo de clase
    fix_imbalance_method='SMOTE',   # m√©todo para generar muestras de la clase minoritaria
    fold_strategy='stratifiedkfold' # mantiene la proporci√≥n de clases en folds de validaci√≥n
)


Unnamed: 0,Description,Value
0,Session id,123
1,Target,default
2,Target type,Binary
3,Original data shape,"(150, 8)"
4,Transformed data shape,"(219, 12)"
5,Transformed train set shape,"(174, 12)"
6,Transformed test set shape,"(45, 12)"
7,Numeric features,5
8,Categorical features,2
9,Preprocess,True


In [73]:
#2. Compare_models(): Comparar multiples modelos y los rankear
best_model = compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.5973,0.5715,0.45,0.1983,0.2688,0.0443,0.0486,2.422
ridge,Ridge Classifier,0.5882,0.5653,0.45,0.1933,0.264,0.0358,0.0396,0.042
lda,Linear Discriminant Analysis,0.5882,0.5653,0.45,0.1933,0.264,0.0358,0.0396,0.041
svm,SVM - Linear Kernel,0.5145,0.5382,0.55,0.1719,0.2563,0.0057,0.0337,0.042
knn,K Neighbors Classifier,0.3973,0.3552,0.4,0.122,0.1811,-0.1075,-0.168,0.043
qda,Quadratic Discriminant Analysis,0.7327,0.4708,0.15,0.1333,0.14,0.009,0.0012,0.053
nb,Naive Bayes,0.5773,0.3972,0.15,0.1,0.1167,-0.121,-0.1468,0.048
dt,Decision Tree Classifier,0.6827,0.45,0.1,0.15,0.1167,-0.0692,-0.071,0.044
lightgbm,Light Gradient Boosting Machine,0.7209,0.4785,0.05,0.05,0.05,-0.0872,-0.0931,0.078
ada,Ada Boost Classifier,0.7164,0.5174,0.05,0.0333,0.04,-0.1017,-0.1041,0.08


In [74]:
#3. Create model(): Crear un modelo espec√≠fico
lr = create_model('lr')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7273,0.7222,0.5,0.3333,0.4,0.2326,0.2406
1,0.6364,0.4444,0.5,0.25,0.3333,0.12,0.1336
2,0.5455,0.6667,0.5,0.2,0.2857,0.0351,0.043
3,0.6364,0.7778,0.5,0.25,0.3333,0.12,0.1336
4,0.7273,0.8333,0.5,0.3333,0.4,0.2326,0.2406
5,0.7,0.8889,1.0,0.25,0.4,0.2857,0.4082
6,0.8,0.4444,0.0,0.0,0.0,-0.1111,-0.1111
7,0.4,0.25,0.5,0.1667,0.25,-0.0714,-0.1021
8,0.3,0.1875,0.0,0.0,0.0,-0.4,-0.5
9,0.5,0.5,0.5,0.2,0.2857,0.0,0.0


In [75]:
#4. Tune_model(): Ajuste autom√°tico de hiperpar√°metros
tuned_lr = tune_model(lr,optimize='F1')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7273,0.7222,0.5,0.3333,0.4,0.2326,0.2406
1,0.7273,0.4444,0.5,0.3333,0.4,0.2326,0.2406
2,0.5455,0.6667,0.5,0.2,0.2857,0.0351,0.043
3,0.6364,0.7778,0.5,0.25,0.3333,0.12,0.1336
4,0.7273,0.8333,0.5,0.3333,0.4,0.2326,0.2406
5,0.7,0.8889,1.0,0.25,0.4,0.2857,0.4082
6,0.8,0.4444,0.0,0.0,0.0,-0.1111,-0.1111
7,0.4,0.25,0.5,0.1667,0.25,-0.0714,-0.1021
8,0.3,0.1875,0.0,0.0,0.0,-0.4,-0.5
9,0.5,0.5,0.5,0.2,0.2857,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [None]:
#5. plot_model(): visualizar el desempe√±o del modelo
evaluate_model(tuned_lr)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin‚Ä¶

In [77]:
#7. Predict_model(): Predicciones sobre nuevos datos en el DF
preds = predict_model(tuned_lr)
print(preds.head())

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.6222,0.6805,0.1429,0.0833,0.1053,-0.1135,-0.1202


     edad  ingresos_mensuales estado_civil historia_credito  \
57     60                4457       Casado            Buena   
98     55                2147       Casado            Buena   
54     34                5659      Soltero          Regular   
92     64                2930       Casado            Buena   
137    33                4643       Casado            Buena   

     nro_creditos_previos  cuota_vs_ingreso  vehiculo_propio  default  \
57                      3              0.32                0        0   
98                      2              0.25                1        0   
54                      3              0.32                0        0   
92                      2              0.31                1        0   
137                     0              0.69                0        1   

     prediction_label  prediction_score  
57                  0            0.7036  
98                  0            0.8340  
54                  0            0.6995  
92            

In [78]:
#8. Save_model/load_model(): Guardar y cargar el modelo
save_model(tuned_lr,'modelo_precio_final')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['edad', 'ingresos_mensuales',
                                              'nro_creditos_previos',
                                              'cuota_vs_ingreso',
                                              'vehiculo_propio'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_impu...
                  TransformerWrapper(exclude=None, include=

### üìä CONCLUSIONES MODELO DE CLASIFICACI√ìN

#### 1. Desempe√±o General

* El modelo seleccionado fue **Regresi√≥n Log√≠stica**, tras comparar m√∫ltiples clasificadores (SVM, KNN, QDA, Naive Bayes, √Årboles, Boosting, Random Forest, etc.).
* En promedio, el modelo alcanz√≥ una **exactitud (Accuracy) del 60.6%**, con una **AUC de 0.57**, lo cual indica una capacidad **moderada de discriminaci√≥n** entre clases.

#### 2. M√©tricas Principales

* **Recall promedio:** 0.45 ‚Üí el modelo logra identificar correctamente el 45% de los casos positivos.
* **Precisi√≥n promedio:** 0.21 ‚Üí de las predicciones positivas, solo el 21% son correctas.
* **F1-score:** 0.28 ‚Üí balance moderado entre precisi√≥n y exhaustividad.
* **Kappa (0.055) y MCC (0.059):** valores bajos, sugiriendo una correlaci√≥n limitada entre predicciones y etiquetas reales.
* La **variabilidad entre folds** (std ‚âà 0.15 en Accuracy y 0.23 en AUC) refleja cierta **inestabilidad del modelo**, probablemente por tama√±o muestral limitado o desbalance de clases.

#### 3. Evaluaci√≥n Comparativa

* Aunque otros modelos (como QDA, Extra Trees o LightGBM) mostraron mayores *accuracy* en validaci√≥n cruzada simple, sus m√©tricas de *recall* y *F1* fueron nulas o muy bajas, indicando **overfitting o sesgo hacia una clase mayoritaria**.
* La **Regresi√≥n Log√≠stica** mantiene un **equilibrio razonable** entre las m√©tricas, mostrando **robustez** frente a la variabilidad de los datos.

#### 4. Interpretaci√≥n y Aspectos T√©cnicos

* La **AUC cercana a 0.57** sugiere que el modelo tiene una **capacidad limitada para ordenar correctamente** instancias positivas y negativas.
* El comportamiento de m√©tricas asim√©tricas (Precision‚ÄìRecall) podr√≠a indicar **desequilibrio de clases** o **valores at√≠picos** en la variable respuesta.
* Las iteraciones con **recall = 1 y precisi√≥n baja (0.25)** muestran que en ciertos folds el modelo prefiri√≥ **minimizar falsos negativos** a costa de cometer m√°s falsos positivos.

#### 5. Recomendaciones

* Aplicar **reescalado y balanceo de clases** (SMOTE, undersampling o ponderaci√≥n de clases).
* Evaluar **regularizaci√≥n L1 o Elastic Net** para estabilizar coeficientes y reducir varianza.
* Analizar **importancia de variables** mediante los coeficientes del modelo y eliminar predictores irrelevantes.
* Considerar modelos no lineales (√°rboles o SVM con kernel) solo tras asegurar un adecuado balance de clases.

---

#### üß© CONCLUSI√ìN GLOBAL

> El modelo de clasificaci√≥n basado en **Regresi√≥n Log√≠stica** presenta un **ajuste moderado** y un desempe√±o **aceptable considerando la complejidad del problema y el desbalance de clases**.
> Si bien la capacidad predictiva (AUC ‚âà 0.57) no es alta, el modelo ofrece **interpretabilidad, estabilidad y una base s√≥lida** para mejorar mediante t√©cnicas de regularizaci√≥n y balanceo.

