# 🧹 Preprocesamiento del Dataset - Scoring Crediticio

Este notebook contiene el tratamiento de valores nulos, outliers, y la preparación de datos para entrenar modelos predictivos sobre `SeriousDlqin2yrs`.

In [13]:
# 📦 Librerías necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Cargar dataset
df = pd.read_csv('../data/cs-training.csv', index_col=0)
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


## 🔧 Imputación de valores nulos

In [14]:
# Imputar ingresos faltantes con la mediana
mediana_ingreso = df['MonthlyIncome'].median()
df['MonthlyIncome'] = df['MonthlyIncome'].fillna(mediana_ingreso)
mediana_dependientes = df['NumberOfDependents'].median()
df['NumberOfDependents'] = df['NumberOfDependents'].fillna(mediana_dependientes)



## 🚨 Tratamiento de outliers extremos

In [15]:
# Recortar valores extremos al percentil 99
cols_outliers = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'MonthlyIncome']
for col in cols_outliers:
    p99 = df[col].quantile(0.99)
    df[col] = df[col].clip(upper=p99)

## 🔀 Creación de variable combinada de atrasos

In [16]:
# Sumar todas las columnas de atraso en una sola
df['TotalPastDue'] = (
    df['NumberOfTime30-59DaysPastDueNotWorse'] +
    df['NumberOfTime60-89DaysPastDueNotWorse'] +
    df['NumberOfTimes90DaysLate']
)
# Eliminar columnas originales de atraso (ya están combinadas en TotalPastDue)
df.drop(columns=[
    'NumberOfTime30-59DaysPastDueNotWorse',
    'NumberOfTime60-89DaysPastDueNotWorse',
    'NumberOfTimes90DaysLate'
], inplace=True)

In [17]:
df.isnull().sum()

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberRealEstateLoansOrLines            0
NumberOfDependents                      0
TotalPastDue                            0
dtype: int64

## 📂 Separación en conjunto de entrenamiento y prueba

In [18]:
# Separar features y target
X = df.drop(columns=['SeriousDlqin2yrs'])
y = df['SeriousDlqin2yrs']

# División 80% entrenamiento, 20% prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_test.shape

((120000, 8), (30000, 8))

In [19]:
import joblib

joblib.dump(X_train, '../data/X_train.pkl')
joblib.dump(X_test, '../data/X_test.pkl')
joblib.dump(y_train, '../data/y_train.pkl')
joblib.dump(y_test, '../data/y_test.pkl')


['../data/y_test.pkl']