In [1]:
import pandas as pd

# Load the dataset
file_path = '../datasets/processed_loan_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())

# Display basic information about the dataset
print(data.info())

# Check for missing values
print(data.isnull().sum())

   loan_amnt  int_rate  installment  annual_inc    dti  delinq_2yrs  pub_rec  \
0     2500.0     13.98        85.42     20004.0  19.86          0.0      0.0   
1     5000.0     15.95       175.67     59000.0  19.57          0.0      0.0   
2     7000.0      9.91       225.58     53796.0  10.80          3.0      0.0   
3     2000.0      5.42        60.32     30000.0   3.60          0.0      0.0   
4     3600.0     10.25       116.59    675048.0   1.55          0.0      0.0   

   long_term  employment_verified  employment  housing_instability  \
0          0                    0           1                    0   
1          0                    0           1                    0   
2          0                    0           1                    0   
3          0                    0           1                    0   
4          0                    0           1                    0   

   log_i2p_ratio  abandonment  
0       0.207108            0  
1       0.376878            1  
2 

Comencemos con un modelo de regresion logistica, vamos a comparar que pasa cuando utilizamos o no utilizamos `class_weight='balanced'`

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Define features (X) and target (y)
X = data.drop(columns=['abandonment'])
y = data['abandonment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)[:, 1]

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nROC-AUC Score:")
print(roc_auc_score(y_test, y_pred_proba))

Confusion Matrix:
[[6305    8]
 [ 261  939]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6313
           1       0.99      0.78      0.87      1200

    accuracy                           0.96      7513
   macro avg       0.98      0.89      0.93      7513
weighted avg       0.97      0.96      0.96      7513


ROC-AUC Score:
0.9785536459158352


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Define features (X) and target (y)
X = data.drop(columns=['abandonment'])
y = data['abandonment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)[:, 1]

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nROC-AUC Score:")
print(roc_auc_score(y_test, y_pred_proba))

Confusion Matrix:
[[6285   28]
 [ 114 1086]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6313
           1       0.97      0.91      0.94      1200

    accuracy                           0.98      7513
   macro avg       0.98      0.95      0.96      7513
weighted avg       0.98      0.98      0.98      7513


ROC-AUC Score:
0.9838101272506468


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Al decirle al modelo que nuestras clases no están balanceadas, podemos ver como mejoró de manera significativa el recall de los abandonos (0.91 vs. 0.78). Veamos si podemos encontrar un modelo que tenga un desempeño mejor. Utilicemos RandomForestClassifier y GradientBoostingClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42, class_weight='balanced')
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
rf_pred_proba = rf_clf.predict_proba(X_test)[:, 1]

print("Random Forest Results:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_pred))
print("\nClassification Report:")
print(classification_report(y_test, rf_pred))
print("\nROC-AUC Score:")
print(roc_auc_score(y_test, rf_pred_proba))

# Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)
gb_pred_proba = gb_clf.predict_proba(X_test)[:, 1]

print("\nGradient Boosting Results:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, gb_pred))
print("\nClassification Report:")
print(classification_report(y_test, gb_pred))
print("\nROC-AUC Score:")
print(roc_auc_score(y_test, gb_pred_proba))

Random Forest Results:
Confusion Matrix:
[[6301   12]
 [  78 1122]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6313
           1       0.99      0.94      0.96      1200

    accuracy                           0.99      7513
   macro avg       0.99      0.97      0.98      7513
weighted avg       0.99      0.99      0.99      7513


ROC-AUC Score:
0.9961695443265219

Gradient Boosting Results:
Confusion Matrix:
[[6307    6]
 [ 103 1097]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6313
           1       0.99      0.91      0.95      1200

    accuracy                           0.99      7513
   macro avg       0.99      0.96      0.97      7513
weighted avg       0.99      0.99      0.99      7513


ROC-AUC Score:
0.9941833650139923


Random Forest es mejor que Gradient Boosting, el F1-Score en la clase de interés es de 0.96. Ahora quiero realizar un gridsearch para ver si encuentro un grupo de hiperparámetros que puedan darme un modelo aún mejor.

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample']
}

# Comenzar el GridSearchCV para el Random Forest
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    verbose=2,
    n_jobs=-1
)


grid_search.fit(X_train, y_train)

# Mostrar los mejores parámetros y el mejor score
print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC Score:", grid_search.best_score_)

Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Parameters: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best ROC-AUC Score: 0.9946725188524402


El mejor ROC-AUC score del GridSearch es inferior (ligeramente) al que encontramos al inicio. Entonces nos quedaremos con el modelo que ya establecimos antes. Por curiosidad, quisiera validar mi decisión de generar la columna log_i2p_ratio para capturar que tanto ha pagado un usuario de interés vs. que tanto a pagado del monto principal, entonces voy a revisar feature_importances.

In [11]:
# Print feature importances
feature_importances = rf_clf.feature_importances_
for feature, importance in zip(X.columns, feature_importances):
    print(f"{feature}: {importance:.4f}")

loan_amnt: 0.0248
int_rate: 0.1224
installment: 0.0338
annual_inc: 0.0322
dti: 0.0316
delinq_2yrs: 0.0041
pub_rec: 0.0026
long_term: 0.0468
employment_verified: 0.0030
employment: 0.0000
housing_instability: 0.0000
log_i2p_ratio: 0.6986
