In [201]:
# Include libraries, etc
# ==============================================================================
import pandas as pd
import numpy as np

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

# Preprocesado y modelado
# ==============================================================================
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#from sklearn.metrics import plot_confusion_matrix
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Configuración matplotlib
# ==============================================================================
plt.rcParams['image.cmap'] = "bwr"
#plt.rcParams['figure.dpi'] = "100"
plt.rcParams['savefig.bbox'] = "tight"
style.use('ggplot') or plt.style.use('ggplot')

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [224]:
dataframe = pd.read_csv("./dataset/train.csv")
# remove unnecessary loan_id column
dataframe = dataframe.drop("Loan_ID", axis=1)

In [225]:
# map strings to integers
dataframe["Gender"] = dataframe["Gender"].map({"Male":1,"Female":2})
dataframe["Married"] = dataframe["Married"].map({"Yes":1,"No":0})
dataframe["Education"] = dataframe["Education"].map({"Graduate":1,"Not Graduate":0})
dataframe["Self_Employed"] = dataframe["Self_Employed"].map({"Yes":1,"No":0})
dataframe["Property_Area"] =  dataframe["Property_Area"].map({"Semiurban": 1,"Urban":2,"Rural":3})
dataframe["Dependents"] =  dataframe["Dependents"].map({"0":0,"1":1,"2":2,"3+":3})
dataframe["Loan_Status"] = dataframe["Loan_Status"].map({"N":0,"Y":1})
dataframe = dataframe.fillna(0)

print("Percentages")
print(100 * dataframe['Loan_Status'].value_counts(normalize=True))


Percentages
Loan_Status
1    68.729642
0    31.270358
Name: proportion, dtype: float64


In [204]:
dataframe.dtypes

Gender               float64
Married              float64
Dependents           float64
Education              int64
Self_Employed        float64
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
Loan_Status            int64
dtype: object

In [144]:
# Correlation
corr_matrix = dataframe.drop(["Gender", "Married", "Property_Area", "Dependents", "Education", "Self_Employed"], axis=1)
corr_matrix = corr_matrix.select_dtypes(include=["float64", "int"])
corr_matrix = corr_matrix.corr(method="pearson")
corr_matrix = corr_matrix.stack()
corr_matrix = corr_matrix.reset_index()
corr_matrix.columns = ["var_1", "var_2", "r"]
corr_matrix = corr_matrix.loc[corr_matrix["var_1"] != corr_matrix["var_2"],:]
corr_matrix = corr_matrix.sort_values("r", ascending=False)
corr_matrix.head(20)



Unnamed: 0,var_1,var_2,r
2,ApplicantIncome,LoanAmount,0.53829
12,LoanAmount,ApplicantIncome,0.53829
34,Loan_Status,Credit_History,0.432616
29,Credit_History,Loan_Status,0.432616
13,LoanAmount,CoapplicantIncome,0.190377
8,CoapplicantIncome,LoanAmount,0.190377
15,LoanAmount,Loan_Amount_Term,0.058519
20,Loan_Amount_Term,LoanAmount,0.058519
22,Loan_Amount_Term,Credit_History,0.050145
27,Credit_History,Loan_Amount_Term,0.050145


In [227]:
# Split training and testing data
X = dataframe.drop("Loan_Status", axis=1)
Y = dataframe["Loan_Status"]

X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    Y.values.reshape(-1,1),
    train_size=0.8,
    random_state=1234,
    shuffle=True
)

In [228]:
# Generate model
X_train = sm.add_constant(X_train, prepend=True)
model = sm.Logit(endog=Y_train, exog=X_train)
model = model.fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.527507
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  491
Model:                          Logit   Df Residuals:                      479
Method:                           MLE   Df Model:                           11
Date:                Sun, 07 Jan 2024   Pseudo R-squ.:                  0.1497
Time:                        15:42:31   Log-Likelihood:                -259.01
converged:                       True   LL-Null:                       -304.61
Covariance Type:            nonrobust   LLR p-value:                 9.686e-15
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 0.4620      0.777      0.595      0.552      -1.060       1.984
Gender  

In [229]:
# Accuracy de test del modelo
# ==============================================================================
X_test = sm.add_constant(X_test, prepend=True)
predictions = model.predict(exog = X_test)
classification = np.where(predictions<0.50, 0, 1)
accuracy = accuracy_score(
            y_true    = Y_test,
            y_pred    = classification,
            normalize = True
           )
print(f"El accuracy de test es: {100*accuracy}%")

El accuracy de test es: 82.11382113821138%


In [230]:
# Matriz de confusión de las predicciones de test
# ==============================================================================
confusion_matrix = pd.crosstab(
    Y_test.ravel(),
    classification,
    rownames=['Real'],
    colnames=['Predicción']
)
confusion_matrix

Predicción,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
0,22,17
1,5,79


In [231]:
tn = 22 #a
tp = 79 #d
fn = 5 #c
fp = 17 #b
recall = tp / (tp+fn) # d/(d+c)
specificity = tn / (tn+fp)# a / (a+b)
precision = tp / (fp+tp) # d / (b+d)
accuracy =  (tn+tp) / (tn+tp+fn+fp) # (a+d) / (a+b+c+d)
print(f"Recall es {100*recall}")
print(f"Specificity es {100*specificity}")
print(f"Precision es {100*precision}")
print(f"Accuracy es {100*accuracy}")

Recall es 94.04761904761905
Specificity es 56.41025641025641
Precision es 82.29166666666666
Accuracy es 82.11382113821138


In [232]:
# Generate the model again by removing irrelevant variables
X_train = X_train.drop(["Gender", "Married", "Dependents", "Education", "Self_Employed","LoanAmount","ApplicantIncome"], axis=1)
X_test = X_test.drop(["Gender", "Married", "Dependents", "Education", "Self_Employed","LoanAmount","ApplicantIncome"], axis=1)
model = sm.Logit(endog=Y_train, exog=X_train)
model = model.fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.534443
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  491
Model:                          Logit   Df Residuals:                      486
Method:                           MLE   Df Model:                            4
Date:                Sun, 07 Jan 2024   Pseudo R-squ.:                  0.1385
Time:                        15:44:52   Log-Likelihood:                -262.41
converged:                       True   LL-Null:                       -304.61
Covariance Type:            nonrobust   LLR p-value:                 2.041e-17
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 0.9320      0.600      1.552      0.121      -0.245       2.109
Coapplic

In [242]:
# Accuracy de test del modelo
# ==============================================================================
predictions = model.predict(exog = X_test)
classification = np.where(predictions<0.50, 0, 1)
accuracy = accuracy_score(
            y_true    = Y_test,
            y_pred    = classification,
            normalize = True
           )
print(f"El accuracy de test es: {100*accuracy}%")

El accuracy de test es: 82.11382113821138%


In [243]:
# Matriz de confusión de las predicciones de test
# ==============================================================================
confusion_matrix = pd.crosstab(
    Y_test.ravel(),
    classification,
    rownames=['Real'],
    colnames=['Predicción']
)
confusion_matrix

Predicción,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
0,22,17
1,5,79


In [244]:
tn = 22 #a
tp = 79 #d
fn = 5 #c
fp = 17 #b
recall = tp / (tp+fn) # d/(d+c)
specificity = tn / (tn+fp)# a / (a+b)
precision = tp / (fp+tp) #d / (b+d)
accuracy =  (tn+tp) / (tn+tp+fn+fp) # (a+d) / (a+b+c+d)
print(f"Recall es {100*recall}")
print(f"Specificity es {100*specificity}")
print(f"Precision es {100*precision}")

Recall es 94.04761904761905
Specificity es 56.41025641025641
Precision es 82.29166666666666


# Conclusiones

- El modelo parece ser bastante sensible, en otras palabras tiene una buena capacidad para identificar casos positivos reales
- Sin embargo, la especificidad es relativamente baja, esto indica que hay margen para mejorar la capacidad del modelo para identificar correctamente los casos negativos reales, esto podría mejorar al subir bastante más el umbral. Al hacer con pruebas de 70% esta métrica sube hasta un 66% pero hace que la sensibilidad baje al 76%, lo cual puede considerarse como un buen trade-off considerando la importancia de no acertar cuando **no** se debe de aprobar un prestamo
- La precisión es aceptable, pero puede ser útil considerar el equilibrio entre precisión, sensibilidad y especificidad según los requisitos específicos del problema.