In [88]:
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

pd.options.plotting.backend = "plotly"

In [89]:
file_dir = Path("..") / "data" / "credit_risk_dataset_cleaned.csv"
df = pd.read_csv(file_dir)
df.head()


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [90]:
df.shape

(31679, 12)

In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31679 entries, 0 to 31678
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  31679 non-null  int64  
 1   person_income               31679 non-null  int64  
 2   person_home_ownership       31679 non-null  object 
 3   person_emp_length           31679 non-null  float64
 4   loan_intent                 31679 non-null  object 
 5   loan_grade                  31679 non-null  object 
 6   loan_amnt                   31679 non-null  int64  
 7   loan_int_rate               28632 non-null  float64
 8   loan_status                 31679 non-null  int64  
 9   loan_percent_income         31679 non-null  float64
 10  cb_person_default_on_file   31679 non-null  object 
 11  cb_person_cred_hist_length  31679 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 2.9+ MB


In [92]:
df.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length                0
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3047
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [93]:
null_columns = df.columns[df.isnull().any()].tolist()
df[null_columns].isnull().sum()

loan_int_rate    3047
dtype: int64

In [94]:
pd.crosstab(df['person_home_ownership'], df['loan_status'], values=df['loan_int_rate'], aggfunc='mean').round(2)

loan_status,0,1
person_home_ownership,Unnamed: 1_level_1,Unnamed: 2_level_1
MORTGAGE,10.09,13.53
OTHER,11.41,13.56
OWN,10.86,12.46
RENT,10.76,13.01


On va remplir les valeurs manquantes de `loan_int_rate` par la moyenne de la colonne.

In [95]:
df['loan_int_rate'] = df['loan_int_rate'].fillna(df['loan_int_rate'].mean())

In [96]:
X = df.drop(columns=['loan_status'])
y = df[['loan_status']]

In [97]:
y.value_counts()

loan_status
0              24854
1               6825
Name: count, dtype: int64

Les deux classes sont  deséquilibrées. Il faut donc les équilibrer. Nous allons utiliser la méthode SMOTE (Synthetic Minority Oversampling TEchnique)

In [98]:
X_numerical = X.select_dtypes(include=["int64", "float64"])

X_categorical = X.select_dtypes(include=["object"])

print(X_numerical.columns.tolist())
print(X_categorical.columns.tolist())

['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']


Il va falloir encoder les variables catégorielles. On va utiliser OrdinalEncoder pour la variable `loan_grade` (pour conserver l'ordre) et OneHotEncoder pour les autres.

In [None]:
ordinal_encoder = OrdinalEncoder(categories=[['A', 'B', 'C', 'D', 'E', 'F', 'G']])

X_categorical_encoded = X_categorical.copy()
X_categorical_encoded['loan_grade'] = ordinal_encoder.fit_transform(X_categorical[['loan_grade']])

X_categorical = X_categorical_encoded.drop(columns=['loan_grade'])

loan_grade_encoded = X_categorical_encoded[['loan_grade']]

In [None]:
onehot_encoder = OneHotEncoder(drop='first', sparse_output=False)
X_categorical_onehot = pd.DataFrame(
    onehot_encoder.fit_transform(X_categorical),
    columns=onehot_encoder.get_feature_names_out(X_categorical.columns),
    index=X_categorical.index
)

X_encoded = pd.concat([X_numerical, loan_grade_encoded, X_categorical_onehot], axis=1)

In [60]:
smote = SMOTE(random_state=42)

In [61]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [62]:
X_train, y_train = smote.fit_resample(X_train, y_train)

In [63]:
# Model de régression logistique: Baseline
clf_logistic = LogisticRegression()
clf_logistic.fit(X_train, np.ravel(y_train))


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [64]:
clf_logistic.predict_proba(X_test)

array([[0.17910403, 0.82089597],
       [0.81255405, 0.18744595],
       [0.65221496, 0.34778504],
       ...,
       [0.47579122, 0.52420878],
       [0.92256221, 0.07743779],
       [0.40249464, 0.59750536]])

In [65]:
clf_logistic.score(X_test, y_test)

0.7073863636363636

In [66]:
preds = clf_logistic.predict_proba(X_test)
preds_df = pd.DataFrame(preds[:,1], columns = ['probability_of_default'])
preds_df['loan_status'] = preds_df['probability_of_default'].apply(lambda x: 1 if x > 0.5 else 0)

In [67]:
preds_df.sample(10)

Unnamed: 0,probability_of_default,loan_status
5325,0.446738,0
388,0.134696,0
170,0.354358,0
7548,0.750736,1
2811,0.636474,1
8220,0.082386,0
697,0.652944,1
5315,0.758279,1
3069,0.877144,1
6790,0.032686,0


In [68]:
print(classification_report(y_test, preds_df['loan_status'], target_names=['No Default', 'Default']))

              precision    recall  f1-score   support

  No Default       0.90      0.70      0.79      7486
     Default       0.40      0.72      0.51      2018

    accuracy                           0.71      9504
   macro avg       0.65      0.71      0.65      9504
weighted avg       0.80      0.71      0.73      9504



In [69]:
confusion_matrix(y_test, preds_df['loan_status'])

array([[5270, 2216],
       [ 565, 1453]])

In [70]:
# Random Forest Classifier
clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [71]:
clf_rf.predict_proba(X_test)

array([[0.  , 1.  ],
       [1.  , 0.  ],
       [0.98, 0.02],
       ...,
       [0.95, 0.05],
       [1.  , 0.  ],
       [0.77, 0.23]])

In [72]:
clf_rf.score(X_test, y_test)

0.9327651515151515

In [73]:
preds = clf_rf.predict_proba(X_test)
preds_df = pd.DataFrame(preds[:, 1], columns=["probability_of_default"])
preds_df["loan_status"] = preds_df["probability_of_default"].apply(
    lambda x: 1 if x > 0.5 else 0
)

preds_df.sample(10)

Unnamed: 0,probability_of_default,loan_status
3801,0.03,0
7052,0.04,0
8157,0.97,1
7847,0.1,0
1521,0.18,0
2515,0.25,0
7472,0.04,0
3483,0.01,0
8315,0.01,0
4440,0.35,0


In [74]:
print(classification_report(
    y_test, preds_df["loan_status"], target_names=["No Default", "Default"]
))

              precision    recall  f1-score   support

  No Default       0.93      0.99      0.96      7486
     Default       0.95      0.72      0.82      2018

    accuracy                           0.93      9504
   macro avg       0.94      0.85      0.89      9504
weighted avg       0.93      0.93      0.93      9504



Le random forest semble proche de la perfection (that's suspicious). Voyons plus en détail.
Faisons la feature importance

In [75]:
clf_rf.feature_importances_

array([0.02424474, 0.11048596, 0.03452873, 0.04876855, 0.08347487,
       0.17639426, 0.01822355, 0.02894773, 0.00058724, 0.00926524,
       0.08246875, 0.02364539, 0.01032922, 0.01811691, 0.02234169,
       0.0100379 , 0.0061852 , 0.01097608, 0.01528057, 0.01628157,
       0.13413539, 0.03503136, 0.00796011, 0.00312443, 0.02464888,
       0.04451568])

In [76]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': clf_rf.feature_importances_
}).sort_values('importance', ascending=False)

feature_importance.head(10).plot(x='feature', y='importance', kind='bar',
                                title='Top 10 Feature Importances - Random Forest')
