In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

pd.options.plotting.backend = "plotly"

In [35]:
file_dir = Path("..") / "data" / "credit_risk_dataset_cleaned.csv"
df = pd.read_csv(file_dir)
df.head()


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [36]:
df.shape

(31679, 12)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31679 entries, 0 to 31678
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  31679 non-null  int64  
 1   person_income               31679 non-null  int64  
 2   person_home_ownership       31679 non-null  object 
 3   person_emp_length           31679 non-null  float64
 4   loan_intent                 31679 non-null  object 
 5   loan_grade                  31679 non-null  object 
 6   loan_amnt                   31679 non-null  int64  
 7   loan_int_rate               28632 non-null  float64
 8   loan_status                 31679 non-null  int64  
 9   loan_percent_income         31679 non-null  float64
 10  cb_person_default_on_file   31679 non-null  object 
 11  cb_person_cred_hist_length  31679 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 2.9+ MB


In [38]:
df.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length                0
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3047
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [39]:
null_columns = df.columns[df.isnull().any()].tolist()
df[null_columns].isnull().sum()

loan_int_rate    3047
dtype: int64

In [40]:
pd.crosstab(df['person_home_ownership'], df['loan_status'], values=df['loan_int_rate'], aggfunc='mean').round(2)

loan_status,0,1
person_home_ownership,Unnamed: 1_level_1,Unnamed: 2_level_1
MORTGAGE,10.09,13.53
OTHER,11.41,13.56
OWN,10.86,12.46
RENT,10.76,13.01


On va remplir les valeurs manquantes de `loan_int_rate` par la moyenne de la colonne.

In [41]:
df['loan_int_rate'] = df['loan_int_rate'].fillna(df['loan_int_rate'].mean())

In [42]:
X = df.drop(columns=['loan_status'])
y = df[['loan_status']]

In [43]:
X_numerical = X.select_dtypes(include=["int64", "float64"])

X_categorical = X.select_dtypes(include=["object"])

print(X_numerical.columns.tolist())
print(X_categorical.columns.tolist())


['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']


Il va falloir encoder les variables catégorielles. On va utiliser OneHotEncoder pour cela.

Nous allons utiliser `pipeline` de `sklearn` pour simplifier le prétraitement des données.

In [44]:
X_categorical_onehot =pd.get_dummies(X_categorical)

In [45]:
X = pd.concat([X_numerical, X_categorical_onehot], axis=1)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


In [47]:
clf_logistic = LogisticRegression()
clf_logistic.fit(X_train, np.ravel(y_train))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
clf_logistic.predict_proba(X_test)

array([[0.50768096, 0.49231904],
       [0.94638767, 0.05361233],
       [0.91750467, 0.08249533],
       ...,
       [0.76663527, 0.23336473],
       [0.97142281, 0.02857719],
       [0.69625458, 0.30374542]])

In [49]:
clf_logistic.score(X_test, y_test)

0.8202861952861953

In [50]:
preds = clf_logistic.predict_proba(X_test)
preds_df = pd.DataFrame(preds[:,1], columns = ['probability_of_default'])
preds_df['loan_status'] = preds_df['probability_of_default'].apply(lambda x: 1 if x > 0.5 else 0)

In [54]:
preds_df.sample(10)

Unnamed: 0,probability_of_default,loan_status
8884,0.015702,0
2381,0.146278,0
3866,0.533549,1
3372,0.091381,0
8870,0.647748,1
7648,0.259917,0
1583,0.073593,0
8426,0.217693,0
227,0.207546,0
2788,0.358385,0


In [56]:
from sklearn.metrics import classification_report

print(classification_report(y_test, preds_df['loan_status'], target_names=['No Default', 'Default']))

              precision    recall  f1-score   support

  No Default       0.84      0.95      0.89      7486
     Default       0.65      0.33      0.44      2018

    accuracy                           0.82      9504
   macro avg       0.75      0.64      0.67      9504
weighted avg       0.80      0.82      0.80      9504

