In [116]:
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

pd.options.plotting.backend = "plotly"

In [103]:
file_dir = Path("..") / "data" / "credit_risk_dataset_cleaned.csv"
df = pd.read_csv(file_dir)
df.head()


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [104]:
df.shape

(31679, 12)

In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31679 entries, 0 to 31678
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  31679 non-null  int64  
 1   person_income               31679 non-null  int64  
 2   person_home_ownership       31679 non-null  object 
 3   person_emp_length           31679 non-null  float64
 4   loan_intent                 31679 non-null  object 
 5   loan_grade                  31679 non-null  object 
 6   loan_amnt                   31679 non-null  int64  
 7   loan_int_rate               28632 non-null  float64
 8   loan_status                 31679 non-null  int64  
 9   loan_percent_income         31679 non-null  float64
 10  cb_person_default_on_file   31679 non-null  object 
 11  cb_person_cred_hist_length  31679 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 2.9+ MB


In [106]:
df.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length                0
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3047
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [107]:
null_columns = df.columns[df.isnull().any()].tolist()
df[null_columns].isnull().sum()

loan_int_rate    3047
dtype: int64

In [108]:
pd.crosstab(df['person_home_ownership'], df['loan_status'], values=df['loan_int_rate'], aggfunc='mean').round(2)

loan_status,0,1
person_home_ownership,Unnamed: 1_level_1,Unnamed: 2_level_1
MORTGAGE,10.09,13.53
OTHER,11.41,13.56
OWN,10.86,12.46
RENT,10.76,13.01


On va remplir les valeurs manquantes de `loan_int_rate` par la moyenne de la colonne.

In [109]:
df['loan_int_rate'] = df['loan_int_rate'].fillna(df['loan_int_rate'].mean())

In [110]:
X = df.drop(columns=['loan_status'])
y = df[['loan_status']]

In [111]:
y.value_counts()

loan_status
0              24854
1               6825
Name: count, dtype: int64

Les deux classes sont  deséquilibrées. Il faut donc les équilibrer. Nous allons utiliser la méthode SMOTE (Synthetic Minority Oversampling TEchnique)

In [112]:
numerical_cols = X.select_dtypes(include=["int64", "float64"])

categorical_cols = X.select_dtypes(include=["object"])

ordinal_cols = ['loan_grade']
nominal_cols = [col for col in categorical_cols.columns if col not in ordinal_cols]

print("Ordinal columns:", ordinal_cols)
print("Nominal columns:", nominal_cols)

Ordinal columns: ['loan_grade']
Nominal columns: ['person_home_ownership', 'loan_intent', 'cb_person_default_on_file']


Il va falloir encoder les variables catégorielles. On va utiliser OrdinalEncoder pour la variable `loan_grade` (pour conserver l'ordre) et OneHotEncoder pour les autres.

In [113]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [114]:
y_train.value_counts()

loan_status
0              17398
1               4777
Name: count, dtype: int64

In [118]:
ordinal_pipeline = OrdinalEncoder(categories=[['A', 'B', 'C', 'D', 'E', 'F', 'G']])

numerical_pipeline = StandardScaler()

nominal_pipeline = OneHotEncoder(
    handle_unknown="ignore"
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols.columns.tolist()),
        ("ord", ordinal_pipeline, ordinal_cols),
        ("nom", nominal_pipeline, nominal_cols),
    ],
    remainder="passthrough",
)

In [119]:
clf_logistic = LogisticRegression() # baseline
clf_rf = RandomForestClassifier(random_state=42)

In [120]:
pipeline_baseline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", clf_logistic)
])

pipeline_baseline.fit(X_train, y_train.values.ravel())

In [122]:
print(
    classification_report(
        y_test,
        pipeline_baseline.predict(X_test)
    )
)

              precision    recall  f1-score   support

           0       0.93      0.79      0.85      7456
           1       0.51      0.78      0.61      2048

    accuracy                           0.79      9504
   macro avg       0.72      0.79      0.73      9504
weighted avg       0.84      0.79      0.80      9504



In [123]:
pipeline_rf = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", clf_rf)
])
pipeline_rf.fit(X_train, y_train.values.ravel())

In [124]:
print(classification_report(y_test, pipeline_rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      0.99      0.96      7456
           1       0.95      0.73      0.82      2048

    accuracy                           0.93      9504
   macro avg       0.94      0.86      0.89      9504
weighted avg       0.93      0.93      0.93      9504



In [126]:
from sklearn.metrics import precision_recall_fscore_support

y_proba = pipeline_rf.predict_proba(X_test)
proba_default = y_proba[:, 1]

thresholds = [0.50, 0.45, 0.40, 0.35, 0.30]
results = []

for t in thresholds:
    y_pred_new = (proba_default >= t).astype(int)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred_new, pos_label=1, average="binary"
    )

    results.append(
        {
            "Threshold": t,
            "Precision": round(precision, 2),
            "Recall": round(recall, 2),
            "F1-Score": round(f1, 2),
        }
    )


threshold_df = pd.DataFrame(results)
threshold_df

Unnamed: 0,Threshold,Precision,Recall,F1-Score
0,0.5,0.94,0.73,0.82
1,0.45,0.91,0.74,0.82
2,0.4,0.85,0.76,0.8
3,0.35,0.78,0.78,0.78
4,0.3,0.71,0.81,0.75


In [127]:
threshold_df_sorted = threshold_df.sort_values(by="Threshold", ascending=True)

In [128]:
import plotly.express as px
import plotly.graph_objects as go

fig = go.Figure()

# Courbe de Précision
fig.add_trace(
    go.Scatter(
        x=threshold_df_sorted["Threshold"],
        y=threshold_df_sorted["Precision"],
        mode="lines+markers",
        name="Précision",
        line=dict(color="red"),
    )
)

# Courbe de Rappel
fig.add_trace(
    go.Scatter(
        x=threshold_df_sorted["Threshold"],
        y=threshold_df_sorted["Recall"],
        mode="lines+markers",
        name="Rappel",
        line=dict(color="green"),
    )
)

# Courbe du F1-Score
fig.add_trace(
    go.Scatter(
        x=threshold_df_sorted["Threshold"],
        y=threshold_df_sorted["F1-Score"],
        mode="lines+markers",
        name="F1-Score",
        line=dict(color="blue", dash="dash"),
    )
)


# 3. Mise en page
fig.update_layout(
    title="Compromis Précision-Rappel en fonction du Seuil de Décision",
    xaxis_title="Seuil de Classification",
    yaxis_title="Score (0 à 1)",
    xaxis=dict(tickformat=".2f"),
    yaxis=dict(range=[0, 1.05]),
    hovermode="x unified",  # Pour un affichage interactif clair
)

fig.show()