In [1]:
%reload_kedro

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.dummy import DummyClassifier
import lightgbm as lgb

import matplotlib.pyplot as plt

import mlflow

# Métrique d'erreur business

On propose de découper le problème posé en deux questions qui peuvent être résolues séparément : 

1) la question de l'évaluation de la probabilité de défaut des clients,
2) la question de l'attribution d'un crédit sur la base de cette probabilité de défaut estimée.

L'évaluation de la probabilité de défaut des clients peut être réalisée à l'aide d'un algorithme de classification. Ensuite il nous faut trouver une règle de décision opérationnelle sur la base de cette probabilité.

Soit $P(D) = p$, la probabilité de défaut du client, $M$ le montant du crédit demandé et $i$ le taux d'intérêt appliqué par l'entreprise. On a $\Pi_D(i) = -M $ le profit réalisé en cas de défaut du client, et $\Pi_\bar{D}(i) = iM$ le profit réalisé en cas de remboursement du prêt. Le profit espéré de la firme en fonction de $p$ et $i$ peut donc s'écrire : $$E(\Pi|i,p) =(1-p)iM - piM $$

On constate que cette fonction est croissante de $i$, décroissante de $p$ et qu'elle s'annule lorsque $$p=\frac{i}{1+i}$$

Cette relation nous permet de définir un seuil $\bar{p}(i)$ sur la probabilité de défaut du client pour l'attribution d'un prêt à un taux d'intérêt $i$ donné. 

Par exemple, si le taux d'intérêt est de 10%, $i=0.1$, $\bar{p}(0.1) = 0.09$. On proposera l'octroi du prêt si la probabilité de défaut prédite par le modèle est inférieure à 0.09, soit 9%. 

On pourrait également renverser la relation et proposer d'appliquer un taux d'intérêt plus élevé pour prendre en compte le risque du client : $$\bar{i}(p) = \frac{p}{1-p}$$

In [3]:
train_df = catalog.load("full_df_train")
#test_df = catalog.load("full_df_test")

In [4]:
def split_data(df, train_size=0.8, test_size=0.2, random_state=42):
    features = [f for f in df.columns if f not in ["SK_ID_CURR", "TARGET"]]
    X = df[features] # Features
    y = df["TARGET"]  # Target variable
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def train_model(X_train, y_train, model):
    model.fit(X_train, y_train)
    return model

def evaluate_model(X_test, y_test, model):
    y_pred = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, y_pred)
    return roc_auc

In [5]:
def train_and_evaluate_model(X, y, model, model_name, params):
    if mlflow.active_run():
        mlflow.end_run()
    # Start an MLflow run
    with mlflow.start_run(run_name=model_name):
        # Fitting model
        model = model.fit(X, y)
        y_probs = model.predict_proba(X)
        y_scores = y_probs[:, 1]  # get the probability of class 1
        # Compute ROC-AUC
        print("Calculating ROC-AUC score...")
        roc_auc = roc_auc_score(y, y_scores)
        
        # Log parameters, metrics, and model
        print("Logging parameters, metrics, and model with MLflow...")
        mlflow.log_params(params)
        mlflow.log_metric("ROC_AUC", roc_auc)
        mlflow.sklearn.log_model(model, "model")

        # Plot ROC curve and save as an artifact
        print("Plotting ROC curve...")
        fpr, tpr, _ = roc_curve(y, y_scores)
        fig = plt.figure(figsize=(6,6))
        plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        roc_curve_path = "roc_curve.png"
        plt.savefig(roc_curve_path)
        plt.close(fig)
        print("Saving ROC curve as an artifact...")
        mlflow.log_artifact(roc_curve_path)

        print("Training and evaluation completed.")
        return roc_auc


In [6]:
mlflow.set_tracking_uri("mlruns")


In [29]:
mlflow.set_experiment("OCP7_modelisation_030624_1")

# Define your dataset
X_train, X_test, y_train, y_test = split_data(train_df.sample(frac=0.5))

# Dummy classifier

params_dummy = {'strategy': 'stratified'}
model_dummy = DummyClassifier(**params_dummy)
train_and_evaluate_model(X_train, y_train, model_dummy, "Dummy Classifier", params_dummy)

# Random Forest
params_rf = {'n_estimators': 50, 'max_depth': 10}
model_rf = RandomForestClassifier(**params_rf)
train_and_evaluate_model(X_train, y_train, model_rf, "Random Forest", params_rf)

# Logistic Regression
params_lr = {'C': 0.1}
model_lr = LogisticRegression(**params_lr)
train_and_evaluate_model(X_train, y_train, model_lr, "Logistic Regression", params_lr)

# LightGBM
params_lgb = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

model_lgb = lgb.LGBMClassifier(**params_lgb)
train_and_evaluate_model(X=X_train, y=y_train, model=model_lgb, model_name="LightGBM", params=params_lgb)

Calculating ROC-AUC score...
Logging parameters, metrics, and model with MLflow...
Plotting ROC curve...
Saving ROC curve as an artifact...
Training and evaluation completed.
Calculating ROC-AUC score...
Logging parameters, metrics, and model with MLflow...
Plotting ROC curve...
Saving ROC curve as an artifact...
Training and evaluation completed.
Calculating ROC-AUC score...
Logging parameters, metrics, and model with MLflow...
Plotting ROC curve...
Saving ROC curve as an artifact...
Training and evaluation completed.
[LightGBM] [Info] Number of positive: 9920, number of negative: 113083
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.122015 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 44376
[LightGBM] [Info] Number of data points in the train set: 123003, number of used features: 454
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080648 -> initscore=-2.433569
[LightGBM] [Info] Start training

[1;36m0.8242152620270637[0m

On trouve que le modèle LightGBM est le plus performant. On va donc essayer de l'améliorer en utilisant une validation croisée.

In [7]:
# LightGBM
mlflow.set_experiment("OCP7_modelisation_030624_1")

# Define your dataset
X_train, X_test, y_train, y_test = split_data(train_df.sample(frac=0.5))
params_lgb = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

model_lgb = lgb.LGBMClassifier(**params_lgb)
train_and_evaluate_model(X=X_train, y=y_train, model=model_lgb, model_name="LightGBM", params=params_lgb)

2024/06/03 09:00:45 INFO mlflow.tracking.fluent: Experiment with name 'OCP7_modelisation_030624_1' does not exist. Creating a new experiment.


[LightGBM] [Info] Number of positive: 9967, number of negative: 113036
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.097384 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 44426
[LightGBM] [Info] Number of data points in the train set: 123003, number of used features: 452
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.081031 -> initscore=-2.428427
[LightGBM] [Info] Start training from score -2.428427
Calculating ROC-AUC score...
Logging parameters, metrics, and model with MLflow...
Plotting ROC curve...
Saving ROC curve as an artifact...
Training and evaluation completed.


[1;36m0.823514893816781[0m

In [7]:
# Create a training and testing dataset
X_train, X_test, y_train, y_test = split_data(train_df, train_size=0.3, test_size=0.7)

In [10]:
params_lgb = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'n_estimators':10000,
    'verbose': -1
}

param_grid = {
    'num_leaves': [34, 65], 
    'learning_rate': [0.02, 0.05],  
    'max_depth': [8, 12]
}

model_lgb = lgb.LGBMClassifier(**params_lgb)

In [17]:
grid_search = GridSearchCV(estimator=model_lgb, param_grid=param_grid, cv=2, scoring='roc_auc', verbose=10)

In [18]:
grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 1/2; 1/8] START learning_rate=0.02, max_depth=8, num_leaves=34..............
[CV 1/2; 1/8] END learning_rate=0.02, max_depth=8, num_leaves=34;, score=0.733 total time= 1.5min
[CV 2/2; 1/8] START learning_rate=0.02, max_depth=8, num_leaves=34..............
[CV 2/2; 1/8] END learning_rate=0.02, max_depth=8, num_leaves=34;, score=0.734 total time= 1.5min
[CV 1/2; 2/8] START learning_rate=0.02, max_depth=8, num_leaves=65..............
[CV 1/2; 2/8] END learning_rate=0.02, max_depth=8, num_leaves=65;, score=0.726 total time= 2.2min
[CV 2/2; 2/8] START learning_rate=0.02, max_depth=8, num_leaves=65..............
[CV 2/2; 2/8] END learning_rate=0.02, max_depth=8, num_leaves=65;, score=0.730 total time= 2.1min
[CV 1/2; 3/8] START learning_rate=0.02, max_depth=12, num_leaves=34.............
[CV 1/2; 3/8] END learning_rate=0.02, max_depth=12, num_leaves=34;, score=0.739 total time= 1.4min
[CV 2/2; 3/8] START learning_rate=0.02, max_

In [56]:
X_train, X_test, y_train, y_test = split_data(train_df)
train_set = lgb.Dataset(X_train, y_train)
val_set = lgb.Dataset(X_test, y_test)

In [61]:
# Get default hyperparameters
model = lgb.LGBMClassifier()
default_params = model.get_params()

In [62]:
# Cross validation with early stopping
cv_results = lgb.cv(default_params, train_set, metrics='auc', 
                    nfold=5, seed=42, callbacks=[lgb.early_stopping(stopping_rounds=100)])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.146458 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45280
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 456
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.167373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45280
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 456
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.163618 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45280
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 456
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.154080 seconds.
You can set `force_c

In [65]:
cv_results


[1m{[0m
    [32m'valid auc-mean'[0m: [1m[[0m
        [1;36m0.7030554191648362[0m,
        [1;36m0.7103788606383553[0m,
        [1;36m0.7139725044585108[0m,
        [1;36m0.7170087269043448[0m,
        [1;36m0.7202398504680867[0m,
        [1;36m0.7226871865193297[0m,
        [1;36m0.7242932003846297[0m,
        [1;36m0.7263610369208934[0m,
        [1;36m0.7281280035664908[0m,
        [1;36m0.7302422187738682[0m,
        [1;36m0.7319578035366889[0m,
        [1;36m0.7339613560261296[0m,
        [1;36m0.7355943872884947[0m,
        [1;36m0.7374468396920524[0m,
        [1;36m0.7388030922900846[0m,
        [1;36m0.740776291332296[0m,
        [1;36m0.7422195243827643[0m,
        [1;36m0.7439724479172949[0m,
        [1;36m0.745209770740302[0m,
        [1;36m0.7470515605987046[0m,
        [1;36m0.7482211934467908[0m,
        [1;36m0.7494348413166397[0m,
        [1;36m0.7511464164749384[0m,
        [1;36m0.7520760963390046[0m,
        [1;36

In [66]:
print('The maximum validation ROC AUC was: {:.5f} with a standard deviation of {:.5f}.'.format(cv_results['valid auc-mean'][-1], cv_results['valid auc-stdv'][-1]))
print('The optimal number of boosting rounds (estimators) was {}.'.format(len(cv_results['valid auc-mean'])))

The maximum validation ROC AUC was: 0.76980 with a standard deviation of 0.00325.
The optimal number of boosting rounds (estimators) was 100.


In [67]:
# Optimal number of esimators found in cv
model.n_estimators = len(cv_results['valid auc-mean'])

# Train and make predicions with model
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:, 1]
baseline_auc = roc_auc_score(y_test, preds)

print('The baseline model scores {:.5f} ROC AUC on the test set.'.format(baseline_auc))

[LightGBM] [Info] Number of positive: 19804, number of negative: 226201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.178898 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45280
[LightGBM] [Info] Number of data points in the train set: 246005, number of used features: 456
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080502 -> initscore=-2.435540
[LightGBM] [Info] Start training from score -2.435540
The baseline model scores 0.77386 ROC AUC on the test set.


In [45]:
# test_df = catalog.load("full_df_test")
# pd.DataFrame({"SK_ID_CURR": test_df["SK_ID_CURR"], "TARGET": y_pred_test}).to_csv("../data/07_model_output/kaggle_output_df_200524.csv", index=False)