In [10]:
%reload_kedro

In [14]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold, cross_val_predict

import matplotlib.pyplot as plt

import mlflow

In [11]:
train_df = catalog.load("full_df_train")
#test_df = catalog.load("full_df_test")

In [16]:
def split_data(df, train_size=0.8, test_size=0.2, random_state=42):
    features = [f for f in df.columns if f not in ["SK_ID_CURR", "TARGET"]]
    X = df[features] # Features
    y = df["TARGET"]  # Target variable
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def train_model(X_train, y_train, model):
    model.fit(X_train, y_train)
    return model

def evaluate_model(X_test, y_test, model):
    y_pred = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, y_pred)
    return roc_auc

In [27]:
import mlflow
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

def train_and_evaluate_model(X, y, model, model_name, params):
    if mlflow.active_run():
        mlflow.end_run()
    # Start an MLflow run
    with mlflow.start_run(run_name=model_name):
        # Fitting model
        model = model.fit(X, y)
        y_probs = model.predict_proba(X)
        y_scores = y_probs[:, 1]  # get the probability of class 1
        # Compute ROC-AUC
        print("Calculating ROC-AUC score...")
        roc_auc = roc_auc_score(y, y_scores)
        
        # Log parameters, metrics, and model
        print("Logging parameters, metrics, and model with MLflow...")
        mlflow.log_params(params)
        mlflow.log_metric("ROC_AUC", roc_auc)
        mlflow.sklearn.log_model(model, "model")

        # Plot ROC curve and save as an artifact
        print("Plotting ROC curve...")
        fpr, tpr, _ = roc_curve(y, y_scores)
        fig = plt.figure(figsize=(6,6))
        plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        roc_curve_path = "roc_curve.png"
        plt.savefig(roc_curve_path)
        plt.close(fig)
        print("Saving ROC curve as an artifact...")
        mlflow.log_artifact(roc_curve_path)

        print("Training and evaluation completed.")
        return roc_auc


In [23]:
mlflow.set_tracking_uri("mlruns")


2024/05/10 16:51:11 INFO mlflow.tracking.fluent: Experiment with name 'OCP7_modelisation_090524_sample5' does not exist. Creating a new experiment.


[1m<[0m[1;95mExperiment:[0m[39m [0m[33martifact_location[0m[39m=[0m[32m'file:///C:/Users/9509298u/Documents/GitHub/OC_Projects/ocp7-scoring-model-cloud/notebooks/mlruns/218322275285935902'[0m[39m, [0m[33mcreation_time[0m[39m=[0m[1;36m1715352671324[0m[39m, [0m[33mexperiment_id[0m[39m=[0m[32m'218322275285935902'[0m[39m, [0m[33mlast_update_time[0m[39m=[0m[1;36m1715352671324[0m[39m, [0m[33mlifecycle_stage[0m[39m=[0m[32m'active'[0m[39m, [0m[33mname[0m[39m=[0m[32m'OCP7_modelisation_090524_sample5'[0m[39m, [0m[33mtags[0m[39m=[0m[1;39m{[0m[1;39m}[0m[1m>[0m

In [29]:
mlflow.set_experiment("OCP7_modelisation_090524_sample6")
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

# Define your dataset
X_train, X_test, y_train, y_test = split_data(train_df.sample(frac=0.5))

# Dummy classifier

params_dummy = {'strategy': 'stratified'}
model_dummy = DummyClassifier(**params_dummy)
train_and_evaluate_model(X_train, y_train, model_dummy, "Dummy Classifier", params_dummy)

# Random Forest
params_rf = {'n_estimators': 50, 'max_depth': 10}
model_rf = RandomForestClassifier(**params_rf)
train_and_evaluate_model(X_train, y_train, model_rf, "Random Forest", params_rf)

# Logistic Regression
params_lr = {'C': 0.1}
model_lr = LogisticRegression(**params_lr)
train_and_evaluate_model(X_train, y_train, model_lr, "Logistic Regression", params_lr)

# LightGBM

params_lgb = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

model_lgb = lgb.LGBMClassifier(**params_lgb)
train_and_evaluate_model(X=X_train, y=y_train, model=model_lgb, model_name="LightGBM", params=params_lgb)

Calculating ROC-AUC score...
Logging parameters, metrics, and model with MLflow...
Plotting ROC curve...
Saving ROC curve as an artifact...
Training and evaluation completed.
Calculating ROC-AUC score...
Logging parameters, metrics, and model with MLflow...
Plotting ROC curve...
Saving ROC curve as an artifact...
Training and evaluation completed.
Calculating ROC-AUC score...
Logging parameters, metrics, and model with MLflow...
Plotting ROC curve...
Saving ROC curve as an artifact...
Training and evaluation completed.
[LightGBM] [Info] Number of positive: 9920, number of negative: 113083
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.122015 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 44376
[LightGBM] [Info] Number of data points in the train set: 123003, number of used features: 454
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080648 -> initscore=-2.433569
[LightGBM] [Info] Start training

[1;36m0.8242152620270637[0m

On trouve que le modèle LightGBM est le plus performant. On va donc essayer de l'améliorer en utilisant une validation croisée.

In [38]:
from sklearn.model_selection import GridSearchCV

In [48]:
# Create a training and testing dataset
X_train, X_test, y_train, y_test = split_data(train_df, train_size=0.2, test_size=0.8)

In [49]:
params_lgb = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'n_estimators':10000,
    'verbose': 1
}

param_grid = {
    'num_leaves': [31, 63, 127],  # Reduced from finer increments
    'learning_rate': [0.01, 0.05, 0.1],  # Focus on commonly effective rates
    'min_child_samples': [20, 40],  # Only two options based on prior knowledge
    'reg_alpha': [0, 0.1],  # Limited to very few options
    'reg_lambda': [0, 0.1]  # Limited to very few options
}

model_lgb = lgb.LGBMClassifier(**params_lgb)

In [50]:
grid_search = GridSearchCV(estimator=model_lgb, param_grid=param_grid, cv=5, scoring='roc_auc', verbose=1)

In [51]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[LightGBM] [Info] Number of positive: 3987, number of negative: 45213
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061562 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41875
[LightGBM] [Info] Number of data points in the train set: 49200, number of used features: 441
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.081037 -> initscore=-2.428346
[LightGBM] [Info] Start training from score -2.428346
[LightGBM] [Info] Number of positive: 3988, number of negative: 45213
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41919
[LightGBM] [Info] Number of data points in the train set: 49201, number of used features: 442
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.081055 -> initscore=-2.428095
[L

In [None]:
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

In [None]:
best_model = grid_search.best_estimator_
validation_roc_auc = best_model.score(X_test, y_test)
print("Validation accuracy: {:.2f}".format(validation_roc_auc))

In [52]:
# Perform random search instead
from sklearn.model_selection import RandomizedSearchCV

In [53]:
param_dist = {
    'num_leaves': [31, 62, 127, 255],
    'max_depth': [-1, 5, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [200, 500],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0, 0.1]
}


In [54]:
random_search = RandomizedSearchCV(estimator=model_lgb, param_distributions=param_grid, n_iter=10, cv=5, scoring='roc_auc', verbose=1)

In [55]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Number of positive: 3987, number of negative: 45213
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41875
[LightGBM] [Info] Number of data points in the train set: 49200, number of used features: 441
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.081037 -> initscore=-2.428346
[LightGBM] [Info] Start training from score -2.428346
[LightGBM] [Info] Number of positive: 3988, number of negative: 45213
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.053151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41919
[LightGBM] [Info] Number of data points in the train set: 49201, number of used features: 442
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.081055 -> initscore=-2.428095
[Li

Exception ignored on calling ctypes callback function: <function _log_callback at 0x000001D7838F8AF0>
Traceback (most recent call last):
  File "C:\Users\9509298u\AppData\Local\miniconda3\envs\ocp7-scoring-model-cloud\lib\site-packages\lightgbm\basic.py", line 224, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 3987, number of negative: 45214
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063361 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41591
[LightGBM] [Info] Number of data points in the train set: 49201, number of used features: 442
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.081035 -> initscore=-2.428368
[LightGBM] [Info] Start training from score -2.428368
[LightGBM] [Info] Number of positive: 3987, number of negative: 45213
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064301 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41846
[LightGBM] [Info] Number of data points in the train set: 49200, number of used features: 436
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.081037 -> initscore=-2.428346
[LightGBM]

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000001D7A7BA81F0>>
Traceback (most recent call last):
  File "C:\Users\9509298u\AppData\Local\miniconda3\envs\ocp7-scoring-model-cloud\lib\site-packages\ipykernel\ipkernel.py", line 788, in _clean_thread_parent_frames
    if phase != "start":
KeyboardInterrupt: 


In [None]:
print("Best parameters found:", random_search.best_params_)
print("Best cross-validation score: {:.2f}".format(random_search.best_score_))
best_model = random_search.best_estimator_
validation_roc_auc = best_model.score(X_test, y_test)
print("Validation accuracy: {:.2f}".format(validation_roc_auc))

In [None]:
clf = lgb.LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

In [30]:
# Create a training and testing dataset
X_train, X_test, y_train, y_test = split_data(train_df)
train_set = lgb.Dataset(data = X_train, label = y_train)
test_set = lgb.Dataset(data = X_test, label = y_test)

In [31]:
# Get default hyperparameters
model = lgb.LGBMClassifier()
default_params = model.get_params()

In [32]:
default_params


[1m{[0m
    [32m'boosting_type'[0m: [32m'gbdt'[0m,
    [32m'class_weight'[0m: [3;35mNone[0m,
    [32m'colsample_bytree'[0m: [1;36m1.0[0m,
    [32m'importance_type'[0m: [32m'split'[0m,
    [32m'learning_rate'[0m: [1;36m0.1[0m,
    [32m'max_depth'[0m: [1;36m-1[0m,
    [32m'min_child_samples'[0m: [1;36m20[0m,
    [32m'min_child_weight'[0m: [1;36m0.001[0m,
    [32m'min_split_gain'[0m: [1;36m0.0[0m,
    [32m'n_estimators'[0m: [1;36m100[0m,
    [32m'n_jobs'[0m: [3;35mNone[0m,
    [32m'num_leaves'[0m: [1;36m31[0m,
    [32m'objective'[0m: [3;35mNone[0m,
    [32m'random_state'[0m: [3;35mNone[0m,
    [32m'reg_alpha'[0m: [1;36m0.0[0m,
    [32m'reg_lambda'[0m: [1;36m0.0[0m,
    [32m'subsample'[0m: [1;36m1.0[0m,
    [32m'subsample_for_bin'[0m: [1;36m200000[0m,
    [32m'subsample_freq'[0m: [1;36m0[0m
[1m}[0m

In [33]:
# Remove the number of estimators because we set this to 10000 in the cv call
del default_params['n_estimators']

In [36]:
# Cross validation with early stopping
N_FOLDS = 5
MAX_EVALS = 5
cv_results = lgb.cv(default_params, train_set, num_boost_round = 10000, 
                    metrics = 'auc', nfold = N_FOLDS, seed = 42)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.206111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45280
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 456
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.176241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45280
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 456
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.209469 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45280
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 456
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.219248 seconds.
You can set `force_c

In [None]:
def kfold_lightgbm(df, num_folds):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    # Cross validation model
    folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = lgb.LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        
    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    return feature_importance_df