In [1]:
from src_draft.utils import LOW_IMP_FEATURES, ZERO_IMP_FEATURES
import shared.ml_config_core as ml_config_core
import pandas as pd
from shared.ml_config_core import ModelConfigsCollection
from shared.ml_config_runner import run_tuning_for_configs_collection

from Draft import feature_builder_v2

import importlib

VERBOSE = True
importlib.reload(feature_builder_v2)
features_matrix = feature_builder_v2.load_datasets_and_prepare_features(
    drop_meta_data=True, ds_type=feature_builder_v2.DatasetType.BASE
)
display("Distribution of Samples")
value_counts = features_matrix["TARGET"].value_counts()
df_value_counts = pd.DataFrame(
    {"Total Rows": value_counts, "Proportion": value_counts / len(features_matrix)}
)
df_value_counts.round(2)
transformers = [
    # No feature engineering tuning for for rejected/accepted df
    # ml_config_core.AcceptedRejectedDropAllButEmployment,  # Only used for testing
]

baseline_only_credit_ratings_config = {
    # Only uses  ExtScore1/2/3
    "Baseline_Only_CreditRatings": ml_config_core.ModelPipelineConfig(
        model_config=ml_config_core.LGBMTuneAUC(),
        transformer_config=ml_config_core.PipelineTransformerConfig(transformers),
        data_loader_params={
            "drop_meta_data": True,
            "ds_type": feature_builder_v2.DatasetType.BASE_ONLY_CREDIT_RATINGS,
        },
    )
}

baseline_only_base_feats = {
    # Only includes base table (i.e. no aggregations from client history)
    "LGBM_AUC_Base_Features": ml_config_core.ModelPipelineConfig(
        model_config=ml_config_core.LGBMTuneAUC(),
        transformer_config=ml_config_core.PipelineTransformerConfig(transformers),
        data_loader_params={
            "drop_meta_data": True,
            "ds_type": feature_builder_v2.DatasetType.BASE,
        },
    ),
}

weighted_log_loss_config = {
    # Tuned on score combined from logloss and F1 score
    "LGBM_Weighted_LogLoss": ml_config_core.ModelPipelineConfig(
        model_config=ml_config_core.LGBMTuneWeightedLogLossF1(),
        transformer_config=ml_config_core.PipelineTransformerConfig(transformers),
        data_loader_params={
            "drop_meta_data": True,
            "drop_cols_post_proc": ZERO_IMP_FEATURES + LOW_IMP_FEATURES,
        },
    ),
}

tune_auc_config: ModelConfigsCollection = {
    "LGBM_AUC": ml_config_core.ModelPipelineConfig(
        model_config=ml_config_core.LGBMTuneAUC(),
        transformer_config=ml_config_core.PipelineTransformerConfig(transformers),
        data_loader_params={
            "drop_meta_data": True,
            "ds_type": feature_builder_v2.DatasetType.FULL,
            "drop_cols_post_proc": ZERO_IMP_FEATURES + LOW_IMP_FEATURES,
        },
    ),
}

dart_tune_auc_config: ModelConfigsCollection = {
    "LGBM_Dart_AUC_NEW": ml_config_core.ModelPipelineConfig(
        model_config=ml_config_core.LGBMDartTuneAUC(),
        transformer_config=ml_config_core.PipelineTransformerConfig(transformers),
        data_loader_params={
            "drop_meta_data": True,
            "ds_type": feature_builder_v2.DatasetType.FULL,
            "drop_cols_post_proc": ZERO_IMP_FEATURES + LOW_IMP_FEATURES,
        },
    ),
}

tune_auc_all_features_config: ModelConfigsCollection = {
    "LGBM_AUC_All_Features": ml_config_core.ModelPipelineConfig(
        model_config=ml_config_core.LGBMTuneAUC(),
        transformer_config=ml_config_core.PipelineTransformerConfig(transformers),
        data_loader_params={
            "drop_meta_data": True,
            "ds_type": feature_builder_v2.DatasetType.FULL,
            # "drop_cols_post_proc": ZERO_IMP_FEATURES + LOW_IMP_FEATURES
        },
    ),
}

configs: ModelConfigsCollection = {
    # **baseline_only_credit_ratings_config,
    # **baseline_only_base_feats,
    # **weighted_log_loss_config,
    # **tune_auc_config,
    **dart_tune_auc_config,
    # **tune_auc_all_features_config
}

Full DS size: 307511


'Distribution of Samples'

In [None]:
tuning_results = run_tuning_for_configs_collection(
    configs, feature_builder_v2.load_datasets_and_prepare_features
)

In [13]:
for t, v in tuning_results.items():
    display(t)
    display(v.hyper_param_all_cv_results)

'LGBM_Dart_AUC_NEW'

Unnamed: 0,Trial Number,Mean Test Score (AUC),mean_test_score,mean_train_score,std_test_score,macro_f1,micro_f1,f1_target1,precision_target1,recall_target1,log_loss,pr_auc,mean_fold_time,Parameters
113,113,0.776541,0.776541,0.854625,0.001461,0.568715,0.744595,0.293298,0.188829,0.656516,0.521725,0.266816,14.861349,"{'model__boosting_type': 'dart', 'model__objec..."
59,59,0.776169,0.776169,0.863464,0.001774,0.571076,0.748920,0.294886,0.190670,0.650352,0.515669,0.265697,17.182891,"{'model__boosting_type': 'dart', 'model__objec..."
115,115,0.776133,0.776133,0.837253,0.001353,0.563986,0.735886,0.290214,0.185310,0.668842,0.533418,0.265379,18.331637,"{'model__boosting_type': 'dart', 'model__objec..."
103,103,0.776098,0.776098,0.850141,0.001724,0.567537,0.742464,0.292491,0.187923,0.659416,0.524208,0.266197,35.797750,"{'model__boosting_type': 'dart', 'model__objec..."
119,119,0.776070,0.776070,0.833643,0.001436,0.562364,0.733788,0.288463,0.183916,0.668439,0.535529,0.265608,18.596235,"{'model__boosting_type': 'dart', 'model__objec..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,68,0.728068,0.728068,0.738853,0.002483,0.522548,0.677790,0.250297,0.154093,0.666264,0.614381,0.201428,1.399441,"{'model__boosting_type': 'rf', 'model__objecti..."
2,2,0.726644,0.726644,0.735570,0.002767,0.478969,0.919271,0.000000,0.000000,0.000000,0.260398,0.204315,1.246094,"{'model__boosting_type': 'rf', 'model__objecti..."
45,45,0.725722,0.725722,0.737205,0.001131,0.520557,0.675839,0.247704,0.152405,0.661067,0.613808,0.194422,1.432226,"{'model__boosting_type': 'rf', 'model__objecti..."
9,9,0.725698,0.725698,0.735554,0.002000,0.525812,0.914384,0.096562,0.325921,0.056677,0.260996,0.197951,2.336899,"{'model__boosting_type': 'rf', 'model__objecti..."


In [5]:
tuning_results.keys()

dict_keys(['LGBM_Dart_AUC_NEW'])

In [6]:
# Results:

# Only main dataset:
# - top res      : 0.753
# - full + gbdt  : 0.765
# - full + dart  : 0.766

In [None]:
from Draft.feature_builder_v2 import DatasetType

importlib.reload(feature_builder_v2)
features_matrix_full = feature_builder_v2.load_datasets_and_prepare_features(
    drop_meta_data=True, ds_type=DatasetType.FULL
)

In [8]:
features_matrix_full

Unnamed: 0,TARGET,NameContractType,CodeGender,FlagOwnCar,FlagOwnRealty,CntChildren,AmtIncomeTotal,AmtCredit,AmtAnnuity,AmtGoodsPrice,...,PrevCodeRejectReasonHcCount,PrevCodeRejectReasonLimitCount,PrevCodeRejectReasonScoCount,PrevCodeRejectReasonScofrCount,PrevCodeRejectReasonSystemCount,PrevCodeRejectReasonVerifCount,PrevCodeRejectReasonXapCount,PrevCodeRejectReasonXnaCount,PrevLastLoanNflagInsuredOnApproval,PrevAvgNflagInsuredOnApproval
0,1,Cash loans,M,False,True,0,202500.0,406597.5,24700.5,351000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000
1,0,Cash loans,F,False,False,0,270000.0,1293502.5,35698.5,1129500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.666667
2,0,Revolving loans,M,True,True,0,67500.0,135000.0,6750.0,135000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000
3,0,Cash loans,F,False,True,0,135000.0,312682.5,29686.5,297000.0,...,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,,0.000000
4,0,Cash loans,M,False,True,0,121500.0,513000.0,21865.5,513000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,1.0,0.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0,Cash loans,M,False,False,0,157500.0,254700.0,27558.0,225000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000
307507,0,Cash loans,F,False,True,0,72000.0,269550.0,12001.5,225000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.000000
307508,0,Cash loans,F,False,True,0,153000.0,677664.0,29979.0,585000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.500000
307509,1,Cash loans,F,False,True,0,171000.0,370107.0,20205.0,319500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.500000


In [None]:
from Draft.feature_builder_v2 import DatasetType

importlib.reload(feature_builder_v2)
features_matrix_full = feature_builder_v2.load_datasets_and_prepare_features(
    drop_meta_data=True, ds_type=DatasetType.FULL
)
features_matrix_base = feature_builder_v2.load_datasets_and_prepare_features(
    drop_meta_data=True, ds_type=DatasetType.BASE
)

In [10]:
features_matrix_full

Unnamed: 0,TARGET,NameContractType,CodeGender,FlagOwnCar,FlagOwnRealty,CntChildren,AmtIncomeTotal,AmtCredit,AmtAnnuity,AmtGoodsPrice,...,PrevCodeRejectReasonHcCount,PrevCodeRejectReasonLimitCount,PrevCodeRejectReasonScoCount,PrevCodeRejectReasonScofrCount,PrevCodeRejectReasonSystemCount,PrevCodeRejectReasonVerifCount,PrevCodeRejectReasonXapCount,PrevCodeRejectReasonXnaCount,PrevLastLoanNflagInsuredOnApproval,PrevAvgNflagInsuredOnApproval
0,1,Cash loans,M,False,True,0,202500.0,406597.5,24700.5,351000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000
1,0,Cash loans,F,False,False,0,270000.0,1293502.5,35698.5,1129500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.666667
2,0,Revolving loans,M,True,True,0,67500.0,135000.0,6750.0,135000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000
3,0,Cash loans,F,False,True,0,135000.0,312682.5,29686.5,297000.0,...,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,,0.000000
4,0,Cash loans,M,False,True,0,121500.0,513000.0,21865.5,513000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,1.0,0.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0,Cash loans,M,False,False,0,157500.0,254700.0,27558.0,225000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000
307507,0,Cash loans,F,False,True,0,72000.0,269550.0,12001.5,225000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.000000
307508,0,Cash loans,F,False,True,0,153000.0,677664.0,29979.0,585000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.500000
307509,1,Cash loans,F,False,True,0,171000.0,370107.0,20205.0,319500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.500000
