In [160]:
import pandas as pd
from pycaret.classification import ClassificationExperiment
from sklearn.metrics import log_loss

In [161]:
df = pd.read_csv('Train.csv').drop('uniqueid', axis=1)
df.shape

(23524, 12)

In [233]:
s = ClassificationExperiment()
s.setup(
    df,
    target="bank_account",
    session_id=42,
    preprocess=True,
    fix_imbalance=False,
    fix_imbalance_method="smote",
    normalize_method="minmax",
    n_jobs=-1,
    train_size=.8,
    fold=10,
    transformation=False,
)

s.add_metric(
    id="logloss",
    name="Log Loss",
    score_func=log_loss,
    target="pred_proba",
    greater_is_better=False,
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,bank_account
2,Target type,Binary
3,Target mapping,"No: 0, Yes: 1"
4,Original data shape,"(23524, 12)"
5,Transformed data shape,"(23524, 38)"
6,Transformed train set shape,"(18819, 38)"
7,Transformed test set shape,"(4705, 38)"
8,Numeric features,3
9,Categorical features,8


Name                                                          Log Loss
Display Name                                                  Log Loss
Score Function       <pycaret.internal.metrics.EncodedDecodedLabels...
Scorer               make_scorer(log_loss, greater_is_better=False,...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                False
Multiclass                                                        True
Custom                                                            True
Name: logloss, dtype: object

In [234]:
s.train_transformed.head()

Unnamed: 0,country_Kenya,country_Rwanda,country_Uganda,country_Tanzania,year,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,...,job_type_Farming and Fishing,job_type_Self employed,job_type_Remittance Dependent,job_type_No Income,job_type_Formally employed Private,job_type_Formally employed Government,job_type_Other Income,job_type_Government Dependent,job_type_Dont Know/Refuse to answer,bank_account
3535,1.0,0.0,0.0,0.0,2018.0,0.0,1.0,4.0,28.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10836,0.0,1.0,0.0,0.0,2016.0,0.0,0.0,6.0,63.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
23078,0.0,0.0,1.0,0.0,2018.0,0.0,1.0,5.0,34.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10569,0.0,1.0,0.0,0.0,2016.0,0.0,0.0,7.0,56.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9135,0.0,1.0,0.0,0.0,2016.0,0.0,1.0,7.0,35.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [235]:
# model training and selection
top_5_best = s.compare_models(exclude=['ridge', 'svm'], n_select=5, sort='Accuracy', budget_time=15)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8877,0.8556,0.8877,0.874,0.872,0.4193,0.4481,0.287,99.068
gbc,Gradient Boosting Classifier,0.8867,0.8623,0.8867,0.8729,0.8677,0.3925,0.4317,0.2831,0.301
xgboost,Extreme Gradient Boosting,0.8842,0.8466,0.8842,0.8693,0.8703,0.4164,0.4376,0.301,0.197
lr,Logistic Regression,0.8841,0.8481,0.8841,0.8688,0.8669,0.3934,0.4242,0.2928,0.748
ada,Ada Boost Classifier,0.8838,0.8489,0.8838,0.8683,0.8666,0.3924,0.4229,0.6691,0.15
lda,Linear Discriminant Analysis,0.8792,0.8398,0.8792,0.8642,0.8674,0.4106,0.4245,0.3239,0.069
knn,K Neighbors Classifier,0.8664,0.7331,0.8664,0.8415,0.8445,0.2869,0.3146,1.74,0.117
rf,Random Forest Classifier,0.8639,0.8023,0.8639,0.8493,0.8546,0.365,0.371,0.7298,0.231
nb,Naive Bayes,0.862,0.8277,0.862,0.8627,0.8623,0.4321,0.4323,1.3593,0.057
et,Extra Trees Classifier,0.8503,0.7568,0.8503,0.839,0.8439,0.3302,0.3325,1.8561,0.248


In [183]:
s.pull().sort_values(by='Log Loss')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Std,0.0053,0.0128,0.0053,0.0079,0.0069,0.0338,0.0332,0.0095
5,0.898,0.8859,0.898,0.8888,0.8826,0.4644,0.501,0.2642
8,0.8899,0.8849,0.8899,0.8767,0.8748,0.432,0.4602,0.27
7,0.895,0.8642,0.895,0.8851,0.878,0.4407,0.4811,0.2754
1,0.8895,0.8752,0.8895,0.8769,0.8716,0.4125,0.4505,0.2784
6,0.889,0.8601,0.889,0.8761,0.8708,0.4071,0.4456,0.2812
Mean,0.8884,0.8652,0.8884,0.8749,0.8704,0.4061,0.4429,0.2812
3,0.8827,0.8677,0.8827,0.8665,0.862,0.3637,0.4036,0.2838
0,0.8891,0.8524,0.8891,0.8769,0.8698,0.4015,0.4444,0.286
4,0.8856,0.8569,0.8856,0.8707,0.8675,0.3929,0.4277,0.286


In [230]:

best_model = s.create_model('lr', fold=10)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.8827,0.8394,0.8827,0.8664,0.864,0.3783,0.4121,0.2959
1,0.8844,0.8596,0.8844,0.8689,0.8668,0.3932,0.425,0.2858
2,0.878,0.8347,0.878,0.8592,0.8592,0.3565,0.386,0.308
3,0.8848,0.8479,0.8848,0.8695,0.8662,0.3864,0.4217,0.295
4,0.8827,0.8413,0.8827,0.8662,0.8642,0.3781,0.411,0.297
5,0.8929,0.8773,0.8929,0.8809,0.8783,0.4483,0.477,0.2703
6,0.8835,0.8467,0.8835,0.8675,0.866,0.3886,0.4191,0.2903
7,0.8916,0.8535,0.8916,0.8799,0.8741,0.4226,0.4613,0.2857
8,0.889,0.871,0.889,0.8758,0.8764,0.4451,0.4653,0.2804
9,0.8793,0.8456,0.8793,0.8615,0.8623,0.3743,0.3999,0.2977


In [232]:
# lr
tuned_best, tuner = s.tune_model(
    best_model,
    optimize="logloss",
    n_iter=15,
    fold=10,
    search_library="scikit-optimize",
    search_algorithm="bayesian",
    return_tuner=True,
)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.8836,0.8394,0.8836,0.8677,0.8653,0.3848,0.4181,0.2957
1,0.8861,0.8599,0.8861,0.8714,0.8691,0.4041,0.4356,0.2856
2,0.8776,0.835,0.8776,0.8587,0.8592,0.3573,0.3854,0.308
3,0.8848,0.848,0.8848,0.8695,0.8662,0.3864,0.4217,0.2949
4,0.8844,0.8417,0.8844,0.8688,0.8664,0.3891,0.4219,0.2967
5,0.8929,0.8774,0.8929,0.8809,0.8783,0.4483,0.477,0.2702
6,0.8839,0.8472,0.8839,0.8681,0.8667,0.3918,0.4221,0.29
7,0.8907,0.8539,0.8907,0.8785,0.8734,0.42,0.4572,0.2855
8,0.8882,0.8714,0.8882,0.8746,0.8752,0.4392,0.4598,0.2802
9,0.8784,0.8455,0.8784,0.8604,0.8616,0.3719,0.3965,0.298


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [225]:
# lda
tuned_best, tuner = s.tune_model(
    best_model,
    optimize="logloss",
    n_iter=50,
    fold=10,
    search_library="scikit-optimize",
    search_algorithm="bayesian",
    return_tuner=True,
)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.8844,0.8278,0.8844,0.8695,0.8638,0.3732,0.4151,0.3038
1,0.8857,0.8506,0.8857,0.8709,0.8679,0.397,0.4307,0.2927
2,0.8814,0.819,0.8814,0.8643,0.8613,0.3621,0.3986,0.3198
3,0.8822,0.8348,0.8822,0.8659,0.861,0.3582,0.3994,0.3072
4,0.8848,0.8259,0.8848,0.8697,0.865,0.3783,0.4178,0.3064
5,0.8903,0.8669,0.8903,0.8777,0.8733,0.4205,0.4559,0.2767
6,0.8822,0.8337,0.8822,0.8656,0.8626,0.3687,0.4047,0.2995
7,0.8924,0.8447,0.8924,0.8814,0.8748,0.4253,0.4655,0.291
8,0.8852,0.8707,0.8852,0.87,0.8688,0.4032,0.4319,0.283
9,0.8805,0.8457,0.8805,0.863,0.8618,0.3679,0.3994,0.301


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

In [222]:
# gbc
tuned_best, tuner = s.tune_model(
    best_model,
    optimize="logloss",
    n_iter=50,
    fold=10,
    search_library="scikit-optimize",
    search_algorithm="bayesian",
    return_tuner=True,
)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.8904,0.8535,0.8904,0.879,0.8711,0.4074,0.4517,0.2852
1,0.8891,0.8743,0.8891,0.8767,0.8701,0.4035,0.4452,0.2774
2,0.8801,0.8451,0.8801,0.8623,0.859,0.3499,0.3882,0.2983
3,0.8831,0.8662,0.8831,0.8674,0.8617,0.3607,0.4036,0.2843
4,0.8865,0.8573,0.8865,0.8722,0.8676,0.3915,0.4298,0.2863
5,0.898,0.8871,0.898,0.8888,0.8826,0.4644,0.501,0.263
6,0.8916,0.8619,0.8916,0.8799,0.8741,0.4226,0.4613,0.2802
7,0.8954,0.8637,0.8954,0.8857,0.8787,0.4439,0.4839,0.276
8,0.8882,0.8829,0.8882,0.8743,0.8732,0.4251,0.4518,0.2713
9,0.8852,0.8569,0.8852,0.87,0.8683,0.3994,0.4298,0.2887


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

In [None]:
# define search space
import numpy as np
# xgboost

# params = {"max_depth": [3,4,5],
#           "n_estimators": [210,220,230,240],
#           "min_samples_leaf": [3,4,5]}

tuned_best, tuner = s.tune_model(
    best_model,
    optimize="logloss",
    n_iter=50,
    fold=10,
    search_library="scikit-optimize",
    search_algorithm="bayesian",
    return_tuner=True,
)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.8865,0.8515,0.8865,0.8721,0.8692,0.4034,0.4366,0.2879
1,0.8865,0.8768,0.8865,0.8721,0.8713,0.4184,0.4445,0.2749
2,0.8776,0.8472,0.8776,0.8586,0.8589,0.3552,0.3842,0.2985
3,0.8827,0.8647,0.8827,0.8662,0.8647,0.3821,0.4132,0.2829
4,0.8827,0.8597,0.8827,0.8664,0.8665,0.3938,0.4199,0.286
5,0.8954,0.8903,0.8954,0.8843,0.8822,0.4681,0.4941,0.2591
6,0.8941,0.8622,0.8941,0.8825,0.8806,0.4608,0.4868,0.2777
7,0.8967,0.8654,0.8967,0.8865,0.8821,0.4638,0.4961,0.2748
8,0.8912,0.8784,0.8912,0.8787,0.8791,0.4583,0.4779,0.273
9,0.8818,0.8624,0.8818,0.8657,0.8671,0.4008,0.4222,0.2869


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

In [210]:
tuned_best.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.04264031156015375,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': 0.7588282465072769,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.00962114482009193,
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 229,
 'n_iter_no_change': None,
 'random_state': 42,
 'subsample': 0.8981042826479537,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [218]:
best_model

In [None]:
tuner