# Preprocessing for sales recommendations

## Loading the data

In [1]:
import numpy as np
from mba.data import (
    get_ffp_train_df,
    get_ffp_rollout_df,
    get_reviews_train_df_fpath,
    get_reviews_rollout_df_fpath,
)

In [2]:
raw_tdf = get_ffp_train_df()

In [3]:
raw_tdf

Unnamed: 0,ID,OTHER_SITE_VALUE,STATUS_PANTINUM,STATUS_GOLD,STATUS_SILVER,NUM_DEAL,LAST_DEAL,ADVANCE_PURCHASE,FARE_L_Y1,FARE_L_Y2,...,POINTS_L_Y1,POINTS_L_Y2,POINTS_L_Y3,POINTS_L_Y4,POINTS_L_Y5,SERVICE_FLAG,CANCEL_FLAG,CREDIT_FLAG,RECSYS_FLAG,BUYER_FLAG
0,1,13.140434,0,0,0,0,0,16,36.8,37.0,...,34.6,47.1,34.7,26.0,42.6,0,0,0,0,0
1,2,9.091326,0,0,0,3,16,18,45.2,47.2,...,50.7,42.9,41.1,50.9,30.4,0,0,0,0,0
2,3,6.742492,0,0,0,5,5,17,29.0,24.8,...,33.9,40.0,37.5,38.5,24.5,0,0,0,0,0
3,4,11.829185,0,0,0,3,14,19,47.8,47.0,...,47.6,57.4,51.5,30.0,31.6,0,0,0,0,0
4,5,7.464712,0,1,0,3,28,26,81.8,81.2,...,81.3,85.2,77.8,82.2,95.4,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,39996,7.281724,0,0,0,5,8,22,40.0,37.5,...,20.6,26.3,31.0,24.8,31.9,0,0,0,0,0
39996,39997,3.450719,0,0,0,11,0,14,28.0,36.5,...,42.7,37.0,28.4,27.4,36.8,0,1,0,0,0
39997,39998,7.282398,0,0,0,6,8,12,46.5,44.8,...,38.5,40.2,48.9,48.3,59.8,0,0,0,0,0
39998,39999,7.934329,0,0,1,1,55,30,52.5,56.0,...,71.2,70.8,82.8,60.3,68.7,0,0,0,0,0


In [4]:
raw_tdf.columns

Index(['ID', 'OTHER_SITE_VALUE', 'STATUS_PANTINUM', 'STATUS_GOLD',
       'STATUS_SILVER', 'NUM_DEAL', 'LAST_DEAL', 'ADVANCE_PURCHASE',
       'FARE_L_Y1', 'FARE_L_Y2', 'FARE_L_Y3', 'FARE_L_Y4', 'FARE_L_Y5',
       'POINTS_L_Y1', 'POINTS_L_Y2', 'POINTS_L_Y3', 'POINTS_L_Y4',
       'POINTS_L_Y5', 'SERVICE_FLAG', 'CANCEL_FLAG', 'CREDIT_FLAG',
       'RECSYS_FLAG', 'BUYER_FLAG'],
      dtype='object')

In [5]:
raw_tdf.columns[0]

'ID'

## Building the pipeline

In [6]:
from mba.shared import (
    Column,
    FeatureGroup,
    FEATURE_GROUPS,
    ContextKey,
)

In [7]:
from mba.pipeline import build_pipeline

In [8]:
pline = build_pipeline()

Starting to build the preprocessing pipeline...
Building the sentiment predictor...
Transformation Pipeline and Model Successfully Loaded
Done.
Building pipeline stages...
Done. Returning pipeline.


In [9]:
pline

A pdpipe pipeline:
[ 0]  Apply dataframe method set_index with kwargs {'keys': 'ID'}
[ 1]  Add the sentiment columns to input dataframes

In [10]:
tdf = pline.fit_transform(
    X=raw_tdf,
    verbose=True,
    context={
        ContextKey.REVIEWS_FPATH: get_reviews_train_df_fpath(),
    },
)

- set_index: Apply dataframe method set_index with kwargs {'keys': 'ID'}
- Add the sentiment columns to input dataframes
  - 1994 id intersection between input & reviewes.
  - None-NA sentiment features adde to 1994 rows.


In [11]:
tdf

Unnamed: 0_level_0,OTHER_SITE_VALUE,STATUS_PANTINUM,STATUS_GOLD,STATUS_SILVER,NUM_DEAL,LAST_DEAL,ADVANCE_PURCHASE,FARE_L_Y1,FARE_L_Y2,FARE_L_Y3,FARE_L_Y4,FARE_L_Y5,POINTS_L_Y1,POINTS_L_Y2,POINTS_L_Y3,POINTS_L_Y4,POINTS_L_Y5,SERVICE_FLAG,CANCEL_FLAG,CREDIT_FLAG,RECSYS_FLAG,BUYER_FLAG,SENTIMENT_0,SENTIMENT_1
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,13.140434,0,0,0,0,0,16,36.8,37.0,37.8,35.0,34.8,34.6,47.1,34.7,26.0,42.6,0,0,0,0,0,0.0,0.0
2,9.091326,0,0,0,3,16,18,45.2,47.2,45.5,44.2,42.8,50.7,42.9,41.1,50.9,30.4,0,0,0,0,0,0.0,0.0
3,6.742492,0,0,0,5,5,17,29.0,24.8,30.8,31.5,29.8,33.9,40.0,37.5,38.5,24.5,0,0,0,0,0,0.0,0.0
4,11.829185,0,0,0,3,14,19,47.8,47.0,45.2,41.0,35.8,47.6,57.4,51.5,30.0,31.6,0,0,0,0,0,0.0,0.0
5,7.464712,0,1,0,3,28,26,81.8,81.2,80.5,82.2,83.5,81.3,85.2,77.8,82.2,95.4,0,1,0,1,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39996,7.281724,0,0,0,5,8,22,40.0,37.5,29.5,27.2,31.2,20.6,26.3,31.0,24.8,31.9,0,0,0,0,0,0.0,0.0
39997,3.450719,0,0,0,11,0,14,28.0,36.5,32.0,31.2,32.2,42.7,37.0,28.4,27.4,36.8,0,1,0,0,0,0.0,0.0
39998,7.282398,0,0,0,6,8,12,46.5,44.8,45.8,45.2,44.5,38.5,40.2,48.9,48.3,59.8,0,0,0,0,0,0.0,0.0
39999,7.934329,0,0,1,1,55,30,52.5,56.0,54.5,55.2,55.0,71.2,70.8,82.8,60.3,68.7,0,0,0,0,0,0.0,0.0


## Play with pycaret

In [12]:
from pycaret.classification import (
    setup,
    compare_models,
    create_model,
    tune_model,
    blend_models,
    predict_model,
    finalize_model,
    save_model,
    load_model,
    get_metrics,
    add_metric,
)
from sklearn.metrics import (
    accuracy_score, roc_auc_score, recall_score, precision_score, f1_score,
    confusion_matrix,
)

In [13]:
num_columns = [x for x in tdf.columns if x != Column.BUYER_FLAG]

In [38]:
from imblearn.over_sampling import RandomOverSampler

In [39]:
clf_handle = setup(
    data = tdf,
    target = Column.BUYER_FLAG,
    train_size=0.8,
    session_id=42,
    numeric_features=num_columns,
    group_features=[FEATURE_GROUPS[k] for k in FEATURE_GROUPS],
    group_names=[k for k in FEATURE_GROUPS],
    normalize=True,
    remove_perfect_collinearity=True,
    data_split_stratify=True,
    silent=True,
    fix_imbalance=True,
    fix_imbalance_method=RandomOverSampler(),
) 

IntProgress(value=0, description='Processing: ', max=3)

KeyboardInterrupt: 

In [15]:
TP_REVENUE = 32.7
FP_REVENUE = -6.05
FN_COST = -32.7


def p_count(y_true, y_pred):
    return sum(y_true==1)

def n_count(y_true, y_pred):
    return sum(np.where((y_true==0), 1, 0))

def tp(y_true, y_pred):
    return sum(np.where((y_pred==1) & (y_true==1), 1, 0))

def fp(y_true, y_pred):
    return sum(np.where((y_pred==1) & (y_true==0), 1, 0))

def tn(y_true, y_pred):
    return sum(np.where((y_pred==0) & (y_true==0), 1, 0))

def fn(y_true, y_pred):
    return sum(np.where((y_pred==0) & (y_true==1), 1, 0))

def revenue_score(y_true, y_pred):
    tp_count = tp(y_true, y_pred)
    fp_count = fp(y_true, y_pred)
    return tp_count * TP_REVENUE + fp_count * FP_REVENUE

def opportunity_cost(y_true, y_pred):
    tp_count = tp(y_true, y_pred)
    fp_count = fp(y_true, y_pred)
    fn_count = fn(y_true, y_pred)
    return tp_count * TP_REVENUE + fp_count * FP_REVENUE + fn_count * FN_COST

In [16]:
add_metric(
    id='p_count',
    name='P',
    score_func=p_count,
    target='pred',
    greater_is_better=True,
    multiclass=False,
)
add_metric(
    id='n_count',
    name='N',
    score_func=n_count,
    target='pred',
    greater_is_better=True,
    multiclass=False,
)
add_metric(
    id='revenue_score',
    name='Total Revenue',
    score_func=revenue_score,
    target='pred',
    greater_is_better=True,
    multiclass=False,
)
add_metric(
    id='opportunity_cost',
    name='Opportunity Cost',
    score_func=opportunity_cost,
    target='pred',
    greater_is_better=True,
    multiclass=False,
)
add_metric(
    id='tp',
    name='TP',
    score_func=tp,
    target='pred',
    greater_is_better=True,
    multiclass=False,
)
add_metric(
    id='fp',
    name='FP',
    score_func=fp,
    target='pred',
    greater_is_better=False,
    multiclass=False,
)
add_metric(
    id='tn',
    name='TN',
    score_func=tn,
    target='pred',
    greater_is_better=True,
    multiclass=False,
)
add_metric(
    id='fn',
    name='FN',
    score_func=fn,
    target='pred',
    greater_is_better=False,
    multiclass=False,
)

Name                                                       FN
Display Name                                               FN
Score Function                   <function fn at 0x14e7260d0>
Scorer               make_scorer(fn, greater_is_better=False)
Target                                                   pred
Args                                                       {}
Greater is Better                                       False
Multiclass                                              False
Custom                                                   True
Name: fn, dtype: object

In [17]:
top3 = compare_models(n_select=3, sort='revenue_score')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN,TT (Sec)
ada,Ada Boost Classifier,0.831,0.7278,0.5442,0.2829,0.3719,0.2855,0.3061,293.8,2906.2,2766.38,159.9,407.0,2499.2,133.9,1.9
gbc,Gradient Boosting Classifier,0.8664,0.7278,0.4564,0.3342,0.3855,0.3127,0.3176,293.8,2906.2,2764.88,134.1,267.8,2638.4,159.7,8.513
et,Extra Trees Classifier,0.8745,0.7285,0.3904,0.3407,0.3636,0.2944,0.2954,293.8,2906.2,2404.565,114.7,222.5,2683.7,179.1,1.837
rf,Random Forest Classifier,0.8738,0.7265,0.3884,0.3378,0.361,0.2914,0.2924,293.8,2906.2,2374.055,114.1,224.3,2681.9,179.7,4.911
lightgbm,Light Gradient Boosting Machine,0.8954,0.7338,0.2447,0.3914,0.3001,0.247,0.2554,293.8,2906.2,1669.295,71.9,112.7,2793.5,221.9,1.278
lr,Logistic Regression,0.7632,0.7094,0.5623,0.208,0.3037,0.1959,0.2312,293.8,2906.2,1594.775,165.2,629.3,2276.9,128.6,0.812
ridge,Ridge Classifier,0.764,0.0,0.5568,0.2075,0.3023,0.1946,0.2291,293.8,2906.2,1567.865,163.6,625.1,2281.1,130.2,0.14
lda,Linear Discriminant Analysis,0.7639,0.7076,0.5565,0.2074,0.3022,0.1944,0.2288,293.8,2906.2,1564.595,163.5,625.1,2281.1,130.3,0.21
svm,SVM - Linear Kernel,0.7618,0.0,0.5197,0.1974,0.2861,0.1765,0.2064,293.8,2906.2,1236.24,152.7,621.0,2285.2,141.1,0.288
xgboost,Extreme Gradient Boosting,0.8981,0.7235,0.1736,0.3808,0.2382,0.1917,0.209,293.8,2906.2,1164.34,51.0,83.2,2823.0,242.8,5.606


In [18]:
def _sanity_check_all_ones_revenue(P, N):
    return P * TP_REVENUE + N * FP_REVENUE

In [19]:
_sanity_check_all_ones_revenue(P = 136 + 157, N=268+2639)

-8006.249999999998

In [20]:
gbc = create_model('gbc')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN
0,0.8728,0.7401,0.471,0.3538,0.4041,0.3345,0.3388,293.0,2907.0,2988.0,138.0,252.0,2655.0,155.0
1,0.8625,0.7008,0.4027,0.3081,0.3491,0.2738,0.2768,293.0,2907.0,2255.35,118.0,265.0,2642.0,175.0
2,0.8722,0.7428,0.466,0.3522,0.4012,0.3312,0.3352,294.0,2906.0,2955.3,137.0,252.0,2654.0,157.0
3,0.8738,0.7259,0.4626,0.356,0.4024,0.3331,0.3367,294.0,2906.0,2958.9,136.0,246.0,2660.0,158.0
4,0.8666,0.7357,0.483,0.3405,0.3994,0.3269,0.3332,294.0,2906.0,2979.65,142.0,275.0,2631.0,152.0
5,0.8575,0.7384,0.4354,0.3062,0.3596,0.2821,0.2876,294.0,2906.0,2431.1,128.0,290.0,2616.0,166.0
6,0.8581,0.7286,0.4898,0.3214,0.3881,0.3118,0.3206,294.0,2906.0,2869.6,144.0,304.0,2602.0,150.0
7,0.8656,0.7148,0.4252,0.3238,0.3676,0.294,0.2974,294.0,2906.0,2508.45,125.0,261.0,2645.0,169.0
8,0.875,0.7195,0.449,0.3568,0.3976,0.3289,0.3316,294.0,2906.0,2876.5,132.0,238.0,2668.0,162.0
9,0.86,0.7318,0.4796,0.3234,0.3863,0.3106,0.3183,294.0,2906.0,2825.95,141.0,295.0,2611.0,153.0


In [21]:
gbc_res = predict_model(gbc)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN
0,Gradient Boosting Classifier,0.8718,0.7397,0.4455,0.3457,0.3893,0.3189,0.3221,734,7266,6947.95,327,619,6647,407


In [22]:
sktuned_gbc = tune_model(
    gbc,
    fold=8,
    n_iter=10,
    optimize='revenue_score',
)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN
0,0.842,0.7289,0.5504,0.3019,0.39,0.308,0.3263,367.0,3633.0,3780.05,202.0,467.0,3166.0,165.0
1,0.8518,0.7448,0.5695,0.3245,0.4135,0.3358,0.3532,367.0,3633.0,4202.55,209.0,435.0,3198.0,158.0
2,0.8565,0.7297,0.5313,0.3266,0.4046,0.3282,0.3408,367.0,3633.0,3944.4,195.0,402.0,3231.0,172.0
3,0.8538,0.7392,0.5886,0.3323,0.4248,0.3484,0.3671,367.0,3633.0,4437.5,216.0,434.0,3199.0,151.0
4,0.832,0.7157,0.5504,0.2849,0.3755,0.2896,0.3106,367.0,3633.0,3538.05,202.0,507.0,3126.0,165.0
5,0.8355,0.7337,0.5313,0.2863,0.3721,0.2871,0.3053,367.0,3633.0,3436.2,195.0,486.0,3147.0,172.0
6,0.8512,0.7114,0.5353,0.3172,0.3984,0.3198,0.3341,368.0,3632.0,3876.7,197.0,424.0,3208.0,171.0
7,0.8425,0.7452,0.5734,0.3085,0.4011,0.3198,0.3402,368.0,3632.0,4038.05,211.0,473.0,3159.0,157.0
Mean,0.8457,0.7311,0.5538,0.3103,0.3975,0.3171,0.3347,367.25,3632.75,3906.6875,203.375,453.5,3179.25,163.875
SD,0.0084,0.0117,0.02,0.0169,0.0168,0.02,0.0193,0.433,0.433,308.659,7.3644,33.0265,32.9915,7.3559


In [23]:
sktuned_gbc

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.0001, loss='deviance', max_depth=9,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.3, min_impurity_split=None,
                           min_samples_leaf=5, min_samples_split=5,
                           min_weight_fraction_leaf=0.0, n_estimators=260,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=42, subsample=0.35, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [24]:
sktuned_gbc_res = predict_model(sktuned_gbc)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN
0,Gradient Boosting Classifier,0.8478,0.7393,0.5341,0.3091,0.3916,0.3116,0.3268,734,7266,7518.6,392,876,6390,342


## AdaBoost

In [28]:
ada = create_model('ada')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN
0,0.8312,0.7401,0.5324,0.2791,0.3662,0.2796,0.2991,293.0,2907.0,2663.05,156.0,403.0,2504.0,137.0
1,0.8288,0.709,0.5051,0.2686,0.3507,0.2625,0.28,293.0,2907.0,2401.45,148.0,403.0,2504.0,145.0
2,0.8422,0.7542,0.5918,0.3113,0.408,0.3269,0.3494,294.0,2906.0,3360.55,174.0,385.0,2521.0,120.0
3,0.8431,0.73,0.5544,0.3052,0.3937,0.3122,0.3306,294.0,2906.0,3085.55,163.0,371.0,2535.0,131.0
4,0.83,0.7292,0.5442,0.2807,0.3704,0.2835,0.3043,294.0,2906.0,2751.5,160.0,410.0,2496.0,134.0
5,0.8116,0.7279,0.5408,0.2536,0.3453,0.2517,0.2764,294.0,2906.0,2367.9,159.0,468.0,2438.0,135.0
6,0.8216,0.7331,0.5646,0.2726,0.3677,0.2782,0.3033,294.0,2906.0,2748.05,166.0,443.0,2463.0,128.0
7,0.8391,0.6987,0.5068,0.2871,0.3665,0.2824,0.2974,294.0,2906.0,2633.8,149.0,370.0,2536.0,145.0
8,0.8428,0.7349,0.5306,0.2994,0.3828,0.3007,0.3169,294.0,2906.0,2892.95,156.0,365.0,2541.0,138.0
9,0.8194,0.7205,0.5714,0.271,0.3676,0.2776,0.3039,294.0,2906.0,2759.0,168.0,452.0,2454.0,126.0


In [29]:
ada_res = predict_model(ada)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN
0,Ada Boost Classifier,0.8331,0.7315,0.5463,0.2858,0.3753,0.2897,0.31,734,7266,7050.6,401,1002,6264,333


In [30]:
sktuned_ada = tune_model(
    ada,
    fold=10,
    n_iter=15,
    optimize='revenue_score',
)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN
0,0.8241,0.7367,0.5768,0.278,0.3751,0.287,0.313,293.0,2907.0,2870.35,169.0,439.0,2468.0,124.0
1,0.8194,0.6996,0.5324,0.2613,0.3506,0.2596,0.2819,293.0,2907.0,2433.15,156.0,441.0,2466.0,137.0
2,0.8369,0.7599,0.6327,0.31,0.4161,0.334,0.3628,294.0,2906.0,3577.5,186.0,414.0,2492.0,108.0
3,0.8359,0.733,0.5884,0.2998,0.3972,0.3137,0.3377,294.0,2906.0,3212.9,173.0,404.0,2502.0,121.0
4,0.8162,0.7472,0.6122,0.2752,0.3797,0.2897,0.3217,294.0,2906.0,3018.3,180.0,474.0,2432.0,114.0
5,0.8119,0.7293,0.5748,0.2616,0.3596,0.267,0.2955,294.0,2906.0,2640.45,169.0,477.0,2429.0,125.0
6,0.8072,0.7291,0.5816,0.2571,0.3566,0.2627,0.293,294.0,2906.0,2603.0,171.0,494.0,2412.0,123.0
7,0.8297,0.7088,0.5442,0.2802,0.3699,0.283,0.3039,294.0,2906.0,2745.45,160.0,411.0,2495.0,134.0
8,0.8259,0.7331,0.5646,0.279,0.3735,0.2856,0.3096,294.0,2906.0,2832.75,166.0,429.0,2477.0,128.0
9,0.8109,0.7348,0.5952,0.2648,0.3665,0.2742,0.3054,294.0,2906.0,2782.2,175.0,486.0,2420.0,119.0


In [31]:
sktuned_ada

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.15,
                   n_estimators=110, random_state=42)

In [32]:
sktuned_ada_res = predict_model(sktuned_ada)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN
0,Ada Boost Classifier,0.8264,0.7461,0.5654,0.2795,0.374,0.2864,0.3104,734,7266,7097.0,415,1070,6196,319


  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


## LGBM

In [34]:
lgbm = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN
0,0.8978,0.7389,0.2253,0.3976,0.2876,0.2371,0.2482,293.0,2907.0,1553.2,66.0,100.0,2807.0,227.0
1,0.8909,0.6973,0.2799,0.3727,0.3197,0.2617,0.2649,293.0,2907.0,1846.5,82.0,138.0,2769.0,211.0
2,0.8928,0.7649,0.2347,0.369,0.2869,0.232,0.239,294.0,2906.0,1542.4,69.0,118.0,2788.0,225.0
3,0.8959,0.7275,0.2211,0.3846,0.2808,0.2291,0.2393,294.0,2906.0,1496.3,65.0,104.0,2802.0,229.0
4,0.9022,0.7593,0.2653,0.4457,0.3326,0.2835,0.2946,294.0,2906.0,1963.75,78.0,97.0,2809.0,216.0
5,0.8988,0.7258,0.2279,0.4085,0.2926,0.2428,0.2548,294.0,2906.0,1604.05,67.0,97.0,2809.0,227.0
6,0.8894,0.7393,0.2789,0.3661,0.3166,0.2576,0.2604,294.0,2906.0,1822.3,82.0,142.0,2764.0,212.0
7,0.8938,0.7208,0.2415,0.3777,0.2946,0.2401,0.2472,294.0,2906.0,1613.85,71.0,117.0,2789.0,223.0
8,0.9,0.734,0.2143,0.4145,0.2825,0.2346,0.2494,294.0,2906.0,1521.65,63.0,89.0,2817.0,231.0
9,0.8928,0.7303,0.2585,0.3781,0.3071,0.2512,0.2565,294.0,2906.0,1728.95,76.0,125.0,2781.0,218.0


In [35]:
lgbm_res = predict_model(lgbm)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN
0,Light Gradient Boosting Machine,0.8996,0.738,0.2684,0.4255,0.3292,0.2779,0.2865,734,7266,4832.6,197,266,7000,537


In [36]:
sktuned_lgbm = tune_model(
    lgbm,
    fold=10,
    n_iter=10,
    optimize='revenue_score',
)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN
0,0.8325,0.7452,0.5666,0.2887,0.3825,0.2972,0.3199,293.0,2907.0,2953.75,166.0,409.0,2498.0,127.0
1,0.8288,0.6768,0.529,0.2743,0.3613,0.2737,0.2935,293.0,2907.0,2588.0,155.0,410.0,2497.0,138.0
2,0.8466,0.743,0.6054,0.3219,0.4203,0.3413,0.364,294.0,2906.0,3551.85,178.0,375.0,2531.0,116.0
3,0.8472,0.7279,0.5816,0.3184,0.4116,0.3323,0.3522,294.0,2906.0,3377.4,171.0,366.0,2540.0,123.0
4,0.8394,0.7534,0.5884,0.3057,0.4023,0.3201,0.3431,294.0,2906.0,3279.45,173.0,393.0,2513.0,121.0
5,0.8281,0.7183,0.5544,0.2801,0.3721,0.2848,0.3072,294.0,2906.0,2795.15,163.0,419.0,2487.0,131.0
6,0.8281,0.7124,0.5714,0.2838,0.3792,0.2923,0.3165,294.0,2906.0,2928.4,168.0,424.0,2482.0,126.0
7,0.8403,0.7152,0.5272,0.2941,0.3776,0.2944,0.3109,294.0,2906.0,2817.9,155.0,372.0,2534.0,139.0
8,0.8466,0.7281,0.5646,0.3138,0.4034,0.3235,0.3419,294.0,2906.0,3232.05,166.0,363.0,2543.0,128.0
9,0.8297,0.7303,0.585,0.2891,0.387,0.301,0.3263,294.0,2906.0,3065.25,172.0,423.0,2483.0,122.0


In [37]:
sktuned_lgbm_res = predict_model(sktuned_lgbm)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN
0,Light Gradient Boosting Machine,0.8371,0.726,0.5668,0.2969,0.3897,0.3061,0.3275,734,7266,7643.95,416,985,6281,318










## Blending models

In [25]:
blender_top3 = blend_models(top3)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN
0,0.8725,0.7492,0.4232,0.3416,0.378,0.3079,0.3101,293.0,2907.0,2608.85,124.0,239.0,2668.0,169.0
1,0.8634,0.702,0.3993,0.3095,0.3487,0.2738,0.2766,293.0,2907.0,2246.85,117.0,261.0,2646.0,176.0
2,0.8738,0.7564,0.4762,0.359,0.4094,0.3402,0.3445,294.0,2906.0,3065.5,140.0,250.0,2656.0,154.0
3,0.8803,0.7245,0.4524,0.3746,0.4099,0.3439,0.3458,294.0,2906.0,3006.0,133.0,222.0,2684.0,161.0
4,0.8716,0.7496,0.4694,0.3511,0.4017,0.3315,0.3359,294.0,2906.0,2969.85,138.0,255.0,2651.0,156.0
5,0.8644,0.7243,0.449,0.3267,0.3782,0.3042,0.3091,294.0,2906.0,2670.8,132.0,272.0,2634.0,162.0
6,0.8659,0.7442,0.466,0.335,0.3898,0.3167,0.3222,294.0,2906.0,2834.3,137.0,272.0,2634.0,157.0
7,0.8697,0.7098,0.4252,0.3351,0.3748,0.3032,0.3059,294.0,2906.0,2587.1,125.0,248.0,2658.0,169.0
8,0.8781,0.7249,0.4388,0.3644,0.3981,0.331,0.3328,294.0,2906.0,2857.05,129.0,225.0,2681.0,165.0
9,0.8628,0.7345,0.4694,0.3278,0.386,0.3115,0.3179,294.0,2906.0,2800.45,138.0,283.0,2623.0,156.0


In [26]:
blender_res = predict_model(blender_top3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,P,N,Total Revenue,TP,FP,TN,FN
0,Voting Classifier,0.8778,0.7297,0.4537,0.3659,0.4051,0.3379,0.3403,734,7266,7398.25,333,577,6689,401


In [27]:
blender_res

Unnamed: 0,OTHER_SITE_VALUE,STATUS_PANTINUM,STATUS_GOLD,STATUS_SILVER,NUM_DEAL,LAST_DEAL,ADVANCE_PURCHASE,FARE_L_Y1,FARE_L_Y2,FARE_L_Y3,...,FARE_Std,POINTS_Min,POINTS_Max,POINTS_Mean,POINTS_Median,POINTS_Mode,POINTS_Std,BUYER_FLAG,Label,Score
0,0.810761,-0.091728,-0.218411,-0.539830,-0.500711,-0.083284,-0.145404,-0.821989,-1.067224,-0.822561,...,0.586183,-1.406468,-0.857471,-1.317963,-1.308019,-1.416403,1.077742,0,0,0.6996
1,0.478291,-0.091728,-0.218411,-0.539830,-0.999831,0.256552,-0.145404,-0.728102,-0.824952,-1.064752,...,0.208382,-1.063434,-0.782743,-0.949042,-0.859792,-1.074804,0.552486,0,0,0.7045
2,0.655939,-0.091728,-0.218411,1.852436,-0.999831,1.955734,0.278997,1.860490,1.961184,2.029913,...,-0.653417,1.437596,1.415516,1.390862,1.116196,1.415766,0.215431,0,0,0.6866
3,-1.223001,-0.091728,-0.218411,-0.539830,-0.500711,-0.338161,-0.569805,-0.151369,-0.528841,-0.640917,...,0.435963,-0.658030,-0.645741,-0.661819,-0.582017,-0.671095,0.025699,0,0,0.6771
4,-0.158672,-0.091728,-0.218411,-0.539830,-0.001591,-0.508079,-0.782005,-0.661040,-0.609598,-0.741830,...,-0.388346,-1.113329,-0.820107,-0.926064,-0.739844,-1.124491,1.029263,0,0,0.7447
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,-1.188774,-0.091728,-0.218411,-0.539830,-1.498951,1.530939,-0.782005,-0.821989,-0.878790,-0.889836,...,-0.529951,-1.007301,-0.925972,-1.039676,-1.175445,-1.018906,0.021550,0,0,0.7196
7996,-2.140656,-0.091728,-0.218411,1.852436,-0.500711,0.086634,-0.569805,0.029698,0.117220,-0.015257,...,-0.662961,0.202674,0.917327,0.462815,0.263933,0.186008,2.114522,0,0,0.7154
7997,0.421767,-0.091728,-0.218411,-0.539830,-0.001591,-0.593038,-0.357604,-1.090236,-0.946088,-0.943656,...,-0.678356,-1.150751,-0.807652,-1.089461,-1.232262,-1.161756,0.591372,0,0,0.7381
7998,-1.315843,-0.091728,-0.218411,-0.539830,1.994890,-0.762956,-0.145404,0.116879,0.231626,0.099111,...,-0.689853,-0.221441,-0.402873,-0.392469,-0.506260,-0.236333,-0.406596,0,0,0.5450


In [None]:
sktuned_m0 = tune_model(
    top3[0],
    fold=8,
    n_iter=10,
    optimize='revenue_score',
)

In [None]:
tuned_gbc = tune_model(
    gbc,
    fold=8,
    n_iter=10,
    optimize='revenue_score',
    search_library='optuna',
    early_stopping='Hyperband',
)

In [None]:
tuned_gbc

In [None]:
tuned_gbc_res = predict_model(tuned_gbc)