# Experiment - Ensemble and Stacking
* StellarAlgo Data Science
* February 17, 2023
* Grant Donst, Peter Morrison

In [1]:
import pandas as pd
import os

from data_sci_toolkit.aws_tools import redshift_tools
from pycaret.classification import *

In [2]:
df = redshift_tools.get_retention_dataset(
    cluster= "prod-app",
    database= "stlrlagalaxy",
    lkupclientid= 6,
    start_year= 2015,
    end_year= 2021
)
df.shape

Attempting to automatically open the SSO authorization page in your default browser.
If the browser does not open or you wish to use a different device to authorize this request, open the following URL:

https://device.sso.us-east-1.amazonaws.com/

Then enter the code:

GJMJ-MFZD
Successfully logged into Start URL: https://stellaralgo.awsapps.com/start#/
Authorized as AROASQ4JELIXYLYV6P4UV:gdonst@stellaralgo.com


(108833, 24)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108833 entries, 0 to 108832
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   lkupclientid         108833 non-null  int64         
 1   clientcode           108833 non-null  object        
 2   dimcustomermasterid  108833 non-null  int64         
 3   year                 108833 non-null  int64         
 4   productgrouping      108833 non-null  object        
 5   totalspent           108833 non-null  float64       
 6   recentdate           108833 non-null  datetime64[ns]
 7   attendancepercent    103614 non-null  float64       
 8   renewedbeforedays    108793 non-null  float64       
 9   source_tenure        108793 non-null  object        
 10  tenure               108793 non-null  float64       
 11  disttovenue          108833 non-null  float64       
 12  recency              108833 non-null  int64         
 13  missed_games_1

In [4]:
# copy original dataframe
df_dataset = df
features = [
    "attendancepercent",
    "disttovenue",
    "inperson_contact",
    "missed_games_1",
    "missed_games_2",
    "missed_games_over_2",
    "recency",
    "tenure",
    "totalspent",
    "isnextyear_buyer"
]

# create training and eval datasets
df_train = df_dataset.sample(frac=0.85, random_state=786)
df_train = df_train.reset_index(drop=True)

df_eval = df_dataset.drop(df_train.index)
df_eval = df_eval.reset_index(drop=True)

# print out the number of records for training and eval
print('Data for Training: ' + str(df_train.shape))
print('Data for Evaluation: ' + str(df_eval.shape), end="\n")

Data for Training: (92508, 24)
Data for Evaluation: (16325, 24)


In [5]:
setup(
    data = df_train, 
    target = 'isnextyear_buyer', 
    train_size = 0.85,
    data_split_shuffle = True,
    numeric_features = [
        "attendancepercent",
        "disttovenue",
        "inperson_contact",
        "missed_games_1",
        "missed_games_2",
        "missed_games_over_2",
        "recency",
        "tenure",
        "totalspent"
    ],
    silent = True,
    verbose = False
);

In [65]:
model_matrix = compare_models(
    fold=10,
    include= ["lightgbm"]
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8738,0.9056,0.5458,0.7308,0.6247,0.5507,0.5594,3.674


In [67]:
best_model = create_model(model_matrix, fold=10);
final_model = finalize_model(best_model);

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8732,0.9075,0.5588,0.72,0.6292,0.5542,0.5607
1,0.8714,0.9001,0.537,0.724,0.6166,0.5414,0.5501
2,0.8746,0.903,0.5499,0.7318,0.6279,0.5543,0.5625
3,0.8783,0.908,0.544,0.755,0.6323,0.5617,0.5726
4,0.877,0.9043,0.5208,0.765,0.6197,0.5495,0.564
5,0.8735,0.905,0.5459,0.7284,0.6241,0.5499,0.5582
6,0.8729,0.9082,0.5466,0.7254,0.6234,0.5488,0.5568
7,0.8672,0.9018,0.5288,0.7073,0.6051,0.5273,0.5354
8,0.8714,0.9052,0.5446,0.719,0.6198,0.5442,0.5518
9,0.8785,0.9125,0.5816,0.7321,0.6483,0.576,0.5816


### Ensemble - bagged

In [68]:
algo = create_model('lightgbm');
bagged_algo = ensemble_model(algo)
print(bagged_algo)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8747,0.9077,0.5548,0.7298,0.6304,0.5567,0.5642
1,0.8717,0.9016,0.5337,0.7273,0.6156,0.5408,0.5501
2,0.8742,0.9032,0.5459,0.7323,0.6255,0.5519,0.5605
3,0.8764,0.908,0.5387,0.7484,0.6264,0.5547,0.5655
4,0.8785,0.9039,0.5261,0.7698,0.625,0.5556,0.57
5,0.8756,0.9058,0.5506,0.7365,0.6301,0.5572,0.5657
6,0.8752,0.908,0.5525,0.7333,0.6302,0.557,0.565
7,0.8689,0.9022,0.5294,0.7152,0.6084,0.5318,0.5405
8,0.8728,0.906,0.544,0.7264,0.6221,0.5475,0.5558
9,0.8799,0.913,0.5823,0.7385,0.6511,0.5799,0.5858


BaggingClassifier(base_estimator=LGBMClassifier(boosting_type='gbdt',
                                                class_weight=None,
                                                colsample_bytree=1.0,
                                                importance_type='split',
                                                learning_rate=0.1, max_depth=-1,
                                                min_child_samples=20,
                                                min_child_weight=0.001,
                                                min_split_gain=0.0,
                                                n_estimators=100, n_jobs=-1,
                                                num_leaves=31, objective=None,
                                                random_state=1287,
                                                reg_alpha=0.0, reg_lambda=0.0,
                                                silent='warn', subsample=1.0,
                                                s

In [69]:
bagged_algo2 = ensemble_model(algo, n_estimators=50)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8742,0.9079,0.5522,0.7289,0.6283,0.5544,0.5621
1,0.8721,0.9013,0.5324,0.7301,0.6157,0.5412,0.551
2,0.8746,0.9038,0.5446,0.7351,0.6257,0.5523,0.5613
3,0.8763,0.9083,0.536,0.7495,0.625,0.5534,0.5645
4,0.8782,0.9045,0.5228,0.7702,0.6228,0.5533,0.5681
5,0.8749,0.9058,0.5459,0.7355,0.6267,0.5535,0.5624
6,0.8745,0.9079,0.5473,0.7327,0.6266,0.553,0.5615
7,0.8695,0.9028,0.5301,0.718,0.6099,0.5337,0.5425
8,0.8723,0.9062,0.5413,0.7254,0.62,0.5452,0.5536
9,0.8792,0.913,0.5763,0.7384,0.6474,0.5758,0.5822


### Enemble - Boosting

In [70]:
lightgbm = create_model('lightgbm');
boosted_dt = ensemble_model(lightgbm, method = 'Boosting')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8719,0.9022,0.5661,0.71,0.6299,0.5537,0.5589
1,0.8651,0.8976,0.5443,0.6895,0.6083,0.5282,0.5336
2,0.8677,0.8988,0.5545,0.6963,0.6174,0.5387,0.5438
3,0.8735,0.9045,0.5658,0.7169,0.6324,0.5573,0.563
4,0.8708,0.9003,0.5321,0.7233,0.6131,0.5377,0.5468
5,0.8681,0.901,0.5578,0.6964,0.6194,0.5409,0.5457
6,0.8735,0.9029,0.5691,0.7151,0.6338,0.5585,0.5638
7,0.8671,0.8957,0.5459,0.6976,0.6125,0.5338,0.5396
8,0.8699,0.9027,0.5631,0.7018,0.6249,0.5473,0.5522
9,0.8766,0.907,0.5869,0.7202,0.6468,0.573,0.5774


### Ensemble - Blending

In [71]:
# train individual models to blend
lightgbm = create_model('lightgbm', verbose = False)
dt = create_model('dt', verbose = False)
lr = create_model('lr', verbose = False)

  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp /

In [72]:
# blend individual models
blend_soft = blend_models(estimator_list = [lightgbm, dt, lr], method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8592,0.8872,0.5119,0.678,0.5834,0.5007,0.5078
1,0.8523,0.8773,0.4974,0.6531,0.5647,0.4777,0.4842
2,0.8565,0.8823,0.499,0.6711,0.5724,0.4884,0.4962
3,0.8642,0.8854,0.5195,0.6974,0.5955,0.5159,0.524
4,0.8548,0.8824,0.4792,0.6719,0.5594,0.4754,0.4851
5,0.854,0.8827,0.497,0.6602,0.5671,0.4814,0.4884
6,0.8536,0.8829,0.4871,0.6628,0.5615,0.4761,0.4843
7,0.8498,0.8744,0.4712,0.6517,0.547,0.4597,0.4684
8,0.8541,0.8821,0.4865,0.6655,0.562,0.4771,0.4855
9,0.8548,0.8883,0.5063,0.6598,0.5729,0.4872,0.4935


In [73]:
# blend individual models
blend_hard = blend_models(estimator_list = [lightgbm, dt, lr], method = 'hard')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8667,0.0,0.4439,0.7654,0.5619,0.4898,0.5151
1,0.8612,0.0,0.4247,0.7451,0.541,0.4664,0.492
2,0.8629,0.0,0.4256,0.755,0.5444,0.471,0.4978
3,0.8703,0.0,0.4362,0.7981,0.5641,0.4955,0.5267
4,0.861,0.0,0.3966,0.7692,0.5233,0.4515,0.4856
5,0.8607,0.0,0.421,0.7442,0.5378,0.4631,0.4892
6,0.8635,0.0,0.4256,0.7594,0.5455,0.4726,0.5001
7,0.8563,0.0,0.4038,0.7282,0.5196,0.4431,0.4698
8,0.8592,0.0,0.4118,0.7417,0.5295,0.4546,0.4819
9,0.8625,0.0,0.4415,0.7389,0.5528,0.4776,0.4997


In [6]:
# blend top3 models from compare_models
top3 = compare_models(n_select = 3)
blender_top3 = blend_models(top3)
print(blender_top3.estimators_)

In [None]:
# blend top4 models from compare_models
top4 = compare_models(n_select = 4)
blender_top4 = blend_models(top4)
print(blender_top4.estimators_)

## Stacking

In [16]:
lightgbm = create_model('lightgbm');
dt = create_model('dt');
lr = create_model('lr');

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8079,0.3904,0.0,0.0,0.0,0.0,0.0
1,0.808,0.3695,0.0,0.0,0.0,0.0,0.0
2,0.808,0.379,0.0,0.0,0.0,0.0,0.0
3,0.808,0.3955,0.0,0.0,0.0,0.0,0.0
4,0.808,0.3913,0.0,0.0,0.0,0.0,0.0
5,0.808,0.3745,0.0,0.0,0.0,0.0,0.0
6,0.808,0.3934,0.0,0.0,0.0,0.0,0.0
7,0.808,0.394,0.0,0.0,0.0,0.0,0.0
8,0.808,0.3877,0.0,0.0,0.0,0.0,0.0
9,0.808,0.3919,0.0,0.0,0.0,0.0,0.0


In [17]:
stack_soft = stack_models([lightgbm, dt, lr])

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8079,0.3904,0.0,0.0,0.0,0.0,0.0
1,0.808,0.3695,0.0,0.0,0.0,0.0,0.0
2,0.808,0.379,0.0,0.0,0.0,0.0,0.0
3,0.808,0.3955,0.0,0.0,0.0,0.0,0.0
4,0.808,0.3913,0.0,0.0,0.0,0.0,0.0
5,0.808,0.3745,0.0,0.0,0.0,0.0,0.0
6,0.808,0.3934,0.0,0.0,0.0,0.0,0.0
7,0.808,0.394,0.0,0.0,0.0,0.0,0.0
8,0.808,0.3877,0.0,0.0,0.0,0.0,0.0
9,0.808,0.3919,0.0,0.0,0.0,0.0,0.0


In [18]:
xgboost = create_model('xgboost')
stack_soft2 = stack_models([lightgbm, dt, lr], meta_model=xgboost)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.873,0.9064,0.5533,0.7207,0.626,0.5511,0.558
1,0.8719,0.8993,0.5384,0.724,0.6175,0.5426,0.5512
2,0.8769,0.909,0.5563,0.7381,0.6344,0.5622,0.5703
3,0.8765,0.9098,0.5596,0.7341,0.6351,0.5624,0.5699
4,0.8747,0.9054,0.5662,0.7215,0.6345,0.5602,0.5662
5,0.8752,0.9076,0.5675,0.7232,0.636,0.562,0.568
6,0.8797,0.9096,0.5523,0.7554,0.6381,0.568,0.5781
7,0.8689,0.8991,0.547,0.7042,0.6157,0.5382,0.5444
8,0.8694,0.8994,0.5272,0.7178,0.6079,0.5317,0.5408
9,0.869,0.9046,0.5205,0.7198,0.6042,0.5281,0.538


In [18]:
stack_soft3 = stack_models(top3, meta_model=xgboost, restack=False)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8758,0.9468,0.7592,0.803,0.7805,0.6939,0.6945
1,0.8747,0.9432,0.7561,0.8018,0.7783,0.691,0.6916
2,0.8658,0.9414,0.7461,0.7826,0.7639,0.6703,0.6706
3,0.8759,0.9472,0.7554,0.806,0.7799,0.6936,0.6943
4,0.8758,0.9427,0.7616,0.8014,0.781,0.6944,0.6948
5,0.8754,0.9458,0.7727,0.7934,0.7829,0.6955,0.6956
6,0.877,0.9465,0.7671,0.8014,0.7839,0.698,0.6983
7,0.8752,0.9479,0.7585,0.8018,0.7796,0.6926,0.6932
8,0.868,0.9382,0.7511,0.7857,0.768,0.6758,0.6761
9,0.8747,0.9478,0.7474,0.8075,0.7763,0.6894,0.6904


## League model stacking