## Importing libraries

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import time
from datetime import date, timedelta, datetime
import pandas_gbq
from google.cloud import bigquery
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import math

from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix
import gc
from alibi.explainers import ALE
warnings.filterwarnings('ignore')
collected_objects = gc.collect()

  "Since version 1.0, "


In [2]:
os.environ["http_proxy"] = "http://geoproxy.kohls.com:3128"
os.environ["HTTP_PROXY"] = "http://geoproxy.kohls.com:3128"
os.environ["https_proxy"] = "http://geoproxy.kohls.com:3128"
os.environ["HTTPS_PROXY"] = "http://geoproxy.kohls.com:3128"

client = bigquery.Client(location="US")
print("Client creating using default project: {}".format(client.project))

del os.environ['http_proxy']
del os.environ['HTTP_PROXY']
del os.environ['https_proxy']
del os.environ['HTTPS_PROXY']

os.chdir('/home/jupyter/SalesForecasting/GmWorkingData/Unsubscribe_email')

Client creating using default project: kohls-bda-mkt-lle


## Important functions

In [3]:
def customize_split(number):
    """
    Input
    number: int
    Output
    grp_num: Provides grp_num to which numbers belong between customize percentile [1,3,5,10,20,100].
    """
    if number == 1:
        grp_num = 1
    elif number <=3:
        grp_num = 3
    elif number <=5:
        grp_num = 5
    elif number <=10:
        grp_num = 10
    elif number <=20:
        grp_num = 20
    elif number <=100:
        grp_num = 100
    return grp_num

In [4]:
def calc_lorenze_acc(X:pd.DataFrame(), y:pd.DataFrame(), actual:str, pred_value:str, m, method='gini', num_split=100, cust_split_list = [1,3,5,10,20,100]):
    """
    X : dataframe of testing data
    y : dataframe of target data
    actual : Target col name
    pred_value : prediction col name
    m : trained model name
    method : default to gini
    num_split : number for splitting the data into buckets.
    cust_split_list : customize splitting list.
    If changing the customized percentage list then change the customize_split function accordingly.
    """
    
    df=pd.DataFrame(y.copy())
    df[pred_value]= m.predict_proba(X)[:,1]
    data = df.copy()
    data = data.sort_values(pred_value, ascending=False)
    data['rank'] = range(len(data))
    data['perc_grp'] =data['rank'].apply(lambda x: (math.floor(num_split * x/len(y))+1))
    data['grp'] = data['perc_grp'].apply(lambda x : customize_split(x))

    lorenz = data.groupby('grp')[[pred_value, actual]].mean()
    lorenz['num_mail_id']=data.groupby('grp').size()
    lorenz['actual_not_unsubscribed']=data.groupby('grp')[[actual]].agg(lambda x: x.eq(0).sum())
    lorenz['actual_unsubscribed']=data.groupby('grp')[[actual]].agg(lambda x: x.eq(1).sum())
    lorenz["per_unsubscribed"] = lorenz['actual_unsubscribed']/lorenz['num_mail_id']

    cum_dict = {}
    cum_df = pd.DataFrame()
    for per in cust_split_list:
        temp_dict = {
            "grp" : per,
            "y_pred_xgbc_oot_cum" : data[data["perc_grp"]<=per][[pred_value, actual]].mean()[pred_value],
            "unsubscribe_or_not_cum" : data[data["perc_grp"]<=per][[pred_value, actual]].mean()[actual],
            "num_mail_id_cum" : data[data["perc_grp"]<=per].shape[0],
            "actual_not_unsubscribed_cum" : data[data["perc_grp"]<=per][data[data["perc_grp"]<=per][actual]==0].shape[0],
            "actual_unsubscribed_cum" : data[data["perc_grp"]<=per][data[data["perc_grp"]<=per][actual]==1].shape[0],
            "per_unsubscribed_cum" : data[data["perc_grp"]<=per][data[data["perc_grp"]<=per][actual]==1].shape[0]/data[data["perc_grp"]<=per].shape[0]
        }
        temp_df = pd.DataFrame([temp_dict])
        cum_df = pd.concat([cum_df,temp_df], ignore_index = True)
    cum_df.set_index("grp", drop = True)

    lorenz = pd.merge(lorenz,cum_df,on = "grp")
    total = data[actual].sum()
    y1 = data[[actual, pred_value]].sort_values(pred_value, ascending=False) [actual].cumsum().values/total
    y2 = data[[actual, pred_value]].sort_values(actual, ascending=False)[actual].cumsum().values/total
    x1 = ((2*y1.sum()-y1[-1])/df.shape[0])-1
    x2 = ((2*y2.sum()-y2[-1])/df.shape[0])-1
    gini=x1/x2
    
    return gini, lorenz, data

In [5]:
def model_testing(X_test_y,y_test_y,xgbc_y):
     
    y_pred_xgbc_y = xgbc_y.predict(X_test_y)
    
    print(f"Accuracy score : {accuracy_score(y_test_y,y_pred_xgbc_y)}, Precision score : {precision_score(y_test_y,y_pred_xgbc_y)}, Recall score : {recall_score(y_test_y,y_pred_xgbc_y)}, f1 score : {f1_score(y_test_y,y_pred_xgbc_y)}")
    print(f"Confusion matrix : {confusion_matrix(y_test_y,y_pred_xgbc_y)}")
    
    gini_oos_y, lorenz_oos_y, data_oos_y = calc_lorenze_acc(X_test_y, y_test_y, "unsubscribe_or_not", "y_pred_xgbc_y_oos", xgbc_y, method='gini', num_split=100, cust_split_list = [1,3,5,10,20,100])
    del y_pred_xgbc_y
    return gini_oos_y, lorenz_oos_y, data_oos_y

## Data Ingestion

In [6]:
# TRAINING DATA
t1 = time.time()

qr1 = '''SELECT * FROM kohls-bda-mkt-prd.dp_marketing_sandbox.TKA1JB1_email_unsubscribe_for_jan_23_monthly_training_ver1_train_OOS_1'''
training_data = pandas_gbq.read_gbq(qr1,project_id='kohls-bda-mkt-lle',use_bqstorage_api=True)

print(f"Total time taken to read the data : {(time.time()-t1)/60} mins")

Downloading: 100%|[32m██████████[0m|
Total time taken to read the data : 4.1728089650472 mins


## Data Preparation

In [7]:
training_data.shape

(19173594, 78)

In [8]:
pd.set_option('display.max_columns', None)
training_data.head(2)

Unnamed: 0,email_addr,vantage_date,month_num,unsubscribe_or_not,ttl_num_of_cls_in_pst_mth,ttl_num_of_opens_in_pst_mth,ttl_num_of_mails_sent_in_pst_mth,ttl_num_of_cls_in_lst_90_days_bf_pst_mth,ttl_num_of_op_in_lst_90_days_bf_pst_mth,ttl_num_of_mails_snt_in_lst_90_days_bf_pst_mth,cust_id,ttl_opt_out_num,rec_opt_in_days,mail_cnt_asn_pr_cust,total_trip_cnt,months_to_last_trans,cust_age,med_incm,kc_holder,dist_to_store,lst_qrt_trp_cnt,lst_qrt_pur_qnts,lst_qrt_spt_amt,lst_qrt_dis_qnt,lst_qrt_dis_spt_amt,lst_qrt_pur_qnts_wt_kc_card,lst_qrt_spt_amt_wt_kc_card,ACTIVE_trans_pur_amt_for_lst_qrt,BEAUTY_trans_pur_amt_for_lst_qrt,CHILDRENS_trans_pur_amt_for_lst_qrt,HOME_trans_pur_amt_for_lst_qrt,MENS_trans_pur_amt_for_lst_qrt,WOMENS_trans_pur_amt_for_lst_qrt,YOUNG_WOMENS_trans_pur_amt_for_lst_qrt,sec_lst_qrt_trp_cnt,sec_lst_qrt_pur_qnts,sec_lst_qrt_spt_amt,sec_lst_qrt_dis_qnt,sec_lst_qrt_dis_spt_amt,sec_lst_qrt_pur_qnts_wt_kc_card,sec_lst_qrt_spt_amt_wt_kc_card,ACTIVE_trans_pur_amt_for_sec_lst_qrt,BEAUTY_trans_pur_amt_for_sec_lst_qrt,CHILDRENS_trans_pur_amt_for_sec_lst_qrt,HOME_trans_pur_amt_for_sec_lst_qrt,MENS_trans_pur_amt_for_sec_lst_qrt,WOMENS_trans_pur_amt_for_sec_lst_qrt,YOUNG_WOMENS_trans_pur_amt_for_sec_lst_qrt,trd_lst_qrt_trp_cnt,trd_lst_qrt_pur_qnts,trd_lst_qrt_spt_amt,trd_lst_qrt_dis_qnt,trd_lst_qrt_dis_spt_amt,trd_lst_qrt_pur_qnts_wt_kc_card,trd_lst_qrt_spt_amt_wt_kc_card,ACTIVE_trans_pur_amt_for_trd_lst_qrt,BEAUTY_trans_pur_amt_for_trd_lst_qrt,CHILDRENS_trans_pur_amt_for_trd_lst_qrt,HOME_trans_pur_amt_for_trd_lst_qrt,MENS_trans_pur_amt_for_trd_lst_qrt,WOMENS_trans_pur_amt_for_trd_lst_qrt,YOUNG_WOMENS_trans_pur_amt_for_trd_lst_qrt,frt_lst_qrt_trp_cnt,frt_lst_qrt_pur_qnts,frt_lst_qrt_spt_amt,frt_lst_qrt_dis_qnt,frt_lst_qrt_dis_spt_amt,frt_lst_qrt_pur_qnts_wt_kc_card,frt_lst_qrt_spt_amt_wt_kc_card,ACTIVE_trans_pur_amt_for_frt_lst_qrt,BEAUTY_trans_pur_amt_for_frt_lst_qrt,CHILDRENS_trans_pur_amt_for_frt_lst_qrt,HOME_trans_pur_amt_for_frt_lst_qrt,MENS_trans_pur_amt_for_frt_lst_qrt,WOMENS_trans_pur_amt_for_frt_lst_qrt,YOUNG_WOMENS_trans_pur_amt_for_frt_lst_qrt,llty_ind,eml_supp_tst_grps
0,0zN9xUGVjp@00f92.1w6,2023-01-01,1,0,0,76,72,0,165,155,89746407,,1479,5,112,1,67,162500,1,2.235791139,5,65,731.8,55,405.35,65,731.8,326.45,0.0,0.0,98.9,0.0,66.0,0.0,8,125,1255.8,120,1189.4,105,995.9,337.75,0.0,0.0,31.15,0.0,467.2,0.0,3,50,575.45,10,192.95,15,417.95,350.0,0.0,0.0,0.0,0.0,67.95,0.0,6,120,3678.0,65,2348.15,120,3678.0,1348.15,0.0,0.0,439.85,540.0,30.0,0.0,1,
1,3LKKDvYwiny@Mgadn.v9d,2023-01-01,1,0,0,5,72,0,24,157,1114092581,4.0,970,1,122,0,32,32500,1,2.027137674,6,34,988.96,15,285.07,33,962.24,371.41,122.0,0.0,55.98,0.0,61.72,0.0,4,12,535.08,4,399.44,11,520.69,0.0,0.0,0.0,366.69,0.0,0.75,0.0,6,22,912.85,19,826.57,20,852.87,2.33,0.0,0.0,649.99,0.0,0.0,0.0,5,29,839.46,27,804.79,29,839.46,18.78,0.0,144.96,399.99,59.87,0.0,0.0,1,


In [9]:
col_to_convert_float_ls = ['dist_to_store',
'lst_qrt_spt_amt',
'lst_qrt_dis_spt_amt',
'lst_qrt_spt_amt_wt_kc_card',
'ACTIVE_trans_pur_amt_for_lst_qrt',
'BEAUTY_trans_pur_amt_for_lst_qrt',
'CHILDRENS_trans_pur_amt_for_lst_qrt',
'HOME_trans_pur_amt_for_lst_qrt',
'MENS_trans_pur_amt_for_lst_qrt',
'WOMENS_trans_pur_amt_for_lst_qrt',
'YOUNG_WOMENS_trans_pur_amt_for_lst_qrt',
'sec_lst_qrt_spt_amt',
'sec_lst_qrt_dis_spt_amt',
'sec_lst_qrt_spt_amt_wt_kc_card',
'ACTIVE_trans_pur_amt_for_sec_lst_qrt',
'BEAUTY_trans_pur_amt_for_sec_lst_qrt',
'CHILDRENS_trans_pur_amt_for_sec_lst_qrt',
'HOME_trans_pur_amt_for_sec_lst_qrt',
'MENS_trans_pur_amt_for_sec_lst_qrt',
'WOMENS_trans_pur_amt_for_sec_lst_qrt',
'YOUNG_WOMENS_trans_pur_amt_for_sec_lst_qrt',
'trd_lst_qrt_spt_amt',
'trd_lst_qrt_dis_spt_amt',
'trd_lst_qrt_spt_amt_wt_kc_card',
'ACTIVE_trans_pur_amt_for_trd_lst_qrt',
'BEAUTY_trans_pur_amt_for_trd_lst_qrt',
'CHILDRENS_trans_pur_amt_for_trd_lst_qrt',
'HOME_trans_pur_amt_for_trd_lst_qrt',
'MENS_trans_pur_amt_for_trd_lst_qrt',
'WOMENS_trans_pur_amt_for_trd_lst_qrt',
'YOUNG_WOMENS_trans_pur_amt_for_trd_lst_qrt',
'frt_lst_qrt_spt_amt',
'frt_lst_qrt_dis_spt_amt',
'frt_lst_qrt_spt_amt_wt_kc_card',
'ACTIVE_trans_pur_amt_for_frt_lst_qrt',
'BEAUTY_trans_pur_amt_for_frt_lst_qrt',
'CHILDRENS_trans_pur_amt_for_frt_lst_qrt',
'HOME_trans_pur_amt_for_frt_lst_qrt',
'MENS_trans_pur_amt_for_frt_lst_qrt',
'WOMENS_trans_pur_amt_for_frt_lst_qrt',
'YOUNG_WOMENS_trans_pur_amt_for_frt_lst_qrt']

In [10]:
le = LabelEncoder()
for col in col_to_convert_float_ls:
    training_data[col] = training_data[col].astype(float)
training_data["eml_supp_tst_grps"] = le.fit_transform(training_data["eml_supp_tst_grps"])

In [11]:
X_train_y = training_data.drop(columns = ["email_addr", "vantage_date", "cust_id", "unsubscribe_or_not"], axis = 1)
y_train_y = training_data['unsubscribe_or_not'].astype(int)
del training_data

In [12]:
collected_objects = gc.collect()

In [13]:
X_train_y.shape, y_train_y.shape

((19173594, 74), (19173594,))

## Model Training

### XGBC

In [14]:
# TRAINING
t1 = time.time()

xgbc_y  =XGBClassifier()
xgbc_y.fit(X_train_y,y_train_y)

print(f"Total time taken to train the model : {(time.time()-t1)/60} mins")

Total time taken to train the model : 6.90701463619868 mins


### XGBC with spw

In [15]:
round(len(y_train_y[y_train_y==0])/len(y_train_y[y_train_y==1]))

108

In [17]:
# TRAINING
t1 = time.time()

xgbc1_y  =XGBClassifier(scale_pos_weight=108)
xgbc1_y.fit(X_train_y,y_train_y)

print(f"Total time taken to train the model : {(time.time()-t1)/60} mins")

Total time taken to train the model : 6.791117783387502 mins


### Features importance

In [18]:
feature_df_y = pd.DataFrame(index = [X_train_y.columns], data = xgbc_y.feature_importances_)
feature_df_y.to_csv("features_m.csv")

In [19]:
feature_df_y1 = pd.DataFrame(index = [X_train_y.columns], data = xgbc1_y.feature_importances_)
feature_df_y1.to_csv("features_m1.csv")

In [20]:
del X_train_y
del y_train_y

### Testing on OOS data

In [21]:
# TESTING DATA OOS
t1 = time.time()

qr2 = '''SELECT * FROM kohls-bda-mkt-prd.dp_marketing_sandbox.TKA1JB1_email_unsubscribe_for_jan_23_monthly_training_ver1_test_OOS_1'''
testing_data = pandas_gbq.read_gbq(qr2,project_id='kohls-bda-mkt-lle',use_bqstorage_api=True)

print(f"Total time taken to read the data : {(time.time()-t1)/60} mins")

Downloading: 100%|[32m██████████[0m|
Total time taken to read the data : 1.5022883574167887 mins


In [22]:
for col in col_to_convert_float_ls:
    testing_data[col] = testing_data[col].astype(float)
testing_data["eml_supp_tst_grps"] = le.transform(testing_data["eml_supp_tst_grps"])

In [23]:
X_test_y = testing_data.drop(columns = ["email_addr", "vantage_date", "cust_id", "unsubscribe_or_not"], axis = 1)
y_test_y = testing_data['unsubscribe_or_not'].astype(int)
del testing_data

collected_objects = gc.collect()

In [24]:
X_test_y.shape, y_test_y.shape

((6391197, 74), (6391197,))

#### Testing on XGBC

In [29]:
gini_oos_y, lorenz_oos_y, data_oos_y = model_testing(X_test_y, y_test_y, xgbc_y)

Accuracy score : 0.9909013913981998, Precision score : 0.31527093596059114, Recall score : 0.0011020042702665474, f1 score : 0.002196331440141389
Confusion matrix : [[6332982     139]
 [  58012      64]]


In [33]:
gini_oos_y

0.5437013830253001

In [31]:
lorenz_oos_y

Unnamed: 0,grp,y_pred_xgbc_y_oos,unsubscribe_or_not,num_mail_id,actual_not_unsubscribed,actual_unsubscribed,per_unsubscribed,y_pred_xgbc_oot_cum,unsubscribe_or_not_cum,num_mail_id_cum,actual_not_unsubscribed_cum,actual_unsubscribed_cum,per_unsubscribed_cum
0,1,0.109323,0.103783,63912,57279,6633,0.103783,0.109323,0.103783,63912,57279,6633,0.103783
1,3,0.051958,0.051962,127824,121182,6642,0.051962,0.071079,0.069236,191736,178461,13275,0.069236
2,5,0.033127,0.034117,127824,123463,4361,0.034117,0.055899,0.055188,319560,301924,17636,0.055188
3,10,0.02299,0.02331,319560,312111,7449,0.02331,0.039444,0.039249,639120,614035,25085,0.039249
4,20,0.014029,0.013806,639120,630296,8824,0.013806,0.026737,0.026528,1278240,1244331,33909,0.026528
5,100,0.004735,0.004727,5112957,5088790,24167,0.004727,0.009135,0.009087,6391197,6333121,58076,0.009087


In [32]:
data_oos_y

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
5003249,0,0.856128,0,1,1
3584140,0,0.783057,1,1,1
2862376,1,0.778760,2,1,1
4891643,1,0.772370,3,1,1
2725295,1,0.769923,4,1,1
...,...,...,...,...,...
2621624,0,0.000032,6391192,100,100
2830979,0,0.000032,6391193,100,100
1536813,0,0.000032,6391194,100,100
2239393,0,0.000032,6391195,100,100


In [34]:
data_oos_y[data_oos_y["perc_grp"] <= 1]

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
5003249,0,0.856128,0,1,1
3584140,0,0.783057,1,1,1
2862376,1,0.778760,2,1,1
4891643,1,0.772370,3,1,1
2725295,1,0.769923,4,1,1
...,...,...,...,...,...
5395135,0,0.071664,63907,1,1
5245604,0,0.071663,63908,1,1
4722305,0,0.071663,63909,1,1
5596983,0,0.071662,63910,1,1


In [35]:
data_oos_y[data_oos_y["perc_grp"] <= 1][data_oos_y[data_oos_y["perc_grp"] <= 1]["y_pred_xgbc_y_oos"]>=0.5][data_oos_y[data_oos_y["perc_grp"] <= 1][data_oos_y[data_oos_y["perc_grp"] <= 1]["y_pred_xgbc_y_oos"]>=0.5]["unsubscribe_or_not"]==1]

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
2862376,1,0.778760,2,1,1
4891643,1,0.772370,3,1,1
2725295,1,0.769923,4,1,1
6258501,1,0.731420,7,1,1
3360186,1,0.721641,8,1,1
...,...,...,...,...,...
4635059,1,0.515701,184,1,1
5597296,1,0.510249,193,1,1
4894858,1,0.509622,195,1,1
3121575,1,0.507194,198,1,1


In [224]:
data_oos_y[data_oos_y["perc_grp"] <= 2][data_oos_y[data_oos_y["perc_grp"] <= 2]["y_pred_xgbc_y_oos"]>=0.5][data_oos_y[data_oos_y["perc_grp"] <= 2][data_oos_y[data_oos_y["perc_grp"] <= 2]["y_pred_xgbc_y_oos"]>=0.5]["unsubscribe_or_not"]==1]

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
2837430,1,0.977919,0,1,1
956660,1,0.977919,1,1,1
1669113,1,0.977919,2,1,1
2169189,1,0.977919,3,1,1
2937545,1,0.977919,4,1,1
...,...,...,...,...,...
5184420,1,0.515116,1422,1,1
3059682,1,0.513041,1424,1,1
5033785,1,0.506301,1432,1,1
650944,1,0.501669,1435,1,1


In [60]:
del data_oos_y
del lorenz_oos_y
del gini_oos_y

#### Testing on XGBC with spw

In [36]:
gini_oos1_y, lorenz_oos1_y, data_oos1_y = model_testing(X_test_y, y_test_y, xgbc1_y)

Accuracy score : 0.7356576553656538, Precision score : 0.022349097041669593, Recall score : 0.6571733590467663, f1 score : 0.04322809656381598
Confusion matrix : [[4663567 1669554]
 [  19910   38166]]


In [37]:
gini_oos1_y

0.5429015319752979

In [38]:
lorenz_oos1_y

Unnamed: 0,grp,y_pred_xgbc_y_oos,unsubscribe_or_not,num_mail_id,actual_not_unsubscribed,actual_unsubscribed,per_unsubscribed,y_pred_xgbc_oot_cum,unsubscribe_or_not_cum,num_mail_id_cum,actual_not_unsubscribed_cum,actual_unsubscribed_cum,per_unsubscribed_cum
0,1,0.916507,0.103455,63912,57300,6612,0.103455,0.916507,0.103455,63912,57300,6612,0.103455
1,3,0.850798,0.051696,127824,121216,6608,0.051696,0.872701,0.068949,191736,178516,13220,0.068949
2,5,0.784764,0.033914,127824,123489,4335,0.033914,0.837526,0.054935,319560,302005,17555,0.054935
3,10,0.716192,0.02347,319560,312060,7500,0.02347,0.776859,0.039202,639120,614065,25055,0.039202
4,20,0.600759,0.013728,639120,630346,8774,0.013728,0.688809,0.026465,1278240,1244411,33829,0.026465
5,100,0.302287,0.004742,5112957,5088710,24247,0.004742,0.379592,0.009087,6391197,6333121,58076,0.009087


In [39]:
data_oos1_y

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
6180640,0,0.996644,0,1,1
4147337,0,0.996009,1,1,1
4526439,1,0.995880,2,1,1
4800548,1,0.995501,3,1,1
5471573,0,0.995190,4,1,1
...,...,...,...,...,...
824635,0,0.000746,6391192,100,100
1516915,0,0.000476,6391193,100,100
402277,0,0.000436,6391194,100,100
277938,0,0.000270,6391195,100,100


In [51]:
data_oos1_y[data_oos1_y["perc_grp"] == 100]

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
1504712,0,0.001458,6327286,100,100
1504707,0,0.001458,6327287,100,100
1649071,0,0.001458,6327288,100,100
2600189,0,0.001458,6327289,100,100
896295,0,0.001458,6327290,100,100
...,...,...,...,...,...
824635,0,0.000746,6391192,100,100
1516915,0,0.000476,6391193,100,100
402277,0,0.000436,6391194,100,100
277938,0,0.000270,6391195,100,100


In [57]:
num = 100
data_oos1_y[data_oos1_y["perc_grp"] <= num][data_oos1_y[data_oos1_y["perc_grp"] <= num]["y_pred_xgbc_y_oos"]>=0.5][data_oos1_y[data_oos1_y["perc_grp"] <= num][data_oos1_y[data_oos1_y["perc_grp"] <= num]["y_pred_xgbc_y_oos"]>=0.5]["unsubscribe_or_not"]==1]

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
4526439,1,0.995880,2,1,1
4800548,1,0.995501,3,1,1
2696448,1,0.995052,6,1,1
3578234,1,0.994626,10,1,1
2725295,1,0.994601,11,1,1
...,...,...,...,...,...
2977595,1,0.500070,1707015,27,100
1576414,1,0.500055,1707171,27,100
4815646,1,0.500021,1707540,27,100
1767329,1,0.500018,1707558,27,100


### Try for ale curve

In [None]:
# xgbc1_y, X_test_y, y_test_y

In [63]:
# X_test_y.columns

In [83]:
# feature_names = ['month_num', 'ttl_num_of_cls_in_pst_mth', 'ttl_num_of_opens_in_pst_mth',
#        'ttl_num_of_mails_sent_in_pst_mth',
#        'ttl_num_of_cls_in_lst_90_days_bf_pst_mth',
#        'ttl_num_of_op_in_lst_90_days_bf_pst_mth',
#        'ttl_num_of_mails_snt_in_lst_90_days_bf_pst_mth', 'ttl_opt_out_num',
#        'rec_opt_in_days', 'mail_cnt_asn_pr_cust', 'total_trip_cnt',
#        'months_to_last_trans', 
#        'dist_to_store', 'lst_qrt_trp_cnt', 'lst_qrt_pur_qnts',
#        'lst_qrt_spt_amt', 'lst_qrt_dis_qnt', 'lst_qrt_dis_spt_amt',
#        'lst_qrt_pur_qnts_wt_kc_card', 'lst_qrt_spt_amt_wt_kc_card',
#        'ACTIVE_trans_pur_amt_for_lst_qrt', 'BEAUTY_trans_pur_amt_for_lst_qrt',
#        'CHILDRENS_trans_pur_amt_for_lst_qrt', 'HOME_trans_pur_amt_for_lst_qrt',
#        'MENS_trans_pur_amt_for_lst_qrt', 'WOMENS_trans_pur_amt_for_lst_qrt',
#        'YOUNG_WOMENS_trans_pur_amt_for_lst_qrt', 'sec_lst_qrt_trp_cnt',
#        'sec_lst_qrt_pur_qnts', 'sec_lst_qrt_spt_amt', 'sec_lst_qrt_dis_qnt',
#        'sec_lst_qrt_dis_spt_amt', 'sec_lst_qrt_pur_qnts_wt_kc_card',
#        'sec_lst_qrt_spt_amt_wt_kc_card',
#        'ACTIVE_trans_pur_amt_for_sec_lst_qrt',
#        'BEAUTY_trans_pur_amt_for_sec_lst_qrt',
#        'CHILDRENS_trans_pur_amt_for_sec_lst_qrt',
#        'HOME_trans_pur_amt_for_sec_lst_qrt',
#        'MENS_trans_pur_amt_for_sec_lst_qrt',
#        'WOMENS_trans_pur_amt_for_sec_lst_qrt',
#        'YOUNG_WOMENS_trans_pur_amt_for_sec_lst_qrt', 'trd_lst_qrt_trp_cnt',
#        'trd_lst_qrt_pur_qnts', 'trd_lst_qrt_spt_amt', 'trd_lst_qrt_dis_qnt',
#        'trd_lst_qrt_dis_spt_amt', 'trd_lst_qrt_pur_qnts_wt_kc_card',
#        'trd_lst_qrt_spt_amt_wt_kc_card',
#        'ACTIVE_trans_pur_amt_for_trd_lst_qrt',
#        'BEAUTY_trans_pur_amt_for_trd_lst_qrt',
#        'CHILDRENS_trans_pur_amt_for_trd_lst_qrt',
#        'HOME_trans_pur_amt_for_trd_lst_qrt',
#        'MENS_trans_pur_amt_for_trd_lst_qrt',
#        'WOMENS_trans_pur_amt_for_trd_lst_qrt',
#        'YOUNG_WOMENS_trans_pur_amt_for_trd_lst_qrt', 'frt_lst_qrt_trp_cnt',
#        'frt_lst_qrt_pur_qnts', 'frt_lst_qrt_spt_amt', 'frt_lst_qrt_dis_qnt',
#        'frt_lst_qrt_dis_spt_amt', 'frt_lst_qrt_pur_qnts_wt_kc_card',
#        'frt_lst_qrt_spt_amt_wt_kc_card',
#        'ACTIVE_trans_pur_amt_for_frt_lst_qrt',
#        'BEAUTY_trans_pur_amt_for_frt_lst_qrt',
#        'CHILDRENS_trans_pur_amt_for_frt_lst_qrt',
#        'HOME_trans_pur_amt_for_frt_lst_qrt',
#        'MENS_trans_pur_amt_for_frt_lst_qrt',
#        'WOMENS_trans_pur_amt_for_frt_lst_qrt',
#        'YOUNG_WOMENS_trans_pur_amt_for_frt_lst_qrt']

In [84]:
# categ_vars = ['cust_age', 'med_incm', 'kc_holder', 'llty_ind', 'eml_supp_tst_grps']

In [85]:
# X_test_y = X_test_y.fillna(X_test_y.median())

In [90]:
# ale_eff = ALE(xgbc1_y.predict_proba,
#               feature_names = list(X_test_y.columns),
#               target_names = "unsubscribe_or_not")

In [98]:
# indep_index_li = []
# # # create an empty dict
# grid_points = {}

In [91]:
# for indep in feature_names:
#     indep_index_num = list(X_test_y.columns).index(indep)
#     indep_index_li.append(indep_index_num)
#     grid_points[indep_index_num] = np.unique(X_test_y.dropna().iloc[:,indep_index_num:indep_index_num+1].describe(percentiles=np.arange(0.05,0.95,0.05)).values[4:-1])

# xgb_exp = ale_eff.explain(X_test_y.values,features=indep_index_li,grid_points=grid_points)

In [80]:
# def one_way_ales(num, x_inp, dep_var, var_list, categ_list, xgb_model,dsname):
    
#     # Declaring all the inputs to the function
#     model_id = num
#     xgb_model = xgb_model
#     x_inp = x_inp.copy()
#     dep_var = dep_var
#     dsn = dsname
#     categ_vars = categ_list.copy()
    
#     # Get the features list for ALEs
#     feature_names = var_list.copy()
    
#     # All the features in the model
#     x_vars_list = list(x_inp.columns)
    
#     # Set your working directory and use this path to store the results
#     dir_str = "4.Outputs/" + "Model " + str(model_id) +"/ALEs"
#     if not os.path.exists(dir_str):
#         os.makedirs(dir_str)
#         print("Created Directory : ", dir_str,"\n")
#     else:
#         print("Directory already existed : ", dir_str,"\n")

#     # ALEs for the validation dataset declaration
#     ale_eff = ALE(xgb_model.predict_proba,
#                   feature_names = x_vars_list,
#                   target_names = dep_var)
    
#     # create index list for independent vars
#     indep_index_li = []
#     # create an empty dict
#     grid_points = {}
    
#     for indep in feature_names:
#         indep_index_num = x_vars_list.index(indep)
#         indep_index_li.append(indep_index_num)
#         grid_points[indep_index_num] = np.unique(x_test.dropna().iloc[:,indep_index_num:indep_index_num+1].describe(percentiles=np.arange(0.05,0.95,0.05)).values[4:-1])
    
#     xgb_exp = ale_eff.explain(x_inp.values,features=indep_index_li,grid_points=grid_points)

#     # Store the ALE results now
#     for i in range(0,len(feature_names)):
        
#         if str(feature_names[i]) in categ_list:
#             z1 = pd.DataFrame(xgb_exp.feature_values[i],columns=[str(feature_names[i])])
#             z2 = pd.DataFrame(xgb_exp.ale_values[i][:,1]+xgb_exp.constant_value,columns=dep_var)
            
#         else:
#             z1 = pd.DataFrame(xgb_exp.feature_values[i][1:-1],columns=[str(feature_names[i])])
#             z2 = pd.DataFrame(xgb_exp.ale_values[i][1:-1,1]+xgb_exp.constant_value,columns=dep_var)
        
#     # Merge the feature values and corresponding ales
#         z1_z2 = pd.merge(z1,z2,left_index=True,right_index=True).reset_index(drop=True)
#         ale_df = z1_z2.copy().drop_duplicates()            
#         ale_str = dir_str + "/"+str(model_id) + ".mid_ale_"+str(dsn)+"_"+ str(feature_names[i]) +".csv"
#         ale_df.to_csv(ale_str,index=False)
        
#     print(15*"*"," One Way ALEs Stored : ",(15*"*"),"\n")
    
#     return xgb_exp

In [58]:
del data_oos1_y
del lorenz_oos1_y
del gini_oos1_y

In [59]:
del X_test_y
del y_test_y

### Testing on OOT data

#### Feb 23 DATA

In [61]:
t1 = time.time()
qr = '''SELECT * FROM kohls-bda-mkt-prd.dp_marketing_sandbox.TKA1JB1_email_unsubscribe_year_data_for_feb_r3_OTT'''
data_OOT= pandas_gbq.read_gbq(qr,project_id='kohls-bda-mkt-lle',use_bqstorage_api=True)
print(f"Total time taken to read the data : {(time.time()-t1)/60} mins")

Downloading: 100%|[32m██████████[0m|
Total time taken to read the data : 1.4169249057769775 mins


In [62]:
data_OOT.shape

(5830696, 78)

In [63]:
for col in col_to_convert_float_ls:
    data_OOT[col] = data_OOT[col].astype(float)
data_OOT["eml_supp_tst_grps"] = le.transform(data_OOT["eml_supp_tst_grps"])

In [64]:
X_test_data_OOT = data_OOT.drop(columns = ["email_addr", "vantage_date", "cust_id", "unsubscribe_or_not"], axis = 1)
y_test_data_OOT = data_OOT['unsubscribe_or_not'].astype(int)
del data_OOT

In [65]:
collected_objects = gc.collect()

In [66]:
X_test_data_OOT.shape, y_test_data_OOT.shape

((5830696, 74), (5830696,))

##### Testing on XGBC

In [67]:
gini_oot_y, lorenz_oot_y, data_oot_y = model_testing(X_test_data_OOT, y_test_data_OOT, xgbc_y)

Accuracy score : 0.9914030846403242, Precision score : 0.4745762711864407, Recall score : 0.0005586257805797738, f1 score : 0.0011159379857319358
Confusion matrix : [[5780542      31]
 [  50095      28]]


In [68]:
gini_oot_y

0.3963245691706832

In [69]:
lorenz_oot_y

Unnamed: 0,grp,y_pred_xgbc_y_oos,unsubscribe_or_not,num_mail_id,actual_not_unsubscribed,actual_unsubscribed,per_unsubscribed,y_pred_xgbc_oot_cum,unsubscribe_or_not_cum,num_mail_id_cum,actual_not_unsubscribed_cum,actual_unsubscribed_cum,per_unsubscribed_cum
0,1,0.097696,0.056854,58307,54992,3315,0.056854,0.097696,0.056854,58307,54992,3315,0.056854
1,3,0.066839,0.023642,116614,113857,2757,0.023642,0.077125,0.034713,174921,168849,6072,0.034713
2,5,0.041455,0.031008,116614,112998,3616,0.031008,0.062857,0.033231,291535,281847,9688,0.033231
3,10,0.025487,0.022944,291535,284846,6689,0.022944,0.044172,0.028088,583070,566693,16377,0.028088
4,20,0.013794,0.012928,583070,575532,7538,0.012928,0.028983,0.020508,1166140,1142225,23915,0.020508
5,100,0.004105,0.005619,4664556,4638348,26208,0.005619,0.009081,0.008596,5830696,5780573,50123,0.008596


In [70]:
data_oot_y[data_oot_y["perc_grp"] <= 1]

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
2671730,1,0.751325,0,1,1
4341762,0,0.748109,1,1,1
348882,0,0.734960,2,1,1
4270005,0,0.716628,3,1,1
3384583,0,0.692013,4,1,1
...,...,...,...,...,...
540567,0,0.074078,58302,1,1
1814691,0,0.074078,58303,1,1
2474772,0,0.074078,58304,1,1
1820692,0,0.074078,58305,1,1


In [73]:
data_oot_y[data_oot_y["perc_grp"] <= 1][data_oot_y[data_oot_y["perc_grp"] <= 1]["y_pred_xgbc_y_oos"]>=0.5][data_oot_y[data_oot_y["perc_grp"] <= 1][data_oot_y[data_oot_y["perc_grp"] <= 1]["y_pred_xgbc_y_oos"]>=0.5]["unsubscribe_or_not"]==1]

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
2671730,1,0.751325,0,1,1
5418792,1,0.691076,5,1,1
4167189,1,0.68563,6,1,1
2370580,1,0.652738,9,1,1
5504681,1,0.631237,11,1,1
2087072,1,0.630917,12,1,1
5066905,1,0.62749,13,1,1
4065564,1,0.617052,17,1,1
4646306,1,0.611083,20,1,1
4936270,1,0.610144,21,1,1


##### Testing on XGBC with spw

In [74]:
gini_oot1_y, lorenz_oot1_y, data_oot1_y = model_testing(X_test_data_OOT, y_test_data_OOT, xgbc1_y)

Accuracy score : 0.8109870588348287, Precision score : 0.02005719416451626, Recall score : 0.4385411886758574, f1 score : 0.03835995110114054
Confusion matrix : [[4706638 1073935]
 [  28142   21981]]


In [75]:
gini_oot1_y

0.330985368135167

In [76]:
lorenz_oot1_y

Unnamed: 0,grp,y_pred_xgbc_y_oos,unsubscribe_or_not,num_mail_id,actual_not_unsubscribed,actual_unsubscribed,per_unsubscribed,y_pred_xgbc_oot_cum,unsubscribe_or_not_cum,num_mail_id_cum,actual_not_unsubscribed_cum,actual_unsubscribed_cum,per_unsubscribed_cum
0,1,0.90882,0.055911,58307,55047,3260,0.055911,0.90882,0.055911,58307,55047,3260,0.055911
1,3,0.877319,0.025108,116614,113686,2928,0.025108,0.887819,0.035376,174921,168733,6188,0.035376
2,5,0.81699,0.029362,116614,113190,3424,0.029362,0.859488,0.03297,291535,281923,9612,0.03297
3,10,0.73404,0.021771,291535,285188,6347,0.021771,0.796764,0.027371,583070,567111,15959,0.027371
4,20,0.567759,0.01133,583070,576464,6606,0.01133,0.682261,0.01935,1166140,1143575,22565,0.01935
5,100,0.208325,0.005908,4664556,4636998,27558,0.005908,0.303112,0.008596,5830696,5780573,50123,0.008596


In [77]:
data_oot1_y

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
5066905,1,0.994382,0,1,1
4847420,0,0.994021,1,1,1
2671730,1,0.993487,2,1,1
4686623,0,0.992694,3,1,1
5391176,1,0.992230,4,1,1
...,...,...,...,...,...
27699,0,0.000395,5830691,100,100
5045687,0,0.000393,5830692,100,100
3919193,0,0.000328,5830693,100,100
138416,0,0.000292,5830694,100,100


In [89]:
data_oot1_y[data_oot1_y["perc_grp"] == 100]

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
1229549,0,0.006396,5772390,100,100
2743015,0,0.006396,5772391,100,100
1067550,0,0.006396,5772392,100,100
1595677,0,0.006396,5772393,100,100
1507364,0,0.006396,5772394,100,100
...,...,...,...,...,...
27699,0,0.000395,5830691,100,100
5045687,0,0.000393,5830692,100,100
3919193,0,0.000328,5830693,100,100
138416,0,0.000292,5830694,100,100


In [94]:
num = 20
data_oot1_y[data_oot1_y["perc_grp"] <= num][data_oot1_y[data_oot1_y["perc_grp"] <= num]["y_pred_xgbc_y_oos"]>=0.5][data_oot1_y[data_oot1_y["perc_grp"] <= num][data_oot1_y[data_oot1_y["perc_grp"] <= num]["y_pred_xgbc_y_oos"]>=0.5]["unsubscribe_or_not"]==1]

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
5066905,1,0.994382,0,1,1
2671730,1,0.993487,2,1,1
5391176,1,0.992230,4,1,1
5199677,1,0.992224,5,1,1
226470,1,0.991709,7,1,1
...,...,...,...,...,...
3844206,1,0.500249,1095074,19,20
4497360,1,0.500220,1095171,19,20
2084411,1,0.500214,1095201,19,20
3436564,1,0.500089,1095627,19,20


In [97]:
del data_oos_y
del lorenz_oos_y
del gini_oos_y

del data_oos1_y
del lorenz_oos1_y
del gini_oos1_y

del X_test_data_OOT
del y_test_data_OOT

#### Mar 23 DATA

In [98]:
t1 = time.time()
qr = '''SELECT * FROM kohls-bda-mkt-prd.dp_marketing_sandbox.TKA1JB1_email_unsubscribe_year_data_for_mar_r3_OTT'''
data_OOT= pandas_gbq.read_gbq(qr,project_id='kohls-bda-mkt-lle',use_bqstorage_api=True)
print(f"Total time taken to read the data : {(time.time()-t1)/60} mins")

Downloading: 100%|[32m██████████[0m|
Total time taken to read the data : 1.151473383108775 mins


In [99]:
data_OOT.shape

(5699684, 78)

In [100]:
for col in col_to_convert_float_ls:
    data_OOT[col] = data_OOT[col].astype(float)
data_OOT["eml_supp_tst_grps"] = le.transform(data_OOT["eml_supp_tst_grps"])

In [101]:
data_OOT.head()

Unnamed: 0,email_addr,vantage_date,month_num,unsubscribe_or_not,ttl_num_of_cls_in_pst_mth,ttl_num_of_opens_in_pst_mth,ttl_num_of_mails_sent_in_pst_mth,ttl_num_of_cls_in_lst_90_days_bf_pst_mth,ttl_num_of_op_in_lst_90_days_bf_pst_mth,ttl_num_of_mails_snt_in_lst_90_days_bf_pst_mth,cust_id,ttl_opt_out_num,rec_opt_in_days,mail_cnt_asn_pr_cust,total_trip_cnt,months_to_last_trans,cust_age,med_incm,kc_holder,dist_to_store,lst_qrt_trp_cnt,lst_qrt_pur_qnts,lst_qrt_spt_amt,lst_qrt_dis_qnt,lst_qrt_dis_spt_amt,lst_qrt_pur_qnts_wt_kc_card,lst_qrt_spt_amt_wt_kc_card,ACTIVE_trans_pur_amt_for_lst_qrt,BEAUTY_trans_pur_amt_for_lst_qrt,CHILDRENS_trans_pur_amt_for_lst_qrt,HOME_trans_pur_amt_for_lst_qrt,MENS_trans_pur_amt_for_lst_qrt,WOMENS_trans_pur_amt_for_lst_qrt,YOUNG_WOMENS_trans_pur_amt_for_lst_qrt,sec_lst_qrt_trp_cnt,sec_lst_qrt_pur_qnts,sec_lst_qrt_spt_amt,sec_lst_qrt_dis_qnt,sec_lst_qrt_dis_spt_amt,sec_lst_qrt_pur_qnts_wt_kc_card,sec_lst_qrt_spt_amt_wt_kc_card,ACTIVE_trans_pur_amt_for_sec_lst_qrt,BEAUTY_trans_pur_amt_for_sec_lst_qrt,CHILDRENS_trans_pur_amt_for_sec_lst_qrt,HOME_trans_pur_amt_for_sec_lst_qrt,MENS_trans_pur_amt_for_sec_lst_qrt,WOMENS_trans_pur_amt_for_sec_lst_qrt,YOUNG_WOMENS_trans_pur_amt_for_sec_lst_qrt,trd_lst_qrt_trp_cnt,trd_lst_qrt_pur_qnts,trd_lst_qrt_spt_amt,trd_lst_qrt_dis_qnt,trd_lst_qrt_dis_spt_amt,trd_lst_qrt_pur_qnts_wt_kc_card,trd_lst_qrt_spt_amt_wt_kc_card,ACTIVE_trans_pur_amt_for_trd_lst_qrt,BEAUTY_trans_pur_amt_for_trd_lst_qrt,CHILDRENS_trans_pur_amt_for_trd_lst_qrt,HOME_trans_pur_amt_for_trd_lst_qrt,MENS_trans_pur_amt_for_trd_lst_qrt,WOMENS_trans_pur_amt_for_trd_lst_qrt,YOUNG_WOMENS_trans_pur_amt_for_trd_lst_qrt,frt_lst_qrt_trp_cnt,frt_lst_qrt_pur_qnts,frt_lst_qrt_spt_amt,frt_lst_qrt_dis_qnt,frt_lst_qrt_dis_spt_amt,frt_lst_qrt_pur_qnts_wt_kc_card,frt_lst_qrt_spt_amt_wt_kc_card,ACTIVE_trans_pur_amt_for_frt_lst_qrt,BEAUTY_trans_pur_amt_for_frt_lst_qrt,CHILDRENS_trans_pur_amt_for_frt_lst_qrt,HOME_trans_pur_amt_for_frt_lst_qrt,MENS_trans_pur_amt_for_frt_lst_qrt,WOMENS_trans_pur_amt_for_frt_lst_qrt,YOUNG_WOMENS_trans_pur_amt_for_frt_lst_qrt,llty_ind,eml_supp_tst_grps
0,SNGep77Yc@karC2.ifJ,2023-03-01,3,0,0,0,33,1,1,123,81957853,,271,3,133,2,43,70000,1,8.065645,4,10,549.32,6,274.35,10,549.32,41.99,0.0,0.0,357.38,0.0,70.77,0.0,1,10,122.94,10,122.94,10,122.94,0.0,0.0,0.0,0.0,0.0,79.76,0.0,7,25,499.98,18,306.71,24,484.99,114.7,0.0,0.0,0.0,0.0,79.97,0.0,2,11,236.32,7,121.19,6,111.12,167.2,0.0,0.0,0.0,69.12,0.0,0.0,0,2
1,M4hdIsbds@kaOj4.Xah,2023-03-01,3,0,0,0,33,0,3,146,436276514,3.0,519,1,29,2,52,125000,1,0.977633,3,22,289.82,22,289.82,22,289.82,0.0,0.0,0.0,46.74,0.0,243.08,0.0,1,5,47.29,3,34.95,5,47.29,0.0,0.0,0.0,12.34,0.0,34.95,0.0,3,36,421.68,29,376.95,36,421.68,0.0,0.0,0.0,8.51,0.0,379.94,0.0,1,10,69.95,10,69.95,10,69.95,0.0,0.0,0.0,3.5,0.0,53.87,0.0,0,2
2,DwYSo@xYs38.dYT,2023-03-01,3,0,0,0,32,0,0,57,143675516,,96,2,363,0,63,62500,1,7.210976,6,30,542.51,27,458.16,28,515.57,10.0,0.0,0.0,0.0,0.0,382.83,0.0,7,25,415.36,24,399.31,25,415.36,0.0,0.0,0.0,93.96,0.0,187.9,0.0,7,42,614.64,42,614.64,42,614.64,7.65,0.0,0.0,0.0,0.0,555.02,0.0,7,23,420.98,21,395.0,23,420.98,0.0,0.0,0.0,136.06,0.0,87.59,0.0,1,2
3,Ota6lxKy@wNU7zxx.gZN,2023-03-01,3,0,0,0,32,0,0,32,1193286860,,80,2,94,1,74,52500,1,2.817602,8,94,988.02,94,988.02,94,988.02,111.88,0.0,0.0,174.98,0.0,160.16,0.0,6,40,224.78,28,155.08,40,224.78,18.22,0.0,0.0,117.48,0.0,0.0,0.0,9,92,628.84,62,506.86,92,628.84,0.0,0.0,0.0,75.0,0.0,0.0,0.0,2,12,62.3,12,62.3,12,62.3,0.0,0.0,0.0,6.78,0.0,0.0,0.0,1,2
4,pqEtX3Dc1uO@fBj8k.QLv,2023-03-01,3,0,0,0,33,0,5,150,712455142,,1011,2,94,3,51,27500,1,1.663906,2,15,304.49,7,83.21,15,304.49,0.0,0.0,0.0,91.54,0.0,0.0,0.0,2,5,110.2,4,108.94,5,110.2,59.47,0.0,0.0,1.26,0.0,0.0,0.0,3,38,576.21,30,390.5,38,576.21,224.49,0.0,0.0,20.57,0.0,138.16,0.0,1,5,135.17,4,125.18,5,135.17,89.98,0.0,0.0,45.19,0.0,0.0,0.0,1,2


In [102]:
X_test_data_OOT = data_OOT.drop(columns = ["email_addr", "vantage_date", "cust_id", "unsubscribe_or_not"], axis = 1)
y_test_data_OOT = data_OOT['unsubscribe_or_not'].astype(int)
del data_OOT

In [103]:
collected_objects = gc.collect()

In [104]:
X_test_data_OOT.shape, y_test_data_OOT.shape

((5699684, 74), (5699684,))

##### Testing on XGBC

In [105]:
gini_oot_y, lorenz_oot_y, data_oot_y = model_testing(X_test_data_OOT, y_test_data_OOT, xgbc_y)

Accuracy score : 0.9966282341266639, Precision score : 0.030973451327433628, Recall score : 0.0003683047458697254, f1 score : 0.0007279534109816973
Confusion matrix : [[5680459     219]
 [  18999       7]]


In [106]:
gini_oot_y

0.47340674487239354

In [107]:
lorenz_oot_y

Unnamed: 0,grp,y_pred_xgbc_y_oos,unsubscribe_or_not,num_mail_id,actual_not_unsubscribed,actual_unsubscribed,per_unsubscribed,y_pred_xgbc_oot_cum,unsubscribe_or_not_cum,num_mail_id_cum,actual_not_unsubscribed_cum,actual_unsubscribed_cum,per_unsubscribed_cum
0,1,0.115163,0.016334,56997,56066,931,0.016334,0.115163,0.016334,56997,56066,931,0.016334
1,3,0.072911,0.00457,113994,113473,521,0.00457,0.086995,0.008492,170991,169539,1452,0.008492
2,5,0.044645,0.015299,113994,112250,1744,0.015299,0.070055,0.011215,284985,281789,3196,0.011215
3,10,0.024366,0.015415,284984,280591,4393,0.015415,0.047211,0.013315,569969,562380,7589,0.013315
4,20,0.013917,0.004914,569968,567167,2801,0.004914,0.030564,0.009115,1139937,1129547,10390,0.009115
5,100,0.004523,0.00189,4559747,4551131,8616,0.00189,0.009731,0.003335,5699684,5680678,19006,0.003335


In [108]:
data_oot_y

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
3666464,0,0.810112,0,1,1
3100684,0,0.772439,1,1,1
2944767,0,0.756344,2,1,1
2892457,0,0.747259,3,1,1
3967282,0,0.742990,4,1,1
...,...,...,...,...,...
1475533,0,0.000039,5699679,100,100
1407051,0,0.000039,5699680,100,100
1109764,0,0.000039,5699681,100,100
1220045,0,0.000039,5699682,100,100


In [109]:
data_oot_y[data_oot_y["perc_grp"] == 1]

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
3666464,0,0.810112,0,1,1
3100684,0,0.772439,1,1,1
2944767,0,0.756344,2,1,1
2892457,0,0.747259,3,1,1
3967282,0,0.742990,4,1,1
...,...,...,...,...,...
1051050,0,0.074078,56992,1,1
1050870,0,0.074078,56993,1,1
1050863,0,0.074078,56994,1,1
1050864,0,0.074078,56995,1,1


In [110]:
data_oot_y[data_oot_y["perc_grp"] <= 1][data_oot_y[data_oot_y["perc_grp"] <= 1]["y_pred_xgbc_y_oos"]>=0.5][data_oot_y[data_oot_y["perc_grp"] <= 1][data_oot_y[data_oot_y["perc_grp"] <= 1]["y_pred_xgbc_y_oos"]>=0.5]["unsubscribe_or_not"]==1]

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
3196526,1,0.641249,38,1,1
4342105,1,0.615943,62,1,1
5687706,1,0.584996,90,1,1
5315050,1,0.583274,91,1,1
3132146,1,0.573309,103,1,1
3991031,1,0.563548,112,1,1
3272195,1,0.530209,166,1,1


##### Testing on XGBC with spw

In [111]:
gini_oot1_y, lorenz_oot1_y, data_oot1_y = model_testing(X_test_data_OOT, y_test_data_OOT, xgbc1_y)

Accuracy score : 0.7684429522759507, Precision score : 0.008114174734960152, Recall score : 0.5645059454908976, f1 score : 0.015998389573982676
Confusion matrix : [[4369153 1311525]
 [   8277   10729]]


In [112]:
gini_oot1_y

0.4411449958684791

In [113]:
lorenz_oot1_y

Unnamed: 0,grp,y_pred_xgbc_y_oos,unsubscribe_or_not,num_mail_id,actual_not_unsubscribed,actual_unsubscribed,per_unsubscribed,y_pred_xgbc_oot_cum,unsubscribe_or_not_cum,num_mail_id_cum,actual_not_unsubscribed_cum,actual_unsubscribed_cum,per_unsubscribed_cum
0,1,0.914735,0.016229,56997,56072,925,0.016229,0.914735,0.016229,56997,56072,925,0.016229
1,3,0.889419,0.005299,113994,113390,604,0.005299,0.897858,0.008942,170991,169462,1529,0.008942
2,5,0.82569,0.014755,113994,112312,1682,0.014755,0.868991,0.011267,284985,281774,3211,0.011267
3,10,0.720269,0.015068,284984,280690,4294,0.015068,0.79463,0.013167,569969,562464,7505,0.013167
4,20,0.585361,0.004562,569968,567368,2600,0.004562,0.689996,0.008865,1139937,1129832,10105,0.008865
5,100,0.264205,0.001952,4559747,4550846,8901,0.001952,0.349363,0.003335,5699684,5680678,19006,0.003335


In [114]:
data_oot1_y

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
2892457,0,0.996509,0,1,1
4362192,0,0.993868,1,1,1
4698805,0,0.993809,2,1,1
4385468,0,0.993647,3,1,1
1976253,0,0.993410,4,1,1
...,...,...,...,...,...
1198660,0,0.000476,5699679,100,100
4834234,0,0.000475,5699680,100,100
5290145,0,0.000427,5699681,100,100
188768,0,0.000357,5699682,100,100


In [125]:
data_oot1_y[data_oot1_y["perc_grp"] == 100]

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
763498,0,0.002292,5642688,100,100
2130060,0,0.002292,5642689,100,100
810549,0,0.002292,5642690,100,100
763503,0,0.002292,5642691,100,100
2670176,0,0.002292,5642692,100,100
...,...,...,...,...,...
1198660,0,0.000476,5699679,100,100
4834234,0,0.000475,5699680,100,100
5290145,0,0.000427,5699681,100,100
188768,0,0.000357,5699682,100,100


In [131]:
num = 100
data_oot1_y[data_oot1_y["perc_grp"] <= num][data_oot1_y[data_oot1_y["perc_grp"] <= num]["y_pred_xgbc_y_oos"]>=0.5][data_oot1_y[data_oot1_y["perc_grp"] <= num][data_oot1_y[data_oot1_y["perc_grp"] <= num]["y_pred_xgbc_y_oos"]>=0.5]["unsubscribe_or_not"]==1]

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
4342105,1,0.991138,26,1,1
5315050,1,0.990556,33,1,1
3196526,1,0.989109,87,1,1
551071,1,0.988947,92,1,1
3132146,1,0.988678,106,1,1
...,...,...,...,...,...
4121668,1,0.500115,1321616,24,100
1334992,1,0.500115,1321633,24,100
2890871,1,0.500105,1321696,24,100
2982453,1,0.500096,1321729,24,100


In [315]:
# del data_oos_y
# del lorenz_oos_y
# del gini_oos_y

# del data_oos1_y
# del lorenz_oos1_y
# del gini_oos1_y

In [316]:
data_oot1_y[data_oot1_y["perc_grp"] == 1]

Unnamed: 0,unsubscribe_or_not,y_pred_xgbc_y_oos,rank,perc_grp,grp
5184170,0,0.995602,0,1,1
113253,0,0.993240,1,1,1
5828341,0,0.993141,2,1,1
4599839,0,0.992200,3,1,1
5510684,0,0.990804,4,1,1
...,...,...,...,...,...
1690267,0,0.879992,58284,1,1
1441965,0,0.879992,58285,1,1
2783814,0,0.879992,58286,1,1
2783818,0,0.879992,58287,1,1


In [320]:
# X_test_data_OOT.columns