In [None]:
#import lib
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import StratifiedKFold, cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
#current path 
import os 
print (os.getcwd())


In [None]:
#set path 
os.chdir('C:\\Users\\saite\\Documents\\Bank data AV hack')

In [None]:
#load the data using pandas
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test['Responders'] = np.nan

In [None]:
# Combine data for performing Feature Engineering
combin = pd.concat([train,test])
combin.replace('Y', 1, inplace = True)
combin.replace('N', 0, inplace = True)

In [None]:
# Replacing all nans in time series variables by zero
combin.iloc[:,11:11 + 39 * 6].fillna(0,inplace = True)

In [None]:
d_c_cols = [col for col in combin.iloc[:,11:50].columns]
d_c_cols = [element[:-1] for element in d_c_cols]

In [None]:
# Create new features from the last 3 months using some measures including max, min, mean and standard deviation
for col in d_c_cols:
    col2 = [column for column in combin.columns if col in column]
    combin['Last_3_month_average_' + str(col)]  = combin[col2].mean(axis = 1)
    combin['Last_3_month_max_' + str(col)]  = combin[col2].max(axis = 1)
    combin['Last_3_month_min_' + str(col)]  = combin[col2].min(axis = 1)
    combin['Last_3_month_std_' + str(col)]  = combin[col2].std(axis = 1)

In [None]:
# Drop in Debit amount as a percentage of the avg monthly balance
for i in np.arange(1,6):
    combin['custinit_DR_amt_Drop_Build_'+ str(i)] = (combin['custinit_DR_amt_prev' + str(i+1)] - combin['custinit_DR_amt_prev' + str(i)])/(1 + combin['BAL_prev' + str(i)])

In [None]:
#Difference between end of month balance and average balance of each month as percentage of avg balance
for i in np.arange(1,7):
    combin['EOP_BAL_DIFF_prev' + str(i)] = (combin['EOP_prev' + str(i)] - combin['BAL_prev'+ str(i)])/combin['BAL_prev' + str(i)]

In [None]:
# Standard Deviation for the above generated feature(list comprihension with condition)
eop_bal_cols = [col for col in combin.columns if 'EOP_BAL_DIFF' in col]
combin['STD_EOP_BAL_DIFF_prev'] = combin[eop_bal_cols].std(axis = 1)

In [None]:
# Measures generated from percentage change in credit amount for each pair of consecutive months
cr_amb_drop_cols = ['CR_AMB_Drop_Build_1','CR_AMB_Drop_Build_2','CR_AMB_Drop_Build_3']
combin['Mean_CR_AMB_Drop_Build'] = combin[cr_amb_drop_cols].mean(axis = 1)
combin['STD_CR_AMB_Drop_Build'] = combin[cr_amb_drop_cols].std(axis = 1)
combin['MIN_CR_AMB_Drop_Build'] = combin[cr_amb_drop_cols].min(axis = 1)
combin['Weighted_Sum_CR_AMB_Drop_Build'] = 1 * (combin['CR_AMB_Drop_Build_1'] < 0) + 0.8 * 
(combin['CR_AMB_Drop_Build_2'] < 0) + 0.6 * (combin['CR_AMB_Drop_Build_3'] < 0) + 
0.4 *(combin['CR_AMB_Drop_Build_4'] < 0) + 0.2 * (combin['CR_AMB_Drop_Build_5'] < 0)

i_cnr_prev = ['I_CNR_PrevQ1','I_CNR_PrevQ2']
i_aqb_prev = ['I_AQB_PrevQ1','I_AQB_PrevQ1']

In [None]:
# Change in quarterly average balance
combin['I_AQB_CHANGE'] = (combin['I_AQB_PrevQ1'] - combin['I_AQB_PrevQ2'])
combin['I_NRV_CHANGE'] = (combin['I_NRV_PrevQ1'] - combin['I_NRV_PrevQ2'])

In [None]:
# Average of quarterly customer net revenue and quarterly balance
combin['Mean_I_CNR_PrevQ'] = combin[i_cnr_prev].mean(axis = 1)
combin['Mean_I_AQB_PrevQ'] = combin[i_aqb_prev].mean(axis = 1)

In [None]:
# Sum of average debit and credit transactions in the past 3 months 
combin['Total_Count_txn'] = (combin['Last_3_month_average_count_C_prev'] + combin['Last_3_month_average_count_D_prev'])

In [None]:
# Percentage of transactions via each mode - branch, atm and mobile banking
combin['percent_txn_branch'] = (combin['Last_3_month_average_COUNT_BRANCH_C_prev'] + combin['Last_3_month_average_COUNT_BRANCH_D_prev'])/combin['Total_Count_txn']
combin['percent_txn_atm'] = (combin['Last_3_month_average_COUNT_ATM_C_prev'] + combin['Last_3_month_average_COUNT_ATM_D_prev'])/combin['Total_Count_txn']
combin['percent_txn_phn_mob'] = (combin['Last_3_month_average_COUNT_IB_C_prev'] + combin['Last_3_month_average_COUNT_IB_D_prev'] + combin['Last_3_month_average_COUNT_MB_C_prev'] + combin['Last_3_month_average_COUNT_MB_D_prev'])/combin['Total_Count_txn']
percent_change_cols = ['Percent_Change_in_Credits', 'Percent_Change_in_FT_Bank', 'Percent_Change_in_FT_outside', 'Percent_Change_in_Self_Txn', 'Percent_Change_in_Big_Expenses']

In [None]:
# Median percentage change for all modes of transaction
combin['Median_Percent_change'] = combin[percent_change_cols].median(axis = 1)

In [None]:
# Label encoding some categorical features
dict_hnw = {'3_Classic': 3, '2_Preferred': 2, '1_Imperia':1}
combin.replace({"HNW_CATEGORY": dict_hnw}, inplace=True)

dict_fw = {'HIGH': 3, 'MEDIUM': 2, 'LOW':1}
combin.replace({"FINAL_WORTH_prev1": dict_fw}, inplace=True)

dict_eng = {'HIGH': 3, 'MEDIUM': 2, 'LOW':1, 'NO':-1}
combin.replace({'ENGAGEMENT_TAG_prev1': dict_eng}, inplace=True)

dict_rbi_ca = {'METROPOLITAN': 3, 'URBAN': 2, 'SEMI-URBAN':1, 'RURAL':0}
combin.replace({'RBI_Class_Audit': dict_rbi_ca}, inplace=True)

dict_billpay = {'A_MISSING': -9999, 'B_1':1, 'C_2':2, 'D_3':3}
combin.replace({'Billpay_Active_PrevQ1_N': dict_billpay}, inplace=True)
combin.replace({'Billpay_Reg_ason_Prev1_N': dict_billpay}, inplace=True)
combin.replace({'Charges_cnt_PrevQ1_N': dict_billpay}, inplace=True)
combin.replace({'FRX_PrevQ1_N': dict_billpay}, inplace=True)

dict_gender = {'Male': 1, 'Female':0, 'Missin':-1}
combin.replace({'gender_bin': dict_gender}, inplace = True)

In [None]:
# Convert city variable wrt degree of number of customers
combin['new_city'] = combin['city']
counts = combin.city.value_counts()
combin.new_city[combin['new_city'].isin(counts[counts > 9000].index)] = 3
combin.new_city[combin['new_city'].isin(counts[counts < 9000].index) & combin['new_city'].isin(counts[counts >= 3500].index)] = 2
combin.new_city[combin['new_city'].isin(counts[counts < 3500].index) & combin['new_city'].isin(counts[counts >= 1000].index)] = 1
combin.new_city[combin['new_city'].isin(counts[counts < 1000].index)] = 0
combin['zip'] = combin['zip'].astype('object')
combin['zip_first_3'] = combin['zip'].str[:3]


In [None]:
# Convert occupation to one hot encoded features
combin = pd.concat([combin,pd.get_dummies(combin['OCCUP_ALL_NEW'],prefix = str('OCCUP_ALL_NEW'),prefix_sep='_')],axis = 1)

In [None]:
# Create derived features from the loan specific information
prem_cols = [col for col in combin.columns if 'PREM_CLOSED' in col]

combin['any_prematurely_closed_loans'] = combin[prem_cols].sum(axis = 1)
loan_closed_cols = ['AGRI_Closed_PrevQ1', 'AL_CNC_Closed_PrevQ1', 'AL_Closed_PrevQ1', 'BL_Closed_PrevQ1', 'CC_CLOSED_PREVQ1',
                    'CE_Closed_PrevQ1', 'CV_Closed_PrevQ1', 'EDU_Closed_PrevQ1', 'GL_Closed_PrevQ1', 'OTHER_LOANS_Closed_PrevQ1', 
                    'PL_Closed_PrevQ1', 'RD_CLOSED_PREVQ1', 'FD_CLOSED_PREVQ1', 'TL_Closed_PrevQ1', 'TWL_Closed_PrevQ1',
                    'DEMAT_CLOSED_PREV1YR', 'SEC_ACC_CLOSED_PREV1YR']
combin['any_closed'] = combin[loan_closed_cols].max(axis = 1)

live_loan_cols = [col for col in combin.columns if 'LIVE' in col]
combin['any_live_loans'] = combin[live_loan_cols].max(axis = 1)

In [None]:
# Replace all remaining nans with a large negative value
combin.fillna(-9999,inplace = True)
combin.replace('>', -9999, inplace = True)

dt_cols = ['Req_Resolved_PrevQ1', 'Query_Resolved_PrevQ1', 'Complaint_Resolved_PrevQ1']
for i in dt_cols:
    combin[i] = pd.to_numeric(combin[i], errors='coerce')
combin.drop(['Responders','UCIC_ID','zip','city','OCCUP_ALL_NEW'],axis = 1,inplace=True)

In [None]:
test_new = combin.iloc[300000:,:]
train_new = combin.iloc[0:300000,:]
y_all = train.Responders

In [None]:
# Remove 0 variance features
threshold = 0
train_new = train_new.drop(train_new.std()[train_new.std() == threshold].index.values, axis=1)
test_new = test_new[train_new.columns]

In [None]:
# Model for choosing high performing features
bst_lgb_2 = lgb.LGBMClassifier(learning_rate=0.1, colsample_bytree=0.8, max_depth=9, boosting_type='gbdt', objective='binary',
                               num_leaves=255,
                               n_estimators=300,
                               n_jobs=8, seed = 99)
bst_lgb_2.fit(train_new,y_all)

# Feature selection using the above model to build a decent performing model
from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(bst_lgb_2, threshold='median')
sfm.fit(train_new,y_all)
z = sfm.get_support(indices=False)
zz = pd.DataFrame({'Feature':train_new.columns.values, 'ret': z})
cols = np.array(zz.loc[zz.ret == 1,'Feature'])
train_new = pd.DataFrame(sfm.transform(train_new),columns=cols)
test_new = pd.DataFrame(sfm.transform(test_new),columns=cols)

In [None]:
kfold = 10
nrounds = 5000
X = train_new.values
y = y_all.values
features = train_new.columns
probs = 0
y_test_pred_xgb = 0
y_valid_pred_xgb = train['UCIC_ID'].to_frame()
y_valid_pred_xgb['xgb'] = 0
y_test_pred_lgb = 0
y_valid_pred_lgb = train['UCIC_ID'].to_frame()
y_valid_pred_lgb['lgb'] = 0

In [None]:
sub_xgb=test['UCIC_ID'].to_frame()
sub_xgb['Responders']=0


params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.01
params['booster'] = 'gbtree'
params['silent'] = True
params['max_depth'] = 6
params['subsample'] = 0.8
params['colsample_bytree'] = 0.8
params['tree_method'] = 'gpu_hist'
params['eval_metric'] = 'auc'
params['gamma'] = 0.05
params['min_child_weight'] = 7

# Take average of different xgboost models for better generalization

skf = StratifiedKFold(n_splits=kfold, random_state=0, shuffle=True)
np.random.seed(0)

# Using varied models for better generalization
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    if i <= 3:
        params['max_depth'] = 6
    elif i > 3 and i <= 6:
        params['max_depth'] = 7
    else:
        params['max_depth'] = 8
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100,
                          maximize=True, verbose_eval=100)
    
    # Generate validation predictions for this fold
    pred = xgb_model.predict(xgb.DMatrix(X_valid))
    y_valid_pred_xgb.iloc[test_index,1] = pred
    
    probs = xgb_model.predict(xgb.DMatrix(test_new[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50)
    y_test_pred_xgb += np.log(probs/(1-probs))
    y_test_pred_xgb /= kfold  # Average test set predictions
    y_test_pred_xgb =  1  /  ( 1 + np.exp(-y_test_pred_xgb) )
    
sub_xgb['Responders'] = y_test_pred_xgb
sub_xgb.to_csv('xgb_tuned_averaged.csv', index=False, float_format='%.5f')

In [None]:
# lgb Light gbm model 
kfold = 10
sub_lgb=test['UCIC_ID'].to_frame()
sub_lgb['Responders']=0

params = {'metric': 'auc', 'learning_rate' : 0.01, 'max_depth':9, 'max_bin':20,  'objective': 'binary', 
          'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':10,  'min_data': 500, 'num_leaves': 255}

skf = StratifiedKFold(n_splits=kfold, random_state=5, shuffle = True)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):

# Using varied models for better generalization
    if i <= 3:
        params['num_leaves'] = 127
        params['max_depth'] = 0
    elif i > 3 and i <= 6:
        params['num_leaves'] = 90
        params['max_depth'] = 0
    else:
        params['max_depth'] = 9
        params['num_leaves'] = 255
    print(' lgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), nrounds, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=100, 
                  early_stopping_rounds=100)
    sub_lgb['Responders'] += lgb_model.predict(test_new[features].values, 
                        num_iteration=lgb_model.best_iteration) / (kfold)
sub_lgb.to_csv('lgb_tuned_averaged.csv', index=False, float_format='%.5f')

In [None]:
# Calculating Normalized rank and making ensemble
xgb_pred = pd.read_csv('xgb_tuned_averaged.csv')
lgb_pred = pd.read_csv('lgb_tuned_averaged.csv')
xgb_pred['rank/length'] = xgb_pred['Responders'].rank(ascending = 1)/xgb_pred.shape[0]
lgb_pred['rank/length'] = lgb_pred['Responders'].rank(ascending = 1)/lgb_pred.shape[0]
sub = pd.DataFrame({'UCIC_ID': test.UCIC_ID, 'Responders':0.45 * xgb_pred['rank/length'] + 0.55 * lgb_pred['rank/length']})
sub.to_csv('sub_ens_rank_norm_xgb_lgb.csv',index=False,float_format='%.6f')