# Ensemble Prediction Model

This notebook shows how to use ensemble for python.

Data preprocessing is taken from [AMEX LightGBM Quickstart](https://www.kaggle.com/code/ambrosm/amex-lightgbm-quickstart)
Uses a lot of code from [This Notebook too](https://www.kaggle.com/code/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
import seaborn as sns
from cycler import cycler
from IPython.display import display
import datetime
import scipy.stats
import warnings
from colorama import Fore, Back, Style
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibrationDisplay
from lightgbm import LGBMClassifier, log_evaluation

plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
plt.rcParams['text.color'] = 'w'

INFERENCE = True # set to False if you only want to cross-validate


In [None]:
def amex_metric(y_true, y_pred, return_components=False) -> float:
    """Amex metric for ndarrays"""
    def top_four_percent_captured(df) -> float:
        """Corresponds to the recall for a threshold of 4 %"""
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(df) -> float:
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(df) -> float:
        """Corresponds to 2 * AUC - 1"""
        df2 = pd.DataFrame({'target': df.target, 'prediction': df.target})
        df2.sort_values('prediction', ascending=False, inplace=True)
        return weighted_gini(df) / weighted_gini(df2)

    df = pd.DataFrame({'target': y_true.ravel(), 'prediction': y_pred.ravel()})
    df.sort_values('prediction', ascending=False, inplace=True)
    g = normalized_weighted_gini(df)
    d = top_four_percent_captured(df)

    if return_components: return g, d, 0.5 * (g + d)
    return 0.5 * (g + d)

def lgb_amex_metric(y_true, y_pred):
    """The competition metric with lightgbm's calling convention"""
    return ('amex',
            amex_metric(y_true, y_pred),
            True)

# Reading and preprocessing the data

We read the data from @munumbutt's [AMEX-Feather-Dataset](https://www.kaggle.com/datasets/munumbutt/amexfeather). Then we create two groups of features:
- Selected features averaged over all statements of a customer
- Selected features taken from the last statement of a customer

The code has been optimized for memory efficiency rather than readability. In particular, `.iloc[mask_array, columns]` needs much less RAM than the groupby construction used in the previous version of the notebook.


In [None]:
%%time
features_avg = ['B_1', 'B_11', 'B_16', 'B_17', 'B_18', 'B_2', 'B_20',
                'B_28', 'B_3', 'B_4', 'B_5', 'B_7', 'B_9', 'D_112',
                'D_121', 'D_141', 'D_39', 'D_41', 'D_42', 'D_43',
                'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 
                'D_50', 'D_51', 'D_53', 'D_54', 'D_56', 'D_58', 
                'D_59', 'D_60', 'D_91', 'P_2', 'P_3', 'R_1', 'R_2', 
                'R_27', 'R_3', 'R_7', 'S_11', 'S_26', 'S_3', 'S_5']
features_last = ['B_1', 'B_10', 'B_11', 'B_12', 'B_13', 'B_15', 'B_16',
                 'B_17', 'B_18', 'B_19', 'B_2', 'B_20', 'B_22', 'B_23',
                 'B_24', 'B_25', 'B_26', 'B_27', 'B_28', 'B_29', 'B_3',
                 'B_32', 'B_33', 'B_36', 'B_38', 'B_39', 'B_4', 'B_40',
                 'B_41', 'B_42', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9',
                 'D_102', 'D_103', 'D_105', 'D_106', 'D_107', 'D_109',
                 'D_112', 'D_115', 'D_117', 'D_118', 'D_119', 'D_120',
                 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_127', 
                 'D_129', 'D_132', 'D_133', 'D_135', 'D_136', 'D_137', 
                 'D_140', 'D_141', 'D_143', 'D_145', 'D_39', 'D_41',
                 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48',
                 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_54', 'D_55',
                 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_63',
                 'D_64', 'D_66', 'D_70', 'D_72', 'D_73', 'D_74', 'D_75',
                 'D_76', 'D_77', 'D_78', 'D_79', 'D_80', 'D_82', 'D_83',
                 'D_84', 'D_86', 'D_91', 'D_92', 'D_93', 'D_94', 'D_96',
                 'P_2', 'P_3', 'R_1', 'R_10', 'R_11', 'R_12', 'R_13',
                 'R_14', 'R_15', 'R_17', 'R_18', 'R_19', 'R_2', 'R_20', 
                 'R_21', 'R_22', 'R_24', 'R_25', 'R_26', 'R_27', 'R_3',
                 'R_4', 'R_5', 'R_7', 'R_8', 'R_9', 'S_11', 'S_12',
                 'S_13', 'S_15', 'S_17', 'S_20', 'S_22', 'S_23', 
                 'S_24', 'S_25', 'S_26', 'S_27', 'S_3', 'S_5', 'S_6',
                 'S_7', 'S_8', 'S_9']

train_test = [None, None] # first element is train, second element is test
for i in [1, 0] if INFERENCE else [0]:
    train_test[i] = pd.read_feather(['../input/amexfeather/train_data.ftr',
                                     '../input/amexfeather/test_data.ftr'][i])
    cid = pd.Categorical(train_test[i].pop('customer_ID'), ordered=True)
    last = (cid != np.roll(cid, -1)) # mask for last statement of every customer
    if i == 0: # train
        target = train_test[0].loc[last, 'target']
    gc.collect()
    print('Read', i)
    df_avg = (train_test[i][features_avg]
              .groupby(cid)
              .mean()
              .rename(columns={f: f"{f}_avg" for f in features_avg})
             )
    gc.collect()
    print('Computed avg', i)
    train_test[i] = (train_test[i].loc[last, features_last]
                     .rename(columns={f: f"{f}_last" for f in features_last})
                     .set_index(np.asarray(cid[last]))
                    )
    gc.collect()
    print('Computed last', i)
    train_test[i] = pd.concat([train_test[i], df_avg], axis=1)
    del df_avg, cid, last

train, test = tuple(train_test)
del train_test
if INFERENCE: print('Shapes:', train.shape, target.shape, test.shape)


# Preprocessing



In [None]:
train_numericcols = train.select_dtypes('float16').columns.tolist()
train_catcols = train.select_dtypes(exclude=['float16','int64']).columns.tolist()

## Correlation
Need to understand the correlation between the numerical variables and drop ones with correlation greater than 0.9

In [None]:
import numpy as np

# Create correlation matrix
corr_matrix = train[train_numericcols].corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

# Drop features 
train.drop(to_drop, axis=1, inplace=True)

In [None]:
print(str(len(to_drop)) + ' Columns were dropped')

In [None]:
train['target'] = target.to_list()

# Missing Values

While some algos deal with missing values automatically, most don't. So for missing values we will be following a simple philosophy:

* Numeric Columns: Replace with Median
* Categorical columns: Take WOE, this automatically treats missing as a category

## Numeric

In [None]:
# remove correlated variables
train_numericcols2 = set(train_numericcols) - set(to_drop)

#replace with median 

for col in train_numericcols2:    
    train[col].fillna(train[col].median(), inplace = True)
    test[col].fillna(test[col].median(), inplace = True)

## Categorical

In [None]:
# WOE transformation of categorical variables 

def get_bivar_tables(train_data, col_names, dep_var):
    for col_name in col_names:
            print(col_name)
            train_data[col_name] = train_data[col_name].cat.add_categories("Missing").fillna("Missing")
            train_data['bins'] = train_data[col_name].replace(np.nan,'Missing').replace('','Missing')
            binned_df = train_data.groupby(['bins'], as_index=False)[dep_var].count()
            binned_df['events'] = train_data.groupby(['bins'])[dep_var].sum().tolist()
            binned_df['non_events'] = binned_df[dep_var] - binned_df['events']
            binned_df['event_rate'] = [x*100 for x in train_data.groupby(['bins'])[dep_var].mean().tolist()]
            binned_df['event_rate'] = binned_df['event_rate'].replace(np.nan,0)
            binned_df['event_capture'] = np.round((binned_df['events']/np.sum(binned_df['events'])),8)
            binned_df['non_event_capture'] = np.round((binned_df['non_events']/np.sum(binned_df['non_events'])),8)
            binned_df['woe'] = np.log(binned_df['non_event_capture']/binned_df['event_capture']).replace(np.nan,0).replace(np.inf,0)
            binned_df['IV'] = ((binned_df['non_event_capture'] - binned_df['event_capture'])*binned_df['woe']).replace(np.nan,0).replace(np.inf,0)
            bivar_tables[col_name] = binned_df
           # print(col_name + ":" + "\n")
           # print(info_val[col_name])
    return bivar_tables

bivar_tables = {}
bivar_tables= get_bivar_tables(train, train_catcols, 'target')

In [None]:
def get_woe(train_data, val_data, bivar_tables,vars_used2):
    for col_name in vars_used2:
        print(col_name)
        val_data[col_name] = val_data[col_name].cat.add_categories("Missing").fillna("Missing")
        
        train_data['bins'] = train_data[col_name].replace(np.nan,'Missing').replace('','Missing')
        val_data['bins'] = val_data[col_name].replace(np.nan,'Missing').replace('','Missing')
      

        train_data[col_name]= list(pd.merge(train_data[[col_name,'bins']], bivar_tables[col_name][['bins','woe']], on=['bins'], how='left')['woe'])
        val_data[col_name]= list(pd.merge(val_data[[col_name,'bins']], bivar_tables[col_name][['bins','woe']], on=['bins'], how='left')['woe'])


    return train_data,val_data 




## Create final datasets

In [None]:
train_df1 = train.copy()
val_df1 = test.copy()
train_df1, val_df1 = get_woe(train_df1, val_df1,bivar_tables, train_catcols)

In [None]:
#Finalcols

final_cols = list(train_numericcols2) + train_catcols

# Split Train and Test Data

As mentioned previously, the test file provided is really validation data for competition submission. So, we will use sklearn function to split the training data in two datasets; 75/25 split. This is important, so we don't overfit our model. Meaning, the algorithm is so specific to a given subset, it cannot accurately generalize another subset, from the same dataset. It's important our algorithm has not seen the subset we will use to test, so it doesn't "cheat" by memorizing the answers. We will use sklearn's train_test_split function. In later sections we will also use sklearn's cross validation functions, that splits our dataset into train and test for data modeling comparison.

In [None]:
from sklearn import model_selection  

train_1, val_1 = model_selection.train_test_split(train_df1, random_state = 0)

print("Train1 Shape: {}".format(train_1.shape))
print("Test1 Shape: {}".format(val_1.shape))

# Model
## Define Models

In [None]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

vote_est = [
    #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html
  
    ('etc',ensemble.ExtraTreesClassifier()),
    ('rfc', ensemble.RandomForestClassifier()),


    

    
    #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html
    ('bnb', naive_bayes.BernoulliNB()),
    ('gnb', naive_bayes.GaussianNB()),

    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
   ('xgb', XGBClassifier())

]

## Defining Hyperparameters

In [None]:
#Hyperparameter Tune with GridSearchCV: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
grid_n_estimator = [10, 50, 100, 300]
grid_ratio = [.1, .25, .5, .75, 1.0]
grid_learn = [.01, .03, .05, .1, .25]
grid_max_depth = [2, 4, 6, 8, 10]
grid_min_samples = [5, 10, .03, .05, .10]
grid_criterion = ['gini', 'entropy']
grid_bool = [True, False]
grid_seed = [0]


grid_param = [
        
    

    
            {
            #ExtraTreesClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
            'n_estimators': grid_n_estimator, #default=10
            'criterion': grid_criterion, #default=”gini”
            'max_depth': grid_max_depth, #default=None
            'random_state': grid_seed
             },


    
            {
            #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
            'n_estimators': grid_n_estimator, #default=10
            'criterion': grid_criterion, #default=”gini”
            'max_depth': grid_max_depth, #default=None
            'oob_score': [True], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.
            'random_state': grid_seed
             },
    
     

            
    
            {
            #BernoulliNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB
            'alpha': grid_ratio, #default: 1.0
             },
    
     #GaussianNB - 
            {},

    
            {
            #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html
            'learning_rate': grid_learn, #default: .3
            'max_depth': [1,2,4,6,8,10], #default 2
            'n_estimators': grid_n_estimator, 
            'seed': grid_seed  
             }   
        ]




## Tune Params

For tuning, we can use a sample of variables, if you have got resources you can run on entire train set

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_comb = 25


train_samp = train_1.sample(100000)
for clf, param in zip (vote_est, grid_param):
    best_search = RandomizedSearchCV(estimator = clf[1], param_distributions = param, n_iter=param_comb,cv = 5, scoring = 'roc_auc', verbose=3, n_jobs = 125, random_state=1001)
    best_search.fit(train_samp[final_cols],train_samp['target'])
    best_param = best_search.best_params_

    print('The best parameter for {} is {}'.format(clf[1].__class__.__name__, best_param)) 
    clf[1].set_params(**best_param)
    print('-'*10)

# Fit the tuned models
For now, we are just fitting on the train set, in next iteration we will do statistics on train and test to check for overfitting

In [None]:
# For now, 

vote_soft = ensemble.VotingClassifier(estimators = vote_est , voting = 'soft',n_jobs = -1)
vote_soft.fit(train_df1[final_cols],train_df1['target'])

In [None]:
y_pred = vote_soft.predict_proba(val_df1[final_cols])[:, 1]

# Submission



In [None]:
sub = pd.DataFrame({'customer_ID': test.index,
                        'prediction': y_pred})
sub.to_csv('submission.csv', index=False)
display(sub)