In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import pickle
import gc
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold

In [None]:
def load_pickle_obj(filename):
    with open(filename, 'rb') as file:
        obj = pickle.load(file)
    return obj

In [None]:
%%time
train_df = pd.read_pickle("../input/amex-train-aggregation-dataset/train_feat_df.pkl")
train_df.head()

In [None]:
feat_cols = [colname for colname in train_df.columns if (colname not in ['customer_ID', 'target', 'last3_target']) ]
print(len(feat_cols))

In [None]:
def get_numeric_feature_threshholds(num_featcols):
    feat_threshholds={}
    for colname in feat_cols:
        s = train_df[colname]
        s = s[ (s.isna()==False)  & (s!=0)]
        
        vmean = np.mean(s)
        vmin = np.min(s)
        vmax = np.max(s)
        v_01 = np.quantile(s, 0.01)
        v_99 = np.quantile(s, 0.99)
        
        
        feat_threshholds[colname]={}
        feat_threshholds[colname]['vmin'] = v_01 - 2*np.abs(v_01)
        feat_threshholds[colname]['vmax'] = v_99 + 2*np.abs(v_99)
        
    return feat_threshholds

In [None]:
#feat_threshholds = get_numeric_feature_threshholds(feat_cols)
#for colname in feat_cols:
#    vmin = feat_threshholds[colname]['vmin']
#    vmax = feat_threshholds[colname]['vmax']
    #train_df[colname] = np.clip(train_df[colname], vmin, vmax) 
#    break

In [None]:
xgb_params={
    'eta': 0.05,
    'max_depth': 4,
    'subsample': 0.6,
    'colsample_bytree': 0.6,
    'tree_method': 'hist',
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'seed': 44
}

In [None]:
def top_4percent(pred_df):
    df = pred_df.copy()
    df = df.sort_values('pred', ascending=False)
    df['weight'] = df['target'].apply(lambda v: 20 if v==0 else 1)
    four_percent_cutoff = 0.04 * sum(df['weight'])
    df['weight_cumsum'] = df['weight'].cumsum()
    df_cutoff = df[df.weight_cumsum <= four_percent_cutoff]
    
    return df_cutoff['target'].sum()/df['target'].sum()

def weighted_gini(pred_df):
    df = pred_df.copy()
    df = df.sort_values('pred', ascending=False)
    df['weight'] = df['target'].apply(lambda v: 20 if v==0 else 1)
    df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
    total_pos = (df['target'] * df['weight']).sum()
    df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
    df['lorentz'] = df['cum_pos_found'] / total_pos
    df['gini'] = (df['lorentz'] - df['random']) * df['weight']
    return df['gini'].sum()


def normalized_gini(df):
    df_true=df[['target']].copy()
    df_true['pred'] = df_true['target'].copy()
    
    G = weighted_gini(df)/weighted_gini(df_true)
    return G

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=33, shuffle=True)
for foldnum, (train_index, test_index) in enumerate(skf.split(train_df.customer_ID, train_df.target)):
    print("Training Fold:", foldnum)
    fold_train_df = train_df.iloc[train_index]
    fold_val_df = train_df.iloc[test_index]
    
    dtrain = xgb.DMatrix(fold_train_df[feat_cols], label=fold_train_df.target)
    deval = xgb.DMatrix(fold_val_df[feat_cols], label=fold_val_df.target)
    
    bst_model = xgb.train(xgb_params, dtrain, 
                          1000, 
                          early_stopping_rounds= 20,
                          evals=[(dtrain,'train'), (deval, 'eval')],
                          verbose_eval = 50
                         )
    
    
    bst_model.save_model("bst_model_{}".format(foldnum))
    preds = bst_model.predict(deval)
    
    fold_val_pred = fold_val_df[['target']].copy()
    fold_val_pred['pred'] = preds
    fold_val_pred[['target', 'pred']].to_csv("preds_fold_{}.csv".format(foldnum))
    
    
    print()
    print()
    print("evaluation metrics")
    G = normalized_gini(fold_val_pred[['target', 'pred']])
    D = top_4percent(fold_val_pred[['target', 'pred']])
    
    print("Gini:{:.4f}".format(G))
    print("Default Rate:{:.4f}".format(D))
    print("Evaluation Metric:{:.4f}".format( (G+D)/2 ))
    
    print()
    print()
    
    xgb.plot_importance(bst_model, max_num_features =10)
    plt.show()
    print()
    print()
    
    del dtrain
    del deval
    del fold_val_pred
    del bst_model
    gc.collect()

In [None]:
del train_df
gc.collect()