In [1]:
import pandas as pd
import numpy as np
import random
import copy

from lightgbm import LGBMClassifier
import lightgbm as lgbm
from tqdm import tqdm_notebook as tqdm

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from utils.utils import *

pd.options.display.max_columns = None
%matplotlib inline

In [2]:
all_data = pd.read_pickle('edit/fixed_data/all_data.pkl')
buro = pd.read_pickle('edit/fixed_data/buro.pkl')
cc_bal = pd.read_pickle('edit/fixed_data/cc_bal.pkl')
inst = pd.read_pickle('edit/fixed_data/inst.pkl')
pos = pd.read_pickle('edit/fixed_data/pos.pkl')
prev = pd.read_pickle('edit/fixed_data/prev.pkl')

In [3]:
def run(model=None, submit=None):
    merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
    FoldSubmit(merged, index_cols, model, submit)

In [4]:
# base model
clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=32,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.04,
            reg_lambda=0.073,
            min_split_gain=0.0222415,
            min_child_weight=40,
            random_state=seed,
            silent=-1,
            verbose=-1, )

In [5]:
merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
clf_set =  FoldSubmit(merged, index_cols, clf, return_clf=True)
feat_cols = merged.drop(index_cols,axis=1).columns

100%|██████████| 6/6 [00:23<00:00,  4.45s/it]


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.7572	valid_1's auc: 0.750343
[200]	training's auc: 0.781176	valid_1's auc: 0.767766
[300]	training's auc: 0.799615	valid_1's auc: 0.779381
[400]	training's auc: 0.812122	valid_1's auc: 0.784964
[500]	training's auc: 0.822272	valid_1's auc: 0.787902
[600]	training's auc: 0.830675	valid_1's auc: 0.789539
[700]	training's auc: 0.838401	valid_1's auc: 0.790743
[800]	training's auc: 0.845329	valid_1's auc: 0.791582
[900]	training's auc: 0.851975	valid_1's auc: 0.792091
[1000]	training's auc: 0.857867	valid_1's auc: 0.792588
[1100]	training's auc: 0.863606	valid_1's auc: 0.792994
[1200]	training's auc: 0.869159	valid_1's auc: 0.792978
Early stopping, best iteration is:
[1098]	training's auc: 0.863471	valid_1's auc: 0.793004
Fold  1 AUC : 0.793004
Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.756751	valid_1's auc: 0.739912
[200]	training's auc: 0.781332	valid_1's auc

In [6]:
all_data = pd.read_pickle('edit/tmp_data/all_dataX2.pkl')
run(model = clf, submit='180720-5.csv')

100%|██████████| 6/6 [00:23<00:00,  4.67s/it]


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.754741	valid_1's auc: 0.748164
[200]	training's auc: 0.782478	valid_1's auc: 0.768629
[300]	training's auc: 0.801063	valid_1's auc: 0.780631
[400]	training's auc: 0.813384	valid_1's auc: 0.785729
[500]	training's auc: 0.823198	valid_1's auc: 0.788746
[600]	training's auc: 0.8313	valid_1's auc: 0.790238
[700]	training's auc: 0.838955	valid_1's auc: 0.791386
[800]	training's auc: 0.846012	valid_1's auc: 0.79213
[900]	training's auc: 0.852679	valid_1's auc: 0.792512
[1000]	training's auc: 0.858569	valid_1's auc: 0.792874
[1100]	training's auc: 0.864228	valid_1's auc: 0.793243
[1200]	training's auc: 0.869666	valid_1's auc: 0.793308
[1300]	training's auc: 0.874806	valid_1's auc: 0.793343
[1400]	training's auc: 0.87941	valid_1's auc: 0.793346
[1500]	training's auc: 0.884148	valid_1's auc: 0.793396
[1600]	training's auc: 0.888924	valid_1's auc: 0.793504
[1700]	training's auc: 0.893234	valid_1's auc: 0.79338

In [66]:
def FoldSubmit_base(all_data, index_cols, model = None, submit=None, return_clf = False):
    np.random.seed(seed)
    folds = KFold(n_splits=4, shuffle=True, random_state=seed)
    data = all_data[all_data.TEST==0]
    y = data.TARGET
    data = data.drop(index_cols,axis=1)
    test = all_data[all_data.TEST==1].drop(index_cols,axis=1)
    total_score=[]
    clf_set=[]

    sub_preds = np.zeros(test.shape[0])

    for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data)):
        trn_x, trn_y = data.iloc[trn_idx], y.iloc[trn_idx]
        val_x, val_y = data.iloc[val_idx], y.iloc[val_idx]

        if model == None:
            clf = LGBMClassifier(
                n_estimators=300,
                learning_rate=0.03,
                num_leaves=30,
                colsample_bytree=.8,
                subsample=.9,
                max_depth= 7,
                reg_alpha=.1,
                reg_lambda=.1,
                min_split_gain=.01,
                min_child_weight=2,
                random_state=seed,
                silent=True,
                verbose=-1,
            )
        else:
            clf = model
        
        clf.fit(trn_x, trn_y,
                eval_set= [(trn_x, trn_y), (val_x, val_y)],
                eval_class_weight = [{0: np.sum(trn_y) , 1: len(trn_y)-np.sum(trn_y)}, {0:np.sum(val_y), 1:len(val_y)-np.sum(val_y)}], 
                eval_metric='auc', 
                verbose=100, 
                early_stopping_rounds=150
               )


        oof_preds = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        auc_score = roc_auc_score(val_y, oof_preds)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, auc_score))

        total_score.append(auc_score)
        if return_clf: clf_set.append(copy.deepcopy(clf))

    print('Mean AUC : %.6f' % (np.mean(total_score)))
    print('Std AUC : %.6f' % (np.std(total_score)))

    if submit!=None:
        submit_data = pd.DataFrame(np.c_[all_data[all_data.TEST==1]['SK_ID_CURR'].values.astype('int32'),sub_preds],
             columns=['SK_ID_CURR','TARGET'])
        submit_data['SK_ID_CURR'] = submit_data['SK_ID_CURR'].astype('int')
        submit_data['TARGET'] = submit_data['TARGET'].astype('float32')
        submit_data.to_csv('csv/'+submit, index=None, float_format='%.8f')

    if return_clf: return clf_set

    return

In [35]:
merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')

100%|██████████| 6/6 [00:33<00:00,  6.68s/it]


In [None]:
clf_set = FoldSubmit_base(merged, index_cols, clf, return_clf=True)

Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.754741	valid_1's auc: 0.744837
[200]	training's auc: 0.782478	valid_1's auc: 0.765954
[300]	training's auc: 0.801063	valid_1's auc: 0.778692
[400]	training's auc: 0.813384	valid_1's auc: 0.784409
[500]	training's auc: 0.823198	valid_1's auc: 0.787804
[600]	training's auc: 0.8313	valid_1's auc: 0.789468
[700]	training's auc: 0.838955	valid_1's auc: 0.790598
[800]	training's auc: 0.846012	valid_1's auc: 0.791458
[900]	training's auc: 0.852679	valid_1's auc: 0.792076
[1000]	training's auc: 0.858569	valid_1's auc: 0.792447
[1100]	training's auc: 0.864228	valid_1's auc: 0.792707
[1200]	training's auc: 0.869666	valid_1's auc: 0.79294
[1300]	training's auc: 0.874806	valid_1's auc: 0.793136
[1400]	training's auc: 0.87941	valid_1's auc: 0.793119
[1500]	training's auc: 0.884148	valid_1's auc: 0.793277
[1600]	training's auc: 0.888924	valid_1's auc: 0.793452
[1700]	training's auc: 0.893234	valid_1's auc: 0.79331

In [None]:
feat_cols = merged.drop(index_cols,axis=1).columns

In [None]:
all_data = pd.read_pickle('edit/tmp_data/all_dataX2.pkl')
merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
clf_set2 = FoldSubmit_base(merged, index_cols, clf, return_clf=True)

In [43]:
len(feat_cols)

620

In [60]:
for i in range(4):
    if i == 0:
        data = pd.DataFrame([list(feat_cols),clf_set[i].feature_importances_.tolist()], index=['feature','score_'+str(i)]).T
    else:
        data =data.merge(pd.DataFrame([list(feat_cols),clf_set[i].feature_importances_.tolist()], index=['feature','score_'+str(i)]).T, on='feature')

In [61]:
data

Unnamed: 0,feature,score_0,score_1,score_2,score_3
0,AMT_ANNUITY,595,595,595,595
1,AMT_CREDIT,326,326,326,326
2,AMT_GOODS_PRICE,334,334,334,334
3,AMT_INCOME_TOTAL,185,185,185,185
4,AMT_REQ_CREDIT_BUREAU_DAY,1,1,1,1
5,AMT_REQ_CREDIT_BUREAU_HOUR,0,0,0,0
6,AMT_REQ_CREDIT_BUREAU_MON,22,22,22,22
7,AMT_REQ_CREDIT_BUREAU_QRT,62,62,62,62
8,AMT_REQ_CREDIT_BUREAU_WEEK,12,12,12,12
9,AMT_REQ_CREDIT_BUREAU_YEAR,109,109,109,109


In [None]:
# イテレーション回す用のやつ
# 特徴量選択・うまくいったら残す
# columns, picked(チェックしてよかった), used(現在の全部), drop(不要)
# でそれぞれto_pickle

In [14]:
clf_set

[LGBMClassifier(boosting_type='gbdt', class_weight=None,
         colsample_bytree=0.9497036, learning_rate=0.02, max_depth=8,
         min_child_samples=20, min_child_weight=40,
         min_split_gain=0.0222415, n_estimators=10000, n_jobs=-1, nthread=4,
         num_leaves=32, objective=None, random_state=1001, reg_alpha=0.04,
         reg_lambda=0.073, silent=-1, subsample=0.8715623,
         subsample_for_bin=200000, subsample_freq=1, verbose=-1),
 LGBMClassifier(boosting_type='gbdt', class_weight=None,
         colsample_bytree=0.9497036, learning_rate=0.02, max_depth=8,
         min_child_samples=20, min_child_weight=40,
         min_split_gain=0.0222415, n_estimators=10000, n_jobs=-1, nthread=4,
         num_leaves=32, objective=None, random_state=1001, reg_alpha=0.04,
         reg_lambda=0.073, silent=-1, subsample=0.8715623,
         subsample_for_bin=200000, subsample_freq=1, verbose=-1),
 LGBMClassifier(boosting_type='gbdt', class_weight=None,
         colsample_bytree=0.949

In [16]:
all_data = pd.read_pickle('edit/tmp_data/all_dataX.pkl')
merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
SK_ID = pd.read_pickle('./edit/tmp_data/choice_SK_ID.pkl')
merged = SK_ID.to_frame().merge(merged, how='left', on='SK_ID_CURR')
FoldSubmit(merged, index_cols, clf, submit='180720-2.csv')

100%|██████████| 6/6 [00:25<00:00,  4.83s/it]


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.755755	valid_1's auc: 0.750502
[200]	training's auc: 0.783795	valid_1's auc: 0.771823
[300]	training's auc: 0.802442	valid_1's auc: 0.784341
[400]	training's auc: 0.81501	valid_1's auc: 0.790659
[500]	training's auc: 0.824775	valid_1's auc: 0.79401
[600]	training's auc: 0.833719	valid_1's auc: 0.795995
[700]	training's auc: 0.842022	valid_1's auc: 0.797109
[800]	training's auc: 0.849101	valid_1's auc: 0.797917
[900]	training's auc: 0.855878	valid_1's auc: 0.798719
[1000]	training's auc: 0.862289	valid_1's auc: 0.799189
[1100]	training's auc: 0.868479	valid_1's auc: 0.799598
[1200]	training's auc: 0.874247	valid_1's auc: 0.799893
[1300]	training's auc: 0.879881	valid_1's auc: 0.80001
[1400]	training's auc: 0.884689	valid_1's auc: 0.80004
[1500]	training's auc: 0.889665	valid_1's auc: 0.800286
[1600]	training's auc: 0.894166	valid_1's auc: 0.80027
[1700]	training's auc: 0.898721	valid_1's auc: 0.800199

In [18]:
merged = pd.read_pickle('edit/tmp_data/merged.pkl')
FoldSubmit(merged, index_cols, clf, submit='180720-3.csv')

Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.756731	valid_1's auc: 0.750528
[200]	training's auc: 0.779158	valid_1's auc: 0.766346
[300]	training's auc: 0.796917	valid_1's auc: 0.776907
[400]	training's auc: 0.809141	valid_1's auc: 0.78233
[500]	training's auc: 0.818611	valid_1's auc: 0.785081
[600]	training's auc: 0.826685	valid_1's auc: 0.786618
[700]	training's auc: 0.834283	valid_1's auc: 0.787514
[800]	training's auc: 0.841002	valid_1's auc: 0.788027
[900]	training's auc: 0.847294	valid_1's auc: 0.788515
[1000]	training's auc: 0.85328	valid_1's auc: 0.788644
[1100]	training's auc: 0.858882	valid_1's auc: 0.788974
[1200]	training's auc: 0.864376	valid_1's auc: 0.789133
[1300]	training's auc: 0.869455	valid_1's auc: 0.789251
[1400]	training's auc: 0.874202	valid_1's auc: 0.789233
[1500]	training's auc: 0.879163	valid_1's auc: 0.789269
[1600]	training's auc: 0.883742	valid_1's auc: 0.789167
Early stopping, best iteration is:
[1455]	training's

In [24]:
def FoldSubmit_base(all_data, index_cols, model = None, submit=None, return_clf = False):
    np.random.seed(seed)
    folds = KFold(n_splits=4, shuffle=True, random_state=seed)
    data = all_data[all_data.TEST==0]
    y = data.TARGET
    data = data.drop(index_cols,axis=1)
    test = all_data[all_data.TEST==1].drop(index_cols,axis=1)
    total_score=0
    clf_set=[]

    sub_preds = np.zeros(test.shape[0])

    for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data)):
        trn_x, trn_y = data.iloc[trn_idx], y.iloc[trn_idx]
        val_x, val_y = data.iloc[val_idx], y.iloc[val_idx]

        if model == None:
            clf = LGBMClassifier(
                n_estimators=300,
                learning_rate=0.03,
                num_leaves=30,
                colsample_bytree=.8,
                subsample=.9,
                max_depth= 7,
                reg_alpha=.1,
                reg_lambda=.1,
                min_split_gain=.01,
                min_child_weight=2,
                random_state=seed,
                silent=True,
                verbose=-1,
            )
        else:
            clf = model

        clf.fit(trn_x, trn_y,
                eval_set= [(trn_x, trn_y), (val_x, val_y)],
                eval_class_weight = [np.sum(val_y), len(val_y) - np.sum(val_y)] # inverse rate, 0 : 1
                eval_metric='auc', 
                verbose=100, 
                early_stopping_rounds=150
               )


        oof_preds = clf.predict_proba(val_x.iloc[arrange_inds], num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        auc_score = roc_auc_score(val_y.iloc[arrange_inds], oof_preds)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, auc_score))

        total_score += auc_score
        if return_clf: clf_set.append(clf)

    print('Total AUC : %.6f' % (total_score/folds.n_splits))

    if submit!=None:
        submit_data = pd.DataFrame(np.c_[all_data[all_data.TEST==1]['SK_ID_CURR'].values.astype('int32'),sub_preds],
             columns=['SK_ID_CURR','TARGET'])
        submit_data['SK_ID_CURR'] = submit_data['SK_ID_CURR'].astype('int')
        submit_data['TARGET'] = submit_data['TARGET'].astype('float32')
        submit_data.to_csv('csv/'+submit, index=None, float_format='%.8f')

    if return_clf: return clf_set

    return

In [22]:
# fold - 3

FoldSubmit(merged, index_cols, clf, submit='180720-3.csv')

Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.757115	valid_1's auc: 0.745918
[200]	training's auc: 0.780941	valid_1's auc: 0.762897
[300]	training's auc: 0.799216	valid_1's auc: 0.773952
[400]	training's auc: 0.811926	valid_1's auc: 0.779598
[500]	training's auc: 0.822103	valid_1's auc: 0.782612
[600]	training's auc: 0.831156	valid_1's auc: 0.784426


KeyboardInterrupt: 

In [6]:
run(model = clf, submit='180705.csv')

100%|██████████| 6/6 [00:28<00:00,  5.53s/it]


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.7572	valid_1's auc: 0.750343
[200]	training's auc: 0.781176	valid_1's auc: 0.767766
[300]	training's auc: 0.799615	valid_1's auc: 0.779381
[400]	training's auc: 0.812122	valid_1's auc: 0.784964
[500]	training's auc: 0.822272	valid_1's auc: 0.787902
[600]	training's auc: 0.830675	valid_1's auc: 0.789539
[700]	training's auc: 0.838401	valid_1's auc: 0.790743
[800]	training's auc: 0.845329	valid_1's auc: 0.791582
[900]	training's auc: 0.851975	valid_1's auc: 0.792091
[1000]	training's auc: 0.857867	valid_1's auc: 0.792588
[1100]	training's auc: 0.863606	valid_1's auc: 0.792994
[1200]	training's auc: 0.869159	valid_1's auc: 0.792978
Early stopping, best iteration is:
[1098]	training's auc: 0.863471	valid_1's auc: 0.793004
Fold  1 AUC : 0.793004
Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.756751	valid_1's auc: 0.739912
[200]	training's auc: 0.781332	valid_1's auc

In [5]:
merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
FoldSubmitwithTargetEnc(merged, index_cols, clf)

100%|██████████| 6/6 [00:26<00:00,  5.05s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  trn_x['TARGET'] = trn_y
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  trn_x[col+'_mean_enc'] = trn_x[col].map(means)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  val_x[col+'_mean_enc'] = val_x[col].map(means)


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.758688	valid_1's auc: 0.750668
[200]	training's auc: 0.782002	valid_1's auc: 0.767065
[300]	training's auc: 0.800605	valid_1's auc: 0.779063
[400]	training's auc: 0.813242	valid_1's auc: 0.785076
[500]	training's auc: 0.823162	valid_1's auc: 0.788364
[600]	training's auc: 0.831556	valid_1's auc: 0.790137
[700]	training's auc: 0.839102	valid_1's auc: 0.791356
[800]	training's auc: 0.845928	valid_1's auc: 0.79222
[900]	training's auc: 0.85244	valid_1's auc: 0.792703
[1000]	training's auc: 0.858385	valid_1's auc: 0.792997
[1100]	training's auc: 0.864115	valid_1's auc: 0.793345
[1200]	training's auc: 0.869448	valid_1's auc: 0.793569
[1300]	training's auc: 0.874656	valid_1's auc: 0.793545
Early stopping, best iteration is:
[1180]	training's auc: 0.868347	valid_1's auc: 0.793656
Fold  1 AUC : 0.793656


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  trn_x['TARGET'] = trn_y
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  trn_x[col+'_mean_enc'] = trn_x[col].map(means)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  val_x[col+'_mean_enc'] = val_x[col].map(means)


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.757776	valid_1's auc: 0.739696
[200]	training's auc: 0.781637	valid_1's auc: 0.758051
[300]	training's auc: 0.800743	valid_1's auc: 0.770341
[400]	training's auc: 0.813547	valid_1's auc: 0.775978
[500]	training's auc: 0.823779	valid_1's auc: 0.779161
[600]	training's auc: 0.832156	valid_1's auc: 0.781275
[700]	training's auc: 0.840203	valid_1's auc: 0.782643
[800]	training's auc: 0.847398	valid_1's auc: 0.783651
[900]	training's auc: 0.853884	valid_1's auc: 0.784303
[1000]	training's auc: 0.860041	valid_1's auc: 0.784677
[1100]	training's auc: 0.865518	valid_1's auc: 0.785154
[1200]	training's auc: 0.870647	valid_1's auc: 0.785276
[1300]	training's auc: 0.876046	valid_1's auc: 0.785723
[1400]	training's auc: 0.880787	valid_1's auc: 0.785838
[1500]	training's auc: 0.885388	valid_1's auc: 0.785878
[1600]	training's auc: 0.889985	valid_1's auc: 0.785816
Early stopping, best iteration is:
[1547]	training

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  trn_x['TARGET'] = trn_y
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  trn_x[col+'_mean_enc'] = trn_x[col].map(means)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  val_x[col+'_mean_enc'] = val_x[col].map(means)


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.758859	valid_1's auc: 0.742727
[200]	training's auc: 0.782918	valid_1's auc: 0.758741
[300]	training's auc: 0.801961	valid_1's auc: 0.770624
[400]	training's auc: 0.814469	valid_1's auc: 0.776238
[500]	training's auc: 0.82426	valid_1's auc: 0.779548
[600]	training's auc: 0.832962	valid_1's auc: 0.781646
[700]	training's auc: 0.840702	valid_1's auc: 0.783215
[800]	training's auc: 0.847782	valid_1's auc: 0.784346
[900]	training's auc: 0.854148	valid_1's auc: 0.784876
[1000]	training's auc: 0.860236	valid_1's auc: 0.785454
[1100]	training's auc: 0.865798	valid_1's auc: 0.785914
[1200]	training's auc: 0.87113	valid_1's auc: 0.786275
[1300]	training's auc: 0.876203	valid_1's auc: 0.786364
[1400]	training's auc: 0.880983	valid_1's auc: 0.786541
[1500]	training's auc: 0.885557	valid_1's auc: 0.786699
[1600]	training's auc: 0.889943	valid_1's auc: 0.787076
[1700]	training's auc: 0.894119	valid_1's auc: 0.787

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  trn_x['TARGET'] = trn_y
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  trn_x[col+'_mean_enc'] = trn_x[col].map(means)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  val_x[col+'_mean_enc'] = val_x[col].map(means)


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.759395	valid_1's auc: 0.747473
[200]	training's auc: 0.782523	valid_1's auc: 0.763307
[300]	training's auc: 0.80132	valid_1's auc: 0.775269
[400]	training's auc: 0.813966	valid_1's auc: 0.780912
[500]	training's auc: 0.823718	valid_1's auc: 0.783741
[600]	training's auc: 0.832717	valid_1's auc: 0.785916
[700]	training's auc: 0.840482	valid_1's auc: 0.787339
[800]	training's auc: 0.847286	valid_1's auc: 0.788262
[900]	training's auc: 0.853666	valid_1's auc: 0.788896
[1000]	training's auc: 0.860074	valid_1's auc: 0.788995
[1100]	training's auc: 0.865617	valid_1's auc: 0.789389
[1200]	training's auc: 0.870743	valid_1's auc: 0.789574
[1300]	training's auc: 0.875602	valid_1's auc: 0.789592
[1400]	training's auc: 0.880501	valid_1's auc: 0.789589
Early stopping, best iteration is:
[1257]	training's auc: 0.873776	valid_1's auc: 0.789701
Fold  4 AUC : 0.789701
Total AUC : 0.789279


In [21]:
#merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
merged.columns[400:500]

Index(['POS_NAME_CONTRACT_STATUS_nan_MEAN', 'POS_COUNT',
       'PREV_AMT_ANNUITY_MAX', 'PREV_AMT_ANNUITY_MEAN',
       'PREV_AMT_APPLICATION_MEAN', 'PREV_AMT_CREDIT_MAX',
       'PREV_AMT_CREDIT_MEAN', 'PREV_APP_CREDIT_PERC_MAX',
       'PREV_APP_CREDIT_PERC_MEAN', 'PREV_AMT_DOWN_PAYMENT_MAX',
       'PREV_AMT_DOWN_PAYMENT_MEAN', 'PREV_AMT_GOODS_PRICE_MAX',
       'PREV_AMT_GOODS_PRICE_MEAN', 'PREV_HOUR_APPR_PROCESS_START_MAX',
       'PREV_HOUR_APPR_PROCESS_START_MEAN', 'PREV_RATE_DOWN_PAYMENT_MIN',
       'PREV_RATE_DOWN_PAYMENT_MAX', 'PREV_RATE_DOWN_PAYMENT_MEAN',
       'PREV_DAYS_DECISION_MIN', 'PREV_DAYS_DECISION_MAX',
       'PREV_DAYS_DECISION_MEAN', 'PREV_CNT_PAYMENT_MEAN',
       'PREV_CNT_PAYMENT_SUM', 'PREV_NAME_CONTRACT_TYPE_Cash loans_MEAN',
       'PREV_NAME_CONTRACT_TYPE_Consumer loans_MEAN',
       'PREV_NAME_CONTRACT_TYPE_Revolving loans_MEAN',
       'PREV_NAME_CONTRACT_TYPE_XNA_MEAN', 'PREV_NAME_CONTRACT_TYPE_nan_MEAN',
       'PREV_WEEKDAY_APPR_PROCESS_START_FRIDA

In [18]:
pd.set_option('display.max_columns', 1000)

In [40]:
np.where(np.array([1,2,1,3]) > 1)[0]

array([1, 3])

In [26]:
# 0.789586
merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
merged['AMT_ANNUITY_RATE_MEAN'] = merged['AMT_ANNUITY'] /merged['PREV_AMT_ANNUITY_MEAN'] 
#merged['AMT_CREDIT_rate'] = merged['AMT_CREDIT'] /merged['PREV_AMT_CREDIT'] 
#merged['AMT_GOODS_PRICE_rate'] = merged['AMT_GOODS_PRICE'] /merged['PREV_AMT_GOODS_PRICE'] 

FoldSubmit(merged, index_cols, model = clf,submit='180705-2.csv')

100%|██████████| 6/6 [00:30<00:00,  6.00s/it]


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.757491	valid_1's auc: 0.757965
[200]	training's auc: 0.781399	valid_1's auc: 0.773663
[300]	training's auc: 0.799791	valid_1's auc: 0.784517
[400]	training's auc: 0.812248	valid_1's auc: 0.78971
[500]	training's auc: 0.822112	valid_1's auc: 0.792777
[600]	training's auc: 0.830738	valid_1's auc: 0.794307
[700]	training's auc: 0.838495	valid_1's auc: 0.795656
[800]	training's auc: 0.84547	valid_1's auc: 0.796459
[900]	training's auc: 0.851893	valid_1's auc: 0.797162
[1000]	training's auc: 0.857895	valid_1's auc: 0.797628
[1100]	training's auc: 0.863702	valid_1's auc: 0.797964
[1200]	training's auc: 0.86912	valid_1's auc: 0.798254
[1300]	training's auc: 0.874235	valid_1's auc: 0.798509
[1400]	training's auc: 0.879228	valid_1's auc: 0.798604
[1500]	training's auc: 0.883778	valid_1's auc: 0.79856
Early stopping, best iteration is:
[1389]	training's auc: 0.878664	valid_1's auc: 0.798623
Fold  1 AUC : 0.798

In [30]:
# 0.789586
merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
merged['AMT_ANNUITY_RATE_MAX'] = merged['AMT_ANNUITY'] /merged['PREV_AMT_ANNUITY_MAX'] 
merged['AMT_ANNUITY_RATE_MEAN'] = merged['AMT_ANNUITY'] /merged['PREV_AMT_ANNUITY_MEAN'] 
merged['AMT_CREDIT_RATE_MEAN'] = merged['AMT_CREDIT'] /merged['PREV_AMT_CREDIT_MEAN'] 
merged['AMT_CREDIT_RATE_MAX'] = merged['AMT_CREDIT'] /merged['PREV_AMT_CREDIT_MAX'] 
merged['AMT_GOODS_RATE_MEAN'] = merged['AMT_GOODS_PRICE'] /merged['PREV_AMT_GOODS_PRICE_MEAN'] 
merged['AMT_GOODS_RATE_MAX'] = merged['AMT_GOODS_PRICE'] /merged['PREV_AMT_GOODS_PRICE_MAX'] 

FoldSubmit(merged, index_cols, model = clf, submit='180705-3.csv')

100%|██████████| 6/6 [00:27<00:00,  5.43s/it]


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.757152	valid_1's auc: 0.757207
[200]	training's auc: 0.781416	valid_1's auc: 0.773754
[300]	training's auc: 0.7999	valid_1's auc: 0.78441
[400]	training's auc: 0.812248	valid_1's auc: 0.789698
[500]	training's auc: 0.822216	valid_1's auc: 0.792817
[600]	training's auc: 0.830718	valid_1's auc: 0.794903
[700]	training's auc: 0.838486	valid_1's auc: 0.796482
[800]	training's auc: 0.84541	valid_1's auc: 0.797277
[900]	training's auc: 0.851854	valid_1's auc: 0.797949
[1000]	training's auc: 0.858145	valid_1's auc: 0.798422
[1100]	training's auc: 0.86413	valid_1's auc: 0.798786
[1200]	training's auc: 0.869358	valid_1's auc: 0.798897
[1300]	training's auc: 0.874427	valid_1's auc: 0.799014
[1400]	training's auc: 0.879536	valid_1's auc: 0.799122
[1500]	training's auc: 0.884173	valid_1's auc: 0.799102
[1600]	training's auc: 0.888599	valid_1's auc: 0.799301
[1700]	training's auc: 0.892788	valid_1's auc: 0.799584

In [31]:
prev = pd.read_pickle('edit/tmp_data/prev2.pkl')
merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
FoldSubmit(merged, index_cols, model = clf, submit='180705-4.csv')

100%|██████████| 6/6 [00:29<00:00,  4.31s/it]


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.757197	valid_1's auc: 0.757438
[200]	training's auc: 0.780769	valid_1's auc: 0.772804
[300]	training's auc: 0.798928	valid_1's auc: 0.783692
[400]	training's auc: 0.811131	valid_1's auc: 0.788723
[500]	training's auc: 0.821087	valid_1's auc: 0.791773
[600]	training's auc: 0.829432	valid_1's auc: 0.793818
[700]	training's auc: 0.836796	valid_1's auc: 0.795145
[800]	training's auc: 0.843559	valid_1's auc: 0.795994
[900]	training's auc: 0.849939	valid_1's auc: 0.79683
[1000]	training's auc: 0.855949	valid_1's auc: 0.797132
[1100]	training's auc: 0.861521	valid_1's auc: 0.797651
[1200]	training's auc: 0.8669	valid_1's auc: 0.798003
[1300]	training's auc: 0.871726	valid_1's auc: 0.798226
[1400]	training's auc: 0.876587	valid_1's auc: 0.798306
[1500]	training's auc: 0.881077	valid_1's auc: 0.798281
Early stopping, best iteration is:
[1361]	training's auc: 0.874663	valid_1's auc: 0.798417
Fold  1 AUC : 0.79

In [35]:
# 0.789586
merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
merged['AMT_ANNUITY_RATE_MAX'] = merged['AMT_ANNUITY'] /merged['PREV_AMT_ANNUITY_MAX'] 
#merged['AMT_ANNUITY_RATE_MEAN'] = merged['AMT_ANNUITY'] /merged['PREV_AMT_ANNUITY_MEAN'] 

FoldSubmit(merged, index_cols, model = clf)

100%|██████████| 6/6 [00:21<00:00,  3.07s/it]


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.756957	valid_1's auc: 0.756891
[200]	training's auc: 0.780862	valid_1's auc: 0.772729
[300]	training's auc: 0.799066	valid_1's auc: 0.783649
[400]	training's auc: 0.811558	valid_1's auc: 0.788937
[500]	training's auc: 0.821336	valid_1's auc: 0.792092
[600]	training's auc: 0.8296	valid_1's auc: 0.794144
[700]	training's auc: 0.837105	valid_1's auc: 0.795297
[800]	training's auc: 0.843982	valid_1's auc: 0.796121
[900]	training's auc: 0.850252	valid_1's auc: 0.796821
[1000]	training's auc: 0.856238	valid_1's auc: 0.797495
[1100]	training's auc: 0.861854	valid_1's auc: 0.797817
[1200]	training's auc: 0.867038	valid_1's auc: 0.798043
[1300]	training's auc: 0.871992	valid_1's auc: 0.798343
[1400]	training's auc: 0.876621	valid_1's auc: 0.7985
[1500]	training's auc: 0.881303	valid_1's auc: 0.798573
[1600]	training's auc: 0.885856	valid_1's auc: 0.798694
[1700]	training's auc: 0.890012	valid_1's auc: 0.79894

In [36]:
# 0.789586
merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
#merged['AMT_CREDIT_RATE_MEAN'] = merged['AMT_CREDIT'] /merged['PREV_AMT_CREDIT_MEAN'] 
merged['AMT_CREDIT_RATE_MAX'] = merged['AMT_CREDIT'] /merged['PREV_AMT_CREDIT_MAX'] 

FoldSubmit(merged, index_cols, model = clf)

100%|██████████| 6/6 [00:20<00:00,  2.99s/it]


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.75664	valid_1's auc: 0.756782
[200]	training's auc: 0.780676	valid_1's auc: 0.772554
[300]	training's auc: 0.79881	valid_1's auc: 0.783433
[400]	training's auc: 0.811337	valid_1's auc: 0.788605
[500]	training's auc: 0.821168	valid_1's auc: 0.791696
[600]	training's auc: 0.829399	valid_1's auc: 0.79378
[700]	training's auc: 0.836833	valid_1's auc: 0.795124
[800]	training's auc: 0.843668	valid_1's auc: 0.796164
[900]	training's auc: 0.850002	valid_1's auc: 0.796759
[1000]	training's auc: 0.855899	valid_1's auc: 0.797201
[1100]	training's auc: 0.861648	valid_1's auc: 0.797734
[1200]	training's auc: 0.866949	valid_1's auc: 0.797969
[1300]	training's auc: 0.87179	valid_1's auc: 0.79814
[1400]	training's auc: 0.876614	valid_1's auc: 0.798221
[1500]	training's auc: 0.881332	valid_1's auc: 0.798375
[1600]	training's auc: 0.885771	valid_1's auc: 0.79842
[1700]	training's auc: 0.88993	valid_1's auc: 0.798444
[

In [37]:
# 0.789586
merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
#merged['AMT_GOODS_RATE_MEAN'] = merged['AMT_GOODS_PRICE'] /merged['PREV_AMT_GOODS_PRICE_MEAN'] 
merged['AMT_GOODS_RATE_MAX'] = merged['AMT_GOODS_PRICE'] /merged['PREV_AMT_GOODS_PRICE_MAX'] 

FoldSubmit(merged, index_cols, model = clf)

100%|██████████| 6/6 [00:10<00:00,  1.85s/it]


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.756585	valid_1's auc: 0.756926
[200]	training's auc: 0.780831	valid_1's auc: 0.772937
[300]	training's auc: 0.79911	valid_1's auc: 0.783877
[400]	training's auc: 0.811403	valid_1's auc: 0.78909
[500]	training's auc: 0.821222	valid_1's auc: 0.792154
[600]	training's auc: 0.829451	valid_1's auc: 0.794023
[700]	training's auc: 0.836726	valid_1's auc: 0.795094
[800]	training's auc: 0.843724	valid_1's auc: 0.795815
[900]	training's auc: 0.849999	valid_1's auc: 0.79655
[1000]	training's auc: 0.855921	valid_1's auc: 0.797101
[1100]	training's auc: 0.861511	valid_1's auc: 0.797679
[1200]	training's auc: 0.866845	valid_1's auc: 0.797777
[1300]	training's auc: 0.871599	valid_1's auc: 0.797998
[1400]	training's auc: 0.87642	valid_1's auc: 0.798179
[1500]	training's auc: 0.881189	valid_1's auc: 0.798371
[1600]	training's auc: 0.885698	valid_1's auc: 0.798708
[1700]	training's auc: 0.889885	valid_1's auc: 0.79882

In [38]:
# 0.789586
merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
#merged['AMT_CREDIT_RATE_MEAN'] = merged['AMT_CREDIT'] /merged['PREV_AMT_CREDIT_MEAN'] 
merged['AMT_CREDIT_RATE_MAX'] = np.log1p(merged['AMT_CREDIT']) /np.log1p(merged['PREV_AMT_CREDIT_MAX'])

FoldSubmit(merged, index_cols, model = clf)

100%|██████████| 6/6 [00:18<00:00,  2.65s/it]


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.756699	valid_1's auc: 0.757058
[200]	training's auc: 0.780709	valid_1's auc: 0.77265
[300]	training's auc: 0.799052	valid_1's auc: 0.783585
[400]	training's auc: 0.811342	valid_1's auc: 0.788645
[500]	training's auc: 0.821216	valid_1's auc: 0.791802
[600]	training's auc: 0.829501	valid_1's auc: 0.793683
[700]	training's auc: 0.837101	valid_1's auc: 0.795012
[800]	training's auc: 0.844134	valid_1's auc: 0.795872
[900]	training's auc: 0.850381	valid_1's auc: 0.796596
[1000]	training's auc: 0.856235	valid_1's auc: 0.797135
[1100]	training's auc: 0.861771	valid_1's auc: 0.797493
[1200]	training's auc: 0.867014	valid_1's auc: 0.797715
[1300]	training's auc: 0.871938	valid_1's auc: 0.797887
[1400]	training's auc: 0.876835	valid_1's auc: 0.798184
[1500]	training's auc: 0.88138	valid_1's auc: 0.798195
[1600]	training's auc: 0.88599	valid_1's auc: 0.798397
[1700]	training's auc: 0.890251	valid_1's auc: 0.7984

In [None]:
# 0.789586
merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
#merged['AMT_ANNUITY_RATE_MEAN'] = merged['AMT_ANNUITY'] /merged['PREV_AMT_ANNUITY_MAX'] 
#merged['AMT_CREDIT_rate'] = merged['AMT_CREDIT'] /merged['PREV_AMT_CREDIT'] 
merged['AMT_GOODS_PRICE_rate'] = merged['AMT_GOODS_PRICE'] /merged['PREV_AMT_GOODS_PRICE_MAX'] 

FoldSubmit(merged, index_cols)

In [18]:
# 0.789586
index_cols = ['SK_ID_CURR','TEST','TARGET']
buro = pd.read_pickle('edit/tmp_data/buro.pkl')

merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
merged['AMT_ANNUITY_rate'] = merged['AMT_ANNUITY_x'] /merged['AMT_ANNUITY_y'] 
merged['AMT_CREDIT_rate'] = merged['AMT_CREDIT_x'] /merged['AMT_CREDIT_y'] 
merged['AMT_GOODS_PRICE_rate'] = merged['AMT_GOODS_PRICE_x'] /merged['AMT_GOODS_PRICE_y'] 

clf_set = FoldSubmit(merged, index_cols,return_clf=True)

100%|██████████| 6/6 [00:08<00:00,  1.44s/it]


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.764246	valid_1's auc: 0.762301
[200]	training's auc: 0.795982	valid_1's auc: 0.783074
[300]	training's auc: 0.815063	valid_1's auc: 0.790463
Did not meet early stopping. Best iteration is:
[300]	training's auc: 0.815063	valid_1's auc: 0.790463
Fold  1 AUC : 0.790463
Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.767546	valid_1's auc: 0.745799
[200]	training's auc: 0.798402	valid_1's auc: 0.768883
[300]	training's auc: 0.817315	valid_1's auc: 0.777389
Did not meet early stopping. Best iteration is:
[300]	training's auc: 0.817315	valid_1's auc: 0.777389
Fold  2 AUC : 0.777389
Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.765021	valid_1's auc: 0.745166
[200]	training's auc: 0.797105	valid_1's auc: 0.766359
[300]	training's auc: 0.815552	valid_1's auc: 0.773943
Did not meet early stopping. Best iteration is:
[300]	training's

In [12]:
# 0.789586
index_cols = ['SK_ID_CURR','TEST','TARGET']
cc_bal = pd.read_pickle('edit/tmp_data/cc_bal.pkl')
pos = pd.read_pickle('edit/tmp_data/pos.pkl')
inst = pd.read_pickle('edit/tmp_data/inst.pkl')
all_data = pd.read_pickle('edit/tmp_data/all_data.pkl')

merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
merged['AMT_ANNUITY_rate'] = merged['AMT_ANNUITY_x'] /merged['AMT_ANNUITY_y'] 
merged['AMT_CREDIT_rate'] = merged['AMT_CREDIT_x'] /merged['AMT_CREDIT_y'] 
merged['AMT_GOODS_PRICE_rate'] = merged['AMT_GOODS_PRICE_x'] /merged['AMT_GOODS_PRICE_y'] 

clf_set = FoldSubmit(merged, index_cols, model=clf, submit='180618.csv',return_clf=True)

100%|██████████| 6/6 [00:12<00:00,  1.95s/it]


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.75954	valid_1's auc: 0.74731
[200]	training's auc: 0.786311	valid_1's auc: 0.76243
[300]	training's auc: 0.806563	valid_1's auc: 0.77286
[400]	training's auc: 0.821499	valid_1's auc: 0.777849
[500]	training's auc: 0.83449	valid_1's auc: 0.78013
[600]	training's auc: 0.845498	valid_1's auc: 0.781316
[700]	training's auc: 0.85544	valid_1's auc: 0.78239
[800]	training's auc: 0.864609	valid_1's auc: 0.78329
[900]	training's auc: 0.873177	valid_1's auc: 0.783865
[1000]	training's auc: 0.880971	valid_1's auc: 0.784062
[1100]	training's auc: 0.88831	valid_1's auc: 0.784061
[1200]	training's auc: 0.894978	valid_1's auc: 0.784203
[1300]	training's auc: 0.901189	valid_1's auc: 0.784264
Early stopping, best iteration is:
[1242]	training's auc: 0.897831	valid_1's auc: 0.784364
Fold  1 AUC : 0.784364
Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.761212	valid_1's auc: 0.744

In [8]:
# 0.789586
index_cols = ['SK_ID_CURR','TEST','TARGET']
cc_bal = pd.read_pickle('edit/tmp_data/cc_bal.pkl')
pos = pd.read_pickle('edit/tmp_data/pos.pkl')
inst = pd.read_pickle('edit/tmp_data/inst.pkl')
all_data = pd.read_pickle('edit/tmp_data/all_data.pkl')

merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
merged['AMT_ANNUITY_rate'] = merged['AMT_ANNUITY_x'] /merged['AMT_ANNUITY_y'] 
merged['AMT_CREDIT_rate'] = merged['AMT_CREDIT_x'] /merged['AMT_CREDIT_y'] 
merged['AMT_GOODS_PRICE_rate'] = merged['AMT_GOODS_PRICE_x'] /merged['AMT_GOODS_PRICE_y'] 

clf_set = FoldSubmit(merged, index_cols, model=clf, submit='180618.csv',return_clf=True)

100%|██████████| 6/6 [00:12<00:00,  1.93s/it]


Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.757972	valid_1's auc: 0.748203
[200]	training's auc: 0.782353	valid_1's auc: 0.762715
[300]	training's auc: 0.801223	valid_1's auc: 0.773366
[400]	training's auc: 0.81374	valid_1's auc: 0.778345
[500]	training's auc: 0.823949	valid_1's auc: 0.781248
[600]	training's auc: 0.8326	valid_1's auc: 0.783197
[700]	training's auc: 0.840151	valid_1's auc: 0.784301
[800]	training's auc: 0.846925	valid_1's auc: 0.784888
[900]	training's auc: 0.853499	valid_1's auc: 0.78562
[1000]	training's auc: 0.859743	valid_1's auc: 0.785689
[1100]	training's auc: 0.865634	valid_1's auc: 0.785988
[1200]	training's auc: 0.871095	valid_1's auc: 0.786509
[1300]	training's auc: 0.87647	valid_1's auc: 0.786764
[1400]	training's auc: 0.881555	valid_1's auc: 0.786882
[1500]	training's auc: 0.886324	valid_1's auc: 0.786888
[1600]	training's auc: 0.891015	valid_1's auc: 0.78689
Early stopping, best iteration is:
[1528]	training's auc

In [None]:
# 0.789586
index_cols = ['SK_ID_CURR','TEST','TARGET']
cc_bal = pd.read_pickle('edit/tmp_data/cc_bal.pkl')
pos = pd.read_pickle('edit/tmp_data/pos.pkl')
inst = pd.read_pickle('edit/tmp_data/inst.pkl')
bburo = pd.read_pickle('edit/tmp_data/buroX.pkl')
all_data = pd.read_pickle('edit/tmp_data/all_data.pkl')

merged = DataMerger([all_data, buro, cc_bal, inst, pos, prev], 'SK_ID_CURR')
merged['AMT_ANNUITY_rate'] = merged['AMT_ANNUITY_x'] /merged['AMT_ANNUITY_y'] 
merged['AMT_CREDIT_rate'] = merged['AMT_CREDIT_x'] /merged['AMT_CREDIT_y'] 
merged['AMT_GOODS_PRICE_rate'] = merged['AMT_GOODS_PRICE_x'] /merged['AMT_GOODS_PRICE_y'] 

clf_set = FoldSubmit(merged, index_cols, model=clf, submit='180614.csv',return_clf=True)

In [20]:
fdf = pd.DataFrame(np.c_[np.array(merged.drop(index_cols,axis=1).columns),clf_set[2].feature_importances_.tolist()], columns=['feature','score']).sort_values('score',ascending=False)

In [22]:
fdf.head()

Unnamed: 0,feature,score
37,EXT_SOURCE_2,1136
38,EXT_SOURCE_3,1066
110,ANNUITY LENGTH,1009
22,DAYS_BIRTH,777
374,CNT_INSTALMENT_FUTURE,759


In [27]:
fdf[fdf.score>200]['feature'].values

array(['EXT_SOURCE_2', 'EXT_SOURCE_3', 'ANNUITY LENGTH', 'DAYS_BIRTH',
       'CNT_INSTALMENT_FUTURE', 'EXT_SOURCE_1p2', 'EXT_SOURCE_1',
       'max_buro_DAYS_CREDIT', 'AMT_ANNUITY_x', 'SK_DPD_DEF_y',
       'AMT_PAYMENT', 'DAYS_ID_PUBLISH', 'AMT_ANNUITY_rate',
       'SK_ID_CURR_CNT_POS_CASH', 'CNT_PAYMENT', 'AMT_GOODS_PRICE_x',
       'max_buro_DAYS_CREDIT_ENDDATE', 'AMT_CREDIT_x',
       'DAYS_LAST_DUE_1ST_VERSION', 'ANNUITY_INCOME_RATIO',
       'WORKING_LIFE_RATIO', 'max_buro_DAYS_ENDDATE_FACT',
       'DAYS_REGISTRATION', 'AMT_DOWN_PAYMENT', 'DAYS_EMPLOYED',
       'med_buro_AMT_CREDIT_SUM', 'med_buro_DAYS_CREDIT_ENDDATE',
       'NAME_CONTRACT_STATUS_Refused', 'OWN_CAR_AGE', 'EXT_SOURCE_12',
       'CODE_GENDER', 'CNT_DRAWINGS_CURRENT', 'DAYS_LAST_PHONE_CHANGE',
       'CNT_INSTALMENT', 'avg_buro_AMT_CREDIT_SUM_DEBT',
       'ORGANIZATION_TYPE', 'SK_ID_PREV_y', 'REGION_POPULATION_RELATIVE',
       'HOUR_APPR_PROCESS_START_y', 'min_buro_AMT_CREDIT_SUM',
       'RATE_DOWN_PAYMENT'

In [33]:
# base model
clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.03,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

In [34]:
FoldSubmit(merged, index_cols, model=clf, submit='1806142.csv',return_clf=True)

Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.744824	valid_1's auc: 0.747261
[200]	training's auc: 0.75573	valid_1's auc: 0.756175
[300]	training's auc: 0.768898	valid_1's auc: 0.76486
[400]	training's auc: 0.781248	valid_1's auc: 0.773629
[500]	training's auc: 0.794105	valid_1's auc: 0.781848
[600]	training's auc: 0.800675	valid_1's auc: 0.785305
[700]	training's auc: 0.806689	valid_1's auc: 0.787553
[800]	training's auc: 0.811965	valid_1's auc: 0.789465
[900]	training's auc: 0.817324	valid_1's auc: 0.791181
[1000]	training's auc: 0.821766	valid_1's auc: 0.792205
[1100]	training's auc: 0.826491	valid_1's auc: 0.793373
[1200]	training's auc: 0.829627	valid_1's auc: 0.794012
[1300]	training's auc: 0.833501	valid_1's auc: 0.794861
[1400]	training's auc: 0.837567	valid_1's auc: 0.795459
[1500]	training's auc: 0.840797	valid_1's auc: 0.795942
[1600]	training's auc: 0.844071	valid_1's auc: 0.796286
[1700]	training's auc: 0.847801	valid_1's auc: 0.796

[LGBMClassifier(boosting_type='dart', class_weight=None,
         colsample_bytree=0.9497036, learning_rate=0.03, max_depth=8,
         min_child_samples=20, min_child_weight=39.3259775,
         min_split_gain=0.0222415, n_estimators=10000, n_jobs=-1, nthread=4,
         num_leaves=34, objective=None, random_state=None,
         reg_alpha=0.041545473, reg_lambda=0.0735294, silent=-1,
         subsample=0.8715623, subsample_for_bin=200000, subsample_freq=1,
         verbose=-1), LGBMClassifier(boosting_type='dart', class_weight=None,
         colsample_bytree=0.9497036, learning_rate=0.03, max_depth=8,
         min_child_samples=20, min_child_weight=39.3259775,
         min_split_gain=0.0222415, n_estimators=10000, n_jobs=-1, nthread=4,
         num_leaves=34, objective=None, random_state=None,
         reg_alpha=0.041545473, reg_lambda=0.0735294, silent=-1,
         subsample=0.8715623, subsample_for_bin=200000, subsample_freq=1,
         verbose=-1), LGBMClassifier(boosting_type='dar