In [1]:
import os
import gc
import numpy as np 
import pandas as pd

In [2]:
pd.options.display.max_rows = 300
pd.options.display.max_columns = 100

In [3]:
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,  KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

In [4]:
import lightgbm as lgb

In [5]:
train = pd.read_csv('application_train.csv', index_col='SK_ID_CURR')
test = pd.read_csv('application_test.csv', index_col='SK_ID_CURR')
test['TARGET'] = 2
traintest = pd.concat([train, test], sort=False).sort_index()

In [6]:
traintest.head().T

SK_ID_CURR,100001,100002,100003,100004,100005
TARGET,2,1,0,0,2
NAME_CONTRACT_TYPE,Cash loans,Cash loans,Cash loans,Revolving loans,Cash loans
CODE_GENDER,F,M,F,M,M
FLAG_OWN_CAR,N,N,N,Y,N
FLAG_OWN_REALTY,Y,Y,N,Y,Y
CNT_CHILDREN,0,0,0,0,0
AMT_INCOME_TOTAL,135000,202500,270000,67500,99000
AMT_CREDIT,568800,406598,1.2935e+06,135000,222768
AMT_ANNUITY,20560.5,24700.5,35698.5,6750,17370
AMT_GOODS_PRICE,450000,351000,1.1295e+06,135000,180000


In [7]:
test.dtypes
numcols = test.select_dtypes(exclude='object').columns.tolist()
numcols.remove('TARGET')
for n in numcols:
    min_ = np.amin(test[n])
    max_ = np.amax(test[n])
    train[n] = train[n].clip(min_, max_)
        
train = train[train.CODE_GENDER != 'XNA']
train = train[train.NAME_INCOME_TYPE != 'Maternity leave']
train = train[train.NAME_FAMILY_STATUS != 'Unknown']
    
traintest = pd.concat([train, test], sort=False).sort_index()
traintest['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
traintest['TT_NULLCOUNT'] = traintest.isnull().sum(axis=1)
del train; del test
gc.collect()
traintest.head().T

SK_ID_CURR,100001,100002,100003,100004,100005
TARGET,2,1,0,0,2
NAME_CONTRACT_TYPE,Cash loans,Cash loans,Cash loans,Revolving loans,Cash loans
CODE_GENDER,F,M,F,M,M
FLAG_OWN_CAR,N,N,N,Y,N
FLAG_OWN_REALTY,Y,Y,N,Y,Y
CNT_CHILDREN,0,0,0,0,0
AMT_INCOME_TOTAL,135000,202500,270000,67500,99000
AMT_CREDIT,568800,406598,1.2935e+06,135000,222768
AMT_ANNUITY,20560.5,24700.5,35698.5,6750,17370
AMT_GOODS_PRICE,450000,351000,1.1295e+06,135000,180000


In [8]:
traintest['HOME_OWNER_TYPE'] = traintest['FLAG_OWN_REALTY'] + traintest['NAME_HOUSING_TYPE']
traintest['OCC_TYPE_GENDER'] = traintest['CODE_GENDER'] + traintest['OCCUPATION_TYPE']
traintest['HOME_OWNER_GENDER'] = traintest['CODE_GENDER'] + traintest['FLAG_OWN_REALTY']

In [9]:
traintest['ext_sources_mean'] = traintest[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
traintest['ext_sources_std'] = traintest[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)

traintest['annuity_income_percentage'] = traintest['AMT_ANNUITY'] / traintest['AMT_INCOME_TOTAL']
traintest['car_to_birth_ratio'] = traintest['OWN_CAR_AGE'] / traintest['DAYS_BIRTH']
traintest['car_to_employ_ratio'] = traintest['OWN_CAR_AGE'] / traintest['DAYS_EMPLOYED']
traintest['children_ratio'] = traintest['CNT_CHILDREN'] / traintest['CNT_FAM_MEMBERS']
traintest['credit_to_annuity_ratio'] = traintest['AMT_CREDIT'] / traintest['AMT_ANNUITY']
traintest['credit_to_goods_ratio'] = traintest['AMT_CREDIT'] / traintest['AMT_GOODS_PRICE']
traintest['days_employed_percentage'] = traintest['DAYS_EMPLOYED'] / traintest['DAYS_BIRTH']
traintest['income_per_child'] = traintest['AMT_INCOME_TOTAL'] / (1 + traintest['CNT_CHILDREN'])
traintest['income_per_person'] = traintest['AMT_INCOME_TOTAL'] / traintest['CNT_FAM_MEMBERS']
traintest['payment_rate'] = traintest['AMT_ANNUITY'] / traintest['AMT_CREDIT']
traintest['phone_to_birth_ratio'] = traintest['DAYS_LAST_PHONE_CHANGE'] / traintest['DAYS_BIRTH']
traintest['phone_to_employ_ratio'] = traintest['DAYS_LAST_PHONE_CHANGE'] / traintest['DAYS_EMPLOYED']

In [10]:
traintest['ANNUITY_GROUPED'] = traintest.groupby(['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'])['EXT_SOURCE_1'].transform('mean')

In [11]:
NUMERICAL_COLUMNS = ['AMT_ANNUITY',
                        'AMT_CREDIT',
                        'AMT_INCOME_TOTAL',
                        'AMT_REQ_CREDIT_BUREAU_YEAR',
                        'DAYS_BIRTH',
                        'EXT_SOURCE_1', 
                        'EXT_SOURCE_2',
                        'EXT_SOURCE_3',
                        'AMT_REQ_CREDIT_BUREAU_YEAR'
                        ]

In [12]:
CAT_COLUMNS =  [['OCCUPATION_TYPE'],
                    ['CODE_GENDER', 'NAME_EDUCATION_TYPE'],
                    ['FLAG_OWN_REALTY', 'NAME_HOUSING_TYPE'],
                    ['CODE_GENDER', 'ORGANIZATION_TYPE'],
                    ['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE'],
                    ['CODE_GENDER', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY']]

In [13]:
for agg in ['mean', 'max', 'min']:
    for numcol in NUMERICAL_COLUMNS:
        for catgroup in CAT_COLUMNS:
            traintest[numcol+'_'.join(catgroup)+agg] = traintest.groupby(catgroup)[numcol].transform(agg)

In [14]:
traintest['previous_employment'] = (traintest['DAYS_EMPLOYED'] > -2000).astype(int)
traintest['retirement_age'] = (traintest['DAYS_BIRTH'] > -14000).astype(int)

In [15]:
uselesses = ['FLAG_DOCUMENT_10',
                        'FLAG_DOCUMENT_12',
                        'FLAG_DOCUMENT_13',
                        'FLAG_DOCUMENT_14',
                        'FLAG_DOCUMENT_15',
                        'FLAG_DOCUMENT_16',
                        'FLAG_DOCUMENT_17',
                        'FLAG_DOCUMENT_19',
                        'FLAG_DOCUMENT_2',
                        'FLAG_DOCUMENT_20',
                        'FLAG_DOCUMENT_21',
                        'FLAG_DOCUMENT_4',
                        'FLAG_DOCUMENT_7',
                        'FLAG_DOCUMENT_9']

In [16]:
traintest = traintest.drop(uselesses, axis=1)

In [17]:
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df,new_columns

In [18]:
bureau = pd.read_csv("bureau.csv")
bb = pd.read_csv("bureau_balance.csv")
prev = pd.read_csv('previous_application.csv')
pos = pd.read_csv('POS_CASH_balance.csv')
ins = pd.read_csv('installments_payments.csv')
cc = pd.read_csv('credit_card_balance.csv')

In [19]:
_, bb_cat = one_hot_encoder(bb, nan_as_category= True)

In [20]:
bb.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [21]:
bb_cat

['STATUS_0',
 'STATUS_1',
 'STATUS_2',
 'STATUS_3',
 'STATUS_4',
 'STATUS_5',
 'STATUS_C',
 'STATUS_X',
 'STATUS_nan']

In [29]:
    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size'],
          }
    
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)

In [36]:
bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [38]:
_, bureau_cat = one_hot_encoder(bureau, nan_as_category= True)

In [39]:
bureau_cat

['CREDIT_ACTIVE_Active',
 'CREDIT_ACTIVE_Bad debt',
 'CREDIT_ACTIVE_Closed',
 'CREDIT_ACTIVE_Sold',
 'CREDIT_ACTIVE_nan',
 'CREDIT_CURRENCY_currency 1',
 'CREDIT_CURRENCY_currency 2',
 'CREDIT_CURRENCY_currency 3',
 'CREDIT_CURRENCY_currency 4',
 'CREDIT_CURRENCY_nan',
 'CREDIT_TYPE_Another type of loan',
 'CREDIT_TYPE_Car loan',
 'CREDIT_TYPE_Cash loan (non-earmarked)',
 'CREDIT_TYPE_Consumer credit',
 'CREDIT_TYPE_Credit card',
 'CREDIT_TYPE_Interbank credit',
 'CREDIT_TYPE_Loan for business development',
 'CREDIT_TYPE_Loan for purchase of shares (margin lending)',
 'CREDIT_TYPE_Loan for the purchase of equipment',
 'CREDIT_TYPE_Loan for working capital replenishment',
 'CREDIT_TYPE_Microloan',
 'CREDIT_TYPE_Mobile operator loan',
 'CREDIT_TYPE_Mortgage',
 'CREDIT_TYPE_Real estate loan',
 'CREDIT_TYPE_Unknown type of loan',
 'CREDIT_TYPE_nan']

In [40]:
def bureau_and_balance(num_rows = None, nan_as_category = True):
    bb = pd.read_csv("bureau_balance.csv")
    bureau = pd.read_csv("bureau.csv")
    _, bb_cat = one_hot_encoder(bb, nan_as_category= True)
    _, bureau_cat = one_hot_encoder(bureau, nan_as_category= True)
        
    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size'],
          }
    
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    del bb, bb_agg
    gc.collect()
        
    num_aggregations = {
            'DAYS_CREDIT': [ 'mean', 'var'],
            'DAYS_CREDIT_ENDDATE': [ 'mean'],
            'DAYS_CREDIT_UPDATE': ['mean'],
            'CREDIT_DAY_OVERDUE': ['mean'],
            'AMT_CREDIT_MAX_OVERDUE': ['mean'],
            'AMT_CREDIT_SUM': [ 'mean', 'sum'],
            'AMT_CREDIT_SUM_DEBT': [ 'mean', 'sum'],
            'AMT_CREDIT_SUM_OVERDUE': ['mean'],
            'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
            'AMT_ANNUITY': ['max', 'mean'],
            'CNT_CREDIT_PROLONG': ['sum'],
        }


    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
        
    bureau_agg = bureau.groupby('SK_ID_CURR').agg(num_aggregations ,cat_aggregations)
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    #active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    #active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    #active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    #bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
   # del active, active_agg
    #gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    #closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    #closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    #closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
   # bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    #del closed, closed_agg, bureau
    #gc.collect()
    return bureau_agg


In [41]:
def previous_applications(num_rows = None, nan_as_category = True):
    prev = pd.read_csv('previous_application.csv')
    prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    # Previous applications numeric features
    num_aggregations = {
            'AMT_ANNUITY': [ 'max', 'mean'],
            'AMT_APPLICATION': ['min', 'mean'],
            'AMT_CREDIT': ['min', 'max', 'mean'],
            'APP_CREDIT_PERC': ['min', 'max', 'mean'],
            'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
            'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
            'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
            'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
            'DAYS_DECISION': ['min', 'max', 'mean'],
            'CNT_PAYMENT': ['mean', 'sum'],
        }
    
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
        
    prev_agg = prev.groupby('SK_ID_CURR').agg(num_aggregations, cat_aggregations)
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
        
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
        
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg


In [42]:
    def pos_cash(num_rows = None, nan_as_category = True):
        pos = pd.read_csv('POS_CASH_balance.csv')
        pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
        
        # Features
        aggregations = {
            'MONTHS_BALANCE': ['max', 'mean', 'size'],
            'SK_DPD': ['max', 'mean'],
            'SK_DPD_DEF': ['max', 'mean']
        }
        for cat in cat_cols:
            aggregations[cat] = ['mean']
        
        pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
        pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
        
        # Count pos cash accounts
        pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
        del pos
        gc.collect()
        return pos_agg

In [43]:
    def installments_payments(num_rows = None, nan_as_category = True):
        ins = pd.read_csv('installments_payments.csv')
        ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
        
        # Percentage and difference paid in each installment (amount paid and installment value)
        ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
        ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
        
        # Days past due and days before due (no negative values)
        ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
        ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
        ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
        ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
        
        # Features: Perform aggregations
        aggregations = {
            'NUM_INSTALMENT_VERSION': ['nunique'],
            'DPD': ['max', 'mean', 'sum'],
            'DBD': ['max', 'mean', 'sum'],
            'PAYMENT_PERC': [ 'mean', 'sum', 'var'],
            'PAYMENT_DIFF': [ 'mean', 'sum', 'var'],
            'AMT_INSTALMENT': ['max', 'mean', 'sum'],
            'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
            'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
        }
        for cat in cat_cols:
            aggregations[cat] = ['mean']
        ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
        ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
        
        # Count installments accounts
        ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
        del ins
        gc.collect()
        return ins_agg


In [44]:
    def credit_card_balance(num_rows = None, nan_as_category = True):
        cc = pd.read_csv('credit_card_balance.csv')
        cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
        
        # General aggregations
        cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
        cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
        cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
        
        # Count credit card lines
        cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
        del cc
        gc.collect()
        return cc_agg

In [45]:
    bureau = bureau_and_balance()
    traintest = traintest.join(bureau)
    del bureau
    print('bureau segment done')

bureau segment done


In [46]:
    prev = previous_applications()
    traintest = traintest.join(prev)
    del prev
    print('prev segment done')

    pos = pos_cash()
    traintest = traintest.join(pos)
    del pos
    print('pos segment done')

    ins = installments_payments()
    traintest = traintest.join(ins)
    del ins
    print('ins segment done')

    cc = credit_card_balance()
    traintest = traintest.join(cc)
    del cc
    print('cc segment done')

traintest.shape


prev segment done
pos segment done
ins segment done
cc segment done


(356244, 552)

In [47]:
    objcols = traintest.select_dtypes('object')
    for c in objcols.columns:
        traintest[c] = traintest[c].astype('str')
        traintest[c] = LabelEncoder().fit_transform(traintest[c])
        if traintest[c].nunique() < 50:
            traintest[c] = traintest[c].astype('category')

    train = traintest[traintest.TARGET != 2]
    test = traintest[traintest.TARGET == 2]

    train.dtypes[train.dtypes=='category'] ####
    train.select_dtypes('category').dtypes ####
    print(train.shape, test.shape)

    X = train.drop('TARGET', axis=1)
    y = train['TARGET']
    X_test = test.drop('TARGET', axis=1)

    del(traintest)
    gc.collect()

train.head().T


((307500, 552), (48744, 552))


SK_ID_CURR,100002,100003,100004,100006,100007
TARGET,1,0,0,0,0
NAME_CONTRACT_TYPE,0,0,1,0,0
CODE_GENDER,1,0,1,0,1
FLAG_OWN_CAR,0,0,1,0,0
FLAG_OWN_REALTY,1,0,1,1,1
CNT_CHILDREN,0,0,0,0,0
AMT_INCOME_TOTAL,202500,270000,67500,135000,121500
AMT_CREDIT,406598,1.2935e+06,135000,312682,513000
AMT_ANNUITY,24700.5,35698.5,6750,29686.5,21865.5
AMT_GOODS_PRICE,351000,1.1295e+06,135000,297000,513000


In [48]:
    params = {'objective':'binary',
            'metric':'auc',
            'boosting':'gbdt', 
            'num_leaves':40,  
            'max_depth':8,  #6
            'learning_rate':0.02, 
            'subsample_for_bin':200000, 
            'class_weight':None, 
            'min_child_samples':60, #40
            'subsample':0.9, 
            'reg_lambda':10.0,  #10
            'reg_alpha':10.0,   ###
            'min_data_in_leaf':1000,    ###
            'predict_contrib':True, 
            'subsample_freq':0, 
            'colsample_bytree':0.75,  #0.85
            'num_threads':3}

    sub_preds = np.zeros_like(test.TARGET, dtype=float)
    oof_preds = np.zeros_like(y, dtype=float)
    cv = KFold(n_splits=4, shuffle=True, random_state=42)
    for trn_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]
    
        trainDS = lgb.Dataset(X_train, label=y_train.values)
        valDS = lgb.Dataset(X_val, label=y_val.values, reference=trainDS)
        
        evalnums = {}
        lmod = lgb.train(params, trainDS, num_boost_round=1400, early_stopping_rounds=50,
            valid_sets=[trainDS, valDS], evals_result=evalnums, verbose_eval=20)
        
        oof_preds[val_idx] = lmod.predict(X_val)
        sub_preds += lmod.predict(X_test.values)/cv.n_splits 
    roc_auc_score(y, oof_preds)   




Training until validation scores don't improve for 50 rounds.
[20]	training's auc: 0.757419	valid_1's auc: 0.743745
[40]	training's auc: 0.764103	valid_1's auc: 0.747303
[60]	training's auc: 0.769642	valid_1's auc: 0.750529
[80]	training's auc: 0.774697	valid_1's auc: 0.753746
[100]	training's auc: 0.779529	valid_1's auc: 0.756764
[120]	training's auc: 0.784136	valid_1's auc: 0.759835
[140]	training's auc: 0.788549	valid_1's auc: 0.762587
[160]	training's auc: 0.792513	valid_1's auc: 0.765065
[180]	training's auc: 0.796147	valid_1's auc: 0.767111
[200]	training's auc: 0.799234	valid_1's auc: 0.768707
[220]	training's auc: 0.802152	valid_1's auc: 0.770142
[240]	training's auc: 0.804678	valid_1's auc: 0.771262
[260]	training's auc: 0.807139	valid_1's auc: 0.772344
[280]	training's auc: 0.809425	valid_1's auc: 0.773278
[300]	training's auc: 0.811529	valid_1's auc: 0.773979
[320]	training's auc: 0.813539	valid_1's auc: 0.774793
[340]	training's auc: 0.815521	valid_1's auc: 0.775459
[360]	t

[140]	training's auc: 0.786827	valid_1's auc: 0.767033
[160]	training's auc: 0.790891	valid_1's auc: 0.769717
[180]	training's auc: 0.794387	valid_1's auc: 0.772052
[200]	training's auc: 0.797481	valid_1's auc: 0.773914
[220]	training's auc: 0.800349	valid_1's auc: 0.775267
[240]	training's auc: 0.802964	valid_1's auc: 0.77661
[260]	training's auc: 0.805338	valid_1's auc: 0.777743
[280]	training's auc: 0.807594	valid_1's auc: 0.778667
[300]	training's auc: 0.809633	valid_1's auc: 0.779408
[320]	training's auc: 0.811597	valid_1's auc: 0.780058
[340]	training's auc: 0.813437	valid_1's auc: 0.780719
[360]	training's auc: 0.815265	valid_1's auc: 0.781241
[380]	training's auc: 0.817025	valid_1's auc: 0.781704
[400]	training's auc: 0.818799	valid_1's auc: 0.782275
[420]	training's auc: 0.820474	valid_1's auc: 0.782656
[440]	training's auc: 0.822103	valid_1's auc: 0.783072
[460]	training's auc: 0.823652	valid_1's auc: 0.783416
[480]	training's auc: 0.825208	valid_1's auc: 0.783711
[500]	train

0.7876058250469338

In [49]:
sub = pd.read_csv('sample_submission.csv')
sub['TARGET'] = np.around(sub_preds, 4)
sub.reset_index()
sub.to_csv('sub.csv', index=False)

In [50]:
sub.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.0347
1,100005,0.1316
2,100013,0.0328
3,100028,0.0427
4,100038,0.1307
