In [17]:
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing
import lightgbm as lgb
import gc
import hashlib
import tqdm

In [18]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [19]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [20]:
import pickle
import gc
def get_type_feature_all(sample, train_df, key, on, type_c, mark):
    filename = "_".join([mark + "_%s_features" % type_c, "_".join(key), on, str(len(sample))]) + ".pkl"
    try:
        with open("feature_all/"+filename, "rb") as fp:
            print("load {} {} feature_all from pickle file: key: {}, on: {}...".format(mark, type_c, "_".join(key), on))
            col = pickle.load(fp)
        for c in col.columns:
            sample[c] = col[c]
        gc.collect()
    except:
        print('get {} {} feature_all, key: {}, on: {}'.format(mark, type_c, "_".join(key), on))
        if type_c == "count":
            tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].count()).reset_index()
        if type_c == "mean":
            tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].mean()).reset_index()
        if type_c == "nunique":
            tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].nunique()).reset_index()
        if type_c == "max":
            tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].max()).reset_index()
        if type_c == "min":
            tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].min()).reset_index()
        if type_c == "sum":
            tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].sum()).reset_index()
        if type_c == "std":
            tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].std()).reset_index()
        if type_c == "median":
            tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].median()).reset_index()
        tmp.columns = key + [mark + "_" + "_".join(key) + '_%s_' % type_c + on]
        tmp[mark + "_" + "_".join(key) + '_%s_' % type_c + on] = tmp[
            mark + "_" + "_".join(key) + '_%s_' % type_c + on].astype('float32')
        sample = sample.merge(tmp, on=key, how='left')
        with open("feature_all/"+filename, "wb") as fp:
            col = sample[[mark + "_" + "_".join(key) + '_%s_' % type_c + on]]
            pickle.dump(col, fp)
        del tmp
    del col, train_df
    gc.collect()
    return sample, mark + "_" + "_".join(key) + '_%s_' % type_c + on

In [21]:
from sklearn.model_selection import KFold,GroupKFold
from sklearn.metrics import roc_auc_score
EPOCHS = 6
kf = GroupKFold(n_splits=EPOCHS)

In [22]:
def train_lgbm_model(X_train,X_test,features,lgb_params,TEST_F=True,num=0):
    oof_preds = np.zeros(X_test.shape[0])
    oof_train = np.zeros(X_train.shape[0])
    feature_importance = pd.DataFrame()

    split_groups = X_train['DT_M']
    y_train=X_train['isFraud']
    
    
    EPOCHS = 5
    score = 0
    folds = KFold(n_splits=EPOCHS,shuffle=False)
    splits = folds.split(X_train, y_train)
    for i, (tr_idx, val_idx) in enumerate(splits):
#         if i >1:
#             continue

#    for i, (tr_idx, val_idx) in enumerate(kf.split(X_train, y_train, groups=split_groups)):
#         train=X_train.iloc[tr_idx, :].copy()    
#         adata=train[train["isFraud"]==0].copy()
#         bdata=train[train["isFraud"]==1].copy()
#         print(bdata.shape)
#         adata=adata.sample(frac=0.1,random_state=i)
   
    
#         X_data=pd.concat([adata,bdata])
    
#         from sklearn.utils import shuffle
#         X_data = shuffle(X_data)
#         print(X_data.shape)
    
#         X_tr=X_data[features]
#         y_tr=X_data["isFraud"]

#         _, X_vl = X_train[features].iloc[tr_idx, :], X_train[features].iloc[val_idx, :]
#         _, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        X_tr, X_vl = X_train[features].iloc[tr_idx, :], X_train[features].iloc[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        trn_data = lgb.Dataset(X_tr, label=y_tr)
        val_data = lgb.Dataset(X_vl, label=y_vl)
        if TEST_F==True:
            lgb_clf = lgb.train(lgb_params,trn_data,100,valid_sets = [trn_data,val_data],early_stopping_rounds=10,verbose_eval = num)
        else:
            lgb_clf = lgb.train(lgb_params,trn_data,10000,valid_sets = [trn_data,val_data],early_stopping_rounds=200,verbose_eval = 200)
            
        y_pred_train = lgb_clf.predict(X_vl)
        oof_train[val_idx] = y_pred_train
        
        print(f"Fold {i + 1} | AUC: {roc_auc_score(y_vl, y_pred_train)}")

        score += roc_auc_score(y_vl, y_pred_train) / EPOCHS
        
        oof_preds+= lgb_clf.predict(X_test[features]) / EPOCHS
        
        
    
        fold_importance = pd.DataFrame()
        fold_importance["feature"] = features
        fold_importance["importance"] = lgb_clf.feature_importance()
        fold_importance["fold"] = i + 1
        feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
    auc=roc_auc_score(y_train, oof_train)
    print(f"\nMean AUC = {score}")
    print(f"Out of folds AUC = {auc}")
    return oof_train,oof_preds,score,feature_importance

#oof_train,oof_preds,auc,feature_importance=train_lgbm_model(X_train,X_test,features,lgb_params,TEST_F=False)

In [23]:
train_transaction = pd.read_csv('../input/train_transaction.csv')
test_transaction = pd.read_csv('../input/test_transaction.csv')

train_identity = pd.read_csv('../input/train_identity.csv')
test_identity = pd.read_csv('../input/test_identity.csv')

train = train_transaction.merge(train_identity, how='left',on="TransactionID")
test = test_transaction.merge(test_identity, how='left',on="TransactionID")

In [24]:

    
#V1 ~ V11
#V12 ~ V34
v_count_fea = []
for  i  in  range(1,12):

    if  i==1:
        train['v1-v11']=train['V%d'%i].astype(str)
        test['v1-v11']=test['V%d'%i].astype(str)
       
    else:
        train['v1-v11']+="_"+train['V%d'%i].astype(str)
        test['v1-v11']+="_"+test['V%d'%i].astype(str)
        
v_count_fea.append('v1-v11')
        
    
for  i  in  range(12,35):

    if  i==12:
        train['v12-v35']=train['V%d'%i].astype(str)
        test['v12-v35']=test['V%d'%i].astype(str)
       
    else:
        train['v12-v35']+="_"+train['V%d'%i].astype(str)
        test['v12-v35']+="_"+test['V%d'%i].astype(str)
        
v_count_fea.append('v12-v35')
        
    


for  i  in  range(35,53):


    if  i==35:
        train['v35-v52']=train['V%d'%i].astype(str)
        test['v35-v52']=test['V%d'%i].astype(str)
       
    else:
        train['v35-v52']+="_"+train['V%d'%i].astype(str)
        test['v35-v52']+="_"+test['V%d'%i].astype(str)
        
v_count_fea.append('v35-v52')
        
        
for  i  in  range(53,75):

    if  i==53:
        train['v53-v74']=train['V%d'%i].astype(str)
        test['v53-v74']=test['V%d'%i].astype(str)
       
    else:
        train['v53-v74']+="_"+train['V%d'%i].astype(str)
        test['v53-v74']+="_"+test['V%d'%i].astype(str)
        
v_count_fea.append('v53-v74')
        
        
#V75 ~ V94
for  i  in  range(75,95):

    if  i==75:
        train['v75-v95']=train['V%d'%i].astype(str)
        test['v75-v95']=test['V%d'%i].astype(str)
       
    else:
        train['v75-v95']+="_"+train['V%d'%i].astype(str)
        test['v75-v95']+="_"+test['V%d'%i].astype(str)
        
v_count_fea.append('v75-v95')
        
        

# #V95 ~ V137

for  i  in  range(95,138):

    if  i==95:
        train['v95-v138']=train['V%d'%i].astype(str)
        test['v95-v138']=test['V%d'%i].astype(str)
       
    else:
        train['v95-v138']+="_"+train['V%d'%i].astype(str)
        test['v95-v138']+="_"+test['V%d'%i].astype(str)
        
v_count_fea.append('v95-v138')
        
# #V138 ~ V166 

for  i  in  range(138,167):

    if  i==138:
        train['v138-v167']=train['V%d'%i].astype(str)
        test['v138-v167']=test['V%d'%i].astype(str)
       
    else:
        train['v138-v167']+="_"+train['V%d'%i].astype(str)
        test['v138-v167']+="_"+test['V%d'%i].astype(str)
        
v_count_fea.append('v138-v167')
        
    
    

In [25]:
df_data=pd.concat([train,test],sort=False)
del train_transaction,test_transaction,train_identity,test_identity,train,test
gc.collect()

171

In [26]:
i_cols = ['v1-v11','v12-v35','v35-v52','v53-v74','v75-v95','v95-v138','v138-v167'
         ]

for col in i_cols:
    fq_encode = df_data[col].value_counts(dropna=False).to_dict()   
    df_data[col+'_fq_enc'] = df_data[col].map(fq_encode)

In [27]:
import os, sys, gc, warnings, random, datetime
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar


dates_range = pd.date_range(start='2017-10-01', end='2019-01-01')
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())

#START_DATE = datetime.datetime.strptime('2017-12-01', '%Y-%m-%d')
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
for df in [df_data]:
    # Temporary
   # df['DT'] = df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x-8*3600)))
    df['DT'] = df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
    df['DT_M'] = (df['DT'].dt.year-2017)*12 + df['DT'].dt.month
    df['DT_W'] = (df['DT'].dt.year-2017)*52 + df['DT'].dt.weekofyear
    df['DT_D'] = (df['DT'].dt.year-2017)*365 + df['DT'].dt.dayofyear
    
    df['DT_hour'] = df['DT'].dt.hour
    df['DT_day_week'] = df['DT'].dt.dayofweek
    df['DT_day'] = df['DT'].dt.day
    

    df['is_holiday'] = (df['DT'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)

    
    df['D9_all'] = (df['TransactionDT']%(3600*24)/3600//1)/24.0
    
    df['D9_not_na'] = np.where(df['D9'].isna(),0,1)
    df['D8_not_same_day'] = np.where(df['D8']>=1,1,0)
    df['D8_D9_decimal_dist'] = df['D8'].fillna(0)-df['D8'].fillna(0).astype(int)
    df['D8_D9_decimal_dist'] = ((df['D8_D9_decimal_dist']-df['D9'])**2)**0.5

    
for col in ['DT_M','DT_W','DT_D']:
    fq_encode = df_data[col].value_counts().to_dict()           
    df_data[col+'_total'] = df_data[col].map(fq_encode)

In [28]:
df_data['card_id'] = df_data['card1'].astype(str)+'_'+df_data['card2'].astype(str)+'_'+df_data['card3'].astype(str)+'_'+df_data['card4'].astype(str)+'_'+df_data['card5'].astype(str)+'_'+df_data['card6'].astype(str)
df_data['addr_id'] =df_data['addr1'].astype(str)+'_'+df_data['addr2'].astype(str)
df_data['dist_id'] =df_data['dist1'].astype(str)+'_'+df_data['dist1'].astype(str)
df_data['P_R']=df_data['P_emaildomain'].astype(str)+'_'+df_data['R_emaildomain'].astype(str)
# df_data['pay_id'] =df_data['id_30'].astype(str)+'_'+df_data['id_31'].astype(str)
# df_data['Device_id']=df_data['DeviceType'].astype(str)+'_'+df_data['DeviceInfo'].astype(str)


df_data['a_P']=df_data['addr_id'].astype(str)+'_'+df_data['P_emaildomain'].astype(str)
df_data['c_a_P']=df_data['card1'].astype(str)+'_'+df_data['a_P'].astype(str)

df_data["a_c"]=df_data['card_id'].astype(str)+'_'+df_data['addr1'].astype(str)
df_data["a_d"]=df_data['card_id'].astype(str)+'_'+df_data['P_R'].astype(str)
df_data["a_f"]=df_data['addr_id'].astype(str)+'_'+df_data['P_R'].astype(str)
df_data["a_e"]=df_data['card_id'].astype(str)+'_'+df_data['a_f'].astype(str)

df_data['product_type'] = df_data['ProductCD'].astype(str)+'_'+df_data['TransactionAmt'].astype(str)
df_data['product_type_d'] = df_data['DT_D'].astype(str)+'_'+df_data['product_type'].astype(str)
df_data['product_type_w'] = df_data['DT_W'].astype(str)+'_'+df_data['product_type'].astype(str)

df_data['product_card1'] = df_data['ProductCD'].astype(str)+'_'+df_data['card1'].astype(str)
df_data['product_card2'] = df_data['ProductCD'].astype(str)+'_'+df_data['card2'].astype(str)
df_data['product_D1'] = df_data['ProductCD'].astype(str)+'_'+df_data['D1'].astype(str)



df_data['DT_Product_a'] =  df_data['DT_D'].astype(str)+'_'+df_data['ProductCD'].astype(str)+'_'+df_data['a_c'].astype(str)
df_data['DT_Product_b'] =  df_data['DT_D'].astype(str)+'_'+df_data['ProductCD'].astype(str)+'_'+df_data['a_d'].astype(str)
df_data['DT_Product_c'] =  df_data['DT_D'].astype(str)+'_'+df_data['ProductCD'].astype(str)+'_'+df_data['a_e'].astype(str)
df_data['DT_Product_d'] =  df_data['DT_D'].astype(str)+'_'+df_data['ProductCD'].astype(str)+'_'+df_data['a_f'].astype(str)

In [29]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other', 'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
us_emails = ['gmail', 'net', 'edu']
#https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest-579654
for c in ['P_emaildomain', 'R_emaildomain']:
    df_data[c + '_bin'] = df_data[c].map(emails)
    df_data[c + '_suffix'] = df_data[c].map(lambda x: str(x).split('.')[-1]) 
    df_data[c + '_suffix'] = df_data[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')


In [30]:
df_data['D1_D2'] = df_data['D1']-df_data['D2']
df_data['D1_DT_user_t1']=df_data['D1_D2'].astype(str)+'_'+df_data['a_c'].astype(str)
df_data['D1_DT_user_t2']=df_data['D1_D2'].astype(str)+'_'+df_data['a_d'].astype(str)
df_data['D1_DT_user_t3']=df_data['D1_D2'].astype(str)+'_'+df_data['a_e'].astype(str)
df_data['D1_DT_user_t4']=df_data['D1_D2'].astype(str)+'_'+df_data['a_f'].astype(str)


df_data['D1_DT'] = df_data['D1']-df_data['DT_D']

df_data['uid'] = df_data['card1'].astype(str)+'_'+df_data['card2'].astype(str)
df_data['D1_DT_a']=df_data['D1_DT'].astype(str)+'_'+df_data['card1'].astype(str)
df_data['D1_DT_b']=df_data['D1_DT'].astype(str)+'_'+df_data['card2'].astype(str)
df_data['D1_DT_c']=df_data['D1_DT'].astype(str)+'_'+df_data['uid'].astype(str)



df_data['D1_DT_user_1']=df_data['D1_DT'].astype(str)+'_'+df_data['a_c'].astype(str)
df_data['D1_DT_user_2']=df_data['D1_DT'].astype(str)+'_'+df_data['a_d'].astype(str)
df_data['D1_DT_user_3']=df_data['D1_DT'].astype(str)+'_'+df_data['a_e'].astype(str)
df_data['D1_DT_user_4']=df_data['D1_DT'].astype(str)+'_'+df_data['a_f'].astype(str)

In [31]:

by = ['card_id','addr1','C3','C5','C7','C9','C11']

for idx,i in enumerate(by):
    if idx==0:
        df_data['card_id-C1-C11'] = df_data[i].astype(str)
    else:
        df_data['card_id-C1-C11'] += "f_"+df_data[i].astype(str)
    

by = ['card_id','addr1','C4','C6','C8','C10','C12']

for idx,i in enumerate(by):
    if idx==0:
        df_data['card_id-C4-C12'] = df_data[i].astype(str)
    else:
        df_data['card_id-C4-C12'] += "_"+df_data[i].astype(str)
        
        
        

df_data['card_id-C1-C11-C4-C12'] = df_data['card_id-C1-C11'].astype(str)+"_"+df_data['card_id-C4-C12'].astype(str)


c_card_unique_fea = ['card_id-C1-C11','card_id-C4-C12','card_id-C1-C11-C4-C12']



for col in c_card_unique_fea:
    fq_encode = df_data[col].value_counts(dropna=False).to_dict()   
    df_data[col+'_fq_enc'] = df_data[col].map(fq_encode)
    


In [32]:
#c with dt_d-d1

c_add_d_fea = []

for d in ['card1','card2','card3','card5',
 'DeviceType', 'DeviceInfo',
  'card_id', 'addr_id', 'dist_id',
  'P_R', 'uid', 'a_P', 'c_a_P', 'a_c', 'a_d', 'a_f', 'a_e', 
  'product_type', 'product_type_d', 'product_type_w', 
  'DT_Product_a', 'DT_Product_b', 'DT_Product_c', 'DT_Product_d',
  'D1_DT_user_t1', 'D1_DT_user_t2', 'D1_DT_user_t3', 'D1_DT_user_t4',
  'D1_DT_a', 'D1_DT_b', 'D1_DT_c', 'D1_DT_user_1', 
  'D1_DT_user_2', 'D1_DT_user_3', 'D1_DT_user_4',
  'product_card1','product_card2','product_D1'
 ]:
    
    
    for c in  c_card_unique_fea+['V307']:
        
        df_data[d+"_"+c] = df_data[d].astype(str)+"_"+df_data[c].astype(str)
        c_add_d_fea.append(d+"_"+c)
        col = d+"_"+c
        fq_encode = df_data[col].value_counts(dropna=False).to_dict()   
        df_data[col+'_fq_enc'] = df_data[col].map(fq_encode)


In [33]:
df_data['card1'] = df_data['card1'].fillna(0)
df_data['card2'] = df_data['card2'].fillna(0)
df_data['card3'] = df_data['card3'].fillna(0)
df_data['card5'] = df_data['card5'].fillna(0)
df_data['card4'] = df_data['card4'].fillna('nan')
df_data['card6'] = df_data['card6'].fillna('nan')



def card_info_hash(x):
    s = (str(int(x['card1']))+
         str(int(x['card2']))+
         str(int(x['card3']))+
         str(x['card4'])+
         str(int(x['card5']))+
         str(x['card6']))
    h = hashlib.sha256(s.encode('utf-8')).hexdigest()[0:15]
    return h


def device_hash(x):
    s =  str(x['id_30'])+str(x['id_31'])+str(x['id_32'])+str(x['id_33'])+str( x['DeviceType'])+ str(x['DeviceInfo'])
    h = hashlib.sha256(s.encode('utf-8')).hexdigest()[0:15]
    return h




df_data['card_hash'] = df_data.apply(lambda x: card_info_hash(x), axis=1   )
df_data['device_hash'] = df_data.apply(lambda x: device_hash(x), axis=1   )


def get_data_by_card_hash( data, card_hash):
    mask = data['card_hash']==card_hash
    return data.loc[mask,:].copy()


def get_data_by_device_hash( data, device_hash):
    mask = data['device_hash']==device_hash
    return data.loc[mask,:].copy()


def get_data_by_card_and_device_hash( data, card_hash, device_hash):
    mask = (data['card_hash']==card_hash) &(data['device_hash']==device_hash)
    return data.loc[mask,:].copy()




df_data['card_hash-device_hash-v307'] = df_data['card_hash'].astype(str)+"_"+df_data['device_hash'].astype(str)+df_data['V307'].astype(str)

df_data['card_hash-device'] = df_data['card_hash'].astype(str)+"_"+df_data['device_hash'].astype(str)

for col in ['card_hash-device_hash-v307']:
    fq_encode = df_data[col].value_counts(dropna=False).to_dict()   
    df_data[col+'_fq_enc'] = df_data[col].map(fq_encode)
    



In [34]:
#self cv target encoding

train = df_data[df_data.isFraud.notna()]
test = df_data[df_data.isFraud.isna()]

def get_data_ctr_fea(tj_data,self_data,features,target):

    items=features
    
    tj_drop_columns=[i for i in  tj_data  if "_ctr" in i]
    if len(tj_drop_columns)>0:
        tj_data=tj_data.drop(columns=tj_drop_columns)
    
    self_drop_columns=[i for i in  self_data  if "_ctr" in i]
    if len(self_drop_columns)>0:
        print(self_drop_columns)
        self_data.drop(columns=self_drop_columns,inplace=True)
        
    
    for item in tqdm.tqdm(items):
        if type(item)==list:
            pr_name="_".join(item)
            merge_columns=item+[pr_name+'_ctr']
        else:
            pr_name=item
            merge_columns=[item,pr_name+'_ctr']
        temp = tj_data.groupby(item, as_index = False)[target].agg({pr_name+'_click':'sum',pr_name+'_count':'count'})
        temp[pr_name+'_ctr'] =1000000* (temp[pr_name+'_click']+0.01)/(temp[pr_name+'_count']+0.01)
        
        self_data = pd.merge(self_data, temp[merge_columns], on=item, how='left')
        
#     items = items[:5]
#     for i in range(len(items)):
#         for j in range(i+1, len(items)):
#             item_g = [items[i], items[j]]
#             merge_columns=item_g+['_'.join(item_g)+'_ctr']
#             temp = tj_data.groupby(item_g, as_index=False)[target].agg({'_'.join(item_g)+'_click': 'sum','_'.join(item_g)+'count':'count'})
#             temp['_'.join(item_g)+'_ctr'] =100* (temp['_'.join(item_g)+'_click']+0.01)/(temp['_'.join(item_g)+'count']+0.01)
#     、
#     self_data = pd.merge(self_data, temp[merge_columns], on=item_g, how='left')
            
    return self_data


ctr_fea = ['card1','card2','card3','card5',
 'DeviceType', 'DeviceInfo',
  'card_id', 'addr_id', 'dist_id',
  'P_R', 'uid', 'a_P', 'c_a_P', 'a_c', 'a_d', 'a_f', 'a_e', 
  'product_type', 'product_type_d', 'product_type_w', 
  'DT_Product_a', 'DT_Product_b', 'DT_Product_c', 'DT_Product_d',
  'D1_DT_user_t1', 'D1_DT_user_t2', 'D1_DT_user_t3', 'D1_DT_user_t4',
  'D1_DT_a', 'D1_DT_b', 'D1_DT_c', 'D1_DT_user_1', 
  'D1_DT_user_2', 'D1_DT_user_3', 'D1_DT_user_4',
  'product_card1','product_card2','product_D1'
 ]+c_card_unique_fea+['card_hash-device_hash-v307','card_hash-device']+v_count_fea+c_add_d_fea

            
test = get_data_ctr_fea(train,test,ctr_fea,'isFraud')


skf = KFold(n_splits=5,shuffle=False)



def  get_train_ctr_fea(train_data,skf,features,target):
    train_data['index'] = [i for i in range(len(train_data))]
    for k, (train_in, test_in) in enumerate(skf.split(train_data)):
        train_df=train_data.iloc[train_in]
        val_df=train_data.iloc[test_in]
        val_df=get_data_ctr_fea(train_df,val_df,features,target)
        if k==0:
            new_train_df=val_df
        else:
            new_train_df=pd.concat([new_train_df,val_df])

    train_data=new_train_df
    train_data.sort_values(by='index',inplace=True)
    del train_data['index']
    train_data.reset_index(drop=True,inplace=True)
    
    return train_data

train = get_train_ctr_fea(train,skf,ctr_fea,'isFraud')


df_data = pd.concat([train,test])

del train,test
gc.collect()
    

100%|██████████| 202/202 [50:53<00:00, 12.82s/it]  
100%|██████████| 202/202 [10:39<00:00,  2.65s/it] 
100%|██████████| 202/202 [10:45<00:00,  2.63s/it] 
100%|██████████| 202/202 [10:47<00:00,  2.65s/it] 
100%|██████████| 202/202 [10:51<00:00,  2.63s/it] 
100%|██████████| 202/202 [10:44<00:00,  2.68s/it] 


84

In [35]:
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.base import clone
# from sklearn.model_selection import check_cv

# from category_encoders import CatBoostEncoder

# import pandas as pd
# import numpy as np


# class TargetEncoderCV(BaseEstimator, TransformerMixin):

#     def __init__(self, cv, **cbe_params):
#         self.cv = cv
#         self.cbe_params = cbe_params

#     @property
#     def _n_splits(self):
#         return check_cv(self.cv).n_splits

#     def fit_transform(self, X: pd.DataFrame, y) -> pd.DataFrame:
#         self.cbe_ = []
#         cv = check_cv(self.cv)

#         cbe = CatBoostEncoder(
#             cols=X.columns.tolist(),
#             return_df=False,
#             **self.cbe_params
#         )

#         X_transformed = np.zeros_like(X, dtype=np.float64)
#         for train_idx, valid_idx in cv.split(X, y):
#             self.cbe_.append(
#                 clone(cbe).fit(X.loc[train_idx], y[train_idx])
#             )
#             X_transformed[valid_idx] = self.cbe_[-1].transform(
#                 X.loc[valid_idx]
#             )

#         return pd.DataFrame(X_transformed, columns=[i+"_tc" for i in X.columns])

#     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
#         X_transformed = np.zeros_like(X, dtype=np.float64)
#         for cbe in self.cbe_:
#             X_transformed += cbe.transform(X) / self._n_splits
#         return pd.DataFrame(X_transformed, columns=[i+"_tc" for i in X.columns])
    
    
# train = df_data[df_data.isFraud.notna()]
# test = df_data[df_data.isFraud.isna()] 

# del df_data
# gc.collect()

# tc_columns = ['card1','card2','card3','card5',
#  'DeviceType', 'DeviceInfo',
#   'card_id', 'addr_id', 'dist_id',
#   'P_R', 'uid', 'a_P', 'c_a_P', 'a_c', 'a_d', 'a_f', 'a_e', 
#   'product_type', 'product_type_d', 'product_type_w', 
#   'DT_Product_a', 'DT_Product_b', 'DT_Product_c', 'DT_Product_d',
#   'D1_DT_user_t1', 'D1_DT_user_t2', 'D1_DT_user_t3', 'D1_DT_user_t4',
#   'D1_DT_a', 'D1_DT_b', 'D1_DT_c', 'D1_DT_user_1', 
#   'D1_DT_user_2', 'D1_DT_user_3', 'D1_DT_user_4',
#   'product_card1','product_card2','product_D1'
#  ]+c_card_unique_fea+['card_hash-device_hash-v307']+v_count_fea+c_add_d_fea

# X_train = train[tc_columns]
# X_test = test[tc_columns] 
# y_train = train['isFraud']
# te_cv = TargetEncoderCV(KFold(n_splits=5,shuffle=False))
# X_train_encoded = te_cv.fit_transform(X_train, y_train)
# X_test_encoded = te_cv.transform(X_test)



# train = pd.concat([train,X_train_encoded],axis=1)
# test = pd.concat([test,X_test_encoded],axis=1)


# df_data = pd.concat([train,test])

# del train,test,X_train,X_test,X_train_encoded,X_test_encoded
# gc.collect()
    

In [36]:
df_data["cents"]=df_data["TransactionAmt"]-df_data["TransactionAmt"].astype(int)
df_data['product_cents'] = df_data['ProductCD'].astype(str)+'_'+df_data["cents"].astype(str)

In [37]:
i_cols = ['card1','card2', 'D1_DT_a']
for col in i_cols:
    for agg_type in ['nunique']:
        new_col_name = col+'_nunique_'+agg_type
        temp_df = df_data.groupby(["product_type_d"])[col].agg([agg_type]).reset_index().rename(
                                                columns={agg_type: new_col_name})
        temp_df.index = list(temp_df["product_type_d"])
        temp_df = temp_df[new_col_name].to_dict()
        df_data[new_col_name] = df_data["product_type_d"].map(temp_df)

In [38]:
i_cols = ['card1','card2','card3','card5',
         'DeviceType', 'DeviceInfo',
          'card_id', 'addr_id', 'dist_id',
          'P_R', 'uid', 'a_P', 'c_a_P', 'a_c', 'a_d', 'a_f', 'a_e', 
          'product_type', 'product_type_d', 'product_type_w', 
          'DT_Product_a', 'DT_Product_b', 'DT_Product_c', 'DT_Product_d',
          'D1_DT_user_t1', 'D1_DT_user_t2', 'D1_DT_user_t3', 'D1_DT_user_t4',
          'D1_DT_a', 'D1_DT_b', 'D1_DT_c', 'D1_DT_user_1', 
          'D1_DT_user_2', 'D1_DT_user_3', 'D1_DT_user_4',
          'product_card1','product_card2','product_D1'
         ]

for col in i_cols:
    fq_encode = df_data[col].value_counts(dropna=False).to_dict()   
    df_data[col+'_fq_enc'] = df_data[col].map(fq_encode)

In [39]:
v = ['v1-v11','v12-v35','v35-v52','v53-v74','v75-v95','v95-v138','v138-v167'
         ]

v_card_fea = []
for fea in v:
    df_data[fea+"_card_id"] = df_data[fea]+"_"+df_data['card_id']
    v_card_fea.append(fea+"_card_id")
    df_data[fea+'D1_DT_user_1'] = df_data[fea]+"_"+df_data['D1_DT_user_1']
    v_card_fea.append(fea+'D1_DT_user_1')
        
    
    
    
    
for col in v_card_fea:
    fq_encode = df_data[col].value_counts(dropna=False).to_dict()   
    df_data[col+'_fq_enc'] = df_data[col].map(fq_encode)


In [40]:
# i_cols = ['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
#          'D1','D2','D3','D4','D5','D6','D7','D8']
# for feature in i_cols:
#     temp=df_data[feature].value_counts(dropna=False).to_dict()
#     #temp = df_data[feature].value_counts(dropna=False).to_dict()
#     df_data[feature + '_fq_enc'] = df_data[feature].map(temp)

i_cols = ['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
         'D1','D2','D3','D4','D5','D6','D7','D8']
for feature in i_cols:
    df_data[feature+"_CD"]=df_data[feature].astype(str)+'_'+df_data['ProductCD'].astype(str)
    temp=df_data[feature+"_CD"].value_counts(dropna=False).to_dict()
    df_data[feature + '_fq_enc'] = df_data[feature+"_CD"].map(temp)
    del df_data[feature+"_CD"]

In [41]:
i_cols = ['card1','card2','card3','card5',
         "card_id","dist_id","P_R", 'D1_DT_a', 'D1_DT_b', 'D1_DT_c',
            'a_c',
           'a_d',
           'a_f',
           'a_e',
         'D1_DT_user_1',
          'product_card1','product_card2','product_D1'
          
         ]+v_card_fea

# for col in i_cols:
#     for agg_type in ['mean','std','max']:
#         new_col_name = col+'_TransactionAmt_'+agg_type
#         temp_df = df_data.groupby([col])['TransactionAmt'].agg([agg_type]).reset_index().rename(
#                                                 columns={agg_type: new_col_name})
        
#         temp_df.index = list(temp_df[col])
#         temp_df = temp_df[new_col_name].to_dict()   
    
#         df_data[new_col_name] = df_data[col].map(temp_df)
 
for col in i_cols:
    for agg_type in ['mean','std','max']:
        new_col_name = col+'_TransactionAmt_'+agg_type
        df_data[col+"_CD"]=df_data[col].astype(str)+'_'+df_data['ProductCD'].astype(str)
        temp_df = df_data.groupby([col+"_CD"])['TransactionAmt'].agg([agg_type]).reset_index().rename(
                                                columns={agg_type: new_col_name})
        
        temp_df.index = list(temp_df[col+"_CD"])
        temp_df = temp_df[new_col_name].to_dict()   
    
        df_data[new_col_name] = df_data[col+"_CD"].map(temp_df)
        del df_data[col+"_CD"]    

In [42]:
periods = ['DT_D']
i_cols = [ 
          'card1','card2','card3','card5',
    'addr1','addr2',
    #'addr_id',
    'P_emaildomain','R_emaildomain',
         "card_id","dist_id","P_R",'D1_DT_a', 'D1_DT_b', 'D1_DT_c',
        'D1_DT_user_1',  'D1_DT_user_2', 'D1_DT_user_3', 'D1_DT_user_4',
      'D1_DT_user_t1', 'D1_DT_user_t2',
    'product_card1','product_card2','product_D1'
         ]


for period in periods:
    for col in i_cols:
        new_column = col + '_' + period
        df_data[new_column] = df_data[col].astype(str) + '_' + (df_data[period]).astype(str)+'_'+df_data['ProductCD'].astype(str)
        fq_encode = df_data[new_column].value_counts().to_dict()
        
        df_data[new_column] = df_data[new_column].map(fq_encode)
        df_data[new_column] /= df_data[period+'_total']

In [43]:
# periods = ['DT_D']
# i_cols = [ 
#           'card1','card2','card3','card5',
#     'addr_id',
#     'P_emaildomain','R_emaildomain',
#          "card_id","dist_id","P_R",'D1_DT_a'
#          ]

# for period in periods:
#     for col in i_cols:
#         new_column = col + '_' + period+"_hour"
#         df_data[new_column] = df_data[col].astype(str) + '_' + (df_data[period]).astype(str)+ '_' + (df_data["DT_hour"]).astype(str)
#         fq_encode = df_data[new_column].value_counts().to_dict()
        
#         df_data[new_column] = df_data[new_column].map(fq_encode)
#         df_data[new_column] /= df_data[period+'_total']

In [44]:
#https://www.kaggle.com/kyakovlev/ieee-fe-with-some-eda
########################### card3/card5 most common hour 
# card3 or card5 is a bank country?
# can we find:
# - the most popular Transaction Hour
# - the most popular Week Day
# and then find distance from it

# Prepare bank type feature
for df in [df_data]:
    df['bank_type'] = df['card3'].astype(str) +'_'+ df['card5'].astype(str)

encoding_mean = {
    1: ['DT_D','DT_hour','_hour_dist','DT_hour_mean'],
    2: ['DT_W','DT_day_week','_week_day_dist','DT_day_week_mean'],
   # 3: ['DT_M','DT_day_month','_month_day_dist','DT_day_month_mean'],
    }

encoding_best = {
    1: ['DT_D','DT_hour','_hour_dist_best','DT_hour_best'],
    2: ['DT_W','DT_day_week','_week_day_dist_best','DT_day_week_best'],
  #  3: ['DT_M','DT_day_month','_month_day_dist_best','DT_day_month_best'],   
    }

# Some ugly code here (even worse than in other parts)
for col in ['card3','card5','bank_type']:
    for df in [df_data]:
        for encode in encoding_mean:
            encode = encoding_mean[encode].copy()
            new_col = col + '_' + encode[0] + encode[2]
            df[new_col] = df[col].astype(str) +'_'+ df[encode[0]].astype(str)

            temp_dict = df.groupby([new_col])[encode[1]].agg(['mean']).reset_index().rename(
                                                                    columns={'mean': encode[3]})
            temp_dict.index = temp_dict[new_col].values
            temp_dict = temp_dict[encode[3]].to_dict()
            df[new_col] = df[encode[1]] - df[new_col].map(temp_dict)

        for encode in encoding_best:
            encode = encoding_best[encode].copy()
            new_col = col + '_' + encode[0] + encode[2]
            df[new_col] = df[col].astype(str) +'_'+ df[encode[0]].astype(str)
            temp_dict = df.groupby([col,encode[0],encode[1]])[encode[1]].agg(['count']).reset_index().rename(
                                                                    columns={'count': encode[3]})

            temp_dict.sort_values(by=[col,encode[0],encode[3]], inplace=True)
            temp_dict = temp_dict.drop_duplicates(subset=[col,encode[0]], keep='last')
            temp_dict[new_col] = temp_dict[col].astype(str) +'_'+ temp_dict[encode[0]].astype(str)
            temp_dict.index = temp_dict[new_col].values
            temp_dict = temp_dict[encode[1]].to_dict()
            df[new_col] = df[encode[1]] - df[new_col].map(temp_dict)
del df_data["bank_type"]

In [45]:
i_cols = ['uid','a_c','D1_DT_a','D1_DT_user_1','D1_DT_user_2','product_card1','product_card2']
d_cols=['D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15']

# for col in i_cols:
#     for d in d_cols:
#         for agg_type in ['mean','std']:
#             new_col_name = col+'_'+d+"_"+agg_type
#             temp_df = df_data.groupby([col])[d].agg([agg_type]).reset_index().rename(
#                                                 columns={agg_type: new_col_name})
#             temp_df.index = list(temp_df[col])
#             temp_df = temp_df[new_col_name].to_dict()       
#             df_data[new_col_name] = df_data[col].map(temp_df)
# print("make D mean std")

for col in i_cols:
    for d in d_cols:
        for agg_type in ['mean','std']:
            new_col_name = col+'_'+d+"_"+agg_type
            df_data[col+"_CD"]=df_data[col].astype(str)+'_'+df_data['ProductCD'].astype(str)
            temp_df = df_data.groupby([col+"_CD"])[d].agg([agg_type]).reset_index().rename(
                                                columns={agg_type: new_col_name})
            temp_df.index = list(temp_df[col+"_CD"])
            temp_df = temp_df[new_col_name].to_dict()       
            df_data[new_col_name] = df_data[col+"_CD"].map(temp_df)
            del df_data[col+"_CD"]
print("make D mean std")

make D mean std


In [46]:
for col in "card1,card2,card5,addr1,addr2".split(","):
    col_count1 = df_data[df_data['C5'] == 0].groupby(col)['C5'].count()
    col_count2 = df_data[df_data['C5'] != 0].groupby(col)['C5'].count()
    df_data[col+'_C5count'] = df_data[col].map(col_count2) / (df_data[col].map(col_count1) + 0.01)

In [47]:
for feature in ['id_02__id_20', 'id_02__D8', 'D11__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']:

    f1, f2 = feature.split('__')
    df_data[feature] = df_data[f1].astype(str) + '_' + df_data[f2].astype(str)

    le = preprocessing.LabelEncoder()
    le.fit(list(df_data[feature].astype(str).values))
    df_data[feature] = le.transform(list(df_data[feature].astype(str).values))

In [48]:
train=df_data[~df_data["isFraud"].isnull()].copy()
test=df_data[df_data["isFraud"].isnull()].copy()

train['TransactionAmt_check'] = np.where(train['TransactionAmt'].isin(test['TransactionAmt']), 1, 0)
test['TransactionAmt_check']  = np.where(test['TransactionAmt'].isin(train['TransactionAmt']), 1, 0)

i_cols = ['card1','card2','card3','card5',
         'DeviceType', 'DeviceInfo',
          'card_id', 'addr_id', 'dist_id',
          'P_R', 'uid', 'a_P', 'c_a_P', 'a_c', 'a_d', 'a_f', 'a_e', 
          'product_type', 'product_type_d', 'product_type_w', 
          'DT_Product_a', 'DT_Product_b', 'DT_Product_c', 'DT_Product_d',
          'D1_DT_user_t1', 'D1_DT_user_t2', 'D1_DT_user_t3', 'D1_DT_user_t4',
          'D1_DT_a', 'D1_DT_b', 'D1_DT_c', 'D1_DT_user_1', 
          'D1_DT_user_2', 'D1_DT_user_3', 'D1_DT_user_4',
         'product_D1',
         ]

for col in i_cols:   
    fq_encode_a = train[col].value_counts(dropna=False).to_dict()   
    fq_encode_b = test[col].value_counts(dropna=False).to_dict()   
    
    train[col+'_fq_enc_dist'] = train[col].map(fq_encode_a)
    test[col+'_fq_enc_dist'] = test[col].map(fq_encode_a)
    
#     train[col+'_fq_enc_diff'] = train[col].map(fq_encode_a)
#     test[col+'_fq_enc_diff'] = test[col].map(fq_encode_b)


for col in ['card1']: 
    valid_card = pd.concat([train[[col]], test[[col]]])
    valid_card = valid_card[col].value_counts()
    valid_card_std = valid_card.values.std()

    invalid_cards = valid_card[valid_card<=2]
    print('Rare cards',len(invalid_cards))

    valid_card = valid_card[valid_card>2]
    valid_card = list(valid_card.index)

    print('No intersection in Train', len(train[~train[col].isin(test[col])]))
    print('Intersection in Train', len(train[train[col].isin(test[col])]))
    
    train[col] = np.where(train[col].isin(test[col]), train[col], np.nan)
    test[col]  = np.where(test[col].isin(train[col]), test[col], np.nan)

    train[col] = np.where(train[col].isin(valid_card), train[col], np.nan)
    test[col]  = np.where(test[col].isin(valid_card), test[col], np.nan)
    print('#'*20)
    
for col in ['card2','card3','card4','card5','card6']: 
    print('No intersection in Train', col, len(train[~train[col].isin(test[col])]))
    print('Intersection in Train', col, len(train[train[col].isin(test[col])]))
    
    train[col] = np.where(train[col].isin(test[col]), train[col], np.nan)
    test[col]  = np.where(test[col].isin(train[col]), test[col], np.nan)
    print('#'*20)

for col in ['D1_DT',"D1_D2",
            'D1_DT_user_t1', 'D1_DT_user_t2', 'D1_DT_user_t3', 'D1_DT_user_t4',
            'D1_DT_a', 'D1_DT_b', 'D1_DT_c', 
            'D1_DT_user_1', 'D1_DT_user_2', 'D1_DT_user_3', 'D1_DT_user_4', 'product_D1']: 
    print('No intersection in Train', col, len(train[~train[col].isin(test[col])]))
    print('Intersection in Train', col, len(train[train[col].isin(test[col])]))
    
    train[col] = np.where(train[col].isin(test[col]), train[col], np.nan)
    test[col]  = np.where(test[col].isin(train[col]), test[col], np.nan)
    print('#'*20)

df_data=pd.concat([train,test],sort=False)
del train,test

Rare cards 5993
No intersection in Train 10396
Intersection in Train 580144
####################
No intersection in Train card2 5012
Intersection in Train card2 585528
####################
No intersection in Train card3 47
Intersection in Train card3 590493
####################
No intersection in Train card4 0
Intersection in Train card4 590540
####################
No intersection in Train card5 7279
Intersection in Train card5 583261
####################
No intersection in Train card6 30
Intersection in Train card6 590510
####################
No intersection in Train D1_DT 27880
Intersection in Train D1_DT 562660
####################
No intersection in Train D1_D2 298
Intersection in Train D1_D2 590242
####################
No intersection in Train D1_DT_user_t1 90079
Intersection in Train D1_DT_user_t1 500461
####################
No intersection in Train D1_DT_user_t2 114551
Intersection in Train D1_DT_user_t2 475989
####################
No intersection in Train D1_DT_user_t3 181992
I

In [49]:
i_cols = ['uid','a_c','D1_DT_a','D1_DT_user_1','D1_DT_user_2', 'product_D1']
d_cols=['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14']

# for col in i_cols:
#     for d in d_cols:
#         for agg_type in ['mean','std','max']:
#             new_col_name = col+'_'+d+"_"+agg_type
#             temp_df = df_data.groupby([col])[d].agg([agg_type]).reset_index().rename(
#                                                 columns={agg_type: new_col_name})
#             temp_df.index = list(temp_df[col])
#             temp_df = temp_df[new_col_name].to_dict()       
#             df_data[new_col_name] = df_data[col].map(temp_df)
# print("make C mean std")
for col in i_cols:
    for d in d_cols:
        for agg_type in ['mean','std','max']:
            new_col_name = col+'_'+d+"_"+agg_type
            #df_data[col]=df_data[col].fillna("unKnow")
            df_data[col+"_CD"]=df_data[col].astype(str)+'_'+df_data['ProductCD'].astype(str)
            temp_df = df_data.groupby([col+"_CD"])[d].agg([agg_type]).reset_index().rename(
                                                columns={agg_type: new_col_name})
            temp_df.index = list(temp_df[col+"_CD"])
            temp_df = temp_df[new_col_name].to_dict()       
            df_data[new_col_name] = df_data[col+"_CD"].map(temp_df)
            del df_data[col+"_CD"]
print("make C mean std")


make C mean std


In [50]:
del temp_df
gc.collect()

104

In [51]:
df_data.reset_index(drop=True,inplace=True)

In [52]:
# i_cols = ['uid','a_c','D1_DT_a','D1_DT_user_1','D1_DT_user_2']
# d_cols=['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14']

# for col in i_cols:
#     for d in d_cols:
#         for agg_type in ['var','nunique']:
#             new_col_name = col+'_'+d+"_"+agg_type
#             temp_df = df_data.groupby([col])[d].agg([agg_type]).reset_index().rename(
#                                                 columns={agg_type: new_col_name})
#             temp_df.index = list(temp_df[col])
#             temp_df = temp_df[new_col_name].to_dict()       
#             df_data[new_col_name] = df_data[col].map(temp_df)
# print("make C mean std")

In [53]:
# Label Encoding
import tqdm

ob_feature=[]
for f in tqdm.tqdm(df_data.columns):
    if df_data[f].dtype=='object': 
        ob_feature.append(f)
        df_data[f].fillna("nan",inplace=True)
        df_data[f] = df_data[f].map(
        dict(zip(df_data[f].unique(), range(0, df_data[f].nunique()))))
#         df_data[f] = pd.factorize(df_data[f])[0]
#         lbl = preprocessing.LabelEncoder()
#         df_data[f]=lbl.fit_transform(list(df_data[f].values))
print(ob_feature)

100%|██████████| 1714/1714 [34:55<00:00,  1.22s/it] 

['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'v1-v11', 'v12-v35', 'v35-v52', 'v53-v74', 'v75-v95', 'v95-v138', 'v138-v167', 'card_id', 'addr_id', 'dist_id', 'P_R', 'a_P', 'c_a_P', 'a_c', 'a_d', 'a_f', 'a_e', 'product_type', 'product_type_d', 'product_type_w', 'product_card1', 'product_card2', 'product_D1', 'DT_Product_a', 'DT_Product_b', 'DT_Product_c', 'DT_Product_d', 'P_emaildomain_bin', 'P_emaildomain_suffix', 'R_emaildomain_bin', 'R_emaildomain_suffix', 'D1_DT_user_t1', 'D1_DT_user_t2', 'D1_DT_user_t3', 'D1_DT_user_t4', 'uid', 'D1_DT_a', 'D1_DT_b', 'D1_DT_c', 'D1_DT_user_1', 'D1_DT_user_2', 'D1_DT_user_3', 'D1_DT_user_4', 'card_id-C1-C11', 'card_id-C4-C12', 'card_id-C1-C11-C4-C12', 'card1_card_id-C1-C11', 'card1_card_id-C4-C12', 'card1_card_id-C1-C




In [54]:
# def values_normalization(dt_df, periods, columns):
#     for period in periods:
#         for col in columns:
#             new_col = col +'_'+ period
#             dt_df[col] = dt_df[col].astype(float)  

#             temp_min = dt_df.groupby([period])[col].agg(['min']).reset_index()
#             temp_min.index = temp_min[period].values
#             temp_min = temp_min['min'].to_dict()

#             temp_max = dt_df.groupby([period])[col].agg(['max']).reset_index()
#             temp_max.index = temp_max[period].values
#             temp_max = temp_max['max'].to_dict()

#             temp_mean = dt_df.groupby([period])[col].agg(['mean']).reset_index()
#             temp_mean.index = temp_mean[period].values
#             temp_mean = temp_mean['mean'].to_dict()

#             temp_std = dt_df.groupby([period])[col].agg(['std']).reset_index()
#             temp_std.index = temp_std[period].values
#             temp_std = temp_std['std'].to_dict()

#             dt_df['temp_min'] = dt_df[period].map(temp_min)
#             dt_df.loc[dt_df['temp_min']<0,'temp_min']=0
            
#             dt_df['temp_max'] = dt_df[period].map(temp_max)
#             dt_df['temp_mean'] = dt_df[period].map(temp_mean)
#             dt_df['temp_std'] = dt_df[period].map(temp_std)

#             dt_df[new_col+'_min_max'] = (dt_df[col]-dt_df['temp_min'])/(dt_df['temp_max']-dt_df['temp_min'])
#             dt_df[new_col+'_std_score'] = (dt_df[col]-dt_df['temp_mean'])/(dt_df['temp_std'])
#             del dt_df['temp_min'],dt_df['temp_max'],dt_df['temp_mean'],dt_df['temp_std']
#     return dt_df


# # TransactionAmt Normalization
# periods = ['DT_D','DT_W','DT_M']
# df_data = values_normalization(df_data, periods, ['TransactionAmt'])
# df_data = values_normalization(df_data, ['DT_D','DT_W'], [ 'D3', 'D4', 'D5', 'D6', 'D7', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15'])
# print("TransactionAmt Normalization")

In [55]:
def values_normalization(dt_df, periods, columns):
    for period in periods:
        for col in columns:
            new_col = col +'_'+ period+"_ProductCD"
            dt_df[col] = dt_df[col].astype(float)  
            dt_df[period+"_ProductCD"]=dt_df[period].astype(str)+"_"+dt_df["ProductCD"].astype(str)
            
            temp_min = dt_df.groupby([period+"_ProductCD"])[col].agg(['min']).reset_index()
            temp_min.index = temp_min[period+"_ProductCD"].values
            temp_min = temp_min['min'].to_dict()

            temp_max = dt_df.groupby([period+"_ProductCD"])[col].agg(['max']).reset_index()
            temp_max.index = temp_max[period+"_ProductCD"].values
            temp_max = temp_max['max'].to_dict()

            temp_mean = dt_df.groupby([period+"_ProductCD"])[col].agg(['mean']).reset_index()
            temp_mean.index = temp_mean[period+"_ProductCD"].values
            temp_mean = temp_mean['mean'].to_dict()

            temp_std = dt_df.groupby([period+"_ProductCD"])[col].agg(['std']).reset_index()
            temp_std.index = temp_std[period+"_ProductCD"].values
            temp_std = temp_std['std'].to_dict()

            dt_df['temp_min'] = dt_df[period+"_ProductCD"].map(temp_min)
            dt_df.loc[dt_df['temp_min']<0,'temp_min']=0
            
            dt_df['temp_max'] = dt_df[period+"_ProductCD"].map(temp_max)
            dt_df['temp_mean'] = dt_df[period+"_ProductCD"].map(temp_mean)
            dt_df['temp_std'] = dt_df[period+"_ProductCD"].map(temp_std)

            dt_df[new_col+'_min_max'] = (dt_df[col]-dt_df['temp_min'])/(dt_df['temp_max']-dt_df['temp_min'])
            dt_df[new_col+'_std_score'] = (dt_df[col]-dt_df['temp_mean'])/(dt_df['temp_std'])
            del dt_df['temp_min'],dt_df['temp_max'],dt_df['temp_mean'],dt_df['temp_std'],dt_df[period+"_ProductCD"]
    return dt_df


# TransactionAmt Normalization
periods = ['DT_D','DT_W','DT_M']
df_data = values_normalization(df_data, periods, ['TransactionAmt'])
df_data = values_normalization(df_data, ['DT_D','DT_W'], [ 'D3', 'D4', 'D5', 'D6', 'D7', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15'])
print("TransactionAmt Normalization")

TransactionAmt Normalization


In [56]:
V_AMT=[]
def make_featrue(df,num1,num2):
    features = [f'V{i}' for i in range(num1,num2+1)]
    V_AMT.extend(features)
    df["V"+str(num1)+"_"+str(num2)+"_mean"]=df[features].mean(1)
    df["V"+str(num1)+"_"+str(num2)+"_std"]=df[features].std(1)
    return df
df_data=make_featrue(df_data,126,137)
df_data=make_featrue(df_data,159,166)
df_data=make_featrue(df_data,202,216)
df_data=make_featrue(df_data,263,278)
df_data=make_featrue(df_data,306,321)
df_data=make_featrue(df_data,331,339)

In [57]:
nans_groups=[['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11'],
['V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34'],
['V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52'],
['V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74'],
['V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94'],
['V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125'],
['V138', 'V139', 'V140', 'V141', 'V142', 'V146', 'V147', 'V148', 'V149', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158'],
['V143', 'V144', 'V145', 'V150', 'V151', 'V152'],
['V167', 'V168', 'V172', 'V173', 'V176', 'V177', 'V178', 'V179', 'V181', 'V182', 'V183', 'V186', 'V187', 'V190', 'V191', 'V192', 'V193', 'V196', 'V199'],
['V169', 'V170', 'V171', 'V174', 'V175', 'V180', 'V184', 'V185', 'V188', 'V189', 'V194', 'V195', 'V197', 'V198', 'V200', 'V201'],
['V217', 'V218', 'V219', 'V223', 'V224', 'V225', 'V226', 'V228', 'V229', 'V230', 'V231', 'V232', 'V233', 'V235', 'V236', 'V237', 'V240', 'V241', 'V242', 'V243', 'V244', 'V246', 'V247', 'V248', 'V249', 'V252', 'V253', 'V254', 'V257', 'V258', 'V260', 'V261', 'V262'],
['V220', 'V221', 'V222', 'V227', 'V234', 'V238', 'V239', 'V245', 'V250', 'V251', 'V255', 'V256', 'V259'],
['V279', 'V280', 'V284', 'V285', 'V286', 'V287', 'V290', 'V291', 'V292', 'V293', 'V294', 'V295', 'V297', 'V298', 'V299', 'V302', 'V303', 'V304', 'V305'],
['V281', 'V282', 'V283', 'V288', 'V289', 'V296', 'V300', 'V301'],
['V322', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 'V330']]

for n_group,n_members in enumerate(nans_groups):
    tmp = n_members
    df_data["VVV_"+str(n_group)+"_"+str(len(n_members))+"_mean"]=df_data[tmp].mean(1)
    df_data["VVV_"+str(n_group)+"_"+str(len(n_members))+"_sum"]=df_data[tmp].sum(1)
    df_data["VVV_"+str(n_group)+"_"+str(len(n_members))+"_std"]=df_data[tmp].std(1)

In [58]:
# C_cols = ['C'+str(i) for i in range(1,15)]
# for i,d in enumerate(C_cols):
#     for b in C_cols[i+1:]:
#         df_data[d+"_"+b]=df_data[d]-df_data[b]
# D_cols = ['D'+str(i) for i in range(1,16)]
# for i,d in enumerate(D_cols):
#     for b in D_cols[i+1:]:
#         df_data[d+"_"+b]=df_data[d]-df_data[b]

# 0.5279121757308655 C1_C2
# 0.5632723386618723 C1_C5
# 0.5526784513436878 C1_C6
# 0.5779849819834738 C1_C9
# 0.49823769760697056 C1_C10

# 0.6457199063995556 C14_C2
# 0.5255646052395444 C14_C5
# 0.6635897435834723 C14_C6
# 0.6546711373897884 C14_C8
# 0.4886515283104664 C14_C9
# 0.668867450031352 C14_C11
# df_data["C1_C2"] = df_data["C1"]-df_data["C2"]
# df_data["C1_C5"] = df_data["C1"]-df_data["C5"]
# df_data["C1_C6"] = df_data["C1"]-df_data["C6"]
# df_data["C1_C9"] = df_data["C1"]-df_data["C9"]
# #df_data["C1_C10"] = df_data["C1"]-df_data["C10"]

df_data["C14_C2"] = df_data["C14"]-df_data["C2"]
# df_data["C14_C5"] = df_data["C14"]-df_data["C5"]
df_data["C14_C6"] = df_data["C14"]-df_data["C6"]
df_data["C14_C8"] = df_data["C14"]-df_data["C8"]
# #df_data["C14_C9"] = df_data["C14"]-df_data["C9"]
df_data["C14_C11"] = df_data["C14"]-df_data["C11"]
df_data["C2_C13"] = df_data["C2"]-df_data["C13"]#0.7024700359840995 C2_C13
df_data["C4_C5"] = df_data["C4"]-df_data["C5"]#0.6836050754587126 C4_C5
# 0.6695011408607345 C4_C13
# 0.6508774658213878 C4_C14
# 0.6490321350709809 C5_C7
df_data["C5_C8"] = df_data["C5"]-df_data["C8"]# 0.6835032319363643 C5_C8
df_data["C5_C10"] = df_data["C5"]-df_data["C10"]# 0.6873015843349426 C5_C10
# 0.6421059903104682 C5_C11
# 0.6580717633801028 C5_C12
# 0.6662308268862508 C6_C13
# 0.6645635722197241 C6_C14

In [59]:
X_train=df_data[~df_data["isFraud"].isnull()].copy()
X_test=df_data[df_data["isFraud"].isnull()].copy()
print(X_train.shape,X_test.shape)
y_train = X_train['isFraud'].copy()
print(X_train.shape,X_test.shape)

(590540, 1829) (506691, 1829)
(590540, 1829) (506691, 1829)


In [60]:
gc.collect()

494

In [61]:
# name="TransactionDT"
# t=1
# for data in [X_train,X_test]:
#     data.sort_values('card1',inplace=True)
#     data=data.reset_index(drop=True)
#     data["%s_shift_card1_%s"%(name,t)]=data[name].shift(t)
#     data["%s_shift_card1_dif_%s" % (name, t)]=data[name]-data["%s_shift_card1_%s"%(name,t)]
#     data.sort_values('TransactionDT',inplace=True)
#     del data["%s_shift_card1_%s"%(name,t)]
#     print(data["%s_shift_card1_dif_%s" % (name, t)].head())

In [62]:
# name="TransactionDT"
# for data in [X_train,X_test]:
#     data.sort_values('D1_DT_a',inplace=True)
#     data=data.reset_index(drop=True)
#     data["%s_shift_D1_DT_a_%s"%(name,t)]=data[name].shift(t)
#     data["%s_shift_D1_DT_a_dif_%s" % (name, t)]=data[name]-data["%s_shift_D1_DT_a_%s"%(name,t)]
#     data.sort_values('TransactionDT',inplace=True)
#     del data["%s_shift_D1_DT_a_%s"%(name,t)]

In [63]:
print(X_train.shape,X_test.shape)

(590540, 1829) (506691, 1829)


In [64]:
# X_train = reduce_mem_usage(X_train)
# X_test = reduce_mem_usage(X_test)

In [65]:
dr_feature=[
    'DT','DT_M','DT_W','DT_D','DT_day_week','DT_day',
    'DT_hour', 'DT_M_total',
    'product_type_w',
    "card3_DT_M_month_day_dist",
    "D1_DT",
    #'card1_nunique_nunique', 'card2_nunique_nunique','D1_DT_a_nunique_nunique',
     'D1_DT_user_t1_D3_mean', 'D1_DT_user_t1_D3_std', 'D1_DT_user_t1_D4_mean', 'D1_DT_user_t1_D4_std', 'D1_DT_user_t1_D5_mean', 'D1_DT_user_t1_D5_std', 'D1_DT_user_t1_D6_mean', 'D1_DT_user_t1_D6_std', 'D1_DT_user_t1_D7_mean', 'D1_DT_user_t1_D7_std', 'D1_DT_user_t1_D8_mean', 'D1_DT_user_t1_D8_std', 'D1_DT_user_t1_D10_mean', 'D1_DT_user_t1_D10_std', 'D1_DT_user_t1_D11_mean', 'D1_DT_user_t1_D11_std', 'D1_DT_user_t1_D12_mean', 'D1_DT_user_t1_D12_std', 'D1_DT_user_t1_D13_mean', 'D1_DT_user_t1_D13_std', 'D1_DT_user_t1_D14_mean', 'D1_DT_user_t1_D14_std', 'D1_DT_user_t1_D15_mean', 'D1_DT_user_t1_D15_std',
     'D1_DT_user_t1_C1_mean', 'D1_DT_user_t1_C1_std', 'D1_DT_user_t1_C2_mean', 'D1_DT_user_t1_C2_std', 'D1_DT_user_t1_C3_mean', 'D1_DT_user_t1_C3_std', 'D1_DT_user_t1_C4_mean', 'D1_DT_user_t1_C4_std', 'D1_DT_user_t1_C5_mean', 'D1_DT_user_t1_C5_std', 'D1_DT_user_t1_C6_mean', 'D1_DT_user_t1_C6_std', 'D1_DT_user_t1_C7_mean', 'D1_DT_user_t1_C7_std', 'D1_DT_user_t1_C8_mean', 'D1_DT_user_t1_C8_std', 'D1_DT_user_t1_C9_mean', 'D1_DT_user_t1_C9_std', 'D1_DT_user_t1_C10_mean', 'D1_DT_user_t1_C10_std', 'D1_DT_user_t1_C11_mean', 'D1_DT_user_t1_C11_std', 'D1_DT_user_t1_C12_mean', 'D1_DT_user_t1_C12_std', 'D1_DT_user_t1_C13_mean', 'D1_DT_user_t1_C13_std', 'D1_DT_user_t1_C14_mean', 'D1_DT_user_t1_C14_std',
     'D1_DT_user_t1_C1_var', 'D1_DT_user_t1_C1_nunique', 'D1_DT_user_t1_C2_var', 'D1_DT_user_t1_C2_nunique', 'D1_DT_user_t1_C3_var', 'D1_DT_user_t1_C3_nunique', 'D1_DT_user_t1_C4_var', 'D1_DT_user_t1_C4_nunique', 'D1_DT_user_t1_C5_var', 'D1_DT_user_t1_C5_nunique', 'D1_DT_user_t1_C6_var', 'D1_DT_user_t1_C6_nunique', 'D1_DT_user_t1_C7_var', 'D1_DT_user_t1_C7_nunique', 'D1_DT_user_t1_C8_var', 'D1_DT_user_t1_C8_nunique', 'D1_DT_user_t1_C9_var', 'D1_DT_user_t1_C9_nunique', 'D1_DT_user_t1_C10_var', 'D1_DT_user_t1_C10_nunique', 'D1_DT_user_t1_C11_var', 'D1_DT_user_t1_C11_nunique', 'D1_DT_user_t1_C12_var', 'D1_DT_user_t1_C12_nunique', 'D1_DT_user_t1_C13_var', 'D1_DT_user_t1_C13_nunique', 'D1_DT_user_t1_C14_var', 'D1_DT_user_t1_C14_nunique',
'D1_D3', 'D1_D4', 'D1_D5', 'D1_D6', 'D1_D7', 'D1_D8', 'D1_D9', 'D1_D10', 'D1_D11', 'D1_D12', 'D1_D13', 'D1_D14', 'D1_D15', 'D2_D3', 'D2_D4', 'D2_D5', 'D2_D6', 'D2_D7', 'D2_D8', 'D2_D9', 'D2_D10', 'D2_D11', 'D2_D12', 'D2_D13', 'D2_D14', 'D2_D15', 'D3_D4', 'D3_D5', 'D3_D6', 'D3_D7', 'D3_D8', 'D3_D9', 'D3_D10', 'D3_D11', 'D3_D12', 'D3_D13', 'D3_D14', 'D3_D15', 'D4_D5', 'D4_D6', 'D4_D7', 'D4_D8', 'D4_D9', 'D4_D10', 'D4_D11', 'D4_D12', 'D4_D13', 'D4_D14', 'D4_D15', 'D5_D6', 'D5_D7', 'D5_D8', 'D5_D9', 'D5_D10', 'D5_D11', 'D5_D12', 'D5_D13', 'D5_D14', 'D5_D15', 'D6_D7', 'D6_D8', 'D6_D9', 'D6_D10', 'D6_D11', 'D6_D12', 'D6_D13', 'D6_D14', 'D6_D15', 'D7_D8', 'D7_D9', 'D7_D10', 'D7_D11', 'D7_D12', 'D7_D13', 'D7_D14', 'D7_D15', 'D8_D9', 'D8_D10', 'D8_D11', 'D8_D12', 'D8_D13', 'D8_D14', 'D8_D15', 'D9_D10', 'D9_D11', 'D9_D12', 'D9_D13', 'D9_D14', 'D9_D15', 'D10_D11', 'D10_D12', 'D10_D13', 'D10_D14', 'D10_D15', 'D11_D12', 'D11_D13', 'D11_D14', 'D11_D15', 'D12_D13', 'D12_D14', 'D12_D15', 'D13_D14', 'D13_D15', 'D14_D15',
 'C1_C2', 'C1_C3', 'C1_C4', 'C1_C5', 'C1_C6', 'C1_C7', 'C1_C8', 'C1_C9', 'C1_C10', 'C1_C11', 'C1_C12', 'C1_C13', 'C1_C14', 'C2_C3', 'C2_C4', 'C2_C5', 'C2_C6', 'C2_C7', 'C2_C8', 'C2_C9', 'C2_C10', 'C2_C11', 'C2_C12', 'C2_C13', 'C2_C14', 'C3_C4', 'C3_C5', 'C3_C6', 'C3_C7', 'C3_C8', 'C3_C9', 'C3_C10', 'C3_C11', 'C3_C12', 'C3_C13', 'C3_C14', 'C4_C5', 'C4_C6', 'C4_C7', 'C4_C8', 'C4_C9', 'C4_C10', 'C4_C11', 'C4_C12', 'C4_C13', 'C4_C14', 'C5_C6', 'C5_C7', 'C5_C8', 'C5_C9', 'C5_C10', 'C5_C11', 'C5_C12', 'C5_C13', 'C5_C14', 'C6_C7', 'C6_C8', 'C6_C9', 'C6_C10', 'C6_C11', 'C6_C12', 'C6_C13', 'C6_C14', 'C7_C8', 'C7_C9', 'C7_C10', 'C7_C11', 'C7_C12', 'C7_C13', 'C7_C14', 'C8_C9', 'C8_C10', 'C8_C11', 'C8_C12', 'C8_C13', 'C8_C14', 'C9_C10', 'C9_C11', 'C9_C12', 'C9_C13', 'C9_C14', 'C10_C11', 'C10_C12', 'C10_C13', 'C10_C14', 'C11_C12', 'C11_C13', 'C11_C14', 'C12_C13', 'C12_C14', 'C13_C14',
    'C14_C2', 'C14_C6', 'C14_C8', 'C14_C11',
    
    #'DT_W_total', 'DT_D_total',
    #'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V138', 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V168', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V190', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 'V199', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V211', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V218', 'V219', 'V220', 'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V230', 'V231', 'V232', 'V233', 'V234', 'V235', 'V236', 'V237', 'V238', 'V239', 'V240', 'V241', 'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V248', 'V249', 'V250', 'V251', 'V252', 'V253', 'V254', 'V255', 'V256', 'V257', 'V258', 'V259', 'V260', 'V261', 'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V269', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276', 'V277', 'V278', 'V279', 'V280', 'V281', 'V282', 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'V292', 'V293', 'V294', 'V295', 'V296', 'V297', 'V298', 'V299', 'V300', 'V301', 'V302', 'V303', 'V304', 'V305', 'V306', 'V307', 'V308', 'V309', 'V310', 'V311', 'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320', 'V321', 'V322', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338', 'V339',
           ]
features=[x for x in X_train.columns if x not in ["isFraud","TransactionID","TransactionDT","Date"]+dr_feature]
print(len(features),features)

1808 ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 

In [66]:
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':100,
                    'learning_rate':0.01,
                    'num_leaves': 256,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.4,
                    'subsample_freq':1,
                    'subsample':0.7,
                   # 'n_estimators':10000,
                    'max_bin':255,
                    'verbose':0,
#                    "min_data_in_leaf":80,
#                    "lambda_l1" : 0.5,
#                    "lambda_l2" : 0.5,
                    'seed': 1993,
                   # 'early_stopping_rounds':200, 
                } 
oof_train,oof_preds,auc,feature_importance=train_lgbm_model(X_train,X_test,features,lgb_params,TEST_F=False)
print(auc)

Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.991843	valid_1's auc: 0.928454
[400]	training's auc: 0.999052	valid_1's auc: 0.933907
[600]	training's auc: 0.999857	valid_1's auc: 0.937848
[800]	training's auc: 0.999984	valid_1's auc: 0.940774
[1000]	training's auc: 0.999999	valid_1's auc: 0.942594
[1200]	training's auc: 1	valid_1's auc: 0.944075
[1400]	training's auc: 1	valid_1's auc: 0.945299
[1600]	training's auc: 1	valid_1's auc: 0.945945
[1800]	training's auc: 1	valid_1's auc: 0.94683
[2000]	training's auc: 1	valid_1's auc: 0.947379
[2200]	training's auc: 1	valid_1's auc: 0.948014
[2400]	training's auc: 1	valid_1's auc: 0.948257
[2600]	training's auc: 1	valid_1's auc: 0.948591
[2800]	training's auc: 1	valid_1's auc: 0.948648
[3000]	training's auc: 1	valid_1's auc: 0.948894
[3200]	training's auc: 1	valid_1's auc: 0.948991
[3400]	training's auc: 1	valid_1's auc: 0.949222
[3600]	training's auc: 1	valid_1's auc: 0.949139
Early stopping, best itera

In [67]:
#lb==9586
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.983422	valid_1's auc: 0.911477
# [400]	training's auc: 0.997036	valid_1's auc: 0.926262
# [600]	training's auc: 0.999502	valid_1's auc: 0.934039
# [800]	training's auc: 0.9999	valid_1's auc: 0.937924
# [1000]	training's auc: 0.999985	valid_1's auc: 0.940752
# [1200]	training's auc: 0.999999	valid_1's auc: 0.942284
# [1400]	training's auc: 1	valid_1's auc: 0.94304
# [1600]	training's auc: 1	valid_1's auc: 0.94371
# [1800]	training's auc: 1	valid_1's auc: 0.943943
# Early stopping, best iteration is:
# [1660]	training's auc: 1	valid_1's auc: 0.943807
# Fold 1 | AUC: 0.9438074882635257
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.982736	valid_1's auc: 0.936973
# [400]	training's auc: 0.997564	valid_1's auc: 0.949215
# [600]	training's auc: 0.999636	valid_1's auc: 0.953824
# [800]	training's auc: 0.999933	valid_1's auc: 0.95573
# [1000]	training's auc: 0.999993	valid_1's auc: 0.956745
# [1200]	training's auc: 1	valid_1's auc: 0.957111
# [1400]	training's auc: 1	valid_1's auc: 0.957413
# [1600]	training's auc: 1	valid_1's auc: 0.957661
# [1800]	training's auc: 1	valid_1's auc: 0.957667
# Early stopping, best iteration is:
# [1637]	training's auc: 1	valid_1's auc: 0.957653
# Fold 2 | AUC: 0.9576527681051226
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.983816	valid_1's auc: 0.940045
# [400]	training's auc: 0.997563	valid_1's auc: 0.949533
# [600]	training's auc: 0.9996	valid_1's auc: 0.95281
# [800]	training's auc: 0.999925	valid_1's auc: 0.953694
# [1000]	training's auc: 0.999991	valid_1's auc: 0.954128
# [1200]	training's auc: 0.999999	valid_1's auc: 0.954173
# [1400]	training's auc: 1	valid_1's auc: 0.954257
# [1600]	training's auc: 1	valid_1's auc: 0.953953
# Early stopping, best iteration is:
# [1439]	training's auc: 1	valid_1's auc: 0.954309
# Fold 3 | AUC: 0.9543089651229442
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.982737	valid_1's auc: 0.949503
# [400]	training's auc: 0.997588	valid_1's auc: 0.961121
# [600]	training's auc: 0.999623	valid_1's auc: 0.963974
# [800]	training's auc: 0.999931	valid_1's auc: 0.965064
# [1000]	training's auc: 0.99999	valid_1's auc: 0.965592
# [1200]	training's auc: 1	valid_1's auc: 0.965671
# Early stopping, best iteration is:
# [1095]	training's auc: 0.999998	valid_1's auc: 0.965742
# Fold 4 | AUC: 0.9657418662845815
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.984095	valid_1's auc: 0.929459
# [400]	training's auc: 0.9976	valid_1's auc: 0.941123
# [600]	training's auc: 0.999616	valid_1's auc: 0.945407
# [800]	training's auc: 0.999922	valid_1's auc: 0.946986
# [1000]	training's auc: 0.999987	valid_1's auc: 0.94755
# [1200]	training's auc: 0.999999	valid_1's auc: 0.947734
# [1400]	training's auc: 1	valid_1's auc: 0.947742
# Early stopping, best iteration is:
# [1231]	training's auc: 1	valid_1's auc: 0.947863
# Fold 5 | AUC: 0.9478626515059665

# Mean AUC = 0.953874747856428
# Out of folds AUC = 0.9529824799704689
# 0.9529824799704689

In [68]:
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.983401	valid_1's auc: 0.911194
# [400]	training's auc: 0.997088	valid_1's auc: 0.92519
# [600]	training's auc: 0.999508	valid_1's auc: 0.932734
# [800]	training's auc: 0.999901	valid_1's auc: 0.937062
# [1000]	training's auc: 0.999985	valid_1's auc: 0.939541
# [1200]	training's auc: 0.999999	valid_1's auc: 0.941461
# [1400]	training's auc: 1	valid_1's auc: 0.942308
# [1600]	training's auc: 1	valid_1's auc: 0.942992
# [1800]	training's auc: 1	valid_1's auc: 0.943604
# Early stopping, best iteration is:
# [1708]	training's auc: 1	valid_1's auc: 0.943397
# Fold 1 | AUC: 0.9433973582464014
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.982788	valid_1's auc: 0.937477
# [400]	training's auc: 0.997681	valid_1's auc: 0.950095
# [600]	training's auc: 0.999637	valid_1's auc: 0.953879
# [800]	training's auc: 0.999935	valid_1's auc: 0.955795
# [1000]	training's auc: 0.999993	valid_1's auc: 0.956487
# [1200]	training's auc: 1	valid_1's auc: 0.957009
# [1400]	training's auc: 1	valid_1's auc: 0.957159
# [1600]	training's auc: 1	valid_1's auc: 0.957191
# Early stopping, best iteration is:
# [1505]	training's auc: 1	valid_1's auc: 0.957261
# Fold 2 | AUC: 0.9572608969617087
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.983634	valid_1's auc: 0.940106
# [400]	training's auc: 0.997566	valid_1's auc: 0.949114
# [600]	training's auc: 0.999621	valid_1's auc: 0.952391
# [800]	training's auc: 0.999928	valid_1's auc: 0.953298
# [1000]	training's auc: 0.999991	valid_1's auc: 0.953762
# [1200]	training's auc: 0.999999	valid_1's auc: 0.953723
# [1400]	training's auc: 1	valid_1's auc: 0.953688
# Early stopping, best iteration is:
# [1292]	training's auc: 1	valid_1's auc: 0.953945
# Fold 3 | AUC: 0.9539445998453435
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.983576	valid_1's auc: 0.950552
# [400]	training's auc: 0.997682	valid_1's auc: 0.96135
# [600]	training's auc: 0.999635	valid_1's auc: 0.964468
# [800]	training's auc: 0.999932	valid_1's auc: 0.965532
# [1000]	training's auc: 0.99999	valid_1's auc: 0.965868
# [1200]	training's auc: 0.999999	valid_1's auc: 0.965988
# [1400]	training's auc: 1	valid_1's auc: 0.966013
# Early stopping, best iteration is:
# [1269]	training's auc: 1	valid_1's auc: 0.966078
# Fold 4 | AUC: 0.9660782819877667
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.984377	valid_1's auc: 0.928965
# [400]	training's auc: 0.997626	valid_1's auc: 0.941173
# [600]	training's auc: 0.999617	valid_1's auc: 0.945279
# [800]	training's auc: 0.999924	valid_1's auc: 0.946804
# [1000]	training's auc: 0.999987	valid_1's auc: 0.94775
# [1200]	training's auc: 0.999999	valid_1's auc: 0.948141
# [1400]	training's auc: 1	valid_1's auc: 0.948308
# [1600]	training's auc: 1	valid_1's auc: 0.948364
# Early stopping, best iteration is:
# [1567]	training's auc: 1	valid_1's auc: 0.948399
# Fold 5 | AUC: 0.9483993408608421

# Mean AUC = 0.9538160955804126
# Out of folds AUC = 0.9540371786876838
# 0.9540371786876838

In [69]:
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.965935	valid_1's auc: 0.903067
# [400]	training's auc: 0.986152	valid_1's auc: 0.920814
# [600]	training's auc: 0.994046	valid_1's auc: 0.929895
# [800]	training's auc: 0.997469	valid_1's auc: 0.935288
# [1000]	training's auc: 0.99898	valid_1's auc: 0.93831
# [1200]	training's auc: 0.999556	valid_1's auc: 0.940146
# [1400]	training's auc: 0.999798	valid_1's auc: 0.941289
# [1600]	training's auc: 0.999909	valid_1's auc: 0.942307
# [1800]	training's auc: 0.99996	valid_1's auc: 0.943029
# [2000]	training's auc: 0.999984	valid_1's auc: 0.94349
# [2200]	training's auc: 0.999994	valid_1's auc: 0.943492
# [2400]	training's auc: 0.999998	valid_1's auc: 0.943831
# [2600]	training's auc: 0.999999	valid_1's auc: 0.943887
# [2800]	training's auc: 1	valid_1's auc: 0.94454
# [3000]	training's auc: 1	valid_1's auc: 0.944615
# Early stopping, best iteration is:
# [2962]	training's auc: 1	valid_1's auc: 0.944692
# Fold 1 | AUC: 0.9446917410425781
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.965307	valid_1's auc: 0.930026
# [400]	training's auc: 0.986764	valid_1's auc: 0.94463
# [600]	training's auc: 0.99487	valid_1's auc: 0.950725
# [800]	training's auc: 0.997981	valid_1's auc: 0.953631
# [1000]	training's auc: 0.999199	valid_1's auc: 0.95511
# [1200]	training's auc: 0.999666	valid_1's auc: 0.956126
# [1400]	training's auc: 0.999855	valid_1's auc: 0.956653
# [1600]	training's auc: 0.999938	valid_1's auc: 0.957241
# [1800]	training's auc: 0.999975	valid_1's auc: 0.957534
# [2000]	training's auc: 0.999991	valid_1's auc: 0.957693
# [2200]	training's auc: 0.999997	valid_1's auc: 0.957968
# [2400]	training's auc: 0.999999	valid_1's auc: 0.958131
# [2600]	training's auc: 1	valid_1's auc: 0.958141
# Early stopping, best iteration is:
# [2575]	training's auc: 1	valid_1's auc: 0.958205
# Fold 2 | AUC: 0.9582047152284849
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.965404	valid_1's auc: 0.931224
# [400]	training's auc: 0.986913	valid_1's auc: 0.943692
# [600]	training's auc: 0.994929	valid_1's auc: 0.949615
# [800]	training's auc: 0.998028	valid_1's auc: 0.951682
# [1000]	training's auc: 0.999181	valid_1's auc: 0.952617
# [1200]	training's auc: 0.999639	valid_1's auc: 0.953305
# [1400]	training's auc: 0.999837	valid_1's auc: 0.953316
# [1600]	training's auc: 0.999928	valid_1's auc: 0.953426
# Early stopping, best iteration is:
# [1515]	training's auc: 0.999898	valid_1's auc: 0.953506
# Fold 3 | AUC: 0.9535063217848269
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.964179	valid_1's auc: 0.94299
# [400]	training's auc: 0.986356	valid_1's auc: 0.956512
# [600]	training's auc: 0.994798	valid_1's auc: 0.961378
# [800]	training's auc: 0.998058	valid_1's auc: 0.963411
# [1000]	training's auc: 0.999213	valid_1's auc: 0.964511
# [1200]	training's auc: 0.99967	valid_1's auc: 0.965078
# [1400]	training's auc: 0.999853	valid_1's auc: 0.965429
# [1600]	training's auc: 0.999935	valid_1's auc: 0.965623
# [1800]	training's auc: 0.999973	valid_1's auc: 0.965814
# [2000]	training's auc: 0.999991	valid_1's auc: 0.965879
# Early stopping, best iteration is:
# [1981]	training's auc: 0.999989	valid_1's auc: 0.965911
# Fold 4 | AUC: 0.9659114843236632
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.96617	valid_1's auc: 0.918653
# [400]	training's auc: 0.986896	valid_1's auc: 0.933561
# [600]	training's auc: 0.99488	valid_1's auc: 0.94059
# [800]	training's auc: 0.998009	valid_1's auc: 0.94349
# [1000]	training's auc: 0.999196	valid_1's auc: 0.945086
# [1200]	training's auc: 0.99967	valid_1's auc: 0.945915
# [1400]	training's auc: 0.999851	valid_1's auc: 0.946285
# [1600]	training's auc: 0.999933	valid_1's auc: 0.946546
# [1800]	training's auc: 0.999972	valid_1's auc: 0.946568
# Early stopping, best iteration is:
# [1703]	training's auc: 0.999957	valid_1's auc: 0.946702
# Fold 5 | AUC: 0.9467015743957923

# Mean AUC = 0.9538031673550691
# Out of folds AUC = 0.9527253292059217
# 0.9527253292059217


In [70]:
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.965804	valid_1's auc: 0.903285
# [400]	training's auc: 0.986151	valid_1's auc: 0.921071
# [600]	training's auc: 0.994168	valid_1's auc: 0.92953
# [800]	training's auc: 0.997543	valid_1's auc: 0.934384
# [1000]	training's auc: 0.998986	valid_1's auc: 0.937491
# [1200]	training's auc: 0.999555	valid_1's auc: 0.939469
# [1400]	training's auc: 0.999798	valid_1's auc: 0.940978
# [1600]	training's auc: 0.999908	valid_1's auc: 0.942347
# [1800]	training's auc: 0.99996	valid_1's auc: 0.942982
# [2000]	training's auc: 0.999984	valid_1's auc: 0.943429
# [2200]	training's auc: 0.999994	valid_1's auc: 0.943643
# [2400]	training's auc: 0.999998	valid_1's auc: 0.944031
# [2600]	training's auc: 0.999999	valid_1's auc: 0.944154
# [2800]	training's auc: 1	valid_1's auc: 0.944385
# [3000]	training's auc: 1	valid_1's auc: 0.944389
# [3200]	training's auc: 1	valid_1's auc: 0.944619
# [3400]	training's auc: 1	valid_1's auc: 0.944669
# Early stopping, best iteration is:
# [3295]	training's auc: 1	valid_1's auc: 0.944736
# Fold 1 | AUC: 0.9447358005139384
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.965307	valid_1's auc: 0.930026
# [400]	training's auc: 0.986764	valid_1's auc: 0.94463
# [600]	training's auc: 0.99487	valid_1's auc: 0.950725
# [800]	training's auc: 0.997981	valid_1's auc: 0.953631
# [1000]	training's auc: 0.999199	valid_1's auc: 0.95511
# [1200]	training's auc: 0.999665	valid_1's auc: 0.956128
# [1400]	training's auc: 0.999855	valid_1's auc: 0.956782
# [1600]	training's auc: 0.999939	valid_1's auc: 0.957375
# [1800]	training's auc: 0.999976	valid_1's auc: 0.95768
# [2000]	training's auc: 0.999991	valid_1's auc: 0.957786
# [2200]	training's auc: 0.999997	valid_1's auc: 0.957977
# [2400]	training's auc: 0.999999	valid_1's auc: 0.957963
# Early stopping, best iteration is:
# [2253]	training's auc: 0.999998	valid_1's auc: 0.958107
# Fold 2 | AUC: 0.9581071523856525
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.965404	valid_1's auc: 0.931224
# [400]	training's auc: 0.986916	valid_1's auc: 0.943688
# [600]	training's auc: 0.994886	valid_1's auc: 0.949463
# [800]	training's auc: 0.998036	valid_1's auc: 0.95159
# [1000]	training's auc: 0.99917	valid_1's auc: 0.952548
# [1200]	training's auc: 0.999641	valid_1's auc: 0.952956
# [1400]	training's auc: 0.999837	valid_1's auc: 0.953221
# [1600]	training's auc: 0.999929	valid_1's auc: 0.953292
# Early stopping, best iteration is:
# [1489]	training's auc: 0.999886	valid_1's auc: 0.953414
# Fold 3 | AUC: 0.9534140608713925
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.964147	valid_1's auc: 0.943108
# [400]	training's auc: 0.986501	valid_1's auc: 0.956482
# [600]	training's auc: 0.994746	valid_1's auc: 0.961489
# [800]	training's auc: 0.998021	valid_1's auc: 0.963434
# [1000]	training's auc: 0.999207	valid_1's auc: 0.964647
# [1200]	training's auc: 0.999668	valid_1's auc: 0.965145
# [1400]	training's auc: 0.999851	valid_1's auc: 0.965514
# [1600]	training's auc: 0.999935	valid_1's auc: 0.965697
# [1800]	training's auc: 0.999974	valid_1's auc: 0.965752
# [2000]	training's auc: 0.99999	valid_1's auc: 0.966031
# [2200]	training's auc: 0.999997	valid_1's auc: 0.966049
# Early stopping, best iteration is:
# [2187]	training's auc: 0.999997	valid_1's auc: 0.966077
# Fold 4 | AUC: 0.9660774546013401
# Training until validation scores don't improve for 200 rounds.
# [200]	training's auc: 0.96617	valid_1's auc: 0.918653
# [400]	training's auc: 0.986896	valid_1's auc: 0.933561
# [600]	training's auc: 0.99488	valid_1's auc: 0.94059
# [800]	training's auc: 0.997996	valid_1's auc: 0.94363
# [1000]	training's auc: 0.999188	valid_1's auc: 0.945059
# [1200]	training's auc: 0.999662	valid_1's auc: 0.945821
# [1400]	training's auc: 0.999846	valid_1's auc: 0.946003
# [1600]	training's auc: 0.999932	valid_1's auc: 0.946329
# [1800]	training's auc: 0.999972	valid_1's auc: 0.94637
# [2000]	training's auc: 0.999989	valid_1's auc: 0.946625
# [2200]	training's auc: 0.999997	valid_1's auc: 0.946543
# [2400]	training's auc: 0.999999	valid_1's auc: 0.946548
# Early stopping, best iteration is:
# [2285]	training's auc: 0.999998	valid_1's auc: 0.946712
# Fold 5 | AUC: 0.9467120366686762

# Mean AUC = 0.9538093010081998
# Out of folds AUC = 0.9524919483133036
# 0.9524919483133036

In [71]:
sample_submission = pd.read_csv('../input/sample_submission.csv', index_col='TransactionID')
sample_submission['isFraud'] = oof_preds
sample_submission.to_csv('simple_lgb_oof_'+str(auc)+'.csv')

In [None]:
oof_train_df = pd.DataFrame(oof_train)
oof_train_df.columns = ['class_1']
oof_train_df.to_csv("oof/oof_train_cv_%f.csv"%auc,index=False)
oof_test_df = pd.DataFrame(oof_preds)
oof_test_df.columns = ['class_1']
oof_test_df.to_csv("oof/oof_test_cv_%f.csv"%auc,index=False)

In [None]:
feature_importance["importance"] /= EPOCHS
cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:1000].index
best_features = feature_importance.loc[feature_importance.feature.isin(cols)]
plt.figure(figsize=(12, 150));
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
plt.title('LGB Features (avg over folds)');