In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from utils.schemas import *
from utils.functions import *

In [2]:
data_folder = 'input'

In [3]:
train = pd.read_csv(data_folder+'/train_ft_eng_0.csv', dtype = schema_generated_0)
test = pd.read_csv(data_folder+'/test_ft_eng_0.csv', dtype = schema_generated_0)
df_imp = pd.read_csv('docs/ft_importances_20190811.csv')

In [4]:
df_imp = pd.read_csv('docs/ft_importances_20190811.csv')

Unnamed: 0,feature,importance
0,TransactionAmt,3.0
1,card1,2.204488
2,C13,2.070567
3,N1,1.881225
4,C1,1.691782


In [6]:
X_cols = df_imp[df_imp.importance > 0.1].feature

In [19]:
mini_train = train.sample(120000, random_state = 42).fillna(train.median())

In [20]:
X = mini_train.sort_values('TransactionDT')[X_cols]
y = mini_train.sort_values('TransactionDT').isFraud

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [22]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X.iloc[:int(X.shape[0]*0.8), :]
X_test = X.iloc[int(X.shape[0]*0.8):, :]
y_train = y[:int(X.shape[0]*0.8)]
y_test = y[int(X.shape[0]*0.8):]

In [23]:
# Optimize max_depth
params = {
    'num_leaves': 491,
    'metric': ['AUC'],
    'first_metric_only': True,
    'n_estimators': 20000,
    'learning_rate': 0.008,
    'colsample_bytree': 0.85,
    'objective': 'xentropy',
    'n_jobs': -1,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'bagging_seed': 42,
    'seed': 42,
    'feature_fraction_seed': 42,
    'drop_seed': 42,
    'data_random_seed': 42,
}

In [None]:
np.arange(0.001, 0.05, 0.001)

In [26]:
#max_depth_list = list(range(7, 21))
# aucs_list = list()
# depth_list = list()
max_depth_list = list(range(21, 30))
for max_depth in max_depth_list:
    print('Max_Depth:', max_depth)
    params['max_depth'] = max_depth
    lgb_model = lgb.LGBMClassifier(**params)
    lgb_model.fit(X_train,
                   y_train,
                   eval_set=[(X_test, y_test)],
                   verbose=100,
                   early_stopping_rounds=40,
#                    eval_metric=custom_loss
                  )
    depth_list.append(max_depth)
    aucs_list.append(roc_auc_score(y_test, lgb_model.predict_proba(X_test)[:,1]))
    print('\n')
    
df_depth = pd.DataFrame({'depth': depth_list, 'auc': aucs_list}).sort_values('auc', ascending = False)
df_depth.head()

Max_Depth: 21
Training until validation scores don't improve for 40 rounds.
[100]	valid_0's auc: 0.892142
[200]	valid_0's auc: 0.895234
[300]	valid_0's auc: 0.89865
Early stopping, best iteration is:
[328]	valid_0's auc: 0.900333


Max_Depth: 22
Training until validation scores don't improve for 40 rounds.
[100]	valid_0's auc: 0.8933
[200]	valid_0's auc: 0.897393
[300]	valid_0's auc: 0.901701
Early stopping, best iteration is:
[311]	valid_0's auc: 0.902474


Max_Depth: 23
Training until validation scores don't improve for 40 rounds.
[100]	valid_0's auc: 0.890424
[200]	valid_0's auc: 0.897478
[300]	valid_0's auc: 0.898031
Early stopping, best iteration is:
[265]	valid_0's auc: 0.898698


Max_Depth: 24
Training until validation scores don't improve for 40 rounds.
[100]	valid_0's auc: 0.894404
[200]	valid_0's auc: 0.897921
Early stopping, best iteration is:
[161]	valid_0's auc: 0.898709


Max_Depth: 25
Training until validation scores don't improve for 40 rounds.
[100]	valid_0's auc: 0.89

Unnamed: 0,depth,auc
18,25,0.903463
22,29,0.903404
19,26,0.902797
15,22,0.902474
21,28,0.902125


In [28]:
# Optimize max_depth
params = {
    'num_leaves': 491,
    'max_depth': 25,
    'metric': ['AUC'],
    'first_metric_only': True,
    'n_estimators': 20000,
    'learning_rate': 0.008,
    'colsample_bytree': 0.85,
    'objective': 'xentropy',
    'n_jobs': -1,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'bagging_seed': 42,
    'seed': 42,
    'feature_fraction_seed': 42,
    'drop_seed': 42,
    'data_random_seed': 42,
}

In [27]:
np.arange(0.001, 0.05, 0.001)

array([0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
       0.01 , 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018,
       0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027,
       0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035, 0.036,
       0.037, 0.038, 0.039, 0.04 , 0.041, 0.042, 0.043, 0.044, 0.045,
       0.046, 0.047, 0.048, 0.049])

In [29]:
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(X_train,
               y_train,
               eval_set=[(X_test, y_test)],
               verbose=100,
               early_stopping_rounds=40,
              )

Training until validation scores don't improve for 40 rounds.
[100]	valid_0's auc: 0.880895
[200]	valid_0's auc: 0.889505
[300]	valid_0's auc: 0.894199
[400]	valid_0's auc: 0.897251
[500]	valid_0's auc: 0.899565
[600]	valid_0's auc: 0.901131
Early stopping, best iteration is:
[645]	valid_0's auc: 0.901972


LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.85,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='split', lambda_l1=0,
               lambda_l2=0, learning_rate=0.008, max_depth=25, metric=['AUC'],
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=20000, n_jobs=-1, num_leaves=491,
               objective='xentropy', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, seed=42, silent=True, subsample=1.0,
               subsample_for_bin=200000, ...)

In [39]:
lgb_model.best_iteration_

In [52]:
params['n_estimators']

1257

In [50]:
params['n_estimators'] = int(967*1.3)

In [53]:
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(train[X_cols], train.isFraud)

LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.85,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='split', lambda_l1=0,
               lambda_l2=0, learning_rate=0.008, max_depth=25, metric=['AUC'],
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1257, n_jobs=-1, num_leaves=491,
               objective='xentropy', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, seed=42, silent=True, subsample=1.0,
               subsample_for_bin=200000, ...)

In [54]:
X_test = test[X.columns]

In [55]:
y_preds = lgb_model.predict_proba(X_test)

In [56]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [57]:
df_sub['isFraud'] = y_preds[:,1]

In [58]:
df_sub.to_csv('submissions/benchmark_ft_selection_ft_eng_0_1257_rounds.csv', sep=',', header=True, index=None)