In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from utils.schemas import *
from utils.functions import *

In [2]:
df_imp = pd.read_csv('docs/ft_importances_20190811.csv')

In [3]:
X_cols = df_imp[df_imp.importance > 0.1].feature.to_list()

In [4]:
data_folder = 'input'

In [5]:
train = pd.read_csv(data_folder+'/train_ft_eng_0.csv', dtype = schema_generated_0, usecols=X_cols+['isFraud', 'TransactionDT'])
test = pd.read_csv(data_folder+'/test_ft_eng_0.csv', dtype = schema_generated_0, usecols=X_cols)

In [11]:
mini_train = train.sample(120000, random_state = 42)#.fillna(train.median())

In [6]:
X = train.sort_values('TransactionDT')[X_cols]#.reset_index(drop=True)
y = train.sort_values('TransactionDT').isFraud

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [7]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X.iloc[:int(X.shape[0]*0.8), :]
X_test = X.iloc[int(X.shape[0]*0.8):, :]
y_train = y[:int(X.shape[0]*0.8)]
y_test = y[int(X.shape[0]*0.8):]

# LightGBM

In [23]:
# Optimize max_depth
params = {
    'num_leaves': 491,
    'metric': ['AUC'],
    'first_metric_only': True,
    'n_estimators': 20000,
    'learning_rate': 0.008,
    'colsample_bytree': 0.85,
    'objective': 'xentropy',
    'n_jobs': -1,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'bagging_seed': 42,
    'seed': 42,
    'feature_fraction_seed': 42,
    'drop_seed': 42,
    'data_random_seed': 42,
}

In [None]:
np.arange(0.001, 0.05, 0.001)

In [26]:
#max_depth_list = list(range(7, 21))
# aucs_list = list()
# depth_list = list()
max_depth_list = list(range(21, 30))
for max_depth in max_depth_list:
    print('Max_Depth:', max_depth)
    params['max_depth'] = max_depth
    lgb_model = lgb.LGBMClassifier(**params)
    lgb_model.fit(X_train,
                   y_train,
                   eval_set=[(X_test, y_test)],
                   verbose=100,
                   early_stopping_rounds=40,
#                    eval_metric=custom_loss
                  )
    depth_list.append(max_depth)
    aucs_list.append(roc_auc_score(y_test, lgb_model.predict_proba(X_test)[:,1]))
    print('\n')
    
df_depth = pd.DataFrame({'depth': depth_list, 'auc': aucs_list}).sort_values('auc', ascending = False)
df_depth.head()

Max_Depth: 21
Training until validation scores don't improve for 40 rounds.
[100]	valid_0's auc: 0.892142
[200]	valid_0's auc: 0.895234
[300]	valid_0's auc: 0.89865
Early stopping, best iteration is:
[328]	valid_0's auc: 0.900333


Max_Depth: 22
Training until validation scores don't improve for 40 rounds.
[100]	valid_0's auc: 0.8933
[200]	valid_0's auc: 0.897393
[300]	valid_0's auc: 0.901701
Early stopping, best iteration is:
[311]	valid_0's auc: 0.902474


Max_Depth: 23
Training until validation scores don't improve for 40 rounds.
[100]	valid_0's auc: 0.890424
[200]	valid_0's auc: 0.897478
[300]	valid_0's auc: 0.898031
Early stopping, best iteration is:
[265]	valid_0's auc: 0.898698


Max_Depth: 24
Training until validation scores don't improve for 40 rounds.
[100]	valid_0's auc: 0.894404
[200]	valid_0's auc: 0.897921
Early stopping, best iteration is:
[161]	valid_0's auc: 0.898709


Max_Depth: 25
Training until validation scores don't improve for 40 rounds.
[100]	valid_0's auc: 0.89

Unnamed: 0,depth,auc
18,25,0.903463
22,29,0.903404
19,26,0.902797
15,22,0.902474
21,28,0.902125


In [28]:
# Optimize max_depth
params = {
    'num_leaves': 491,
    'max_depth': 25,
    'metric': ['AUC'],
    'first_metric_only': True,
    'n_estimators': 20000,
    'learning_rate': 0.008,
    'colsample_bytree': 0.85,
    'objective': 'xentropy',
    'n_jobs': -1,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'bagging_seed': 42,
    'seed': 42,
    'feature_fraction_seed': 42,
    'drop_seed': 42,
    'data_random_seed': 42,
}

In [27]:
np.arange(0.001, 0.05, 0.001)

array([0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
       0.01 , 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018,
       0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027,
       0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035, 0.036,
       0.037, 0.038, 0.039, 0.04 , 0.041, 0.042, 0.043, 0.044, 0.045,
       0.046, 0.047, 0.048, 0.049])

In [29]:
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(X_train,
               y_train,
               eval_set=[(X_test, y_test)],
               verbose=100,
               early_stopping_rounds=40,
              )

Training until validation scores don't improve for 40 rounds.
[100]	valid_0's auc: 0.880895
[200]	valid_0's auc: 0.889505
[300]	valid_0's auc: 0.894199
[400]	valid_0's auc: 0.897251
[500]	valid_0's auc: 0.899565
[600]	valid_0's auc: 0.901131
Early stopping, best iteration is:
[645]	valid_0's auc: 0.901972


LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.85,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='split', lambda_l1=0,
               lambda_l2=0, learning_rate=0.008, max_depth=25, metric=['AUC'],
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=20000, n_jobs=-1, num_leaves=491,
               objective='xentropy', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, seed=42, silent=True, subsample=1.0,
               subsample_for_bin=200000, ...)

In [39]:
lgb_model.best_iteration_

In [52]:
params['n_estimators']

1257

In [50]:
params['n_estimators'] = int(967*1.3)

In [53]:
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(train[X_cols], train.isFraud)

LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.85,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='split', lambda_l1=0,
               lambda_l2=0, learning_rate=0.008, max_depth=25, metric=['AUC'],
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1257, n_jobs=-1, num_leaves=491,
               objective='xentropy', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, seed=42, silent=True, subsample=1.0,
               subsample_for_bin=200000, ...)

In [54]:
X_test = test[X.columns]

In [55]:
y_preds = lgb_model.predict_proba(X_test)

In [56]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [57]:
df_sub['isFraud'] = y_preds[:,1]

In [58]:
df_sub.to_csv('submissions/benchmark_ft_selection_ft_eng_0_1257_rounds.csv', sep=',', header=True, index=None)

# CatBoost

In [10]:
from catboost import CatBoostClassifier

In [11]:
cat_ft_id = list()
n = 0
for c in X_cols:
    if c in cat_ft:
        cat_ft_id.append(n)
    n += 1

In [12]:
params = {'depth':11,
          'iterations':20000,
          'eval_metric':'AUC',
          'random_seed':42,
          'logging_level':'Verbose',
          'allow_writing_files':False,
          'early_stopping_rounds':20,
          'learning_rate':0.01,
          'thread_count':8,
          'boosting_type':'Plain',
          'bootstrap_type':'Bernoulli',
          'rsm':0.6}

In [13]:
model_cb = CatBoostClassifier(**params)

In [14]:
gc.collect()
model_cb.fit(X_train,
             y_train,
             cat_features=cat_ft_id,
             eval_set=(X_test, y_test),
             verbose=50
             )

0:	test: 0.7419593	best: 0.7419593 (0)	total: 5.13s	remaining: 1d 4h 28m 23s
50:	test: 0.8405590	best: 0.8405590 (50)	total: 3m 43s	remaining: 1d 16m 48s
100:	test: 0.8566219	best: 0.8566219 (100)	total: 8m 19s	remaining: 1d 3h 20m 55s
150:	test: 0.8785868	best: 0.8785868 (150)	total: 13m 45s	remaining: 1d 6h 9m 17s
200:	test: 0.8945481	best: 0.8945481 (200)	total: 20m 19s	remaining: 1d 9h 21m 55s
250:	test: 0.8979413	best: 0.8979606 (249)	total: 27m 7s	remaining: 1d 11h 34m 29s
300:	test: 0.9006662	best: 0.9006662 (300)	total: 33m 5s	remaining: 1d 12h 6m 4s
350:	test: 0.9033150	best: 0.9033150 (350)	total: 38m 52s	remaining: 1d 12h 15m 46s
400:	test: 0.9055165	best: 0.9055165 (400)	total: 44m 21s	remaining: 1d 12h 7m 46s
450:	test: 0.9077800	best: 0.9077800 (450)	total: 49m 53s	remaining: 1d 12h 2m 54s
500:	test: 0.9093310	best: 0.9093310 (500)	total: 55m 25s	remaining: 1d 11h 56m 59s
550:	test: 0.9110644	best: 0.9110644 (550)	total: 1h 1m 5s	remaining: 1d 11h 56m 40s
600:	test: 0.912

<catboost.core.CatBoostClassifier at 0x7fb84edadcc0>

In [15]:
params['iterations'] = int(model_cb.best_iteration_*1.1)

In [16]:
model_cb = CatBoostClassifier(**params)
model_cb.fit(train[X_cols], train.isFraud, verbose = 200)

0:	total: 744ms	remaining: 33m 52s
200:	total: 2m 34s	remaining: 32m 25s
400:	total: 5m 8s	remaining: 29m 56s
600:	total: 7m 43s	remaining: 27m 23s
800:	total: 10m 17s	remaining: 24m 50s
1000:	total: 12m 52s	remaining: 22m 16s
1200:	total: 15m 26s	remaining: 19m 42s
1400:	total: 18m 1s	remaining: 17m 8s
1600:	total: 20m 35s	remaining: 14m 34s
1800:	total: 23m 10s	remaining: 12m
2000:	total: 25m 45s	remaining: 9m 25s
2200:	total: 28m 19s	remaining: 6m 51s
2400:	total: 30m 53s	remaining: 4m 17s
2600:	total: 33m 27s	remaining: 1m 42s
2733:	total: 35m 10s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fb84e784c88>

In [17]:
X_test = test[X.columns]

In [18]:
y_preds = model_cb.predict_proba(X_test)

In [19]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [20]:
df_sub['isFraud'] = y_preds[:,1]

In [21]:
df_sub.to_csv('submissions/benchmark_ft_selection_ft_eng_0_catboost.csv', sep=',', header=True, index=None)