# LightGBM

### Libraries

In [22]:
import warnings
warnings.filterwarnings('ignore')

# import pandas as pd
import modin.pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, GroupKFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
import datetime

from utils.schemas import *
from utils.functions import *

### Logging

In [2]:
LOG_NAME = 'logs/LightGBM.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New LightGBM Model #####")

### Data

In [4]:
df_imp = pd.read_csv('docs/20190903_PermitationImportance_Rf4.csv')

In [24]:
drop_cols = ['D1','D10','D11','D12','D13','D14','D15','D2','D3','D4','D5','D6',
             'D7','D8','D9','C1','C10','C11','C12','C13','C14','C2','C3','C4','C5','C6','C7','C8','C9']

In [5]:
X_cols = df_imp[df_imp.Importance > 0].feature.to_list()
# X_cols = df_imp.feature[:400].to_list()

In [26]:
len(X_cols)

268

In [7]:
X_cols[:9]

['R1', 'V258', 'V201', 'V257', 'R71', 'V200', 'R43', 'V199', 'V189']

In [4]:
data_folder = 'input'

In [8]:
id_cols = ['addr1', 'addr2', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'P_emaildomain']

In [6]:
# train = pd.read_csv(data_folder+'/train_syn_ft_eng_0.zip', dtype = schema_synthetic_ft_eng_0, usecols=X_cols+['isFraud']+id_cols+drop_cols)
# test = pd.read_csv(data_folder+'/test_syn_ft_eng_0.zip', dtype = schema_synthetic_ft_eng_0, usecols=X_cols+id_cols+drop_cols)
train = pd.read_csv(data_folder+'/train_syn_ft_eng_0.zip', dtype = schema_synthetic_ft_eng_0)
test = pd.read_csv(data_folder+'/test_syn_ft_eng_0.zip', dtype = schema_synthetic_ft_eng_0)

In [9]:
groups = train.groupby(id_cols).grouper.group_info[0]

In [10]:
# X_cols.remove('addr1')
# X_cols.remove('addr2')
# X_cols.remove('card1')

In [96]:
imp_dict = dict()
for c in X_cols:
    if c not in cat_ft:
        imp_dict[c] = train[c].median()

In [13]:
X_cols = [x for x in train.columns if x not in ['isFraud', 'TransactionDT', 'Transaction_day_of_week', 'Transaction_hour']]

In [15]:
X = train[X_cols]#.fillna(imp_dict)
y = train.isFraud

In [16]:
X_test = test[X_cols]
y_test = test.isFraud

### Model

In [37]:
logging.warning("Used columns: {}".format(X_cols))
k = 3
logging.warning("Folds number: {}".format(k))

In [38]:
group_kfold = GroupKFold(n_splits=k)

In [39]:
train_ids = X.index
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [40]:
tscv = TimeSeriesSplit(n_splits=k)

#### LightGBM

In [41]:
params = {
    'num_leaves': 111,
    'max_depth': -1,
    'metric': ['AUC'],
    'first_metric_only': True,
    'n_estimators': 20000,
    'num_threads': 64,
    'learning_rate': 0.01,
    'colsample_bytree': 0.5,
    'objective': 'xentropy',
    'n_jobs': -1,
    'bagging_fraction': 0.7,
    'bagging_freq': 2,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'bagging_seed': 42,
    'random_state':42,
    'seed': 42,
    'feature_fraction_seed': 42,
    'drop_seed': 42,
    'data_random_seed': 42,
}

### Best max_depth

In [22]:
logging.warning("Searching best max_depth")

In [None]:
for train_index, test_index in group_kfold.split(X, y, groups):
    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

scores_dict = dict()
for i in range(5, 20):
    print('Max depth = {}'.format(i))
    logging.warning('Max depth = {}'.format(i))
    params['max_depth'] = i
    lgb_model = lgb.LGBMClassifier(**params)
    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  verbose=100,
                  early_stopping_rounds=40)
    scores_dict[i] = lgb_model.best_score_['valid_0']['auc']

Max depth = 5


In [91]:
b = pd.DataFrame({'Depth': [x for x in scores_dict.keys()], 'AUC': [x for x in scores_dict.values()]})
md = b[b.AUC == b.AUC.max()].Depth
md.values[0]

17

In [None]:
params['max_depth'] = md.values[0]

In [23]:
lgb_model = lgb.LGBMClassifier(**params)

In [24]:
logging.warning("Params: {}".format(str(lgb_model.get_params())))

In [29]:
X_test.shape, y_test.shape

((221908, 712), (221908,))

In [42]:
counter = 1
auc_score = 0
iterat = 0
list_iter = list()
for train_index, test_index in skf.split(train_ids, y):
# for train_index, test_index in group_kfold.split(X, y, groups):
# for train_index, test_index in tscv.split(X=X, y=y):
    print('Fold {}\n'.format(counter))
    logging.warning("Training fold {}".format(counter))

    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  verbose=200,
                  early_stopping_rounds=40)
    
    logging.warning("Best AUC in this fold: {}".format(lgb_model.best_score_['valid_0']['auc']))
    logging.warning("Best iteration in this fold: {}".format(lgb_model.best_iteration_))
    auc_score += lgb_model.best_score_['valid_0']['auc']
    print('AUC test score: {}'.format(roc_auc_score(y_test, lgb_model.predict_proba(X_test[X.columns])[:,1])))
    it = lgb_model.best_iteration_
    iterat += it
    list_iter.append(it)

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()
    
    counter += 1
    
mean_auc_score = auc_score/k
mean_iterat = iterat/k

logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))

Fold 1

Training until validation scores don't improve for 40 rounds.
[200]	training's auc: 0.959987	valid_0's auc: 0.924216
[400]	training's auc: 0.990751	valid_0's auc: 0.944575
[600]	training's auc: 0.997483	valid_0's auc: 0.952096
[800]	training's auc: 0.999295	valid_0's auc: 0.955946
[1000]	training's auc: 0.999771	valid_0's auc: 0.957985
[1200]	training's auc: 0.999928	valid_0's auc: 0.959514
[1400]	training's auc: 0.999981	valid_0's auc: 0.960553
[1600]	training's auc: 0.999996	valid_0's auc: 0.961286
[1800]	training's auc: 0.999999	valid_0's auc: 0.96183
[2000]	training's auc: 1	valid_0's auc: 0.962213
Early stopping, best iteration is:
[2158]	training's auc: 1	valid_0's auc: 0.962469
AUC test score: 0.8861013374743754
Fold 2

Training until validation scores don't improve for 40 rounds.
[200]	training's auc: 0.958945	valid_0's auc: 0.928088
[400]	training's auc: 0.990271	valid_0's auc: 0.948327
[600]	training's auc: 0.997528	valid_0's auc: 0.956223
[800]	training's auc: 0.9992

In [44]:
counter = 1
auc_score = 0
iterat = 0
list_iter = list()
# for train_index, test_index in skf.split(train_ids, y):
# for train_index, test_index in group_kfold.split(X, y, groups):
for train_index, test_index in tscv.split(X=X, y=y):
    print('Fold {}\n'.format(counter))
    logging.warning("Training fold {}".format(counter))

    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  verbose=100,
                  early_stopping_rounds=200)
    
    logging.warning("Best AUC in this fold: {}".format(lgb_model.best_score_['valid_0']['auc']))
    logging.warning("Best iteration in this fold: {}".format(lgb_model.best_iteration_))
    auc_score += lgb_model.best_score_['valid_0']['auc']
    print('AUC test score: {}'.format(roc_auc_score(y_test, lgb_model.predict_proba(X_test[X.columns])[:,1])))
    it = lgb_model.best_iteration_
    iterat += it
    list_iter.append(it)

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()
    
    counter += 1
    
mean_auc_score = auc_score/k
mean_iterat = iterat/k

logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))

Fold 1

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.967128	valid_0's auc: 0.892336
[200]	training's auc: 0.993069	valid_0's auc: 0.9012
[300]	training's auc: 0.999273	valid_0's auc: 0.904383
[400]	training's auc: 0.999925	valid_0's auc: 0.906741
[500]	training's auc: 0.999991	valid_0's auc: 0.908083
[600]	training's auc: 0.999999	valid_0's auc: 0.90866
[700]	training's auc: 1	valid_0's auc: 0.909145
Early stopping, best iteration is:
[690]	training's auc: 1	valid_0's auc: 0.908917
AUC test score: 0.8672260532455773
Fold 2

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.946903	valid_0's auc: 0.876507
[200]	training's auc: 0.976229	valid_0's auc: 0.886164
[300]	training's auc: 0.992189	valid_0's auc: 0.895813
[400]	training's auc: 0.997899	valid_0's auc: 0.90281
[500]	training's auc: 0.999403	valid_0's auc: 0.907363
[600]	training's auc: 0.99981	valid_0's auc: 0.909443
[700]	training's auc: 0.999932	valid_

In [45]:
counter = 1
auc_score = 0
iterat = 0
list_iter = list()
# for train_index, test_index in skf.split(train_ids, y):
for train_index, test_index in group_kfold.split(X, y, groups):
# for train_index, test_index in tscv.split(X=X, y=y):
    print('Fold {}\n'.format(counter))
    logging.warning("Training fold {}".format(counter))

    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  verbose=100,
                  early_stopping_rounds=200)
    
    logging.warning("Best AUC in this fold: {}".format(lgb_model.best_score_['valid_0']['auc']))
    logging.warning("Best iteration in this fold: {}".format(lgb_model.best_iteration_))
    auc_score += lgb_model.best_score_['valid_0']['auc']
    print('AUC test score: {}'.format(roc_auc_score(y_test, lgb_model.predict_proba(X_test[X.columns])[:,1])))
    it = lgb_model.best_iteration_
    iterat += it
    list_iter.append(it)

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()
    
    counter += 1
    
mean_auc_score = auc_score/k
mean_iterat = iterat/k

logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))

Fold 1

Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.928305	valid_0's auc: 0.888832
[200]	training's auc: 0.964239	valid_0's auc: 0.898152
[300]	training's auc: 0.982017	valid_0's auc: 0.903248
[400]	training's auc: 0.990634	valid_0's auc: 0.906894
[500]	training's auc: 0.995258	valid_0's auc: 0.908936
[600]	training's auc: 0.99749	valid_0's auc: 0.91027
[700]	training's auc: 0.998663	valid_0's auc: 0.910604
[800]	training's auc: 0.999298	valid_0's auc: 0.910388
[900]	training's auc: 0.999639	valid_0's auc: 0.910408
Early stopping, best iteration is:
[708]	training's auc: 0.998737	valid_0's auc: 0.910678
AUC test score: 0.8871782651772717
Fold 2

Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.931735	valid_0's auc: 0.875267
[200]	training's auc: 0.968147	valid_0's auc: 0.885825
[300]	training's auc: 0.985178	valid_0's auc: 0.892866
[400]	training's auc: 0.993111	valid_0's auc: 0.896827
[500]	training's auc

In [38]:
mean_iterat

2624.6

In [39]:
params['n_estimators'] = int(mean_iterat)
params['n_estimators']

2624

In [43]:
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(train[X_cols+drop_cols], train.isFraud)

LGBMClassifier(bagging_fraction=0.7, bagging_freq=2, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.3,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='split', lambda_l1=0,
               lambda_l2=0, learning_rate=0.01, max_depth=-1, metric=['AUC'],
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2624, n_jobs=-1, num_leaves=311, num_threads=64,
               objective='xentropy', random_state=42, reg_alpha=0.0,
               reg_lambda=0.0, seed=42, silent=True, subsample=1.0, ...)

In [44]:
y_preds = lgb_model.predict_proba(test[X.columns])

In [45]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [46]:
df_sub['isFraud'] = y_preds[:,1]

In [47]:
df_sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000113
1,3663550,0.000302
2,3663551,0.000472
3,3663552,0.000826
4,3663553,0.000374


In [48]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [49]:
submission_name = '{0}_LightGBM_{1}'.format(D, mean_auc_score)

In [50]:
submission_name

'20190903_LightGBM_0.9732650305281695'

In [51]:
logging.warning("Submission name: {}".format(submission_name))

In [52]:
df_sub.to_csv('submissions/{}.csv'.format(submission_name), sep=',', header=True, index=None)

In [53]:
logging.warning("End")

In [54]:
df_sub[df_sub.isFraud>0.9].shape

(5084, 2)