# LightGBM

### Libraries

In [12]:
import warnings
warnings.filterwarnings('ignore')

# import pandas as pd
import modin.pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, GroupKFold, TimeSeriesSplit
import datetime

from utils.schemas import *
from utils.functions import *

### Logging

In [3]:
LOG_NAME = 'logs/LightGBM.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New LightGBM Model #####")

### Data

In [4]:
df_imp = pd.read_csv('docs/20190903_PermitationImportance_Rf4.csv')

In [24]:
drop_cols = ['D1','D10','D11','D12','D13','D14','D15','D2','D3','D4','D5','D6',
             'D7','D8','D9','C1','C10','C11','C12','C13','C14','C2','C3','C4','C5','C6','C7','C8','C9']

In [5]:
X_cols = df_imp[df_imp.Importance > 0].feature.to_list()
# X_cols = df_imp.feature[:400].to_list()

In [26]:
len(X_cols)

268

In [7]:
X_cols[:9]

['R1', 'V258', 'V201', 'V257', 'R71', 'V200', 'R43', 'V199', 'V189']

In [8]:
data_folder = 'input'

In [9]:
id_cols = ['addr1', 'addr2', 'card1', 'P_emaildomain']

In [25]:
train = pd.read_csv(data_folder+'/train_ft_eng_2.csv', dtype = schema_ft_eng_2, usecols=X_cols+['isFraud']+id_cols+drop_cols)
test = pd.read_csv(data_folder+'/test_ft_eng_2.csv', dtype = schema_ft_eng_2, usecols=X_cols+id_cols+drop_cols)

In [11]:
groups = train.groupby(id_cols).grouper.group_info[0]

In [92]:
# X_cols.remove('addr1')
# X_cols.remove('addr2')
# X_cols.remove('card1')

In [96]:
imp_dict = dict()
for c in X_cols:
    if c not in cat_ft:
        imp_dict[c] = train[c].median()

In [27]:
X = train[X_cols+drop_cols]#.fillna(imp_dict)
y = train.isFraud

### Model

In [28]:
logging.warning("Used columns: {}".format(X_cols))
k = 5
logging.warning("Folds number: {}".format(k))

In [29]:
group_kfold = GroupKFold(n_splits=k)

In [30]:
train_ids = X.index
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [31]:
tscv = TimeSeriesSplit(n_splits=k)

#### LightGBM

In [32]:
params = {
    'num_leaves': 311,
    'max_depth': -1,
    'metric': ['AUC'],
    'first_metric_only': True,
    'n_estimators': 20000,
    'num_threads': 64,
    'learning_rate': 0.01,
    'colsample_bytree': 0.3,
    'objective': 'xentropy',
    'n_jobs': -1,
    'bagging_fraction': 0.7,
    'bagging_freq': 2,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'bagging_seed': 42,
    'random_state':42,
    'seed': 42,
    'feature_fraction_seed': 42,
    'drop_seed': 42,
    'data_random_seed': 42,
}

### Best max_depth

In [22]:
logging.warning("Searching best max_depth")

In [None]:
for train_index, test_index in group_kfold.split(X, y, groups):
    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

scores_dict = dict()
for i in range(5, 20):
    print('Max depth = {}'.format(i))
    logging.warning('Max depth = {}'.format(i))
    params['max_depth'] = i
    lgb_model = lgb.LGBMClassifier(**params)
    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  verbose=100,
                  early_stopping_rounds=100)
    scores_dict[i] = lgb_model.best_score_['valid_0']['auc']

Max depth = 5


In [91]:
b = pd.DataFrame({'Depth': [x for x in scores_dict.keys()], 'AUC': [x for x in scores_dict.values()]})
md = b[b.AUC == b.AUC.max()].Depth
md.values[0]

17

In [None]:
params['max_depth'] = md.values[0]

In [35]:
lgb_model = lgb.LGBMClassifier(**params)

In [36]:
logging.warning("Params: {}".format(str(lgb_model.get_params())))

In [37]:
counter = 1
auc_score = 0
iterat = 0
list_iter = list()
for train_index, test_index in skf.split(train_ids, y):
# for train_index, test_index in group_kfold.split(X, y, groups):
# for train_index, test_index in tscv.split(X=X, y=y):
    print('Fold {}\n'.format(counter))
    logging.warning("Training fold {}".format(counter))

    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  verbose=100,
                  early_stopping_rounds=100)
    
    logging.warning("Best AUC in this fold: {}".format(lgb_model.best_score_['valid_0']['auc']))
    logging.warning("Best iteration in this fold: {}".format(lgb_model.best_iteration_))
    auc_score += lgb_model.best_score_['valid_0']['auc']
    it = lgb_model.best_iteration_
    iterat += it
    list_iter.append(it)

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()
    
    counter += 1
    
mean_auc_score = auc_score/k
mean_iterat = iterat/k

logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))

Fold 1

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.936755	valid_0's auc: 0.918009
[200]	training's auc: 0.962995	valid_0's auc: 0.93626
[300]	training's auc: 0.978778	valid_0's auc: 0.947616
[400]	training's auc: 0.988072	valid_0's auc: 0.955333
[500]	training's auc: 0.993126	valid_0's auc: 0.959785
[600]	training's auc: 0.995964	valid_0's auc: 0.962954
[700]	training's auc: 0.997647	valid_0's auc: 0.964918
[800]	training's auc: 0.998563	valid_0's auc: 0.966301
[900]	training's auc: 0.999135	valid_0's auc: 0.96743
[1000]	training's auc: 0.99949	valid_0's auc: 0.96831
[1100]	training's auc: 0.999698	valid_0's auc: 0.968976
[1200]	training's auc: 0.999827	valid_0's auc: 0.969497
[1300]	training's auc: 0.999898	valid_0's auc: 0.970057
[1400]	training's auc: 0.999942	valid_0's auc: 0.970406
[1500]	training's auc: 0.999967	valid_0's auc: 0.970786
[1600]	training's auc: 0.999981	valid_0's auc: 0.9711
[1700]	training's auc: 0.99999	valid_0's auc: 0.

In [38]:
mean_iterat

2624.6

In [39]:
params['n_estimators'] = int(mean_iterat)
params['n_estimators']

2624

In [43]:
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(train[X_cols+drop_cols], train.isFraud)

LGBMClassifier(bagging_fraction=0.7, bagging_freq=2, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.3,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='split', lambda_l1=0,
               lambda_l2=0, learning_rate=0.01, max_depth=-1, metric=['AUC'],
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2624, n_jobs=-1, num_leaves=311, num_threads=64,
               objective='xentropy', random_state=42, reg_alpha=0.0,
               reg_lambda=0.0, seed=42, silent=True, subsample=1.0, ...)

In [44]:
y_preds = lgb_model.predict_proba(test[X.columns])

In [45]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [46]:
df_sub['isFraud'] = y_preds[:,1]

In [47]:
df_sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000113
1,3663550,0.000302
2,3663551,0.000472
3,3663552,0.000826
4,3663553,0.000374


In [48]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [49]:
submission_name = '{0}_LightGBM_{1}'.format(D, mean_auc_score)

In [50]:
submission_name

'20190903_LightGBM_0.9732650305281695'

In [51]:
logging.warning("Submission name: {}".format(submission_name))

In [52]:
df_sub.to_csv('submissions/{}.csv'.format(submission_name), sep=',', header=True, index=None)

In [53]:
logging.warning("End")

In [54]:
df_sub[df_sub.isFraud>0.9].shape

(5084, 2)