# LightGBM

### Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import datetime

from utils.schemas import *
from utils.functions import *

### Logging

In [2]:
LOG_NAME = 'logs/LightGBM.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New LightGBM Model #####")

### Data

In [3]:
df_imp = pd.read_csv('docs/ft_importances_20190831.csv')

In [4]:
X_cols = df_imp.feature[:50].to_list()

In [5]:
len(X_cols)

50

In [7]:
data_folder = 'input'

In [8]:
train = pd.read_csv(data_folder+'/train_ft_eng_1.csv', dtype = schema_ft_eng_1, usecols=X_cols+['isFraud', 'TransactionDT'])
test = pd.read_csv(data_folder+'/test_ft_eng_1.csv', dtype = schema_ft_eng_1, usecols=X_cols)

In [9]:
X = train[X_cols]
y = train.isFraud

### Model

In [10]:
logging.warning("Used columns: {}".format(X_cols))
k = 5
logging.warning("Folds number: {}".format(k))

In [11]:
train_ids = X.index
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

#### LightGBM

In [12]:
params = {
    'num_leaves': 91,
    'max_depth': 11,
    'metric': ['AUC'],
    'first_metric_only': True,
    'n_estimators': 20000,
    'learning_rate': 0.01,
    'colsample_bytree': 0.3,
    'objective': 'xentropy',
    'n_jobs': -1,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'bagging_seed': 42,
    'random_state':42,
    'seed': 42,
    'feature_fraction_seed': 42,
    'drop_seed': 42,
    'data_random_seed': 42,
}

In [13]:
lgb_model = lgb.LGBMClassifier(**params)

In [14]:
logging.warning("Params: {}".format(str(lgb_model.get_params())))

In [15]:
counter = 1
auc_score = 0
iterat = 0
for train_index, test_index in skf.split(train_ids, y):
    print('Fold {}\n'.format(counter))
    logging.warning("Training fold {}".format(counter))

    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  verbose=100,
                  early_stopping_rounds=50)
    
    logging.warning("Best AUC in this fold: {}".format(lgb_model.best_score_['valid_0']['auc']))
    logging.warning("Best iteration in this fold: {}".format(lgb_model.best_iteration_))
    auc_score += lgb_model.best_score_['valid_0']['auc']
    iterat += lgb_model.best_iteration_

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()
    
    counter += 1
    
mean_auc_score = auc_score/k
mean_iterat = iterat/k

logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))

Fold 1

Training until validation scores don't improve for 50 rounds.
[100]	training's auc: 0.908197	valid_0's auc: 0.899146
[200]	training's auc: 0.924045	valid_0's auc: 0.912624
[300]	training's auc: 0.935257	valid_0's auc: 0.921827
[400]	training's auc: 0.944456	valid_0's auc: 0.928706
[500]	training's auc: 0.951277	valid_0's auc: 0.93333
[600]	training's auc: 0.956934	valid_0's auc: 0.937216
[700]	training's auc: 0.961556	valid_0's auc: 0.939998
[800]	training's auc: 0.965618	valid_0's auc: 0.942628
[900]	training's auc: 0.968932	valid_0's auc: 0.94467
[1000]	training's auc: 0.971986	valid_0's auc: 0.946523
[1100]	training's auc: 0.974641	valid_0's auc: 0.948195
[1200]	training's auc: 0.977048	valid_0's auc: 0.949511
[1300]	training's auc: 0.979119	valid_0's auc: 0.950664
[1400]	training's auc: 0.98101	valid_0's auc: 0.951758
[1500]	training's auc: 0.982793	valid_0's auc: 0.952794
[1600]	training's auc: 0.984385	valid_0's auc: 0.95383
[1700]	training's auc: 0.985903	valid_0's auc: 

[1600]	training's auc: 0.984321	valid_0's auc: 0.956182
[1700]	training's auc: 0.985744	valid_0's auc: 0.956876
[1800]	training's auc: 0.986965	valid_0's auc: 0.957474
[1900]	training's auc: 0.988075	valid_0's auc: 0.958191
[2000]	training's auc: 0.989064	valid_0's auc: 0.958761
[2100]	training's auc: 0.989981	valid_0's auc: 0.959349
[2200]	training's auc: 0.990758	valid_0's auc: 0.959892
[2300]	training's auc: 0.991595	valid_0's auc: 0.960441
[2400]	training's auc: 0.992305	valid_0's auc: 0.960945
[2500]	training's auc: 0.992908	valid_0's auc: 0.961352
[2600]	training's auc: 0.993486	valid_0's auc: 0.961803
[2700]	training's auc: 0.99402	valid_0's auc: 0.962228
[2800]	training's auc: 0.994493	valid_0's auc: 0.962569
[2900]	training's auc: 0.994922	valid_0's auc: 0.962945
[3000]	training's auc: 0.995326	valid_0's auc: 0.963202
[3100]	training's auc: 0.995683	valid_0's auc: 0.963545
[3200]	training's auc: 0.996048	valid_0's auc: 0.963812
[3300]	training's auc: 0.996392	valid_0's auc: 0.

[1000]	training's auc: 0.972178	valid_0's auc: 0.94877
[1100]	training's auc: 0.974854	valid_0's auc: 0.950261
[1200]	training's auc: 0.977109	valid_0's auc: 0.951568
[1300]	training's auc: 0.979282	valid_0's auc: 0.953009
[1400]	training's auc: 0.981238	valid_0's auc: 0.954212
[1500]	training's auc: 0.983	valid_0's auc: 0.95536
[1600]	training's auc: 0.984512	valid_0's auc: 0.956181
[1700]	training's auc: 0.98596	valid_0's auc: 0.956895
[1800]	training's auc: 0.987214	valid_0's auc: 0.9576
[1900]	training's auc: 0.988349	valid_0's auc: 0.958302
[2000]	training's auc: 0.989323	valid_0's auc: 0.958915
[2100]	training's auc: 0.99028	valid_0's auc: 0.959502
[2200]	training's auc: 0.991101	valid_0's auc: 0.96001
[2300]	training's auc: 0.99191	valid_0's auc: 0.96058
[2400]	training's auc: 0.992595	valid_0's auc: 0.961155
[2500]	training's auc: 0.99324	valid_0's auc: 0.961641
[2600]	training's auc: 0.993822	valid_0's auc: 0.961964
[2700]	training's auc: 0.994363	valid_0's auc: 0.962333
[2800

In [19]:
params['n_estimators'] = int(mean_iterat)
params['n_estimators']

6893

In [20]:
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(train[X_cols], train.isFraud)

LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.3,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='split', lambda_l1=0,
               lambda_l2=0, learning_rate=0.01, max_depth=11, metric=['AUC'],
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=6893, n_jobs=-1, num_leaves=91,
               objective='xentropy', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, seed=42, silent=True, subsample=1.0,
               subsample_for_bin=200000, ...)

In [21]:
y_preds = lgb_model.predict_proba(test[X.columns])

In [22]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [23]:
df_sub['isFraud'] = y_preds[:,1]

In [24]:
df_sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,7.8e-05
1,3663550,0.001336
2,3663551,0.000121
3,3663552,0.001579
4,3663553,0.000823


In [25]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [26]:
submission_name = '{0}_LightGBM_{1}'.format(D, mean_auc_score)

In [27]:
logging.warning("Submission name: {}".format(submission_name))

In [28]:
df_sub.to_csv('submissions/{}.csv'.format(submission_name), sep=',', header=True, index=None)

In [29]:
logging.warning("End")

In [37]:
df_sub[df_sub.isFraud>0.99].shape

(910, 2)