# LightGBM

### Libraries

In [30]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import datetime

from utils.schemas import *
from utils.functions import *

### Logging

In [14]:
LOG_NAME = 'logs/LightGBM.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New LightGBM Model #####")

### Data

In [15]:
df_imp = pd.read_csv('docs/ft_importances_20190831.csv')

In [16]:
X_cols = df_imp[df_imp.importance > 0.08].feature.to_list()

In [17]:
len(X_cols)

227

In [18]:
data_folder = 'input'

In [19]:
train = pd.read_csv(data_folder+'/train_ft_eng_1.csv', dtype = schema_ft_eng_1, usecols=X_cols+['isFraud', 'TransactionDT'])
test = pd.read_csv(data_folder+'/test_ft_eng_1.csv', dtype = schema_ft_eng_1, usecols=X_cols)

In [20]:
X = train[X_cols]
y = train.isFraud

### Model

In [21]:
logging.warning("Used columns: {}".format(X_cols))
k = 5
logging.warning("Folds number: {}".format(k))

In [23]:
train_ids = X.index
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

#### LightGBM

In [24]:
params = {
    'num_leaves': 351,
    'metric': ['AUC'],
    'first_metric_only': True,
    'n_estimators': 20000,
    'learning_rate': 0.01,
    'colsample_bytree': 0.5,
    'objective': 'xentropy',
    'n_jobs': -1,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'bagging_seed': 42,
    'seed': 42,
    'feature_fraction_seed': 42,
    'drop_seed': 42,
    'data_random_seed': 42,
}

In [25]:
lgb_model = lgb.LGBMClassifier(**params)

In [28]:
logging.warning("Params: {}".format(str(lgb_model.get_params())))

In [37]:
counter = 1
auc_score = 0
iterat = 0
for train_index, test_index in skf.split(train_ids, y):
    print('Fold {}\n'.format(counter))
    logging.warning("Training fold {}".format(counter))

    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  verbose=100,
                  early_stopping_rounds=50)
    
    logging.warning("Best AUC in this fold: {}".format(lgb_model.best_score_['valid_0']['auc']))
    logging.warning("Best iteration in this fold: {}".format(lgb_model.best_iteration_))
    auc_score += lgb_model.best_score_['valid_0']['auc']
    iterat += lgb_model.best_iteration_

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()
    
    counter += 1
    
mean_auc_score = auc_score/k
mean_iterat = iterat/k

logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))

Fold 1

Training until validation scores don't improve for 50 rounds.
[100]	training's auc: 0.941268	valid_0's auc: 0.919339
[200]	training's auc: 0.966362	valid_0's auc: 0.935667
[300]	training's auc: 0.98354	valid_0's auc: 0.948619
[400]	training's auc: 0.991911	valid_0's auc: 0.955736
[500]	training's auc: 0.996108	valid_0's auc: 0.960154
[600]	training's auc: 0.998089	valid_0's auc: 0.962689
[700]	training's auc: 0.999035	valid_0's auc: 0.964604
[800]	training's auc: 0.999511	valid_0's auc: 0.965947
[900]	training's auc: 0.999736	valid_0's auc: 0.967118
[1000]	training's auc: 0.999855	valid_0's auc: 0.968205
[1100]	training's auc: 0.999921	valid_0's auc: 0.968861
[1200]	training's auc: 0.999953	valid_0's auc: 0.969519
[1300]	training's auc: 0.99997	valid_0's auc: 0.970023
[1400]	training's auc: 0.999978	valid_0's auc: 0.970542
[1500]	training's auc: 0.999981	valid_0's auc: 0.970929
[1600]	training's auc: 0.999983	valid_0's auc: 0.97132
[1700]	training's auc: 0.999983	valid_0's auc:

In [38]:
params['n_estimators'] = int(mean_iterat*1.1)
params['n_estimators']

2052

In [39]:
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(train[X_cols], train.isFraud)

LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='split', lambda_l1=0,
               lambda_l2=0, learning_rate=0.01, max_depth=-1, metric=['AUC'],
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2052, n_jobs=-1, num_leaves=351,
               objective='xentropy', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, seed=42, silent=True, subsample=1.0,
               subsample_for_bin=200000, ...)

In [40]:
y_preds = lgb_model.predict_proba(test[X.columns])

In [41]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [42]:
df_sub['isFraud'] = y_preds[:,1]

In [43]:
df_sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000243
1,3663550,0.001092
2,3663551,0.000159
3,3663552,0.001502
4,3663553,0.000712


In [44]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [45]:
submission_name = '{0}_LightGBM_{1}'.format(D, mean_auc_score)

In [46]:
logging.warning("Submission name: {}".format(submission_name))

In [47]:
df_sub.to_csv('submissions/{}.csv'.format(submission_name), sep=',', header=True, index=None)

In [48]:
logging.warning("End")