# LightGBM

### Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import datetime

from utils.schemas import *
from utils.functions import *

### Logging

In [2]:
LOG_NAME = 'logs/LightGBM.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New LightGBM Model #####")

### Data

In [3]:
df_imp = pd.read_csv('docs/20190901_PermitationImportance_Rf1.csv')

In [4]:
X_cols = df_imp.feature[:50].to_list()

In [5]:
len(X_cols)

50

In [28]:
X_cols

['R1',
 'C1',
 'C13',
 'C14',
 'R12',
 'C10',
 'N3',
 'card2_fe2',
 'TransactionAmt',
 'C2',
 'N5',
 'V90',
 'card6_fe1',
 'card2',
 'addr1',
 'V283',
 'addr1_fe1',
 'R3',
 'R4',
 'V45',
 'card1_fe2',
 'R11',
 'id_31',
 'card1',
 'C11',
 'id_31_fe1',
 'card1_fe1',
 'C5',
 'V282',
 'addr2',
 'N1',
 'V308',
 'card2_fe1',
 'V310',
 'D1',
 'V91',
 'R_emaildomain_0_fe2',
 'R20',
 'addr2_fe1',
 'card3',
 'N2',
 'id_17_fe1',
 'V233',
 'N4',
 'id_31_fe2',
 'V201',
 'R2',
 'V258',
 'M4_fe2',
 'N6']

In [6]:
data_folder = 'input'

In [8]:
train = pd.read_csv(data_folder+'/train_ft_eng_1.zip', dtype = schema_ft_eng_1, usecols=X_cols+['isFraud', 'TransactionDT'])
test = pd.read_csv(data_folder+'/test_ft_eng_1.zip', dtype = schema_ft_eng_1, usecols=X_cols)

In [9]:
X = train[X_cols]
y = train.isFraud

### Model

In [10]:
logging.warning("Used columns: {}".format(X_cols))
k = 5
logging.warning("Folds number: {}".format(k))

In [11]:
train_ids = X.index
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

#### LightGBM

In [12]:
params = {
    'num_leaves': 91,
    'max_depth': 11,
    'metric': ['AUC'],
    'first_metric_only': True,
    'n_estimators': 20000,
    'learning_rate': 0.01,
    'colsample_bytree': 0.3,
    'objective': 'xentropy',
    'n_jobs': -1,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'bagging_seed': 42,
    'random_state':42,
    'seed': 42,
    'feature_fraction_seed': 42,
    'drop_seed': 42,
    'data_random_seed': 42,
}

In [13]:
lgb_model = lgb.LGBMClassifier(**params)

In [14]:
logging.warning("Params: {}".format(str(lgb_model.get_params())))

In [15]:
counter = 1
auc_score = 0
iterat = 0
for train_index, test_index in skf.split(train_ids, y):
    print('Fold {}\n'.format(counter))
    logging.warning("Training fold {}".format(counter))

    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  verbose=100,
                  early_stopping_rounds=50)
    
    logging.warning("Best AUC in this fold: {}".format(lgb_model.best_score_['valid_0']['auc']))
    logging.warning("Best iteration in this fold: {}".format(lgb_model.best_iteration_))
    auc_score += lgb_model.best_score_['valid_0']['auc']
    iterat += lgb_model.best_iteration_

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()
    
    counter += 1
    
mean_auc_score = auc_score/k
mean_iterat = iterat/k

logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))

Fold 1

Training until validation scores don't improve for 50 rounds.
[100]	training's auc: 0.901311	valid_0's auc: 0.894143
[200]	training's auc: 0.91718	valid_0's auc: 0.90679
[300]	training's auc: 0.930636	valid_0's auc: 0.917193
[400]	training's auc: 0.94079	valid_0's auc: 0.924574
[500]	training's auc: 0.948169	valid_0's auc: 0.930591
[600]	training's auc: 0.954032	valid_0's auc: 0.934998
[700]	training's auc: 0.958442	valid_0's auc: 0.938292
[800]	training's auc: 0.962137	valid_0's auc: 0.940951
[900]	training's auc: 0.965408	valid_0's auc: 0.943348
[1000]	training's auc: 0.968203	valid_0's auc: 0.944965
[1100]	training's auc: 0.970906	valid_0's auc: 0.946892
[1200]	training's auc: 0.97327	valid_0's auc: 0.948341
[1300]	training's auc: 0.97548	valid_0's auc: 0.949865
[1400]	training's auc: 0.977409	valid_0's auc: 0.951181
[1500]	training's auc: 0.979172	valid_0's auc: 0.952413
[1600]	training's auc: 0.980528	valid_0's auc: 0.95332
[1700]	training's auc: 0.98191	valid_0's auc: 0.9

[400]	training's auc: 0.94044	valid_0's auc: 0.927364
[500]	training's auc: 0.947911	valid_0's auc: 0.933344
[600]	training's auc: 0.953779	valid_0's auc: 0.937895
[700]	training's auc: 0.958095	valid_0's auc: 0.940974
[800]	training's auc: 0.96211	valid_0's auc: 0.94383
[900]	training's auc: 0.965425	valid_0's auc: 0.945984
[1000]	training's auc: 0.968215	valid_0's auc: 0.947823
[1100]	training's auc: 0.970657	valid_0's auc: 0.94937
[1200]	training's auc: 0.973073	valid_0's auc: 0.950841
[1300]	training's auc: 0.975108	valid_0's auc: 0.952103
[1400]	training's auc: 0.97695	valid_0's auc: 0.953264
[1500]	training's auc: 0.978674	valid_0's auc: 0.954478
[1600]	training's auc: 0.980135	valid_0's auc: 0.955331
[1700]	training's auc: 0.981532	valid_0's auc: 0.956317
[1800]	training's auc: 0.982965	valid_0's auc: 0.957052
[1900]	training's auc: 0.984148	valid_0's auc: 0.957779
[2000]	training's auc: 0.985282	valid_0's auc: 0.958529
[2100]	training's auc: 0.986332	valid_0's auc: 0.959201
[22

[1100]	training's auc: 0.970534	valid_0's auc: 0.951109
[1200]	training's auc: 0.972998	valid_0's auc: 0.952718
[1300]	training's auc: 0.975056	valid_0's auc: 0.954017
[1400]	training's auc: 0.977073	valid_0's auc: 0.955061
[1500]	training's auc: 0.97875	valid_0's auc: 0.956146
[1600]	training's auc: 0.980407	valid_0's auc: 0.957084
[1700]	training's auc: 0.981819	valid_0's auc: 0.957973
[1800]	training's auc: 0.983037	valid_0's auc: 0.958723
[1900]	training's auc: 0.984329	valid_0's auc: 0.959472
[2000]	training's auc: 0.985344	valid_0's auc: 0.960191
[2100]	training's auc: 0.986409	valid_0's auc: 0.960828
[2200]	training's auc: 0.987421	valid_0's auc: 0.961391
[2300]	training's auc: 0.988245	valid_0's auc: 0.961874
[2400]	training's auc: 0.989057	valid_0's auc: 0.962427
[2500]	training's auc: 0.989787	valid_0's auc: 0.962961
[2600]	training's auc: 0.990407	valid_0's auc: 0.963374
[2700]	training's auc: 0.991045	valid_0's auc: 0.963767
[2800]	training's auc: 0.991653	valid_0's auc: 0.

In [16]:
params['n_estimators'] = int(mean_iterat)
params['n_estimators']

6838

In [17]:
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(train[X_cols], train.isFraud)

LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.3,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='split', lambda_l1=0,
               lambda_l2=0, learning_rate=0.01, max_depth=11, metric=['AUC'],
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=6838, n_jobs=-1, num_leaves=91,
               objective='xentropy', random_state=42, reg_alpha=0.0,
               reg_lambda=0.0, seed=42, silent=True, subsample=1.0,
               subsample_for_bin=200000, ...)

In [18]:
y_preds = lgb_model.predict_proba(test[X.columns])

In [19]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [20]:
df_sub['isFraud'] = y_preds[:,1]

In [21]:
df_sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,1.4e-05
1,3663550,0.001307
2,3663551,0.000176
3,3663552,0.002968
4,3663553,0.000423


In [22]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [23]:
submission_name = '{0}_LightGBM_{1}'.format(D, mean_auc_score)

In [24]:
logging.warning("Submission name: {}".format(submission_name))

In [25]:
df_sub.to_csv('submissions/{}.csv'.format(submission_name), sep=',', header=True, index=None)

In [26]:
logging.warning("End")

In [27]:
df_sub[df_sub.isFraud>0.99].shape

(1659, 2)