# LightGBM

### Libraries

In [42]:
import warnings
warnings.filterwarnings('ignore')

# import pandas as pd
import modin.pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, GroupKFold
import datetime

from utils.schemas import *
from utils.functions import *

### Logging

In [5]:
LOG_NAME = 'logs/LightGBM.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New LightGBM Model #####")

### Data

In [6]:
df_imp = pd.read_csv('docs/20190902_PermitationImportance_Rf3.csv')

In [10]:
X_cols = df_imp[df_imp.Importance > 0].feature.to_list()

In [11]:
len(X_cols)

267

In [12]:
X_cols[:9]

['R1', 'C1', 'C13', 'V258', 'C14', 'V201', 'C8', 'V199', 'V274']

In [13]:
data_folder = 'input'

In [14]:
train = pd.read_csv(data_folder+'/train_ft_eng_1.csv', dtype = schema_ft_eng_1, usecols=X_cols+['isFraud', 'TransactionDT'])
test = pd.read_csv(data_folder+'/test_ft_eng_1.csv', dtype = schema_ft_eng_1, usecols=X_cols)

In [15]:
X = train[X_cols]
y = train.isFraud

In [25]:
id_cols = ['addr1', 'addr2', 'card1', 'P_emaildomain']

groups = train.groupby(id_cols).grouper.group_info[0]

### Model

In [54]:
logging.warning("Used columns: {}".format(X_cols))
k = 3
logging.warning("Folds number: {}".format(k))

In [56]:
group_kfold = GroupKFold(n_splits=k)

In [57]:
train_ids = X.index
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

#### LightGBM

In [58]:
params = {
    'num_leaves': 256,
    'max_depth': -1,
    'metric': ['AUC'],
    'first_metric_only': True,
    'n_estimators': 20000,
    'learning_rate': 0.01,
    'colsample_bytree': 0.4,
    'objective': 'xentropy',
    'n_jobs': -1,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'bagging_seed': 42,
    'random_state':42,
    'seed': 42,
    'feature_fraction_seed': 42,
    'drop_seed': 42,
    'data_random_seed': 42,
}

### Best max_depth

In [None]:
logging.warning("Searching best max_depth")

In [80]:
for train_index, test_index in group_kfold.split(X, y, groups):
    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

scores_dict = dict()
for i in range(5, 20):
    print('Max depth = {}'.format(i))
    logging.warning('Max depth = {}'.format(i))
    params['max_depth'] = i
    lgb_model = lgb.LGBMClassifier(**params)
    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  verbose=100,
                  early_stopping_rounds=100)
    scores_dict[i] = lgb_model.best_score_['valid_0']['auc']

Max depth = 5
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.876001	valid_0's auc: 0.868347
[200]	training's auc: 0.889415	valid_0's auc: 0.875198
[300]	training's auc: 0.898851	valid_0's auc: 0.880354
[400]	training's auc: 0.906328	valid_0's auc: 0.884393
Did not meet early stopping. Best iteration is:
[448]	training's auc: 0.908602	valid_0's auc: 0.885995
Max depth = 6
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.884189	valid_0's auc: 0.873093
[200]	training's auc: 0.901483	valid_0's auc: 0.880355
[300]	training's auc: 0.913456	valid_0's auc: 0.885806
[400]	training's auc: 0.921941	valid_0's auc: 0.890356
Did not meet early stopping. Best iteration is:
[448]	training's auc: 0.924902	valid_0's auc: 0.892198
Max depth = 7
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.894282	valid_0's auc: 0.876881
[200]	training's auc: 0.915056	valid_0's auc: 0.884701
[300]	trainin

In [91]:
b = pd.DataFrame({'Depth': [x for x in scores_dict.keys()], 'AUC': [x for x in scores_dict.values()]})
md = b[b.AUC == b.AUC.max()].Depth
md.values[0]

17

In [None]:
params['max_depth'] = md.values[0]

In [59]:
lgb_model = lgb.LGBMClassifier(**params)

In [60]:
logging.warning("Params: {}".format(str(lgb_model.get_params())))

In [64]:
counter = 1
auc_score = 0
iterat = 0
# for train_index, test_index in skf.split(train_ids, y):
for train_index, test_index in group_kfold.split(X, y, groups):
    print('Fold {}\n'.format(counter))
    logging.warning("Training fold {}".format(counter))

    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  verbose=100,
                  early_stopping_rounds=100)
    
    logging.warning("Best AUC in this fold: {}".format(lgb_model.best_score_['valid_0']['auc']))
    logging.warning("Best iteration in this fold: {}".format(lgb_model.best_iteration_))
    auc_score += lgb_model.best_score_['valid_0']['auc']
    iterat += lgb_model.best_iteration_

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()
    
    counter += 1
    
mean_auc_score = auc_score/k
mean_iterat = iterat/k

logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))

Fold 1

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.934737	valid_0's auc: 0.877548
[200]	training's auc: 0.960015	valid_0's auc: 0.88549
[300]	training's auc: 0.975274	valid_0's auc: 0.886796
Early stopping, best iteration is:
[249]	training's auc: 0.968586	valid_0's auc: 0.887004
Fold 2

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.934875	valid_0's auc: 0.886468
[200]	training's auc: 0.957847	valid_0's auc: 0.893972
[300]	training's auc: 0.97504	valid_0's auc: 0.898208
[400]	training's auc: 0.984539	valid_0's auc: 0.901038
[500]	training's auc: 0.990353	valid_0's auc: 0.901948
Did not meet early stopping. Best iteration is:
[548]	training's auc: 0.992245	valid_0's auc: 0.902299
Fold 3

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.931893	valid_0's auc: 0.888802
[200]	training's auc: 0.95858	valid_0's auc: 0.897061
[300]	training's auc: 0.974551	valid_0's auc: 0

In [65]:
params['n_estimators'] = int(mean_iterat)
params['n_estimators']

448

In [66]:
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(train[X_cols], train.isFraud)

LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.4,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='split', lambda_l1=0,
               lambda_l2=0, learning_rate=0.01, max_depth=-1, metric=['AUC'],
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=448, n_jobs=-1, num_leaves=256,
               objective='xentropy', random_state=42, reg_alpha=0.0,
               reg_lambda=0.0, seed=42, silent=True, subsample=1.0,
               subsample_for_bin=200000, ...)

In [67]:
y_preds = lgb_model.predict_proba(test[X.columns])

In [68]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [69]:
df_sub['isFraud'] = y_preds[:,1]

In [70]:
df_sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.003384
1,3663550,0.005506
2,3663551,0.004588
3,3663552,0.006006
4,3663553,0.005722


In [71]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [72]:
submission_name = '{0}_LightGBM_{1}'.format(D, mean_auc_score)

In [73]:
logging.warning("Submission name: {}".format(submission_name))

In [74]:
df_sub.to_csv('submissions/{}.csv'.format(submission_name), sep=',', header=True, index=None)

In [75]:
logging.warning("End")

In [78]:
df_sub[df_sub.isFraud>0.9].shape

(2947, 2)