# LightGBM

### Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
# import modin.pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, GroupKFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import datetime

from utils.schemas import *
from utils.functions import *

### Logging

In [2]:
LOG_NAME = 'logs/LightGBM.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New LightGBM Model #####")

### Data

In [3]:
df_imp = pd.read_csv('docs/20190906_PermitationImportance_Rf5.csv')

In [3]:
drop_cols = ['D1','D10','D11','D12','D13','D14','D15','D2','D3','D4','D5','D6',
             'D7','D8','D9','C1','C10','C11','C12','C13','C14','C2','C3','C4','C5','C6','C7','C8','C9']

In [5]:
X_cols = df_imp[df_imp.Importance > 0].feature.to_list()
# X_cols = df_imp.feature[:100].to_list()

In [4]:
len(X_cols)

NameError: name 'X_cols' is not defined

In [7]:
X_cols[:9]

['R1', 'V258', 'C7', 'V45', 'C1', 'C12', 'C8', 'C4', 'V257']

In [3]:
data_folder = 'input'

In [4]:
id1 = ['addr1', 'addr2', 'card1', 'day']

In [5]:
id2 = ['card1', 'card2', 'card3', 'card5']

In [6]:
id_cols = ['addr1', 'addr2', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6']

In [7]:
group_cols = id2

In [8]:
# train = pd.read_csv(data_folder+'/train_syn_ft_eng_1.csv', dtype = schema_synthetic_ft_eng_1, usecols=list(set(X_cols+['isFraud']+id_cols)))
# test = pd.read_csv(data_folder+'/test_syn_ft_eng_1.csv', dtype = schema_synthetic_ft_eng_1, usecols=list(set(X_cols+['isFraud']+id_cols)))
train = pd.read_csv(data_folder+'/train_ft_eng_3.csv', dtype = schema_ft_eng_3)
test = pd.read_csv(data_folder+'/test_ft_eng_3.csv', dtype = schema_ft_eng_3)

In [121]:
# train_1 = train[train.isFraud == 1]
# train_1.shape

# train_0 = train[train.isFraud == 0].sample(train_1.shape[0]*3, random_state=42)

# mini_train = pd.concat([train_1, train_0], axis=0).reset_index(drop=True)

(9025, 677)

In [18]:
# train['day'] = np.floor((train['TransactionDT'] / (3600 * 24) - 1))
# test['day'] = np.floor((test['TransactionDT'] / (3600 * 24) - 1))

In [9]:
groups = train.groupby(['month']).grouper.group_info[0]

In [96]:
imp_dict = dict()
for c in X_cols:
    if c not in cat_ft:
        imp_dict[c] = train[c].median()

In [12]:
dc = ['C1','C10','C11','C12','C13','C14','C2','C3','C4','C5','C6','C7',
      'C8','C9','D1','D10','D11','D12','D13','D14','D15','D2','D3','D4','D5','D6','D7','D8',]

In [48]:
drop_cols = ['date',
 'day','isFraud',
 'month',
 'year','date_fe1','date_fe2','TransactionDT','addr1',
 'addr2',
 'card_0',
 'card_1',
 'card_2',
 'card_3',
 'card_4',
 'card_5',
 'card_6',
 'card_7',
 'card_8',
 'card_9',
 'card_10',
 'card_11',
 'card_12',
 'card_13',
 'card_14',
 'card_15',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6','dist1','card1_fe1',
 'card2_fe1',
 'card3_fe1',
 'card4_fe1',
 'card5_fe1',
 'card6_fe1',
 'addr1_fe1',
 'addr2_fe1','card_fe1','card1_fe2',
 'card2_fe2',
 'card3_fe2',
 'card4_fe2',
 'card5_fe2',
 'card6_fe2',
 'addr1_fe2',
 'addr2_fe2','card_fe2','addr',] + dc

In [49]:
X_cols = [x for x in train.columns if x not in drop_cols]

In [50]:
# X_cols = [x for x in X_cols if x not in ['day']+id_cols]

In [51]:
# X_cols = [x for x in X_cols if x not in ['P_emaildomain_0', 'device_version_fe1', 'N17', 'N9', 'R_emaildomain_0', 'device_name_fe1', 'N7', 'R_emaildomain_1', 'N22', 'N10', 'N12', 'R_emaildomain_0_fe1', 'P_emaildomain_0_fe1', 'proc_id_30_0_fe1', 'N13', 'R_emaildomain_1_fe1', 'R_emaildomain_1_fe2', 'P_emaildomain_0_fe2', 'N21', 'proc_id_30_1_fe1', 'proc_id_31_0_fe1', 'proc_id_31_0_fe2', 'device_name', 'device_version']]

In [32]:
new_cols = [x for x in train.columns if 'M' in x or 'D' in x or 'C' in x]

In [33]:
for c in new_cols:
    train[c] = train[c] / train.groupby(['month'])[c].transform('mean')

In [61]:
X = train[X_cols]#.fillna(imp_dict)
y = train.isFraud

In [62]:
X_test = test[X_cols]
y_test = test.isFraud

In [63]:
gc.collect()

180

### Model

In [37]:
logging.warning("Used columns: {}".format(X_cols))
k = 6
logging.warning("Folds number: {}".format(k))

In [38]:
group_kfold = GroupKFold(n_splits=k)

In [39]:
train_ids = X.index
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [40]:
tscv = TimeSeriesSplit(n_splits=k)

#### LightGBM

In [55]:
params = {
    'num_leaves': 256,
    'max_depth': 15,
    'max_leaf_nodes': 45,
    'min_sample_leaf': 20,
    'metric': ['AUC'],
    'first_metric_only': True,
    'n_estimators': 50000,
    'num_threads': 56,
    'learning_rate': 0.01,
    'colsample_bytree': 0.4,
    'objective': 'xentropy',
#     'n_jobs': -1,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'importance_type': 'gain',
#     'lambda_l1': 0.05,
#     'lambda_l2': 0.05,
    'bagging_seed': 42,
    'random_state':42,
    'seed': 42,
    'feature_fraction_seed': 42,
    'drop_seed': 42,
    'data_random_seed': 42,
}

### Best max_depth

In [165]:
# logging.warning("Searching best max_depth")

# for train_index, test_index in group_kfold.split(X, y, groups):
#     X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
#     y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

# scores_dict = dict()
# for i in range(5, 20):
#     print('Max depth = {}'.format(i))
#     logging.warning('Max depth = {}'.format(i))
#     params['max_depth'] = i
#     lgb_model = lgb.LGBMClassifier(**params)
#     lgb_model.fit(X_fit,
#                   y_fit,
#                   eval_set=[(X_val, y_val), (X_fit, y_fit)],
#                   verbose=100,
#                   early_stopping_rounds=40)
#     scores_dict[i] = lgb_model.best_score_['valid_0']['auc']

# b = pd.DataFrame({'Depth': [x for x in scores_dict.keys()], 'AUC': [x for x in scores_dict.values()]})
# md = b[b.AUC == b.AUC.max()].Depth
# md.values[0]

# params['max_depth'] = md.values[0]

In [23]:
# lgb_model = lgb.LGBMClassifier(**params)

In [24]:
# logging.warning("Params: {}".format(str(lgb_model.get_params())))

In [34]:
groups_test = test.groupby(group_cols).grouper.group_info[0]

In [64]:
fold_strategy_skf = skf.split(train_ids, y)
fold_strategy_gkf = group_kfold.split(X, y, train.month)
fold_strategy_tss = tscv.split(X=X, y=y)

# fold_strategy_test = group_kfold.split(X_test, y_test, groups_test)

In [65]:
scaler = MinMaxScaler()

In [66]:
gc.collect()

0

In [67]:
def make_predictions(fold_strategy):
    lgb_model = lgb.LGBMClassifier(**params)
    logging.warning("Params: {}".format(str(lgb_model.get_params())))
    counter = 1
    auc_score = 0
    iterat = 0
    list_iter = list()
    y_preds = np.zeros(X_test.shape[0])
    importances = np.zeros(X_test.shape[1])

    for train_index, test_index in fold_strategy:
        print('Fold {}'.format(counter))
        logging.warning("Training fold {}".format(counter))

        X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
        y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

        lgb_model.fit(X_fit,
                      y_fit,
    #                   eval_set=[(X_val, y_val), (X_fit, y_fit)],
                      eval_set=[(X_val, y_val)],
                      verbose=200,
                      early_stopping_rounds=40)

        logging.warning("Best AUC in this fold: {}".format(lgb_model.best_score_['valid_0']['auc']))
        logging.warning("Best iteration in this fold: {}".format(lgb_model.best_iteration_))
        auc_score += lgb_model.best_score_['valid_0']['auc']
        it = lgb_model.best_iteration_
        iterat += it
        list_iter.append(it)
        importances += lgb_model.feature_importances_/k
        predictions = lgb_model.predict_proba(X_test)[:,1]
        predictions = np.array([x[0] for x in scaler.fit_transform(predictions.reshape(-1, 1))])
        
#         print(predictions)
        y_preds += predictions/k


        del X_fit
        del X_val
        del y_fit
        del y_val
        del train_index
        del test_index
        gc.collect()

        counter += 1


    mean_auc_score = auc_score/k
    mean_iterat = iterat/k

    logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
    logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))
    
    return y_preds, importances, mean_auc_score

In [68]:
print('Training with GoupKFold')
y_preds_gkf, importances_gkf, mean_auc_score_gkf = make_predictions(fold_strategy_gkf)
# print('')
# print('Training with TimeSeriesSplit')
# y_preds_tss, importances_tss, mean_auc_score_tss = make_predictions(fold_strategy_tss)

Training with GoupKFold
Fold 1
Training until validation scores don't improve for 40 rounds.
[200]	valid_0's auc: 0.865219
[400]	valid_0's auc: 0.87659
[600]	valid_0's auc: 0.882139
[800]	valid_0's auc: 0.885737
Early stopping, best iteration is:
[937]	valid_0's auc: 0.887073
Fold 2
Training until validation scores don't improve for 40 rounds.
[200]	valid_0's auc: 0.898656
[400]	valid_0's auc: 0.912186
[600]	valid_0's auc: 0.916989
[800]	valid_0's auc: 0.919176
[1000]	valid_0's auc: 0.920438
Early stopping, best iteration is:
[1117]	valid_0's auc: 0.920673
Fold 3
Training until validation scores don't improve for 40 rounds.
[200]	valid_0's auc: 0.895843
[400]	valid_0's auc: 0.907124
[600]	valid_0's auc: 0.913914
[800]	valid_0's auc: 0.916652
[1000]	valid_0's auc: 0.9184
[1200]	valid_0's auc: 0.919486
[1400]	valid_0's auc: 0.92022
Early stopping, best iteration is:
[1431]	valid_0's auc: 0.920444
Fold 4
Training until validation scores don't improve for 40 rounds.
[200]	valid_0's auc: 0.

In [75]:
y_preds_gkf[230:235]

array([0.00129607, 0.70607657, 0.00683997, 0.00655756, 0.00107914])

In [77]:
df_importance = pd.DataFrame({'feature': X.columns, 'importance': importances_gkf})\
.sort_values('importance', ascending = False)
df_importance.head(10)

Unnamed: 0,feature,importance
613,R1,377514.795445
194,V258,212999.74583
193,V257,93634.97909
609,N3,78360.733913
234,V294,71469.572079
644,R29,66807.979053
620,R4,65705.00227
16,TransactionAmt,57894.502839
132,V201,56233.524995
664,R50,51395.703869


In [52]:
y_preds_final = (y_preds_gkf + y_preds_tss) / 2

In [79]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [80]:
df_sub['isFraud'] = y_preds_gkf

In [81]:
df_sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000911
1,3663550,0.003058
2,3663551,0.001825
3,3663552,0.003406
4,3663553,0.003128


In [82]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [83]:
submission_name = '{0}_LightGBM_{1}'.format(D, round(mean_auc_score_gkf, 6))

In [84]:
submission_name

'20190910_LightGBM_0.929139'

In [85]:
logging.warning("Submission name: {}".format(submission_name))

In [86]:
df_sub.to_csv('submissions/{}.csv'.format(submission_name), sep=',', header=True, index=None)

In [87]:
df_importance.to_csv('docs/ft_importances_{}.csv'.format(D), index=None, header=True)

In [88]:
logging.warning("End")

In [29]:
counter = 1
auc_score = 0
iterat = 0
list_iter = list()
y_preds = np.zeros(X_test.shape[0])
importances = np.zeros(X_test.shape[1])

for train_index, test_index in fold_strategy:
    print('Fold {}'.format(counter))
    logging.warning("Training fold {}".format(counter))

    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
#                   eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  eval_set=[(X_val, y_val)],
                  verbose=200,
                  early_stopping_rounds=40)
    
    logging.warning("Best AUC in this fold: {}".format(lgb_model.best_score_['valid_0']['auc']))
    logging.warning("Best iteration in this fold: {}".format(lgb_model.best_iteration_))
    auc_score += lgb_model.best_score_['valid_0']['auc']
#     lgb_model.predict_proba(test[X.columns])
#     preds = lgb_model.predict_proba(X_test[X.columns])[:,1]
#     preds = lgb_model.predict_proba(X_fit_test[X.columns])[:,1]
#     print('AUC test score: {}\n'.format(roc_auc_score(y_fit_test, preds)))
#     print('AUC 20% test score: {}'.format(roc_auc_score(y_test[:int(len(y_test)*0.2)],
#                                                         preds[:int(len(y_test)*0.2)])))
#     print('AUC 80% test score: {}'.format(roc_auc_score(y_test[int(len(y_test)*0.2):],
#                                                         preds[int(len(y_test)*0.2):])))
    it = lgb_model.best_iteration_
    iterat += it
    list_iter.append(it)
    importances += lgb_model.feature_importances_/k
    y_preds += lgb_model.predict_proba(X_test)[:,1]/k


    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()
    
    counter += 1

    
mean_auc_score = auc_score/k
mean_iterat = iterat/k

logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))

Fold 1
Training until validation scores don't improve for 40 rounds.
[200]	valid_0's auc: 0.952958
[400]	valid_0's auc: 0.958528
Early stopping, best iteration is:
[530]	valid_0's auc: 0.959233
Fold 2
Training until validation scores don't improve for 40 rounds.
[200]	valid_0's auc: 0.947061
[400]	valid_0's auc: 0.952863
[600]	valid_0's auc: 0.953964
Early stopping, best iteration is:
[637]	valid_0's auc: 0.954343
Fold 3
Training until validation scores don't improve for 40 rounds.
[200]	valid_0's auc: 0.954049
[400]	valid_0's auc: 0.958746
[600]	valid_0's auc: 0.959474
Early stopping, best iteration is:
[578]	valid_0's auc: 0.959689
Fold 4
Training until validation scores don't improve for 40 rounds.
[200]	valid_0's auc: 0.951073
[400]	valid_0's auc: 0.955019
[600]	valid_0's auc: 0.956798
Early stopping, best iteration is:
[604]	valid_0's auc: 0.956889
Fold 5
Training until validation scores don't improve for 40 rounds.
[200]	valid_0's auc: 0.948491
[400]	valid_0's auc: 0.953397
[600]

In [31]:
df_importaces = pd.DataFrame({'feature': X.columns, 'importance': importances})\
.sort_values('importance', ascending=False).reset_index(drop=True)
# df_importaces['cs'] = df_importaces.importance.cumsum()
# # df_importaces.cs = df_importaces.cs/df_importaces.cs.max()
# df_importaces.importance = df_importaces.importance/df_imp.cs
df_importaces.head(20)

Unnamed: 0,feature,importance
0,R1,91065.925002
1,V257,50031.108368
2,N3,26082.882709
3,R12,24254.089073
4,R4,20582.440879
5,N6,20300.030602
6,R11,18503.751742
7,card2_fe1,17764.991037
8,TransactionAmt,17752.117065
9,V294,17534.175283


In [38]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [39]:
df_sub['isFraud'] = y_preds

In [40]:
df_sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000247
1,3663550,0.002492
2,3663551,0.000582
3,3663552,0.000902
4,3663553,0.000977


In [43]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [46]:
submission_name = '{0}_LightGBM_{1}'.format(D, round(mean_auc_score, 6))

In [47]:
submission_name

'20190908_LightGBM_0.958018'

In [48]:
logging.warning("Submission name: {}".format(submission_name))

In [49]:
df_sub.to_csv('submissions/{}.csv'.format(submission_name), sep=',', header=True, index=None)

In [50]:
logging.warning("End")

In [83]:
new_groups = new_X.groupby(group_cols).grouper.group_info[0]

In [84]:
counter = 1
auc_score = 0
iterat = 0
list_iter = list()
# for train_index, test_index in skf.split(train_ids, y):
for train_index, test_index in group_kfold.split(new_X, new_y, new_groups):
# for train_index, test_index in tscv.split(X=X, y=y):
    print('Fold {}\n'.format(counter))
    logging.warning("Training fold {}".format(counter))

    X_fit, X_val = new_X.iloc[train_index, :], new_X.iloc[test_index, :]
    y_fit, y_val = new_y.iloc[train_index], new_y.iloc[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  verbose=100,
                  early_stopping_rounds=100)
    
    logging.warning("Best AUC in this fold: {}".format(lgb_model.best_score_['valid_0']['auc']))
    logging.warning("Best iteration in this fold: {}".format(lgb_model.best_iteration_))
    auc_score += lgb_model.best_score_['valid_0']['auc']
    preds = lgb_model.predict_proba(X_test[X.columns])[:,1]
    print('AUC test score: {}'.format(roc_auc_score(y_test, preds)))
    print('AUC 20% test score: {}'.format(roc_auc_score(y_test[:int(len(y_test)*0.2)],
                                                        preds[:int(len(y_test)*0.2)])))
    print('AUC 80% test score: {}'.format(roc_auc_score(y_test[int(len(y_test)*0.2):],
                                                        preds[int(len(y_test)*0.2):])))
    it = lgb_model.best_iteration_
    iterat += it
    list_iter.append(it)

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()
    
    counter += 1
    
    break
    
mean_auc_score = auc_score/k
mean_iterat = iterat/k

logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))

Fold 1

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.895268	valid_0's auc: 0.880426
[200]	training's auc: 0.930704	valid_0's auc: 0.882666
Early stopping, best iteration is:
[173]	training's auc: 0.923412	valid_0's auc: 0.883409
AUC test score: 0.864554651319306
AUC 20% test score: 0.8795648576345201
AUC 80% test score: 0.8593868240892053


In [38]:
mean_iterat

2624.6

In [39]:
params['n_estimators'] = int(mean_iterat)
params['n_estimators']

2624

In [43]:
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(train[X_cols+drop_cols], train.isFraud)

LGBMClassifier(bagging_fraction=0.7, bagging_freq=2, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.3,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='split', lambda_l1=0,
               lambda_l2=0, learning_rate=0.01, max_depth=-1, metric=['AUC'],
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2624, n_jobs=-1, num_leaves=311, num_threads=64,
               objective='xentropy', random_state=42, reg_alpha=0.0,
               reg_lambda=0.0, seed=42, silent=True, subsample=1.0, ...)

In [44]:
y_preds = lgb_model.predict_proba(test[X.columns])

In [45]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [46]:
df_sub['isFraud'] = y_preds[:,1]

In [47]:
df_sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000113
1,3663550,0.000302
2,3663551,0.000472
3,3663552,0.000826
4,3663553,0.000374


In [48]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [49]:
submission_name = '{0}_LightGBM_{1}'.format(D, mean_auc_score)

In [50]:
submission_name

'20190903_LightGBM_0.9732650305281695'

In [51]:
logging.warning("Submission name: {}".format(submission_name))

In [52]:
df_sub.to_csv('submissions/{}.csv'.format(submission_name), sep=',', header=True, index=None)

In [53]:
logging.warning("End")

In [54]:
df_sub[df_sub.isFraud>0.9].shape

(5084, 2)