# LightGBM

### Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
# import modin.pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, GroupKFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import datetime

from utils.schemas import *
from utils.functions import *

### Logging

In [2]:
LOG_NAME = 'logs/LightGBM.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New LightGBM Model #####")

### Data

In [3]:
df_imp = pd.read_csv('docs/20190923_FeatureImportance_LGB.csv')

In [4]:
# drop_cols = ['D1','D10','D11','D12','D13','D14','D15','D2','D3','D4','D5','D6',
#              'D7','D8','D9','C1','C10','C11','C12','C13','C14','C2','C3','C4','C5','C6','C7','C8','C9']

In [5]:
# X_cols = df_imp[df_imp.Importance >= 0].feature.to_list()
X_cols = df_imp.feature[:500].to_list()

In [6]:
len(X_cols)

500

In [7]:
X_cols[:9]

['PCA_27',
 'C1_div_C14',
 'C11_div_C13',
 'C13_div_C8',
 'C1_div_C4',
 'C1',
 'C14_div_C4',
 'C13_div_C2',
 'V317']

In [8]:
data_folder = 'input'

In [10]:
# train = pd.read_csv(data_folder+'/train_syn_ft_eng_1.csv', dtype = schema_synthetic_ft_eng_1, usecols=list(set(X_cols+['isFraud']+id_cols)))
# test = pd.read_csv(data_folder+'/test_syn_ft_eng_1.csv', dtype = schema_synthetic_ft_eng_1, usecols=list(set(X_cols+['isFraud']+id_cols)))
# train = pd.read_csv(data_folder+'/train_ft_eng_5.csv', usecols=X_cols+['isFraud', 'month'])
# test = pd.read_csv(data_folder+'/test_ft_eng_5.csv', usecols=X_cols)

chunksize = 10 ** 5

print('Readig train...')
# train = pd.DataFrame()
train = pd.read_csv(data_folder+'/train_ft_eng_7.csv.gz', usecols=X_cols+['isFraud','month'])
# train = pd.concat([chunk, train], axis=1)
dict_dtypes = reduce_memory2(train)
train = train.astype(dict_dtypes)
    
print('Readig test...')
# test = pd.DataFrame()
test = pd.read_csv(data_folder+'/test_ft_eng_7.csv.gz', usecols=X_cols)
#     test = pd.concat([chunk, test], axis=1)
dict_dtypes = reduce_memory2(test)
test = test.astype(dict_dtypes)


gc.collect()

Readig train...
Reduce_memory...
Readig test...
Reduce_memory...


7

In [11]:
# train_1 = train[train.isFraud == 1]
# train_1.shape

# train_0 = train[train.isFraud == 0].sample(train_1.shape[0]*3, random_state=42)

# mini_train = pd.concat([train_1, train_0], axis=0).reset_index(drop=True)

In [12]:
# train['day'] = np.floor((train['TransactionDT'] / (3600 * 24) - 1))
# test['day'] = np.floor((test['TransactionDT'] / (3600 * 24) - 1))

In [10]:
groups = train.groupby(['month']).grouper.group_info[0]

In [None]:
imp_dict = dict()
for c in X_cols:
    if c not in cat_ft:
        imp_dict[c] = train[c].median()

In [13]:
dc = ['C1','C10','C11','C12','C13','C14','C2','C3','C4','C5','C6','C7',
      'C8','C9','D1','D10','D11','D12','D13','D14','D15','D2','D3','D4','D5','D6','D7','D8',]

In [14]:
drop_cols = ['date',
 'day','isFraud',
 'month',
 'year','date_fe1','date_fe2','TransactionDT','addr1',
 'addr2',
 'card_0',
 'card_1',
 'card_2',
 'card_3',
 'card_4',
 'card_5',
 'card_6',
 'card_7',
 'card_8',
 'card_9',
 'card_10',
 'card_11',
 'card_12',
 'card_13',
 'card_14',
 'card_15',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6','dist1','card1_fe1',
 'card2_fe1',
 'card3_fe1',
 'card4_fe1',
 'card5_fe1',
 'card6_fe1',
 'addr1_fe1',
 'addr2_fe1','card_fe1','card1_fe2',
 'card2_fe2',
 'card3_fe2',
 'card4_fe2',
 'card5_fe2',
 'card6_fe2',
 'addr1_fe2',
 'addr2_fe2','card_fe2','addr',] + dc

In [None]:
X_cols = [x for x in train.columns if x not in drop_cols]

In [None]:
# X_cols = [x for x in X_cols if x not in ['day']+id_cols]

In [None]:
# X_cols = [x for x in X_cols if x not in ['P_emaildomain_0', 'device_version_fe1', 'N17', 'N9', 'R_emaildomain_0', 'device_name_fe1', 'N7', 'R_emaildomain_1', 'N22', 'N10', 'N12', 'R_emaildomain_0_fe1', 'P_emaildomain_0_fe1', 'proc_id_30_0_fe1', 'N13', 'R_emaildomain_1_fe1', 'R_emaildomain_1_fe2', 'P_emaildomain_0_fe2', 'N21', 'proc_id_30_1_fe1', 'proc_id_31_0_fe1', 'proc_id_31_0_fe2', 'device_name', 'device_version']]

In [None]:
new_cols = [x for x in train.columns if 'M' in x or 'D' in x or 'C' in x]

In [None]:
for c in new_cols:
    train[c] = train[c] / train.groupby(['month'])[c].transform('mean')

In [15]:
X = train[[x for x in train.columns if x not in ['isFraud','month']]]#.fillna(imp_dict)
y = train.isFraud

In [16]:
X_test = test[[x for x in train.columns if x not in ['isFraud','month']]]
# y_test = test.isFraud

### Model

In [17]:
cut_train_1 = train[(train.month < 4) | (train.month == 12)].shape[0]
cut_train_2 = train[(train.month > 4) & (train.month != 12)].index[0]
cut_train_1, cut_train_2

(417559, 501214)

In [18]:
del train
del test

In [19]:
gc.collect()

14

Sarching best number rounds

In [20]:
X_fit = X[:cut_train_1]
y_fit = y[:cut_train_1]

X_val = X[cut_train_2:]
y_val = y[cut_train_2:]
gc.collect()

0

In [18]:
logging.warning("Used columns: {}".format(X_cols))
k = 6
logging.warning("Folds number: {}".format(k))

In [19]:
group_kfold = GroupKFold(n_splits=k)

In [20]:
train_ids = X.index
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [21]:
tscv = TimeSeriesSplit(n_splits=k)

#### LightGBM

In [21]:
params = {
    'num_leaves': 191,
    'max_depth': 15,
    'max_leaf_nodes': 45,
    'min_sample_leaf': 20,
    'metric': ['AUC'],
    'first_metric_only': True,
    'n_estimators': 50000,
    'num_threads': -1,
    'learning_rate': 0.01,
    'colsample_bytree': 0.6,
    'objective': 'xentropy',
#     'n_jobs': -1,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'importance_type': 'gain',
#     'lambda_l1': 0.05,
#     'lambda_l2': 0.05,
    'bagging_seed': 42,
    'random_state':42,
    'seed': 42,
    'feature_fraction_seed': 42,
    'drop_seed': 42,
    'data_random_seed': 42,
}

In [22]:
lgb_model = lgb.LGBMClassifier(**params)

In [23]:
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val)],
                  verbose=200,
                  early_stopping_rounds=100)


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.891964
[400]	valid_0's auc: 0.903942
[600]	valid_0's auc: 0.908431
[800]	valid_0's auc: 0.90962
[1000]	valid_0's auc: 0.910258
Early stopping, best iteration is:
[932]	valid_0's auc: 0.910661


LGBMClassifier(bagging_fraction=0.7, bagging_freq=5, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.6,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='gain',
               learning_rate=0.01, max_depth=15, max_leaf_nodes=45,
               metric=['AUC'], min_child_samples=20, min_child_weight=0.001,
               min_sample_leaf=20, min_split_gain=0.0, n_estimators=50000,
               n_jobs=-1, num_leaves=191, num_threads=-1, objective='xentropy',
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, seed=42,
               silent=True, subsample=1.0, ...)

In [24]:
best_i = lgb_model.best_iteration_

In [25]:
best_i

932

In [26]:
params['n_estimators']

50000

In [27]:
# best_i = params['n_estimators']

In [28]:
params['n_estimators'] = int(best_i*(5/3))

In [29]:
lgb_model = lgb.LGBMClassifier(**params)

In [30]:
lgb_model.fit(X, y)

LGBMClassifier(bagging_fraction=0.7, bagging_freq=5, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.6,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='gain',
               learning_rate=0.01, max_depth=15, max_leaf_nodes=45,
               metric=['AUC'], min_child_samples=20, min_child_weight=0.001,
               min_sample_leaf=20, min_split_gain=0.0, n_estimators=1553,
               n_jobs=-1, num_leaves=191, num_threads=-1, objective='xentropy',
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, seed=42,
               silent=True, subsample=1.0, ...)

In [31]:
predictions = lgb_model.predict_proba(X_test)[:,1]

In [32]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [33]:
df_sub['isFraud'] = predictions

In [34]:
df_sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000424
1,3663550,0.001645
2,3663551,0.001192
3,3663552,0.002099
4,3663553,0.003253


In [35]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [36]:
# submission_name = '{0}_LightGBM_{1}'.format(D, round(mean_auc_score_gkf, 6))

In [36]:
submission_name = '{0}_LightGBM_best_i_5_3_ft_eng_7_500'.format(D)

In [37]:
submission_name

'20190923_LightGBM_best_i_5_3_ft_eng_7_500'

In [38]:
df_sub.to_csv('submissions/{}.csv.gz'.format(submission_name), sep=',', header=True, index=None, compression="gzip")

### Best max_depth

In [None]:
# logging.warning("Searching best max_depth")

# for train_index, test_index in group_kfold.split(X, y, groups):
#     X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
#     y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

# scores_dict = dict()
# for i in range(5, 20):
#     print('Max depth = {}'.format(i))
#     logging.warning('Max depth = {}'.format(i))
#     params['max_depth'] = i
#     lgb_model = lgb.LGBMClassifier(**params)
#     lgb_model.fit(X_fit,
#                   y_fit,
#                   eval_set=[(X_val, y_val), (X_fit, y_fit)],
#                   verbose=100,
#                   early_stopping_rounds=40)
#     scores_dict[i] = lgb_model.best_score_['valid_0']['auc']

# b = pd.DataFrame({'Depth': [x for x in scores_dict.keys()], 'AUC': [x for x in scores_dict.values()]})
# md = b[b.AUC == b.AUC.max()].Depth
# md.values[0]

# params['max_depth'] = md.values[0]

In [None]:
# lgb_model = lgb.LGBMClassifier(**params)

In [None]:
# logging.warning("Params: {}".format(str(lgb_model.get_params())))

In [None]:
# groups_test = test.groupby(group_cols).grouper.group_info[0]

In [None]:
fold_strategy_skf = skf.split(train_ids, y)
fold_strategy_gkf = group_kfold.split(X, y, train.month)
fold_strategy_tss = tscv.split(X=X, y=y)

# fold_strategy_test = group_kfold.split(X_test, y_test, groups_test)

In [None]:
scaler = MinMaxScaler()

In [None]:
gc.collect()

In [None]:
def make_predictions(fold_strategy):
    lgb_model = lgb.LGBMClassifier(**params)
    logging.warning("Params: {}".format(str(lgb_model.get_params())))
    counter = 1
    auc_score = 0
    iterat = 0
    list_iter = list()
    y_preds = np.zeros(X_test.shape[0])
    importances = np.zeros(X_test.shape[1])
    
    y_preds_normal = np.zeros(X_test.shape[0])

    for train_index, test_index in fold_strategy:
        print('Fold {}'.format(counter))
        logging.warning("Training fold {}".format(counter))

        X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
        y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

        lgb_model.fit(X_fit,
                      y_fit,
    #                   eval_set=[(X_val, y_val), (X_fit, y_fit)],
                      eval_set=[(X_val, y_val)],
                      verbose=200,
                      early_stopping_rounds=40)

        logging.warning("Best AUC in this fold: {}".format(lgb_model.best_score_['valid_0']['auc']))
        logging.warning("Best iteration in this fold: {}".format(lgb_model.best_iteration_))
        auc_score += lgb_model.best_score_['valid_0']['auc']
        it = lgb_model.best_iteration_
        iterat += it
        list_iter.append(it)
        importances += lgb_model.feature_importances_/k
        predictions = lgb_model.predict_proba(X_test)[:,1]
        y_preds_normal += predictions
        predictions = np.array([x[0] for x in scaler.fit_transform(predictions.reshape(-1, 1))])
        
#         print(predictions)
        y_preds += predictions/k


        del X_fit
        del X_val
        del y_fit
        del y_val
        del train_index
        del test_index
        gc.collect()

        counter += 1


    mean_auc_score = auc_score/k
    mean_iterat = iterat/k

    logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
    logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))
    
    return y_preds, y_preds_normal, importances, mean_auc_score

In [None]:
print('Training with GoupKFold')
y_preds_gkf, y_preds_normal_gkf, importances_gkf, mean_auc_score_gkf = make_predictions(fold_strategy_gkf)
print('')
print('Training with StratifiedFold')
y_preds_skf, y_preds_normal_skf, importances_skf, mean_auc_score_skf = make_predictions(fold_strategy_skf)
print('')
# print('Training with TimeSeriesSplit')
# y_preds_tss, importances_tss, mean_auc_score_tss = make_predictions(fold_strategy_tss)

In [None]:
y_preds_gkf[230:235]

In [None]:
df_importance = pd.DataFrame({'feature': X.columns, 'importance': importances_gkf})\
.sort_values('importance', ascending = False)
df_importance.head(10)

In [None]:
y_preds_final = (y_preds_gkf + y_preds_tss) / 2

In [None]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [None]:
df_sub['isFraud'] = y_preds_gkf

In [None]:
df_sub.head()

In [None]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [None]:
submission_name = '{0}_LightGBM_{1}'.format(D, round(mean_auc_score_gkf, 6))

In [None]:
submission_name

In [None]:
logging.warning("Submission name: {}".format(submission_name))

In [None]:
df_sub.to_csv('submissions/{}.csv'.format(submission_name), sep=',', header=True, index=None)

In [None]:
df_importance.to_csv('docs/ft_importances_{}.csv'.format(D), index=None, header=True)

In [None]:
logging.warning("End")

In [None]:
counter = 1
auc_score = 0
iterat = 0
list_iter = list()
y_preds = np.zeros(X_test.shape[0])
importances = np.zeros(X_test.shape[1])

for train_index, test_index in fold_strategy:
    print('Fold {}'.format(counter))
    logging.warning("Training fold {}".format(counter))

    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
#                   eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  eval_set=[(X_val, y_val)],
                  verbose=200,
                  early_stopping_rounds=40)
    
    logging.warning("Best AUC in this fold: {}".format(lgb_model.best_score_['valid_0']['auc']))
    logging.warning("Best iteration in this fold: {}".format(lgb_model.best_iteration_))
    auc_score += lgb_model.best_score_['valid_0']['auc']
#     lgb_model.predict_proba(test[X.columns])
#     preds = lgb_model.predict_proba(X_test[X.columns])[:,1]
#     preds = lgb_model.predict_proba(X_fit_test[X.columns])[:,1]
#     print('AUC test score: {}\n'.format(roc_auc_score(y_fit_test, preds)))
#     print('AUC 20% test score: {}'.format(roc_auc_score(y_test[:int(len(y_test)*0.2)],
#                                                         preds[:int(len(y_test)*0.2)])))
#     print('AUC 80% test score: {}'.format(roc_auc_score(y_test[int(len(y_test)*0.2):],
#                                                         preds[int(len(y_test)*0.2):])))
    it = lgb_model.best_iteration_
    iterat += it
    list_iter.append(it)
    importances += lgb_model.feature_importances_/k
    y_preds += lgb_model.predict_proba(X_test)[:,1]/k


    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()
    
    counter += 1

    
mean_auc_score = auc_score/k
mean_iterat = iterat/k

logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))

In [None]:
df_importaces = pd.DataFrame({'feature': X.columns, 'importance': importances})\
.sort_values('importance', ascending=False).reset_index(drop=True)
# df_importaces['cs'] = df_importaces.importance.cumsum()
# # df_importaces.cs = df_importaces.cs/df_importaces.cs.max()
# df_importaces.importance = df_importaces.importance/df_imp.cs
df_importaces.head(20)

In [None]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [None]:
df_sub['isFraud'] = y_preds

In [None]:
df_sub.head()

In [None]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [None]:
submission_name = '{0}_LightGBM_{1}'.format(D, round(mean_auc_score, 6))

In [None]:
submission_name

In [None]:
logging.warning("Submission name: {}".format(submission_name))

In [None]:
df_sub.to_csv('submissions/{}.csv'.format(submission_name), sep=',', header=True, index=None)

In [None]:
logging.warning("End")

In [None]:
new_groups = new_X.groupby(group_cols).grouper.group_info[0]

In [None]:
counter = 1
auc_score = 0
iterat = 0
list_iter = list()
# for train_index, test_index in skf.split(train_ids, y):
for train_index, test_index in group_kfold.split(new_X, new_y, new_groups):
# for train_index, test_index in tscv.split(X=X, y=y):
    print('Fold {}\n'.format(counter))
    logging.warning("Training fold {}".format(counter))

    X_fit, X_val = new_X.iloc[train_index, :], new_X.iloc[test_index, :]
    y_fit, y_val = new_y.iloc[train_index], new_y.iloc[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val), (X_fit, y_fit)],
                  verbose=100,
                  early_stopping_rounds=100)
    
    logging.warning("Best AUC in this fold: {}".format(lgb_model.best_score_['valid_0']['auc']))
    logging.warning("Best iteration in this fold: {}".format(lgb_model.best_iteration_))
    auc_score += lgb_model.best_score_['valid_0']['auc']
    preds = lgb_model.predict_proba(X_test[X.columns])[:,1]
    print('AUC test score: {}'.format(roc_auc_score(y_test, preds)))
    print('AUC 20% test score: {}'.format(roc_auc_score(y_test[:int(len(y_test)*0.2)],
                                                        preds[:int(len(y_test)*0.2)])))
    print('AUC 80% test score: {}'.format(roc_auc_score(y_test[int(len(y_test)*0.2):],
                                                        preds[int(len(y_test)*0.2):])))
    it = lgb_model.best_iteration_
    iterat += it
    list_iter.append(it)

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()
    
    counter += 1
    
    break
    
mean_auc_score = auc_score/k
mean_iterat = iterat/k

logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))

In [None]:
mean_iterat

In [None]:
params['n_estimators'] = int(mean_iterat)
params['n_estimators']

In [None]:
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(train[X_cols+drop_cols], train.isFraud)

In [None]:
y_preds = lgb_model.predict_proba(test[X.columns])

In [None]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [None]:
df_sub['isFraud'] = y_preds[:,1]

In [None]:
df_sub.head()

In [None]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [None]:
submission_name = '{0}_LightGBM_{1}'.format(D, mean_auc_score)

In [None]:
submission_name

In [None]:
logging.warning("Submission name: {}".format(submission_name))

In [None]:
df_sub.to_csv('submissions/{}.csv'.format(submission_name), sep=',', header=True, index=None)

In [None]:
logging.warning("End")

In [None]:
df_sub[df_sub.isFraud>0.9].shape