In [1]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

import lightgbm as lgb
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer

import warnings
warnings.filterwarnings("ignore")

from scipy import stats
from tqdm import tqdm, tqdm_notebook
import gc, os, pickle

In [2]:
class CFG:

    cat_keys = [
        "B_30","B_38","D_114", "D_116", "D_117", "D_120", "D_126", "D_63", "D_64", "D_66", "D_68"
    ]

    num_keys = [
        'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42', 'D_43', 'D_44', 'B_4', 'D_45', 'B_5',
        'R_2', 'D_46', 'D_47', 'D_48', 'D_49', 'B_6', 'B_7', 'B_8', 'D_50', 'D_51', 'B_9', 'R_3', 'D_52', 'P_3',
        'B_10', 'D_53', 'S_5', 'B_11', 'S_6', 'D_54', 'R_4', 'S_7', 'B_12', 'S_8', 'D_55', 'D_56', 'B_13', 'R_5',
        'D_58', 'S_9', 'B_14', 'D_59', 'D_60', 'D_61', 'B_15', 'S_11', 'D_62', 'D_65', 'B_16', 'B_17', 'B_18', 'B_19',
        'B_20', 'S_12', 'R_6', 'S_13', 'B_21', 'D_69', 'B_22', 'D_70', 'D_71', 'D_72', 'S_15', 'B_23', 'D_73',
        'P_4', 'D_74', 'D_75', 'D_76', 'B_24', 'R_7', 'D_77', 'B_25', 'B_26', 'D_78', 'D_79', 'R_8', 'R_9', 'S_16',
        'D_80', 'R_10', 'R_11', 'B_27', 'D_81', 'D_82', 'S_17', 'R_12', 'B_28', 'R_13', 'D_83', 'R_14', 'R_15', 'D_84',
        'R_16', 'B_29', 'S_18', 'D_86', 'D_87', 'R_17', 'R_18', 'D_88', 'B_31', 'S_19', 'R_19', 'B_32', 'S_20', 'R_20', 'R_21',
        'B_33', 'D_89', 'R_22', 'R_23', 'D_91', 'D_92', 'D_93', 'D_94', 'R_24', 'R_25', 'D_96', 'S_22', 'S_23', 'S_24', 'S_25',
        'S_26', 'D_102', 'D_103', 'D_104', 'D_105', 'D_106', 'D_107', 'B_36', 'B_37', 'R_26', 'R_27', 'D_108', 'D_109', 'D_110', 'D_111',
        'B_39', 'D_112', 'B_40', 'S_27', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_127',
        'D_128', 'D_129', 'B_41', 'B_42', 'D_130', 'D_131', 'D_132', 'D_133', 'R_28', 'D_134', 'D_135', 'D_136', 'D_137',
        'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143', 'D_144', 'D_145'
    ]

    id_key = 'customer_ID'
    date_key = 'S_2'
    max_group_size=13     

    def amex_metric(y_true, y_pred):
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, 1].argsort()[::-1]]
        weights = np.where(labels[:,0]==0, 20, 1)
        cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
        top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
        gini = [0,0]
        for i in [1,0]:
            labels = np.transpose(np.array([y_true, y_pred]))
            labels = labels[labels[:, i].argsort()[::-1]]
            weight = np.where(labels[:,0]==0, 20, 1)
            weight_random = np.cumsum(weight / np.sum(weight))
            total_pos = np.sum(labels[:, 0] *  weight)
            cum_pos_found = np.cumsum(labels[:, 0] * weight)
            lorentz = cum_pos_found / total_pos
            gini[i] = np.sum((lorentz - weight_random) * weight)
        return 0.5 * (gini[1]/gini[0] + top_four)

    def lgb_amex_metric(y_pred, y_true):

        def amex_metric(y_true, y_pred):
            labels = np.transpose(np.array([y_true, y_pred]))
            labels = labels[labels[:, 1].argsort()[::-1]]
            weights = np.where(labels[:,0]==0, 20, 1)
            cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
            top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
            gini = [0,0]
            for i in [1,0]:
                labels = np.transpose(np.array([y_true, y_pred]))
                labels = labels[labels[:, i].argsort()[::-1]]
                weight = np.where(labels[:,0]==0, 20, 1)
                weight_random = np.cumsum(weight / np.sum(weight))
                total_pos = np.sum(labels[:, 0] *  weight)
                cum_pos_found = np.cumsum(labels[:, 0] * weight)
                lorentz = cum_pos_found / total_pos
                gini[i] = np.sum((lorentz - weight_random) * weight)
            return 0.5 * (gini[1]/gini[0] + top_four)

        y_true = y_true.get_label()
        return 'amex_metric', amex_metric(y_true, y_pred), True

    params = {
        'objective': 'binary',
        'metric': 'None', #CFG.metric,
        #'boosting': 'dart',
        'n_jobs':-1,
        'seed': 42,
        'num_leaves': 100,
        'learning_rate': 0.01,
        'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,        
        'lambda_l2': 2,
        'min_data_in_leaf': 40,
        'first_metric_only':True
    }

    path2data = 'data'
    seed = 42
    skf = StratifiedKFold(5, shuffle = True, random_state=seed)
    amex_scorer = make_scorer(amex_metric)
    target_keuy = 'target'
    pi_iter=3

In [3]:
# all_cat_features = [
#         "B_30","B_38","D_114", "D_116", "D_117", "D_120", "D_126", "D_63", "D_64", "D_66", "D_68"
#     ]
# all_cat_features = [f'{k}_last' for k in all_cat_features]+[f'{k}_first' for k in all_cat_features]    

In [4]:
X = pd.read_parquet("train_fe_v3.parquet").set_index(CFG.id_key).drop(CFG.target_keuy, 1)
with open('y.pickle', 'rb') as f:
    y = pickle.load(f)

In [5]:
assert (X.index==y.index).all()

In [9]:
# X = X.iloc[:1000]
# y = y.loc[X.index]
# gc.collect()

64

In [6]:
def split_features_into_groups_v1(X):
    fs1 = X.columns[X.columns.str.contains('last')&~X.columns.str.contains('sub')&~X.columns.str.contains('div')].tolist()
    fs2 = X.columns[X.columns.str.contains('first')].tolist()
    fs3 = X.columns[X.columns.str.contains('mean')].tolist()
    fs4 = X.columns[X.columns.str.contains('min')].tolist()
    fs5 = X.columns[X.columns.str.contains('max')].tolist()
    fs6 = X.columns[X.columns.str.contains('std')].tolist()
    fs7 = X.columns[X.columns.str.contains('sub')].tolist()
    fs8 = X.columns[X.columns.str.contains('div')].tolist()
    fs9 = X.columns[X.columns.str.contains('diff')].tolist()
    fs10 = X.drop(fs1+fs2+fs3+fs4+fs5+fs6+fs7+fs8+fs9, 1).columns
    return fs1, fs2, fs3, fs4, fs5, fs6, fs7, fs8, fs9, fs10

def calc_cv(X, y, num_boost_round):    
    scores, importances = [], []
    for i, (tr_idx, val_idx) in tqdm_notebook(enumerate(CFG.skf.split(X,y)), total = CFG.skf.n_splits):   
        x_train, x_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[tr_idx], y.iloc[val_idx]    
        lgb_tr = lgb.Dataset(x_train, y_train, params={'verbose': -1}) 
        lgb_val = lgb.Dataset(x_val, y_val, params={'verbose': -1})         
        eval_result = {}
        model = lgb.train(
                params = CFG.params,
                train_set = lgb_tr,
                valid_sets = [lgb_tr, lgb_val],
                num_boost_round = num_boost_round,        
                early_stopping_rounds = 100,
                verbose_eval = 100,
                feval = CFG.lgb_amex_metric,
                evals_result=eval_result,            
        )
        scores.append(model.best_score['valid_1']['amex_metric'])
        importances.append(model.feature_importance())
        del x_train, x_val, y_train, y_val, lgb_tr, lgb_val
        gc.collect()
    return np.mean(scores), np.r_[importances].mean(0)

def calc_feature_rank(importances, feature_names):
    ser_imp2 = pd.Series(dict(zip(feature_names, importances))).sort_values().to_frame('imp')
    ser_imp2['rank'] = np.arange(1, len(ser_imp2)+1)
    return ser_imp2

def sel_feat_with_pi_and_cv(X, y, num_boost_round, n_repeat, seed):

    def fit_model(x_train, x_val, y_train, y_val, num_boost_round):
        print('\tevaluating ....')
        lgb_tr = lgb.Dataset(x_train, y_train, params={'verbose': -1}) 
        lgb_val = lgb.Dataset(x_val, y_val, params={'verbose': -1})        
        eval_result = {}
        model = lgb.train(
                params = CFG.params,
                train_set = lgb_tr,
                valid_sets = [lgb_tr, lgb_val],
                num_boost_round = num_boost_round,        
                early_stopping_rounds = 100,
                verbose_eval = 100,
                feval = CFG.lgb_amex_metric,
                evals_result=eval_result
        )
        score = model.best_score['valid_1']['amex_metric']
        del x_train, x_val, y_train, y_val, lgb_tr, lgb_val
        gc.collect()
        return model

    def calc_pi(model, x_val, y_val, n_repeat, seed):
        print('\tcalculating permutation importances ...')
        l_pi = []
        for i in tqdm_notebook(range(n_repeat)):
            pi_mean = permutation_importance(
                model,
                x_val, y_val,
                n_repeats=1,
                scoring =CFG.amex_scorer,
                random_state=CFG.seed+i+seed,
                n_jobs=-1
            ).importances_mean
            l_pi.append(pi_mean) 
            del pi_mean 
        pi_mean = np.r_[l_pi].mean(0) 
        return pi_mean   

    ###############################################################################################################

    all_best_features = []
    for i, (tr_idx, val_idx) in tqdm_notebook(enumerate(CFG.skf.split(X,y)), total = CFG.skf.n_splits):    
    
        x_train, x_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[tr_idx], y.iloc[val_idx]    

        model = fit_model(x_train, x_val, y_train, y_val, num_boost_round)
        pi_mean = calc_pi(model, x_val, y_val, n_repeat, seed)        
        best_features = x_train.columns[pi_mean>0].tolist() 
        all_best_features.extend(best_features)

        del x_train, x_val, y_train, y_val, model, best_features, pi_mean
        gc.collect()
    
    return pd.Series(all_best_features).value_counts()

def select_features(X, y, num_boost_round, n_repeat, seed):
    feature_A = X.columns
    score_A = calc_cv(X, y, num_boost_round=num_boost_round)[0] 
    feature_rank = sel_feat_with_pi_and_cv(X, y, num_boost_round=num_boost_round, n_repeat=n_repeat, seed=seed)  
    score_B = score_A
    for r in tqdm_notebook(np.unique(feature_rank.values)):
        feat2use = feature_rank[feature_rank>=r].index
        score = calc_cv(X[feat2use], y, num_boost_round=num_boost_round)[0] 
        if score>score_B:
            score_B = score
            feature_B = feat2use 
    if score_B>score_A:
        best_features = feature_B
    else:
        best_features = feature_A
    return best_features

def forward_selection(X, y, split_func, num_boost_round, n_repeat, seed):

    fs1, fs2, fs3, fs4, fs5, fs6, fs7, fs8, fs9, fs10 = split_func(X)
    print('feature subset size: {}'.format([len(fs) for fs in [fs1, fs2, fs3, fs4, fs5, fs6, fs7, fs8, fs9, fs10]]))

    # select features in every subset
    all_best_fs = []
    for fs in tqdm_notebook([fs1, fs2, fs3, fs4, fs5, fs6, fs7, fs8, fs9, fs10]):
        bf = select_features(X[fs], y, num_boost_round=num_boost_round, n_repeat=n_repeat, seed=seed)
        all_best_fs.append(bf.tolist())

    # score selected subsets
    all_best_scores = np.array([calc_cv(X[fs], y, num_boost_round=num_boost_round)[0]  for fs in all_best_fs])
    # rank scores
    fs_rank = np.argsort(all_best_scores)[::-1]
    # forward selection
    drop_fs = set()
    for i in tqdm_notebook(range(2, len(fs_rank)+1)):
        new_fs = np.concatenate([all_best_fs[j] for j in fs_rank[:i]])    
        best_fs = select_features(X[new_fs].drop(list(drop_fs), 1), y, num_boost_round=num_boost_round, n_repeat=n_repeat, seed=seed)
        drop_fs.update((set(new_fs)-set(best_fs)))

    return best_fs

def union_selected(X, y, best_fs_100, best_fs_300, split_func, num_boost_round, n_repeat, seed):
    fs_union = np.append(best_fs_100, best_fs_300)
    best_fs_union = forward_selection(
        X[fs_union], y,
        split_func=split_features_into_groups_v1,
        num_boost_round=num_boost_round,
        n_repeat=n_repeat,
        seed=seed
    )
    best_fs_union
    return best_fs_union 

In [8]:
X.shape

(458913, 1637)

In [9]:
# # dropped features
# drop_fs_100 = X.drop(best_fs_100, 1).columns

# # forward selection dropped features with 300 iterations
# best_fs_300 = forward_selection(
#     X[drop_fs_100], y,
#     split_func=split_features_into_groups_v1,
#     num_boost_round=300,
#     n_repeat=1,
#     seed=0
# )

# # forward selection union of selected features
# best_fs_union = union_selected(
#     X, y, 
#     best_fs_100, best_fs_300, 
#     split_func=split_features_into_groups_v1, 
#     num_boost_round=300, 
#     n_repeat=1, 
#     seed=1
# )

# # all scores
# score_ABC = [calc_cv(X[fs], y, num_boost_round=300)[0]  for fs in [best_fs_100, best_fs_300, best_fs_union]]

In [10]:
# forward selection with 100 iterations and feature subsets
best_fs_100 = forward_selection(
    X, y,
    split_func=split_features_into_groups_v1,
    num_boost_round=100,
    n_repeat=1,
    seed=0
)
with open('best_fs_100.pickle', 'wb') as f:
    pickle.dump(best_fs_100, f)

feature subset size: [188, 188, 177, 177, 177, 177, 177, 177, 177, 22]


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25594
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 187
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.765315	valid_1's amex_metric: 0.76348
Did not meet early stopping. Best iteration is:
[100]	training's amex_metric: 0.765315	valid_1's amex_metric: 0.76348
Evaluated only: amex_metric
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25611
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 187
Training until validation scores don't improve for 100 rounds
[100]	traini

  0%|          | 0/5 [00:00<?, ?it/s]

	evaluating ....
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25594
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 187
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.765315	valid_1's amex_metric: 0.76348
Did not meet early stopping. Best iteration is:
[100]	training's amex_metric: 0.765315	valid_1's amex_metric: 0.76348
Evaluated only: amex_metric
	calculating permutation importances ...


  0%|          | 0/1 [00:00<?, ?it/s]

	evaluating ....
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25611
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 187
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.766343	valid_1's amex_metric: 0.757369
Did not meet early stopping. Best iteration is:
[99]	training's amex_metric: 0.766434	valid_1's amex_metric: 0.757302
Evaluated only: amex_metric
	calculating permutation importances ...


  0%|          | 0/1 [00:00<?, ?it/s]

	evaluating ....
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25613
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 187
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.76669	valid_1's amex_metric: 0.758689
Did not meet early stopping. Best iteration is:
[99]	training's amex_metric: 0.766692	valid_1's amex_metric: 0.758714
Evaluated only: amex_metric
	calculating permutation importances ...


  0%|          | 0/1 [00:00<?, ?it/s]

	evaluating ....
[LightGBM] [Info] Number of positive: 95063, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25602
[LightGBM] [Info] Number of data points in the train set: 367131, number of used features: 187
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.76718	valid_1's amex_metric: 0.753646
Did not meet early stopping. Best iteration is:
[100]	training's amex_metric: 0.76718	valid_1's amex_metric: 0.753646
Evaluated only: amex_metric
	calculating permutation importances ...


  0%|          | 0/1 [00:00<?, ?it/s]

	evaluating ....
[LightGBM] [Info] Number of positive: 95063, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25611
[LightGBM] [Info] Number of data points in the train set: 367131, number of used features: 187
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.765771	valid_1's amex_metric: 0.760683
Did not meet early stopping. Best iteration is:
[99]	training's amex_metric: 0.765812	valid_1's amex_metric: 0.76105
Evaluated only: amex_metric
	calculating permutation importances ...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25259
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 166
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.766851	valid_1's amex_metric: 0.76578
Did not meet early stopping. Best iteration is:
[100]	training's amex_metric: 0.766851	valid_1's amex_metric: 0.76578
Evaluated only: amex_metric
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25274
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 166
Training until validation scores don't improve for 100 rounds
[100]	traini

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22851
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 147
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.765867	valid_1's amex_metric: 0.763257
Did not meet early stopping. Best iteration is:
[100]	training's amex_metric: 0.765867	valid_1's amex_metric: 0.763257
Evaluated only: amex_metric
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22860
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 147
Training until validation scores don't improve for 100 rounds
[100]	trai

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20024
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 117
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.764506	valid_1's amex_metric: 0.762883
Did not meet early stopping. Best iteration is:
[100]	training's amex_metric: 0.764506	valid_1's amex_metric: 0.762883
Evaluated only: amex_metric
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20032
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 117
Training until validation scores don't improve for 100 rounds
[100]	trai

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12267
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 65
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.762468	valid_1's amex_metric: 0.760852
Did not meet early stopping. Best iteration is:
[100]	training's amex_metric: 0.762468	valid_1's amex_metric: 0.760852
Evaluated only: amex_metric
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12272
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 65
Training until validation scores don't improve for 100 rounds
[100]	traini

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7311
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 36
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.763503	valid_1's amex_metric: 0.763073
Did not meet early stopping. Best iteration is:
[95]	training's amex_metric: 0.763937	valid_1's amex_metric: 0.764287
Evaluated only: amex_metric
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7317
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 36
Training until validation scores don't improve for 100 rounds
[100]	training'

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25372
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 187
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.632345	valid_1's amex_metric: 0.630636
Did not meet early stopping. Best iteration is:
[97]	training's amex_metric: 0.63244	valid_1's amex_metric: 0.630725
Evaluated only: amex_metric
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25386
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 187
Training until validation scores don't improve for 100 rounds
[100]	traini

  0%|          | 0/5 [00:00<?, ?it/s]

	evaluating ....
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25372
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 187
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.632345	valid_1's amex_metric: 0.630636
Did not meet early stopping. Best iteration is:
[97]	training's amex_metric: 0.63244	valid_1's amex_metric: 0.630725
Evaluated only: amex_metric
	calculating permutation importances ...


  0%|          | 0/1 [00:00<?, ?it/s]

	evaluating ....
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25386
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 187
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.634729	valid_1's amex_metric: 0.62451
Did not meet early stopping. Best iteration is:
[98]	training's amex_metric: 0.634842	valid_1's amex_metric: 0.625064
Evaluated only: amex_metric
	calculating permutation importances ...


  0%|          | 0/1 [00:00<?, ?it/s]

	evaluating ....
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25380
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 187
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.633394	valid_1's amex_metric: 0.623347
Did not meet early stopping. Best iteration is:
[97]	training's amex_metric: 0.633758	valid_1's amex_metric: 0.62399
Evaluated only: amex_metric
	calculating permutation importances ...


  0%|          | 0/1 [00:00<?, ?it/s]

	evaluating ....
[LightGBM] [Info] Number of positive: 95063, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25383
[LightGBM] [Info] Number of data points in the train set: 367131, number of used features: 187
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.634283	valid_1's amex_metric: 0.626006
Did not meet early stopping. Best iteration is:
[97]	training's amex_metric: 0.634525	valid_1's amex_metric: 0.62597
Evaluated only: amex_metric
	calculating permutation importances ...


  0%|          | 0/1 [00:00<?, ?it/s]

	evaluating ....
[LightGBM] [Info] Number of positive: 95063, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25380
[LightGBM] [Info] Number of data points in the train set: 367131, number of used features: 187
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.634935	valid_1's amex_metric: 0.620939
Did not meet early stopping. Best iteration is:
[97]	training's amex_metric: 0.635235	valid_1's amex_metric: 0.621412
Evaluated only: amex_metric
	calculating permutation importances ...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24540
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 166
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.633091	valid_1's amex_metric: 0.629697
Did not meet early stopping. Best iteration is:
[100]	training's amex_metric: 0.633091	valid_1's amex_metric: 0.629697
Evaluated only: amex_metric
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24553
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 166
Training until validation scores don't improve for 100 rounds
[100]	trai

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24014
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 160
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.628983	valid_1's amex_metric: 0.625095
Did not meet early stopping. Best iteration is:
[100]	training's amex_metric: 0.628983	valid_1's amex_metric: 0.625095
Evaluated only: amex_metric
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24027
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 160
Training until validation scores don't improve for 100 rounds
[100]	trai

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22094
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 140
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.623531	valid_1's amex_metric: 0.621375
Did not meet early stopping. Best iteration is:
[100]	training's amex_metric: 0.623531	valid_1's amex_metric: 0.621375
Evaluated only: amex_metric
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22101
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 140
Training until validation scores don't improve for 100 rounds
[100]	trai

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16568
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 96
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.632307	valid_1's amex_metric: 0.629496
Did not meet early stopping. Best iteration is:
[100]	training's amex_metric: 0.632307	valid_1's amex_metric: 0.629496
Evaluated only: amex_metric
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16573
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 96
Training until validation scores don't improve for 100 rounds
[100]	traini

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9419
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 44
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.633961	valid_1's amex_metric: 0.633331
Did not meet early stopping. Best iteration is:
[100]	training's amex_metric: 0.633961	valid_1's amex_metric: 0.633331
Evaluated only: amex_metric
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9420
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 44
Training until validation scores don't improve for 100 rounds
[100]	training

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 35632
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 177
Training until validation scores don't improve for 100 rounds
[100]	training's amex_metric: 0.731132	valid_1's amex_metric: 0.727775
Did not meet early stopping. Best iteration is:
[100]	training's amex_metric: 0.731132	valid_1's amex_metric: 0.727775
Evaluated only: amex_metric
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 35682
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 177
Training until validation scores don't improve for 100 rounds
[100]	trai

  0%|          | 0/5 [00:00<?, ?it/s]

	evaluating ....
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 35632
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 177
Training until validation scores don't improve for 100 rounds


In [15]:
score_ABC

[0.7741016590366844, 0.7547454657085837, 0.7907211060440213]

In [16]:
len(best_fs_100), len(best_fs_300), len(best_fs_union)

(43, 46, 37)