In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [11]:
train = pd.read_pickle('train_feats738_cv646_lb679.pickle')

In [18]:
target = train.target.values

In [13]:
exclude_feats = ['first_active_month', 'card_id', 'target', 'anchor_date', 'ref_date']

In [14]:
feats = [col for col in train.columns if col not in exclude_feats]

In [15]:
cat_feats = ['feature_1', 'feature_2']

In [16]:
param = {
    'objective': 'regression',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'bagging_fraction': 0.9,
    'bagging_freq': 1,
    'feature_fraction': 0.9,
    'lambda_l1': 9.6,
    'metric': 'rmse',
    'num_threads': 48,
}

In [17]:
tr_data = lgb.Dataset(train[feats], label=train.target, categorical_feature=cat_feats)
cv_scores = lgb.cv(param, tr_data, 3000, stratified=False, verbose_eval=200, early_stopping_rounds=600)
print(f'Best cv scores {cv_scores["rmse-mean"][-1]} + {cv_scores["rmse-stdv"][-1]}\nNum of boost: {len(cv_scores["rmse-mean"])}')

[200]	cv_agg's rmse: 3.67828 + 0.0311249
[400]	cv_agg's rmse: 3.65681 + 0.0312714
[600]	cv_agg's rmse: 3.64955 + 0.0315071
[800]	cv_agg's rmse: 3.64667 + 0.0314958
[1000]	cv_agg's rmse: 3.64514 + 0.0315281
[1200]	cv_agg's rmse: 3.64472 + 0.0314058
[1400]	cv_agg's rmse: 3.6444 + 0.0312227
[1600]	cv_agg's rmse: 3.64442 + 0.0311327
[1800]	cv_agg's rmse: 3.64467 + 0.0311163
[2000]	cv_agg's rmse: 3.64494 + 0.030836
Best cv scores 3.644193959468102 + 0.03124571224721622
Num of boost: 1530


In [22]:
def get_feature_importances(data, shuffle, seed=None):
    # Gather real features
    train_features = feats
    # Go over fold and keep track of CV score (train and valid) and feature importances
    
    # Shuffle target if required
    y = data['target'].copy()
    if shuffle:
        # Here you could as well use a binomial distribution
        y = data['target'].copy().sample(frac=1.0)
    
    # Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
    dtrain = lgb.Dataset(data[train_features], y, free_raw_data=False, silent=True)
    
    # Fit the model
    clf = lgb.train(param, dtrain, num_boost_round=1530, categorical_feature=cat_feats)

    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df["feature"] = list(train_features)
    imp_df["importance_gain"] = clf.feature_importance(importance_type='gain')
    imp_df["importance_split"] = clf.feature_importance(importance_type='split')
    imp_df['trn_score'] = mean_squared_error(y, clf.predict(data[train_features]))
    
    return imp_df

In [23]:
actual_imp_df = get_feature_importances(data=train, shuffle=False)

New categorical_feature is ['feature_1', 'feature_2']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [38]:
actual_imp_df.to_pickle('actual_imp_df.pkl')

In [27]:
null_imp_df = pd.DataFrame()
nb_runs = 200

for i in range(nb_runs):
    # Get current run importances
    imp_df = get_feature_importances(data=train, shuffle=True)
    imp_df['run'] = i + 1 
    # Concat the latest importances with the old ones
    null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
    # Display current run
    print(f'Done with {i+1} of {nb_runs}')

New categorical_feature is ['feature_1', 'feature_2']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Done with 1 of 2


New categorical_feature is ['feature_1', 'feature_2']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Done with 2 of 2


In [39]:
null_imp_df.to_pickle('null_imp_df.pkl')

In [28]:
feature_scores = []
for _f in actual_imp_df['feature'].unique():
    f_null_imps_gain = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_gain'].values
    f_act_imps_gain = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_gain'].mean()
    gain_score = np.log(1e-10 + f_act_imps_gain / (1 + np.percentile(f_null_imps_gain, 75)))  # Avoid didvide by zero
    f_null_imps_split = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_split'].values
    f_act_imps_split = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_split'].mean()
    split_score = np.log(1e-10 + f_act_imps_split / (1 + np.percentile(f_null_imps_split, 75)))  # Avoid didvide by zero
    feature_scores.append((_f, split_score, gain_score))

scores_df = pd.DataFrame(feature_scores, columns=['feature', 'split_score', 'gain_score'])

In [40]:
scores_df.to_csv('fe_feature_scores.csv', index=False)

In [30]:
correlation_scores = []
for _f in actual_imp_df['feature'].unique():
    f_null_imps = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_gain'].values
    f_act_imps = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_gain'].values
    gain_score = 100 * (f_null_imps < np.percentile(f_act_imps, 25)).sum() / f_null_imps.size
    f_null_imps = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_split'].values
    f_act_imps = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_split'].values
    split_score = 100 * (f_null_imps < np.percentile(f_act_imps, 25)).sum() / f_null_imps.size
    correlation_scores.append((_f, split_score, gain_score))

corr_scores_df = pd.DataFrame(correlation_scores, columns=['feature', 'split_score', 'gain_score'])

In [41]:
corr_scores_df.to_csv('fe_corr_scores.cvs', index=False)

In [2]:
actual_imp_df = pd.read_pickle('actual_imp_df.pkl')

In [3]:
null_imp_df = pd.read_pickle('null_imp_df.pkl')

In [4]:
scores_df = pd.read_csv('fe_feature_scores.csv')

In [5]:
corr_scores_df = pd.read_csv('fe_corr_scores.cvs')

In [None]:
scores_df.sort_values('split_score', ascending=True)

In [10]:
corr_scores_df.sort_values('split_score', ascending=True)

Unnamed: 0,feature,split_score,gain_score
366,new_state_id_nunique_mean,0.0,0.0
341,new_installments_std_std,0.0,0.0
350,new_merchant_id_nunique,0.0,0.0
352,new_city_id_nunique,0.0,12.0
731,new_city_id_auth_sum_median_std,0.0,0.0
367,new_city_id_nunique_mean,0.0,0.0
369,new_subsector_id_nunique_std,0.0,0.0
370,new_merchant_id_nunique_std,0.0,0.0
444,new_merchant_category_id_auth_sum_std,0.0,12.5
448,new_subsector_id_auth_mean_mean,0.0,97.5


In [17]:
def score_feature_selection(df=None, train_features=None, cat_feats=None, target=None):
    dtrain = lgb.Dataset(df[train_features], target, categorical_feature=cat_feats, free_raw_data=False, silent=True)
    hist = lgb.cv(param, dtrain, 3000, stratified=False, early_stopping_rounds=400, verbose_eval=200)
    return hist['rmse-mean'][-1], hist['rmse-stdv'][-1]

In [19]:
selected_feats = [f for f in feats if f not in ['ref_anchor_diff', 'hist_year_nunique']]
score_feature_selection(train, selected_feats, cat_feats, target)



[200]	cv_agg's rmse: 3.74051 + 0.027141
[400]	cv_agg's rmse: 3.71685 + 0.0262385
[600]	cv_agg's rmse: 3.7088 + 0.0259058
[800]	cv_agg's rmse: 3.70608 + 0.025759
[1000]	cv_agg's rmse: 3.70479 + 0.0252582
[1200]	cv_agg's rmse: 3.70416 + 0.0251366
[1400]	cv_agg's rmse: 3.70381 + 0.0249277
[1600]	cv_agg's rmse: 3.70395 + 0.0249116
[1800]	cv_agg's rmse: 3.70393 + 0.024882


(3.703705541039728, 0.025050051937817747)

In [26]:
split_results_lst = []
gain_results_lst = []
for threshold in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99]:
    split_feats = corr_scores_df.loc[corr_scores_df.split_score > threshold, 'feature'].tolist()
    split_cat_feats = set.intersection(set(cat_feats), set(split_feats))

    gain_feats = corr_scores_df.loc[corr_scores_df.gain_score > threshold, 'feature'].tolist()
    gain_cat_feats = set.intersection(set(cat_feats), set(gain_feats))
    
    print(f'Results for threshold {threshold}')
    split_results = score_feature_selection(train, split_feats, split_cat_feats, target)
    print(f'\t SPLIT: {split_results[0]}, {split_results[1]}')

    gain_results = score_feature_selection(train, gain_feats, gain_cat_feats, target)
    print(f'\t gain: {gain_results[0]}, {gain_results[1]}')
    
    split_results_lst.append(split_results)
    gain_results_lst.append(gain_results)

Results for threshold 0




[200]	cv_agg's rmse: 3.67814 + 0.0310549
[400]	cv_agg's rmse: 3.6571 + 0.0301737
[600]	cv_agg's rmse: 3.65003 + 0.0298079
[800]	cv_agg's rmse: 3.64715 + 0.0295953
[1000]	cv_agg's rmse: 3.64593 + 0.0294436
[1200]	cv_agg's rmse: 3.64544 + 0.0295014
[1400]	cv_agg's rmse: 3.64541 + 0.0291928
[1600]	cv_agg's rmse: 3.6454 + 0.0291844
	 SPLIT: 3.6452617640209986, 0.02933959176242044
[200]	cv_agg's rmse: 3.67887 + 0.0301237
[400]	cv_agg's rmse: 3.65708 + 0.0297254
[600]	cv_agg's rmse: 3.65025 + 0.0297586
[800]	cv_agg's rmse: 3.64756 + 0.0295571
[1000]	cv_agg's rmse: 3.64634 + 0.0295508
[1200]	cv_agg's rmse: 3.64563 + 0.0296515
[1400]	cv_agg's rmse: 3.64558 + 0.0296638
[1600]	cv_agg's rmse: 3.64585 + 0.0294428
[1800]	cv_agg's rmse: 3.64595 + 0.029366
	 gain: 3.645510236414553, 0.02964837638358948
Results for threshold 10
[200]	cv_agg's rmse: 3.67761 + 0.0318197
[400]	cv_agg's rmse: 3.65625 + 0.031603
[600]	cv_agg's rmse: 3.6489 + 0.0310936
[800]	cv_agg's rmse: 3.6455 + 0.0310799
[1000]	cv_agg's

	 SPLIT: 3.6431540070242816, 0.027560609540072813
[200]	cv_agg's rmse: 3.67828 + 0.0315448
[400]	cv_agg's rmse: 3.65675 + 0.0307225
[600]	cv_agg's rmse: 3.64904 + 0.0305539
[800]	cv_agg's rmse: 3.64635 + 0.0300093
[1000]	cv_agg's rmse: 3.64495 + 0.0299949
[1200]	cv_agg's rmse: 3.64471 + 0.0299662
[1400]	cv_agg's rmse: 3.64462 + 0.0296098
[1600]	cv_agg's rmse: 3.64441 + 0.029794
[1800]	cv_agg's rmse: 3.64449 + 0.0296406
	 gain: 3.644310870992796, 0.029709359051243868
Results for threshold 95
[200]	cv_agg's rmse: 3.6761 + 0.0314028
[400]	cv_agg's rmse: 3.65551 + 0.0307413
[600]	cv_agg's rmse: 3.64871 + 0.0302713
[800]	cv_agg's rmse: 3.64601 + 0.0300499
[1000]	cv_agg's rmse: 3.64478 + 0.0298754
[1200]	cv_agg's rmse: 3.64428 + 0.0297237
[1400]	cv_agg's rmse: 3.64404 + 0.0299656
[1600]	cv_agg's rmse: 3.64359 + 0.0297778
[1800]	cv_agg's rmse: 3.64346 + 0.0295627
[2000]	cv_agg's rmse: 3.64352 + 0.0291541
[2200]	cv_agg's rmse: 3.64365 + 0.0290314
	 SPLIT: 3.6434012629855728, 0.0294078053022882

In [20]:
corr_scores_df.head()

Unnamed: 0,feature,split_score,gain_score
0,feature_1,100.0,100.0
1,feature_2,100.0,100.0
2,feature_3,95.0,99.0
3,ref_anchor_diff,100.0,100.0
4,active_ref_diff,100.0,100.0
