# Permitation Importance

### Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
from sklearn.ensemble import RandomForestClassifier
from eli5.sklearn import PermutationImportance
import lightgbm as lgb
import eli5
import datetime

### Logging

In [2]:
LOG_NAME = 'logs/Permutation_importances.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New Permutation Importance Study #####")

In [3]:
df_impt_first = pd.read_csv('docs/20190921_FeatureImportance_LGB.csv')

In [4]:
X_cols=df_impt_first.feature[:900].tolist()

In [5]:
def reduce_memory2(df):
    print("Reduce_memory...");
    dict_types = dict()
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dict_types[col] = 'int8'
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dict_types[col] = 'int16'
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dict_types[col] = 'int32'
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dict_types[col] = 'int64'
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    dict_types[col] = 'float16'
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dict_types[col] = 'float32'
                else:
                    dict_types[col] = 'float64'
    return dict_types

### Data

In [6]:
logging.warning("Studying synthetic most important features")

In [7]:
# train = pd.read_csv('input/train_ft_eng_6.csv.gz')

In [8]:
train = pd.DataFrame()
chunksize = 10 ** 5
n = 0
for chunk in pd.read_csv('input/train_ft_eng_6.csv.gz', chunksize=chunksize, usecols=X_cols+['isFraud','month']):
    print('Reading chunk no: {}'.format(n))
    chunk = chunk[chunk.month != 4]
    train = pd.concat([train, chunk], axis=0)
    dtyp = reduce_memory2(train)
    train = train.astype(dtyp)
    n+=1

Reading chunk no: 0
Reduce_memory...
Reading chunk no: 1
Reduce_memory...
Reading chunk no: 2
Reduce_memory...
Reading chunk no: 3
Reduce_memory...
Reading chunk no: 4
Reduce_memory...
Reading chunk no: 5
Reduce_memory...


In [9]:
# train.to_csv('input/ft_eng_6_perm_imp.csv', header=True, index=None)

In [10]:
# dict_types = reduce_memory2(train)
# train = train.astype(dict_types)

In [11]:
gc.collect()

0

In [12]:
first_drop_col = ['D9','id_24','id_25','id_07','id_08','id_21','id_26','id_27','id_23','id_22','dist2']

In [13]:
drop_cols_new = ['TransactionDT', 'date', 'month','isFraud', 'date_fe1',
#                  'addr1',
#                  'addr2',
#                  'card1',
#                  'card2',
#                  'card3',
#                  'card4',
#                  'card5',
#                  'card6',
                 'card1_fe1',
                 'card2_fe1',
                 'card3_fe1',
                 'card4_fe1',
                 'card5_fe1',
                 'card6_fe1',
                 'addr1_fe1',
                 'addr2_fe1',
                 'M1_fe1',
                 'M2_fe1',
                 'M3_fe1',
                 'M4_fe1',
                 'M5_fe1',
                 'M6_fe1',
                 'M7_fe1',
                 'M8_fe1',
                 'M9_fe1'
                ] + first_drop_col

In [14]:
select_cols = [x for x in train.columns if x not in drop_cols_new]

In [15]:
# X = train[select_cols].replace(np.inf, np.nan)
# y = train.isFraud
# gc.collect()

In [16]:
# cut_train_1 = train[(train.month < 4) | (train.month == 12)].shape[0]
# cut_train_2 = train[(train.month > 4) & (train.month != 12)].index[0]
# cut_train_1, cut_train_2

In [17]:
X_train = train[(train.month < 4) | (train.month == 12)][select_cols].replace(np.inf, np.nan)
y_train = train[(train.month < 4) | (train.month == 12)]['isFraud']

X_test = train[(train.month > 4) & (train.month != 12)][select_cols].replace(np.inf, np.nan)
y_test = train[(train.month > 4) & (train.month != 12)]['isFraud']
gc.collect()

0

In [18]:
X_train.shape, X_test.shape

((417559, 900), (89326, 900))

In [19]:
del train
gc.collect()

0

### Model

#### LightGBM

In [20]:
logging.warning("Model used to PermitationImportance: {}".format('LightGBM'))

In [21]:
params = {
    'num_leaves': 191,
    'max_depth': 12,
    'max_leaf_nodes': 45,
    'min_sample_leaf': 20,
    'metric': ['AUC'],
    'first_metric_only': True,
    'n_estimators': 800,
    'num_threads': -1,
    'learning_rate': 0.01,
    'colsample_bytree': 0.4,
    'objective': 'xentropy',
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'importance_type': 'gain',
    'bagging_seed': 42,
    'random_state':42,
    'seed': 42,
    'feature_fraction_seed': 42,
    'drop_seed': 42,
    'data_random_seed': 42,
}

In [22]:
lgb_model = lgb.LGBMClassifier(**params)

In [23]:
lgb_model.fit(X_train.fillna(-1), y_train)

LGBMClassifier(bagging_fraction=0.7, bagging_freq=5, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.4,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='gain',
               learning_rate=0.01, max_depth=12, max_leaf_nodes=45,
               metric=['AUC'], min_child_samples=20, min_child_weight=0.001,
               min_sample_leaf=20, min_split_gain=0.0, n_estimators=800,
               n_jobs=-1, num_leaves=191, num_threads=-1, objective='xentropy',
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, seed=42,
               silent=True, subsample=1.0, ...)

In [24]:
logging.warning("Params: {}".format(str(lgb_model.get_params())))

In [25]:
df_perm = pd.DataFrame({'feature': X_train.columns, 'Importance': lgb_model.feature_importances_}).sort_values('Importance', ascending=False)
df_perm.head(20)

Unnamed: 0,feature,Importance
25,PCA_27,357286.566106
273,C13_div_C2,133802.421958
237,C1_div_C10,108821.923609
240,C1_div_C13,86346.992113
241,C1_div_C14,81697.283666
280,C14_div_C2,65022.896727
26,C1,60518.989244
21,PCA_23,38846.463498
126,V294,36622.421739
141,V317,35946.754977


In [26]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

file_name = 'docs/{0}_FeatureImportance_LGB_v2.csv'.format(D)
df_perm.to_csv(file_name, header=True, index=None)

#### RandomForest

In [23]:
# logging.warning("Used columns: {}".format(X_cols))

In [24]:
logging.warning("Model used to PermitationImportance: {}".format('Random Forest'))

In [25]:
model_rf = RandomForestClassifier(n_estimators=200, oob_score=True, n_jobs=-1, random_state=42, max_depth=7)

In [26]:
fill_num = dict()
for c in X_train.columns:
    fill_num[c] = X_train[c].median()

In [27]:
model_rf.fit(X_train.fillna(fill_num), y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [28]:
logging.warning("Params: {}".format(str(model_rf.get_params())))

### Permitation Importance

In [None]:
perm_rf = PermutationImportance(lgb_model).fit(X_test.fillna(-1), y_test)

In [None]:
# eli5.show_weights(perm_rf,feature_names=X_train.columns.tolist(),
#                   top=500)

In [None]:
logging.warning("Importances: {}".format(str(perm_rf.feature_importances_)))

In [None]:
df_perm = pd.DataFrame({'feature': X_train.columns, 'Importance': perm_rf.feature_importances_}).sort_values('Importance', ascending=False)
df_perm.head(20)

In [None]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

file_name = 'docs/{0}_PermitationImportance_LGB.csv'.format(D)

In [None]:
df_perm.to_csv(file_name, header=True, index=None)

In [None]:
logging.warning("End")