# Permitation Importance

### Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
from sklearn.ensemble import RandomForestClassifier
from eli5.sklearn import PermutationImportance
import eli5
import datetime

from utils.schemas import *
from utils.functions import *

### Logging

In [2]:
LOG_NAME = 'logs/Permutation_importances.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New Permutation Importance Study #####")

### Data

In [3]:
df_imp = pd.read_csv('docs/ft_importances_20190831.csv')

In [4]:
n = df_imp.shape[0]
logging.warning("Studying first {0} most important features".format(n))

In [3]:
logging.warning("Studying synthetic most important features")

In [5]:
X_cols = df_imp.feature[:n].to_list()

In [6]:
len(X_cols)

583

In [5]:
data_folder = 'input'

In [8]:
# train = pd.read_csv(data_folder+'/train_ft_eng_1.zip', dtype = schema_ft_eng_1, usecols=X_cols+['isFraud', 'TransactionDT'])
# test = pd.read_csv(data_folder+'/test_ft_eng_1.zip', dtype = schema_ft_eng_1, usecols=X_cols)
train = pd.read_csv(data_folder+'/train_syn_ft_eng_0.zip', dtype = schema_synthetic_ft_eng_0)
test = pd.read_csv(data_folder+'/test_syn_ft_eng_0.zip', dtype = schema_synthetic_ft_eng_0)

In [9]:
[x for x in train.columns if 'C' in x]

['ProductCD',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'ProductCD_fe1',
 'ProductCD_fe2']

In [10]:
drop_cols = ['isFraud', 'TransactionDT', 'D1','D10','D11','D12','D13','D14','D15','D2','D3','D4','D5','D6',
             'D7','D8','D9','C1','C10','C11','C12','C13','C14','C2','C3','C4','C5','C6','C7','C8','C9',
            'addr1', 'addr2', 'card1']

In [18]:
drop_cols = ['isFraud', 'TransactionDT', 'Transaction_day_of_week', 'Transaction_hour']

In [19]:
X_cols = [x for x in train.columns if x not in drop_cols]

In [20]:
len(X_cols)

712

In [29]:
train_split = int(len(train)*0.8)
train_split

472432

In [30]:
X_train = train[X_cols].iloc[:train_split,:]
y_train = train.isFraud[:train_split]

X_test = train[X_cols].iloc[train_split:,:]
y_test = train.isFraud[train_split:]

In [22]:
X_train = train[X_cols]
y_train = train.isFraud

X_test = test[X_cols]
y_test = test.isFraud

### Model

#### RandomForest

In [24]:
logging.warning("Used columns: {}".format(X_cols))

In [25]:
logging.warning("Model used to PermitationImportance: {}".format('Random Forest'))

In [26]:
model_rf = RandomForestClassifier(n_estimators=300, oob_score=True, n_jobs=-1, random_state=42, max_depth=7)

In [27]:
fill_num = dict()
for c in X_train.columns:
    if c not in cat_ft:
        fill_num[c] = X_train[c].median()
fill_num

{'TransactionAmt': 68.94,
 'dist1': 9.0,
 'dist2': 30.0,
 'C1': 1.0,
 'C2': 1.0,
 'C3': 0.0,
 'C4': 0.0,
 'C5': 0.0,
 'C6': 1.0,
 'C7': 0.0,
 'C8': 0.0,
 'C9': 1.0,
 'C10': 0.0,
 'C11': 1.0,
 'C12': 0.0,
 'C13': 2.0,
 'C14': 1.0,
 'D1': 0.0,
 'D2': 99.0,
 'D3': 8.0,
 'D4': 20.0,
 'D5': 9.0,
 'D6': 0.0,
 'D7': 0.0,
 'D8': 47.12,
 'D10': 11.0,
 'D11': 26.0,
 'D12': 0.0,
 'D13': 0.0,
 'D14': 0.0,
 'D15': 44.0,
 'V1': 1.0,
 'V2': 1.0,
 'V3': 1.0,
 'V4': 1.0,
 'V5': 1.0,
 'V6': 1.0,
 'V7': 1.0,
 'V8': 1.0,
 'V9': 1.0,
 'V10': 0.0,
 'V11': 0.0,
 'V12': 1.0,
 'V13': 1.0,
 'V14': 1.0,
 'V15': 0.0,
 'V16': 0.0,
 'V17': 0.0,
 'V18': 0.0,
 'V19': 1.0,
 'V20': 1.0,
 'V21': 0.0,
 'V22': 0.0,
 'V23': 1.0,
 'V24': 1.0,
 'V25': 1.0,
 'V26': 1.0,
 'V27': 0.0,
 'V28': 0.0,
 'V29': 0.0,
 'V30': 0.0,
 'V31': 0.0,
 'V32': 0.0,
 'V33': 0.0,
 'V34': 0.0,
 'V35': 0.0,
 'V36': 0.0,
 'V37': 1.0,
 'V38': 1.0,
 'V39': 0.0,
 'V40': 0.0,
 'V41': 1.0,
 'V42': 0.0,
 'V43': 0.0,
 'V44': 1.0,
 'V45': 1.0,
 'V46': 1.0,


In [28]:
model_rf.fit(X_train.fillna(fill_num), y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [29]:
logging.warning("Params: {}".format(str(model_rf.get_params())))

### Permitation Importance

In [30]:
perm_rf = PermutationImportance(model_rf).fit(X_test.fillna(fill_num), y_test)

In [34]:
eli5.show_weights(perm_rf,feature_names=X_train.columns.tolist(),
                  top=500)

Weight,Feature
0.0009  ± 0.0001,R1
0.0003  ± 0.0000,V258
0.0003  ± 0.0000,C7
0.0003  ± 0.0000,V45
0.0003  ± 0.0000,C1
0.0003  ± 0.0000,C12
0.0002  ± 0.0000,C8
0.0002  ± 0.0000,C4
0.0002  ± 0.0000,V257
0.0002  ± 0.0000,V201


In [35]:
logging.warning("Importances: {}".format(str(perm_rf.feature_importances_)))

In [36]:
df_perm = pd.DataFrame({'feature': X_train.columns, 'Importance': perm_rf.feature_importances_}).sort_values('Importance', ascending=False)
df_perm.head(20)

Unnamed: 0,feature,Importance
639,R1,0.000858
308,V258,0.000271
20,C7,0.000269
95,V45,0.000261
14,C1,0.000257
25,C12,0.000251
21,C8,0.000215
17,C4,0.000206
307,V257,0.000199
251,V201,0.000193


In [37]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

file_name = 'docs/{0}_PermitationImportance_Rf5.csv'.format(D)

In [38]:
df_perm.to_csv(file_name, header=True, index=None)

In [39]:
logging.warning("End")