# Permitation Importance

### Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
from sklearn.ensemble import RandomForestClassifier
from eli5.sklearn import PermutationImportance
import eli5
import datetime

from utils.schemas import *
from utils.functions import *

### Logging

In [2]:
LOG_NAME = 'logs/Permutation_importances.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New Permutation Importance Study #####")

### Data

In [3]:
df_imp = pd.read_csv('docs/ft_importances_20190831.csv')

In [4]:
n = df_imp.shape[0]
logging.warning("Studying first {0} most important features".format(n))

In [5]:
X_cols = df_imp.feature[:n].to_list()

In [6]:
len(X_cols)

583

In [7]:
data_folder = 'input'

In [8]:
# train = pd.read_csv(data_folder+'/train_ft_eng_1.zip', dtype = schema_ft_eng_1, usecols=X_cols+['isFraud', 'TransactionDT'])
# test = pd.read_csv(data_folder+'/test_ft_eng_1.zip', dtype = schema_ft_eng_1, usecols=X_cols)
train = pd.read_csv(data_folder+'/train_ft_eng_2.csv', dtype = schema_ft_eng_2)
test = pd.read_csv(data_folder+'/test_ft_eng_2.csv', dtype = schema_ft_eng_2)

In [13]:
[x for x in train.columns if 'C' in x]

['C1',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'ProductCD',
 'ProductCD_fe1',
 'ProductCD_fe2']

In [25]:
drop_cols = ['isFraud', 'TransactionDT', 'D1','D10','D11','D12','D13','D14','D15','D2','D3','D4','D5','D6',
             'D7','D8','D9','C1','C10','C11','C12','C13','C14','C2','C3','C4','C5','C6','C7','C8','C9',
            'addr1', 'addr2', 'card1']

In [26]:
X_cols = [x for x in train.columns if x not in drop_cols]

In [27]:
len(X_cols)

622

In [29]:
train_split = int(len(train)*0.8)
train_split

472432

In [30]:
X_train = train[X_cols].iloc[:train_split,:]
y_train = train.isFraud[:train_split]

X_test = train[X_cols].iloc[train_split:,:]
y_test = train.isFraud[train_split:]

### Model

#### RandomForest

In [31]:
logging.warning("Used columns: {}".format(X_cols))

In [32]:
logging.warning("Model used to PermitationImportance: {}".format('Random Forest'))

In [33]:
model_rf = RandomForestClassifier(n_estimators=400, oob_score=True, n_jobs=-1, random_state=42, max_depth=11)

In [34]:
fill_num = dict()
for c in X_train.columns:
    if c not in cat_ft:
        fill_num[c] = X_train[c].median()
fill_num

{'TransactionAmt': 68.94,
 'V1': 1.0,
 'V10': 0.0,
 'V100': 0.0,
 'V101': 0.0,
 'V102': 0.0,
 'V103': 0.0,
 'V104': 0.0,
 'V105': 0.0,
 'V106': 0.0,
 'V107': 1.0,
 'V108': 1.0,
 'V109': 1.0,
 'V11': 0.0,
 'V110': 1.0,
 'V111': 1.0,
 'V112': 1.0,
 'V113': 1.0,
 'V114': 1.0,
 'V115': 1.0,
 'V116': 1.0,
 'V117': 1.0,
 'V118': 1.0,
 'V119': 1.0,
 'V12': 1.0,
 'V120': 1.0,
 'V121': 1.0,
 'V122': 1.0,
 'V123': 1.0,
 'V124': 1.0,
 'V125': 1.0,
 'V126': 0.0,
 'V127': 0.0,
 'V128': 0.0,
 'V129': 0.0,
 'V13': 1.0,
 'V130': 0.0,
 'V131': 0.0,
 'V132': 0.0,
 'V133': 0.0,
 'V134': 0.0,
 'V135': 0.0,
 'V136': 0.0,
 'V137': 0.0,
 'V138': 0.0,
 'V139': 1.0,
 'V14': 1.0,
 'V140': 1.0,
 'V141': 0.0,
 'V142': 0.0,
 'V143': 0.0,
 'V144': 0.0,
 'V145': 0.0,
 'V146': 0.0,
 'V147': 0.0,
 'V148': 1.0,
 'V149': 1.0,
 'V15': 0.0,
 'V150': 1.0,
 'V151': 1.0,
 'V152': 1.0,
 'V153': 1.0,
 'V154': 1.0,
 'V155': 1.0,
 'V156': 1.0,
 'V157': 1.0,
 'V158': 1.0,
 'V159': 0.0,
 'V16': 0.0,
 'V160': 0.0,
 'V161': 0.0,
 'V

In [35]:
model_rf.fit(X_train.fillna(fill_num), y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=11, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [36]:
logging.warning("Params: {}".format(str(model_rf.get_params())))

### Permitation Importance

In [37]:
perm_rf = PermutationImportance(model_rf).fit(X_test.fillna(fill_num), y_test)

In [38]:
eli5.show_weights(perm_rf,feature_names=X_train.columns.tolist(),
                  top=n)

Weight,Feature
0.0011  ± 0.0001,R1
0.0002  ± 0.0000,V258
0.0001  ± 0.0000,V201
0.0001  ± 0.0000,V257
0.0001  ± 0.0000,R71
0.0001  ± 0.0000,V200
0.0001  ± 0.0000,R43
0.0001  ± 0.0000,V199
0.0001  ± 0.0000,V189
0.0001  ± 0.0000,R68


In [39]:
logging.warning("Importances: {}".format(str(perm_rf.feature_importances_)))

In [40]:
df_perm = pd.DataFrame({'feature': X_train.columns, 'Importance': perm_rf.feature_importances_}).sort_values('Importance', ascending=False)
df_perm.head(20)

Unnamed: 0,feature,Importance
549,R1,0.00107
191,V258,0.0002
129,V201,0.000139
190,V257,0.000112
619,R71,0.000108
128,V200,9e-05
591,R43,8.8e-05
125,V199,8.5e-05
114,V189,8.3e-05
616,R68,8.1e-05


In [41]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

file_name = 'docs/{0}_PermitationImportance_Rf4.csv'.format(D)

In [42]:
df_perm.to_csv(file_name, header=True, index=None)

In [43]:
logging.warning("End")