# Permitation Importance

### Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
from sklearn.ensemble import RandomForestClassifier
from eli5.sklearn import PermutationImportance
import eli5
import datetime

from utils.schemas import *
from utils.functions import *

### Logging

In [2]:
LOG_NAME = 'logs/Permutation_importances.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New Permutation Importance Study #####")

### Data

In [3]:
df_imp = pd.read_csv('docs/ft_importances_20190831.csv')

In [4]:
n = df_imp.shape[0]
logging.warning("Studying first {0} most important features".format(n))

In [5]:
X_cols = df_imp.feature[:n].to_list()

In [6]:
len(X_cols)

583

In [7]:
data_folder = 'input'

In [8]:
train = pd.read_csv(data_folder+'/train_ft_eng_1.zip', dtype = schema_ft_eng_1, usecols=X_cols+['isFraud', 'TransactionDT'])
test = pd.read_csv(data_folder+'/test_ft_eng_1.zip', dtype = schema_ft_eng_1, usecols=X_cols)

In [9]:
train_split = int(len(train)*0.8)
train_split

472432

In [10]:
X_train = train[X_cols].iloc[:train_split,:]
y_train = train.isFraud[:train_split]

X_test = train[X_cols].iloc[train_split:,:]
y_test = train.isFraud[train_split:]

### Model

#### RandomForest

In [11]:
logging.warning("Used columns: {}".format(X_cols))

In [12]:
logging.warning("Model used to PermitationImportance: {}".format('Random Forest'))

In [13]:
model_rf = RandomForestClassifier(n_estimators=400, oob_score=True, n_jobs=-1, random_state=42,
                                 max_depth=11)

In [14]:
fill_num = dict()
for c in X_train.columns:
    if c not in cat_ft:
        fill_num[c] = X_train[c].median()
fill_num

{'R1': 0.9165,
 'R12': 34245.0,
 'TransactionAmt': 68.94,
 'N6': 36.0,
 'C1': 1.0,
 'N1': 1404.0,
 'N5': 4891.0,
 'R4': 0.15099017413413526,
 'R11': 0.782,
 'R3': 0.4128,
 'R7': 1.9274553571428568,
 'N3': 136.0,
 'R29': 33048.0,
 'C13': 2.0,
 'R8': 0.5054,
 'R28': 0.0,
 'R9': 0.839,
 'C11': 1.0,
 'R17': 0.03793,
 'D2': 97.0,
 'C14': 1.0,
 'R13': 0.5083,
 'D1': 1.0,
 'R5': 0.0052,
 'C6': 1.0,
 'Transaction_hour': 16.0,
 'dist1': 9.0,
 'C2': 1.0,
 'R21': 0.01839,
 'R2': 0.689,
 'N2': 38887.0,
 'V307': 0.0,
 'id_02': 124418.0,
 'R19': 0.005222,
 'N4': 141694.0,
 'Transaction_day_of_week': 3.0,
 'V258': 1.0,
 'V257': 1.0,
 'C12': 0.0,
 'C9': 1.0,
 'V317': 0.0,
 'V45': 1.0,
 'V308': 0.0,
 'R20': 0.168,
 'V315': 0.0,
 'V310': 0.0,
 'V53': 1.0,
 'C8': 0.0,
 'V313': 0.0,
 'V201': 1.0,
 'R18': 0.1054,
 'C4': 0.0,
 'V314': 0.0,
 'V283': 1.0,
 'V282': 1.0,
 'V285': 0.0,
 'V200': 1.0,
 'C10': 0.0,
 'V83': 1.0,
 'R14': 0.3992,
 'R27': 0.0187,
 'V189': 1.0,
 'V294': 0.0,
 'R22': 0.1268,
 'C7': 0.0,


In [15]:
model_rf.fit(X_train.fillna(fill_num), y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=11, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [16]:
logging.warning("Params: {}".format(str(model_rf.get_params())))

### Permitation Importance

In [17]:
perm_rf = PermutationImportance(model_rf).fit(X_test.fillna(fill_num), y_test)

In [18]:
eli5.show_weights(perm_rf,feature_names=X_train.columns.tolist(),
                  top=n)

Weight,Feature
0.0013  ± 0.0001,R1
0.0004  ± 0.0001,C1
0.0002  ± 0.0000,C13
0.0002  ± 0.0000,V258
0.0002  ± 0.0000,C14
0.0002  ± 0.0000,V201
0.0001  ± 0.0000,C8
0.0001  ± 0.0000,V199
0.0001  ± 0.0000,V274
0.0001  ± 0.0000,C7


In [19]:
logging.warning("Importances: {}".format(str(perm_rf.feature_importances_)))

In [20]:
df_perm = pd.DataFrame({'feature': X_train.columns, 'Importance': perm_rf.feature_importances_}).sort_values('Importance', ascending=False)
df_perm.head(20)

Unnamed: 0,feature,Importance
0,R1,0.001278
5,C1,0.000373
16,C13,0.00024
51,V258,0.000227
24,C14,0.000185
73,V201,0.000159
70,C8,0.000137
149,V199,0.000134
170,V274,0.000113
95,C7,0.000108


In [21]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

file_name = 'docs/{0}_PermitationImportance_Rf3.csv'.format(D)

In [22]:
df_perm.to_csv(file_name, header=True, index=None)

In [23]:
logging.warning("End")