# Permitation Importance

### Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
from sklearn.ensemble import RandomForestClassifier
from eli5.sklearn import PermutationImportance
import eli5
import datetime

from utils.schemas import *
from utils.functions import *

### Logging

In [2]:
LOG_NAME = 'logs/Permutation_importances.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New Permutation Importance Study #####")

### Data

In [3]:
df_imp = pd.read_csv('docs/ft_importances_20190831.csv')

In [4]:
n = 300
logging.warning("Studying first {0} most important features".format(n))

In [5]:
X_cols = df_imp.feature[:n].to_list()

In [6]:
len(X_cols)

300

In [7]:
data_folder = 'input'

In [8]:
train = pd.read_csv(data_folder+'/train_ft_eng_1.zip', dtype = schema_ft_eng_1, usecols=X_cols+['isFraud', 'TransactionDT'])
test = pd.read_csv(data_folder+'/test_ft_eng_1.zip', dtype = schema_ft_eng_1, usecols=X_cols)

In [9]:
train_split = int(len(train)*0.8)
train_split

472432

In [10]:
X_train = train[X_cols].iloc[:train_split,:]
y_train = train.isFraud[:train_split]

X_test = train[X_cols].iloc[train_split:,:]
y_test = train.isFraud[train_split:]

### Model

#### RandomForest

In [11]:
logging.warning("Used columns: {}".format(X_cols))

In [12]:
logging.warning("Model used to PermitationImportance: {}".format('Random Forest'))

In [13]:
model_rf = RandomForestClassifier(n_estimators=400, oob_score=True, n_jobs=-1, random_state=42)

In [14]:
model_rf.fit(X_train.fillna(-1), y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [15]:
logging.warning("Params: {}".format(str(model_rf.get_params())))

### Permitation Importance

In [16]:
perm_rf = PermutationImportance(model_rf).fit(X_test.fillna(-1), y_test)

In [17]:
eli5.show_weights(perm_rf,feature_names=X_train.columns.tolist(),
                  top=n)

Weight,Feature
0.0016  ± 0.0001,R1
0.0004  ± 0.0001,C1
0.0004  ± 0.0001,C13
0.0004  ± 0.0000,C14
0.0002  ± 0.0000,TransactionAmt
0.0002  ± 0.0001,D1
0.0002  ± 0.0000,N5
0.0002  ± 0.0001,R12
0.0002  ± 0.0000,card2
0.0002  ± 0.0000,C2


In [18]:
logging.warning("Importances: {}".format(str(perm_rf.feature_importances_)))

In [19]:
df_perm = pd.DataFrame({'feature': X_train.columns, 'Importance': perm_rf.feature_importances_}).sort_values('Importance', ascending=False)
df_perm.head(20)

Unnamed: 0,feature,Importance
0,R1,0.001588
5,C1,0.000395
16,C13,0.000374
24,C14,0.000366
3,TransactionAmt,0.000193
26,D1,0.000185
7,N5,0.000176
2,R12,0.000163
8,card2,0.000159
32,C2,0.000156


In [20]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

file_name = 'docs/{0}_PermitationImportance_Rf2.csv'.format(D)

In [21]:
df_perm.to_csv(file_name, header=True, index=None)

In [22]:
logging.warning("End")