# Permutation Importance

### Libraries

In [25]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from eli5.sklearn import PermutationImportance
import lightgbm as lgb
import eli5
import datetime
import os

### Logging

In [4]:
if not os.path.exists('logs'):
    os.makedirs('logs')

LOG_NAME = 'logs/Permutation_importances.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New Permutation Importance Study #####")

### Data

In [4]:
logging.warning("Studying most important features")

In [5]:
train = pd.read_csv('data/reduce_train.csv')

In [10]:
_target = 'accuracy_group'
_id = 'installation_id'

In [7]:
for i, j in zip(train.columns, train.iloc[0,:]):
    print(i, j)

Clip 11
Activity 3
Assessment 0
Game 4
acc_Mushroom Sorter (Assessment) -1.0
acc_Cauldron Filler (Assessment) -1.0
acc_Chest Sorter (Assessment) -1.0
acc_Bird Measurer (Assessment) -1.0
acc_Cart Balancer (Assessment) -1.0
2050 6
4100 0
4230 0
5000 0
4235 0
2060 0
4110 0
5010 0
2070 0
2075 0
2080 4
2081 1
2083 2
3110 77
4010 4
3120 7
3121 9
4020 92
4021 14
4022 31
4025 19
4030 121
4031 0
3010 79
4035 1
4040 0
3020 7
3021 9
4045 0
2000 18
4050 0
2010 0
2020 20
4070 94
2025 4
2030 18
4080 0
2035 0
2040 6
4090 4
4220 0
4095 0
16dffff1 0
0a08139c 0
3babcb9b 0
48349b14 0
73757a5e 6
160654fd 0
ab3136ba 0
a8a78786 0
46b50ba8 0
cf82af56 14
907a054b 0
28a4eb9a 0
a7640a16 0
5e812b27 21
6088b756 0
ac92046e 3
736f9581 0
1cc7cfca 27
d185d3ea 0
1bb5fbdb 19
e7e44842 0
bd612267 0
d3f1e122 0
4bb2f698 0
8d84fa81 0
3d0b9317 0
d122731b 0
3bf1cf26 0
93b353f2 0
d2278a3b 0
832735e1 0
83c6c409 0
795e4a37 0
b120f2ac 4
3323d7e9 0
d38c2fd7 0
3b2048ee 0
37c53127 6
f28c589a 0
f806dc10 0
dcb55a27 0
9ce586dd 0
15a43e

In [11]:
select_cols = [x for x in train.columns if x not in [_target, _id]]

#### RandomForest

In [12]:
logging.warning("Model used to PermitationImportance: {}".format('Random Forest'))

In [20]:
model_rf = RandomForestClassifier(n_estimators=200, oob_score=True, n_jobs=-1,
                                  random_state=42, max_depth=5, criterion='entropy')

In [26]:
X = train[select_cols]
y = train[_target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [27]:
model_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [28]:
logging.warning("Params: {}".format(str(model_rf.get_params())))

### Permitation Importance

In [29]:
perm_rf = PermutationImportance(model_rf).fit(X_test, y_test)

In [30]:
# eli5.show_weights(perm_rf,feature_names=X_train.columns.tolist(),
#                   top=500)

In [31]:
logging.warning("Importances: {}".format(str(perm_rf.feature_importances_)))

In [32]:
df_perm = pd.DataFrame({'feature': X_train.columns, 'Importance': perm_rf.feature_importances_}).sort_values('Importance', ascending=False)
df_perm.head(20)

Unnamed: 0,feature,Importance
973,session_title,0.021406
362,3393b68b,0.002261
991,accumulated_accuracy_group,0.00147
274,7525289a,0.001394
903,Chest Sorter (Assessment)_3121,0.001319
295,45d01abe,0.001244
910,Crystal Caves - Level 3_2000,0.001093
531,Crystal Caves - Level 3,0.00098
697,Bird Measurer (Assessment)_4100,0.000942
986,accumulated_accuracy,0.000942


In [33]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

file_name = 'importances/{0}_PermutationImportance_RF.csv'.format(D)

In [34]:
df_perm.to_csv(file_name, header=True, index=None)

In [35]:
logging.warning("End")

In [40]:
df_perm[df_perm.Importance > 0].shape

(168, 2)

In [41]:
df_perm[df_perm.Importance > 0]['feature'].tolist()

['session_title',
 '3393b68b',
 'accumulated_accuracy_group',
 '7525289a',
 'Chest Sorter (Assessment)_3121',
 '45d01abe',
 'Crystal Caves - Level 3_2000',
 'Crystal Caves - Level 3',
 'Bird Measurer (Assessment)_4100',
 'accumulated_accuracy',
 '8fee50e2',
 'acc_Chest Sorter (Assessment)',
 'Bird Measurer (Assessment)_3021',
 '222660ff',
 'e4f1efe6',
 'acc_Bird Measurer (Assessment)',
 'Bird Measurer (Assessment)_2010',
 '9c5ef70c',
 'f6947f54',
 '3020',
 'Pan Balance_3120',
 'Cart Balancer (Assessment)_4100',
 'a592d54e',
 'Chest Sorter (Assessment)_4030',
 '0db6d71d',
 '3020_g',
 '3120',
 '070a5291',
 'Pan Balance',
 'Welcome to Lost Lagoon!',
 '38074c54',
 '363d3849',
 '12 Monkeys',
 'Crystal Caves - Level 1_2000',
 '160654fd',
 '1c178d24',
 'Chest Sorter (Assessment)_2030',
 'Chest Sorter (Assessment)_2010',
 'Cauldron Filler (Assessment)_2000',
 '2050',
 'Leaf Leader_2060',
 '4090',
 'Egg Dropper (Activity)_2000',
 'Mushroom Sorter (Assessment)_2010',
 '15f99afc',
 '3afb49e6',
 '