In [1]:
import sys
import warnings
if '../' not in sys.path:
    sys.path.append('../')

warnings.filterwarnings("ignore")

In [2]:
import src.utils as ut

In [3]:
conf = ut.get_conf()

# Introduction
One of the most basic questions we might ask of a model is: What features have the biggest impact on predictions?

This concept is called **feature importance**.

There are multiple ways to measure feature importance. Some approaches answer subtly different versions of the question above. Other approaches have documented shortcomings.

In this lesson, we'll focus on **permutation importance**. Compared to most other approaches, permutation importance is:

fast to calculate,
widely used and understood, and
consistent with properties we would want a feature importance measure to have.

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [6]:
data = ut.read_csv_file(conf['save_paths']['fifa_data'], 'FIFA 2018 Statistics')
y = (data['Man of the Match'] == "Yes")  # Convert from string "Yes"/"No" to binary
feature_names = [i for i in data.columns if data[i].dtype in [np.int64]]
X = data[feature_names]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
rf_model = RandomForestClassifier(n_estimators=100,
                                  random_state=0).fit(train_X, train_y)

#### Using ELI5 Framework


In [7]:
import eli5
from eli5.sklearn import PermutationImportance
eli5.show_weights(rf_model)
perm = PermutationImportance(rf_model).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())

Weight,Feature
0.1625  ± 0.0729,Goal Scored
0.0750  ± 0.0637,Distance Covered (Kms)
0.0437  ± 0.0750,On-Target
0.0375  ± 0.0468,Yellow Card
0.0312  ± 0.0559,Off-Target
0.0250  ± 0.0729,Free Kicks
0.0250  ± 0.0468,Fouls Committed
0.0125  ± 0.0637,Ball Possession %
0.0063  ± 0.0829,Offsides
0.0063  ± 0.1000,Corners
