# Différentes Méthodes pour calculer l'importance de features

### -1) import and functions

In [269]:
from sklearn.tree import _tree
from scipy import stats
import numpy as np
from sklearn import datasets
from collections import OrderedDict

In [270]:
def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print "def tree({}):".format(", ".join(feature_names))
    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print "{}if {} <= {}:".format(indent, name, threshold)
            recurse(tree_.children_left[node], depth + 1)
            print "{}else:  # if {} > {}".format(indent, name, threshold)
            recurse(tree_.children_right[node], depth + 1)
        else:
            print "{}return {}".format(indent, tree_.value[node])

    recurse(0, 1)

### 0) loading data

In [271]:
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

In [272]:
r = OrderedDict()

### 1) importance des features

### 1.1) régression linéaire

In [273]:
from sklearn.linear_model import LinearRegression
algorithme = LinearRegression()
model = algorithme.fit(X, y)
r["sk LinearRegression.coef"] = model.coef_
model.coef_

array([ -10.01219782, -239.81908937,  519.83978679,  324.39042769,
       -792.18416163,  476.74583782,  101.04457032,  177.06417623,
        751.27932109,   67.62538639])

### avec stats model : pvalues

In [286]:
import statsmodels.api as sm

In [287]:
intercept_x = sm.add_constant(X)
estimateur  = sm.OLS(y, intercept_x)
modele      = estimateur.fit()
print(modele.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.518
Model:                            OLS   Adj. R-squared:                  0.507
Method:                 Least Squares   F-statistic:                     46.27
Date:                Wed, 11 Apr 2018   Prob (F-statistic):           3.83e-62
Time:                        07:38:02   Log-Likelihood:                -2386.0
No. Observations:                 442   AIC:                             4794.
Df Residuals:                     431   BIC:                             4839.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const        152.1335      2.576     59.061      0.0

In [288]:
r["sm OLS_coef"] = modele.params[1:]

In [289]:
r["sm OLS_pvalues"] = modele.pvalues[1:]

### 1.2) Tree

In [278]:
from sklearn import tree
algorithme = tree.DecisionTreeRegressor(max_depth=3)
modele = algorithme.fit(X, y)

In [279]:
tree_to_code(modele, [str(x) for x in range(10)])

def tree(0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
  if 8 <= -0.00376178603619:
    if 2 <= 0.00618888484314:
      if 6 <= 0.0210278164595:
        return [[ 108.8045977]]
      else:  # if 6 > 0.0210278164595
        return [[ 83.36904762]]
    else:  # if 2 > 0.00618888484314
      if 0 <= -0.0799815952778:
        return [[ 274.]]
      else:  # if 0 > -0.0799815952778
        return [[ 154.66666667]]
  else:  # if 8 > -0.00376178603619
    if 2 <= 0.0148113816977:
      if 2 <= -0.0218342300504:
        return [[ 137.69047619]]
      else:  # if 2 > -0.0218342300504
        return [[ 176.86486486]]
    else:  # if 2 > 0.0148113816977
      if 2 <= 0.0687019824982:
        return [[ 208.57142857]]
      else:  # if 2 > 0.0687019824982
        return [[ 268.87096774]]


In [280]:
r["DecisionTreeRegressor"] = modele.feature_importances_

### 1.3) features importances par random forest

In [281]:
from sklearn.ensemble import RandomForestRegressor

In [282]:
algo = RandomForestRegressor()
modele = algo.fit(X, y)
r["RandomForestRegressor"] = modele.feature_importances_

### 1.4) ExtraTreesClassifier

In [283]:
from sklearn.ensemble import ExtraTreesClassifier

In [284]:
algo = ExtraTreesClassifier()
modele = algo.fit(X, y)
modele.feature_importances_
r["ExtraTreesClassifier"] = modele.feature_importances_

# 3) display of column importance differences

In [285]:
import pandas as pd
df = pd.DataFrame(r)
df["num_col"] = list(range(1,11))
df["total"] = df.DecisionTreeRegressor + df.ExtraTreesClassifier + df.RandomForestRegressor

df.sort_values("total", ascending=False)

Unnamed: 0,sk LinearRegression.coef,sm OLS_coef,sm OLS_pvalues,DecisionTreeRegressor,RandomForestRegressor,ExtraTreesClassifier,num_col,total
8,751.279321,751.279321,1.556021e-05,0.582301,0.337915,0.104612,9,1.024828
2,519.839787,519.839787,4.299558e-14,0.375849,0.251091,0.119965,3,0.746906
3,324.390428,324.390428,1.023819e-06,0.0,0.098816,0.119599,4,0.218415
0,-10.012198,-10.012198,0.8669998,0.02078,0.06363,0.118122,1,0.202532
9,67.625386,67.625386,0.3059983,0.0,0.084474,0.106609,10,0.191083
6,101.04457,101.04457,0.6347207,0.02107,0.041628,0.112076,7,0.174774
5,476.745838,476.745838,0.1603892,0.0,0.047571,0.11042,6,0.15799
4,-792.184162,-792.184162,0.05794735,0.0,0.042791,0.11067,5,0.153461
7,177.064176,177.064176,0.2734557,0.0,0.018083,0.075829,8,0.093911
1,-239.819089,-239.819089,0.0001041429,0.0,0.014001,0.022097,2,0.036099
