In [12]:
# Reference: https://github.com/andosa/treeinterpreter
# Blog: http://blog.datadive.net/random-forest-interpretation-with-scikit-learn/

from treeinterpreter import treeinterpreter as ti
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

In [27]:
from sklearn.datasets import load_boston
boston = load_boston()
rf = RandomForestRegressor()
print(len(boston.feature_names))

13


In [14]:
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [23]:
pd.DataFrame(boston.data[:300,]).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [28]:
rf = RandomForestRegressor()
# Selecting 300 sample rows from the data set
fit1 = rf.fit(boston.data[:300], boston.target[:300])

In [29]:
fit1

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [32]:
# Randomly pick 2 instances
instances = boston.data[[300, 309]]
print(pd.DataFrame(instances))
print "Instance 0 prediction:", rf.predict(instances[0].reshape(1,13))
print "Instance 1 prediction:", rf.predict(instances[1].reshape(1,13))

        0     1     2    3      4      5     6       7    8      9     10  \
0  0.04417  70.0  2.24  0.0  0.400  6.871  47.4  7.8278  5.0  358.0  14.8   
1  0.34940   0.0  9.90  0.0  0.544  5.972  76.7  3.1025  4.0  304.0  18.4   

       11    12  
0  390.86  6.07  
1  396.24  9.97  
Instance 0 prediction: [ 29.08]
Instance 1 prediction: [ 22.64]


In [33]:
# It uses feature response from multiple tree path 
prediction, bias, contributions = ti.predict(rf, instances)

In [34]:
for i in range(len(instances)):
    print "Instance", i
    print "Bias (trainset mean)", bias[i]
    print "Feature contributions:"
    for c, feature in sorted(zip(contributions[i], 
                                 boston.feature_names), 
                             key=lambda x: -abs(x[0])):
        print feature, round(c, 2)
    print "-"*20

Instance 0
Bias (trainset mean) 25.5101
Feature contributions:
RM 2.62
LSTAT 1.95
TAX -0.92
INDUS 0.46
PTRATIO 0.45
NOX -0.33
ZN -0.28
DIS -0.26
B -0.2
CRIM -0.16
RAD 0.13
AGE 0.1
CHAS 0.0
--------------------
Instance 1
Bias (trainset mean) 25.5101
Feature contributions:
RM -5.98
LSTAT 2.7
CRIM 0.9
AGE -0.34
B -0.17
PTRATIO 0.14
TAX -0.12
NOX 0.11
RAD -0.07
DIS -0.06
ZN 0.02
INDUS 0.01
CHAS 0.0
--------------------


In [51]:
#print prediction
#print(contributions[0])
print(bias[0] + np.sum(contributions[0]))
print(bias[1] + np.sum(contributions[1]))

29.08
22.64


In [61]:
np.sum(contributions, axis=1)

array([ 4.20146667, -3.11853333])

In [43]:
#  the basic feature importance feature provided by sklearn
fit1.feature_importances_

array([ 0.01602412,  0.00129815,  0.00440821,  0.00173291,  0.00392606,
        0.81480729,  0.01405132,  0.00794089,  0.00397147,  0.0120137 ,
        0.01252419,  0.01006302,  0.09723867])

In [44]:
# treeinterpreter uses the apply function to retrieve the leave indicies with the help of which, 
# the tree path is retrieves
rf.apply

<bound method RandomForestRegressor.apply of RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)>

In [47]:
rf.apply(instances)

array([[311, 283, 265, 296,  95, 118, 254, 308, 104, 301],
       [117,  79,  74,  82,  48,  33, 121, 104,  73,  63]])