In [125]:
from sklearn.tree import DecisionTreeClassifier as Tree
from sklearn.ensemble import RandomForestClassifier as Forest
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import time
%matplotlib qt

def Rotate(coords,theta):
    out = np.zeros(coords.shape)
    out[:,0] = coords[:,0]*np.cos(theta) + coords[:,1]*np.sin(theta)
    out[:,1] = -coords[:,0]*np.sin(theta) + coords[:,1]*np.cos(theta)
    return out

def permutation_importance(tree,test_data,test_target): # estimate variable importance using test data
    is_verbose = tree.get_params()['verbose']
    tree.set_params(verbose=False)
    importances = np.zeros(test_data.shape[1])
    original_score = tree.score(test_data,test_target)
    for i in xrange(test_data.shape[1]): # scramble each column and get % increase in error rate (Breinman importance)
        local = test_data.copy()
        np.random.shuffle(local[:,i])
        importances[i] = (original_score - tree.score(local,test_target))/(1-original_score)
    tree.set_params(verbose=is_verbose)
    return importances

train_data = pd.read_csv('train.csv')
train_labels = pd.read_csv('train_label.csv')
N = train_data.shape[0]

X = train_data['longitude']
Y = train_data['latitude']
XY = np.array([X,Y]).T
target = np.zeros(X.shape[0],dtype=np.int)
target[np.array(train_labels['status_group']=='non functional')] = 0
target[np.array(train_labels['status_group']=='functional needs repair')] = 1
target[np.array(train_labels['status_group']=='functional')] = 2

In [139]:
s = 0.8
random_indices = np.arange(0,N)
np.random.shuffle(random_indices)
train_ind = random_indices[0:int(s*N)]
test_ind = random_indices[int(s*N):]

Ru = np.random.uniform(0.0,1.0,size=N) # random continuous data
Rb = np.random.binomial(1,0.5,size=N) # random binary data
Ri = np.arange(0,N)
np.random.shuffle(Ri)
not_GPS_data = np.array([Ru,Rb,Ri]).T

In [140]:
P2 = np.c_[X, Y, X+Y, X*Y, X*X+Y, Y*Y+X]#, not_GPS_data]

In [143]:
qforest = Forest(n_estimators=100,criterion='gini',oob_score=True,max_features=3,verbose=True,n_jobs=4)
qforest.fit(P2[train_ind,:],target[train_ind])

[Parallel(n_jobs=4)]: Done   1 out of 100 | elapsed:    0.2s remaining:   18.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    4.4s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=3, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
            oob_score=True, random_state=None, verbose=True,
            warm_start=False)

In [144]:
qforest.score(P2[test_ind,:],target[test_ind])

[Parallel(n_jobs=4)]: Done   1 out of  12 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


0.67289562289562288

In [137]:
qforest.feature_importances_

array([ 0.17366609,  0.15679718,  0.18894286,  0.15807817,  0.17051755,
        0.15199815])

In [138]:
pi=permutation_importance(qforest,P2[test_ind,:],target[test_ind])
print pi/np.max(pi)

[ 0.93989983  0.27045075  1.          0.48080134  0.56761269  0.09348915]


In [124]:
qforest.get_params()['verbose']

True

In [10]:
zscores

array([ 19.07275377,  25.62822108,  26.4666061 ,  16.06312031,
        17.71412804,  17.57257608,  15.70427369,   8.13137713])

In [11]:
qforest.oob_decision_function_

array([[ 0.6       ,  0.3       ,  0.1       ],
       [ 0.        ,  0.        ,  1.        ],
       [ 0.4       ,  0.        ,  0.6       ],
       ..., 
       [ 0.66666667,  0.        ,  0.33333333],
       [ 0.63636364,  0.18181818,  0.18181818],
       [ 0.6       ,  0.        ,  0.4       ]])

[Parallel(n_jobs=4)]: Done   1 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.0s finished


array([[15388, 16997, 14944,  8185, 15251,  6356,  9922, 18248,  6719,
        19148, 16496,  8472, 16155, 19325,  5652,  9790, 14662,  3874,
        10524, 15453,  8006,  8264,  4763, 12829,  6161, 16463,  6833,
        10144, 17412, 15405,   309,   655, 17003, 11583, 15733, 15679,
         5784, 18875,  9882, 17494,   965, 12543, 19217, 15325, 13614,
        16398, 15456,  1265, 17736,  4916]])

In [40]:
T = qforest.estimators_[0]

In [92]:
# Print number of times each tree split on each of the categories
print [-2]+range(P2.shape[1])
print '-----------------------------------------------------------'
for T in qforest.estimators_:
    print [np.sum(T.tree_.feature==f) for f in [-2]+range(P2.shape[1])]/np.sum(T.tree_.feature>-1,dtype=np.float)

[-2, 0, 1, 2, 3, 4, 5, 6, 7, 8]
-----------------------------------------------------------
[ 1.0001085   0.10827818  0.11348595  0.12303353  0.11272648  0.11760877
  0.11164153  0.14429858  0.02495389  0.14397309]
[ 1.00010752  0.1116009   0.10998817  0.11708418  0.11375121  0.1171917
  0.11643909  0.14536071  0.02644877  0.14213525]
[ 1.00010872  0.11643836  0.11404653  0.11785171  0.11252446  0.12046097
  0.10969776  0.13872581  0.02826701  0.14198739]
[ 1.00010753  0.11483871  0.10677419  0.11860215  0.11688172  0.12182796
  0.10978495  0.1444086   0.02483871  0.14204301]
[ 1.00010852  0.10971243  0.10602279  0.12707542  0.11578947  0.11427021
  0.11220836  0.14107434  0.0284319   0.14541508]
[ 1.00010808  0.11575875  0.1104626   0.12278426  0.11846087  0.11608301
  0.10948984  0.14169909  0.02702118  0.13824038]
[ 1.00010893  0.11285403  0.10947712  0.12298475  0.11742919  0.11111111
  0.11122004  0.14237473  0.02875817  0.14379085]
[ 1.00010873  0.11840818  0.11405893  0.11927803

array([0, 3, 3])

In [83]:
np.bincount(T.tree_.feature)

ValueError: The first argument of bincount must be non-negative