In [570]:
from sklearn.tree import DecisionTreeClassifier as Tree
from sklearn.ensemble import RandomForestClassifier as Forest
from sklearn.cross_validation import train_test_split as sk_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import sys
%matplotlib qt

def permutation_importance(tree,test_data,test_target): # estimate variable importance using test data
    is_verbose = tree.get_params()['verbose']
    tree.set_params(verbose=False)
    importances = np.zeros(test_data.shape[1])
    original_score = tree.score(test_data,test_target)
    for i in xrange(test_data.shape[1]): # scramble each column and get % increase in error rate (Breinman importance)
        local = test_data.copy()
        np.random.shuffle(local[:,i])
        importances[i] = (original_score - tree.score(local,test_target))/(1-original_score)
        if is_verbose:
            sys.stdout.write('.')
            
    tree.set_params(verbose=is_verbose)
    return importances

train_data = pd.read_csv('train.csv')
train_labels = pd.read_csv('train_label.csv')
N = train_data.shape[0]

target = np.zeros(N,dtype=np.int)
target[np.array(train_labels['status_group']=='non functional')] = 0
target[np.array(train_labels['status_group']=='functional needs repair')] = 1
target[np.array(train_labels['status_group']=='functional')] = 2

In [571]:
# Manual data processing

# Convert date recorded to days since the first recording
s = train_data['date_recorded']
s = s.apply(lambda date_string: np.datetime64(date_string))
min_date = s.min()
s = (s-s.min())/np.timedelta64(1,'D')
train_data['date_recorded']=s

train_data['region_code'] = 'r'+train_data['region_code'].astype(np.str) # Regions are categorical

train_data.drop('recorded_by', axis=1, inplace=True) # uniform
train_data.drop('quantity_group',axis = 1, inplace=True) # duplicate of quantity
train_data.drop('id',axis=1,inplace=True) # unique value for every well

train_data.fillna("was_nan",inplace=True)

# Random data to help assess variable importance
train_data['random_1'] = np.random.uniform(0.0,1.0,N)
train_data['random_2'] = np.random.uniform(0.0,1.0,N)
train_data['random_3'] = np.random.binomial(1,0.5,N)
train_data['random_4'] = np.random.binomial(1,0.1,N)

In [572]:
# For all categorical data: create binary columns for each category that represents at least p% of the data
p = 0.01
for f in list(train_data.columns)[1:]:
    # Only modify string data
    if train_data[f].dtype == np.object:
        sizes = train_data.groupby(f).size()/N
        sizes.sort(ascending=False)
        sizes = sizes[sizes>p]
#        print sizes[sizes>p]
        print ""
        print f
        appended = np.zeros(N,dtype=np.bool)
        # The list of categories with at least p% of the training data for feature f
        salient_categories = list(sizes.keys())
        for cat in salient_categories:
            # Append the binary column
            train_data[f+'__'+str(cat)] = (train_data[f]==cat)
            print "\t", cat
        train_data.drop(f,axis=1,inplace=True)
    # Done!


funder
	Government Of Tanzania
	was_nan
	Danida
	Hesawa
	Rwssp
	World Bank
	Kkkt
	World Vision
	Unicef
	Tasaf
	District Council
	Dhv
	Private Individual
	Dwsp
	0
	Norad
	Germany Republi
	Tcrs

installer
	DWE
	was_nan
	Government
	RWE
	Commu
	DANIDA
	KKKT
	Hesawa
	0
	TCRS
	Central government
	CES

wpt_name
	none
	Shuleni
	Zahanati

basin
	Lake Victoria
	Pangani
	Rufiji
	Internal
	Lake Tanganyika
	Wami / Ruvu
	Lake Nyasa
	Ruvuma / Southern Coast
	Lake Rukwa

subvillage

region
	Iringa
	Shinyanga
	Mbeya
	Kilimanjaro
	Morogoro
	Arusha
	Kagera
	Mwanza
	Kigoma
	Ruvuma
	Pwani
	Tanga
	Dodoma
	Singida
	Mara
	Tabora
	Rukwa
	Mtwara
	Manyara
	Lindi
	Dar es Salaam

region_code
	r11
	r17
	r12
	r3
	r5
	r18
	r19
	r2
	r16
	r10
	r4
	r1
	r13
	r14
	r20
	r15
	r6
	r21
	r80
	r60
	r90
	r7

lga
	Njombe
	Arusha Rural
	Moshi Rural
	Bariadi
	Rungwe
	Kilosa
	Kasulu
	Mbozi
	Meru
	Bagamoyo
	Singida Rural
	Kilombero
	Same
	Kibondo
	Kyela
	Kahama
	Magu
	Kigoma Rural
	Maswa
	Karagwe
	Mbinga
	Iringa Rural
	Serengeti
	N

In [573]:
data_matrix = train_data.as_matrix().astype(np.float)

In [574]:
data_train, data_test, target_train, target_test = sk_split(data_matrix,target,test_size=0.2)

In [575]:
forest = Forest(n_estimators=160, criterion='gini', n_jobs=4,verbose=True,max_features=10)
forest.fit(data_train,target_train)
forest.score(data_test,target_test)

[Parallel(n_jobs=4)]: Done   1 out of 160 | elapsed:    0.1s remaining:   22.9s
[Parallel(n_jobs=4)]: Done 160 out of 160 | elapsed:    6.5s finished
[Parallel(n_jobs=4)]: Done   1 out of 160 | elapsed:    0.0s remaining:    5.4s
[Parallel(n_jobs=4)]: Done 160 out of 160 | elapsed:    0.4s finished


0.80900673400673395

In [577]:
pi = permutation_importance(forest,data_test,target_test)
current += [pi/np.max(pi)]
zscores = np.sqrt(len(current))*np.mean(current,axis=0)/np.std(current,axis=0)
plt.figure()
plt.bar(range(len(list(train_data.columns))),pi/np.max(pi))
ax = plt.gca()
ax.set_xticks(np.arange(len(list(train_data.columns)))+0.4)
ax.set_xticklabels(list(train_data.columns))
plt.plot(zscores)
bad_features = np.array(train_data.columns)[pi/np.max(pi)<=-0.00]
bad_features

..............................................................................................................................................................................................................................................

array(['amount_tsh', 'random_1', 'random_3', 'random_4', 'funder__was_nan',
       'funder__Kkkt', 'funder__World Vision',
       'funder__Private Individual', 'funder__0', 'funder__Norad',
       'funder__Germany Republi', 'installer__was_nan', 'installer__Commu',
       'installer__DANIDA', 'installer__0', 'installer__TCRS',
       'installer__Central government', 'installer__CES',
       'wpt_name__Zahanati', 'basin__Lake Victoria', 'basin__Rufiji',
       'basin__Lake Nyasa', 'basin__Ruvuma / Southern Coast',
       'region__Mbeya', 'region__Kilimanjaro', 'region__Kagera',
       'region__Mwanza', 'region__Kigoma', 'region__Ruvuma',
       'region__Pwani', 'region__Singida', 'region__Mara', 'region__Rukwa',
       'region__Manyara', 'region__Lindi', 'region__Dar es Salaam',
       'region_code__r17', 'region_code__r3', 'region_code__r18',
       'region_code__r19', 'region_code__r2', 'region_code__r16',
       'region_code__r10', 'region_code__r13', 'region_code__r14',
       'regi

In [578]:
for i,f in enumerate(train_data.columns):
    print zscores[i],"\t", f

-inf 	amount_tsh
inf 	date_recorded
inf 	gps_height
inf 	longitude
inf 	latitude
inf 	num_private
inf 	district_code
inf 	population
inf 	construction_year
-inf 	random_1
inf 	random_2
-inf 	random_3
nan 	random_4
inf 	funder__Government Of Tanzania
-inf 	funder__was_nan
inf 	funder__Danida
inf 	funder__Hesawa
inf 	funder__Rwssp
inf 	funder__World Bank
-inf 	funder__Kkkt
nan 	funder__World Vision
inf 	funder__Unicef
inf 	funder__Tasaf
inf 	funder__District Council
inf 	funder__Dhv
-inf 	funder__Private Individual
inf 	funder__Dwsp
nan 	funder__0
-inf 	funder__Norad
nan 	funder__Germany Republi
inf 	funder__Tcrs
inf 	installer__DWE
nan 	installer__was_nan
inf 	installer__Government
inf 	installer__RWE
nan 	installer__Commu
nan 	installer__DANIDA
inf 	installer__KKKT
inf 	installer__Hesawa
nan 	installer__0
-inf 	installer__TCRS
nan 	installer__Central government
-inf 	installer__CES
inf 	wpt_name__none
inf 	wpt_name__Shuleni
nan 	wpt_name__Zahanati
-inf 	basin__Lake Victoria
inf 	basin_

In [557]:
current

[array([ 0.18623962,  0.08540925,  0.15776987,  0.14472123,  0.13997628,
         0.26927639,  0.01779359,  0.00118624,  0.07710558, -0.01897983,
         0.00711744,  0.00711744,  0.00355872,  0.01660735,  0.00474496,
        -0.00118624,  0.00237248, -0.0059312 , -0.00118624, -0.00830368,
         0.00355872, -0.00118624,  0.0059312 , -0.00355872, -0.02135231,
         0.00237248,  0.0059312 , -0.01897983,  0.        ,  0.00355872,
        -0.00118624,  0.00237248,  0.0059312 ,  0.00118624,  0.00474496,
        -0.00948992, -0.01067616,  0.00355872, -0.00355872, -0.0059312 ,
         0.00118624, -0.00474496,  0.00830368,  0.        ,  0.00474496,
         0.00948992,  0.00237248, -0.00237248,  0.00355872,  0.00237248,
         0.00118624,  0.00237248,  0.00355872,  0.00355872, -0.00118624,
         0.        , -0.00474496,  0.00474496,  0.        ,  0.00474496,
        -0.00237248,  0.        ,  0.04982206,  0.02016607,  0.0118624 ,
         0.00948992,  0.00355872, -0.01067616, -0.0

In [566]:
bad_features = np.array(train_data.columns)[zscores<=-1.0]
bad_features

ValueError: too many boolean indices

In [532]:
for bf in bad_features:
    train_data.drop(bf,axis=1,inplace=True)

In [567]:
len(list(train_data.columns))

108

In [569]:
zscores.shape

(115,)

In [398]:
current = [pi/np.max(pi)]

array([  5.99455624,   3.90814219,   2.47833661,   7.14716107,
         9.2131852 ,  -0.46717987,   2.45825333,   9.94879845,
        38.36256938,   8.94148061,  -0.48812796,  -0.34796203,
        -0.33947186,  -0.16322957,   2.9372056 ,   2.00888894,
         3.41940673,   4.29352424,   0.20132394,   5.09860486,
         0.51617802,   2.56167367,   0.52900146,   5.34262529,
         2.34396135,  16.04571247,   2.15069796,          inf,
         4.66980474,   3.25710355,   2.30761225,   6.71978059,   1.09274678])

In [None]:
trees = forest.estimators_

In [None]:
tree = trees[0]

In [576]:
current=[]