In [2]:
from sklearn.tree import DecisionTreeClassifier as Tree
from sklearn.ensemble import RandomForestClassifier as Forest
from sklearn.cross_validation import train_test_split as sk_split
from sklearn.neighbors import KNeighborsClassifier as KNN
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import sys
#%matplotlib qt

def permutation_importance(tree,test_data,test_target): # estimate variable importance using test data
    is_verbose = tree.get_params()['verbose']
    tree.set_params(verbose=False)
    importances = np.zeros(test_data.shape[1])
    original_score = tree.score(test_data,test_target)
    for i in xrange(test_data.shape[1]): # scramble each column and get % increase in error rate (Breinman importance)
        local = test_data.copy()
        np.random.shuffle(local[:,i])
        importances[i] = (original_score - tree.score(local,test_target))/(1-original_score)
        if is_verbose:
            sys.stdout.write('.')
            
    tree.set_params(verbose=is_verbose)
    return importances

def Rotate(coords,theta):
    out = np.zeros(coords.shape)
    out[:,0] = coords[:,0]*np.cos(theta) + coords[:,1]*np.sin(theta)
    out[:,1] = -coords[:,0]*np.sin(theta) + coords[:,1]*np.cos(theta)
    return out

train_data = pd.read_csv('train.csv')
train_labels = pd.read_csv('train_label.csv')
compete_data = pd.read_csv('test.csv')
compete_id = compete_data.id
N = train_data.shape[0]

target = np.zeros(N,dtype=np.int)
target[np.array(train_labels['status_group']=='non functional')] = 0
target[np.array(train_labels['status_group']=='functional needs repair')] = 1
target[np.array(train_labels['status_group']=='functional')] = 2

In [None]:
from sklearn.ensemble import ExtraTreesClassifier as EForest
%matplotlib qt

In [None]:
def oob_permuation_importance(forest,test_data,test_target):
    votes = np.zeros(test_data.shape[0],len(tree.classes_))
    perm_votes = np.zeros(test_data.shape[0],tree.n_classes_,test_data.shape[1])
    for tree in forest.estimators_:
        oob = np.logical_not(tree.indices_)
        oob_data = test_data[oob,:].copy()
        tree_votes = tree.predict(oob_data)
        for i,c in enumerate(tree.classes_):
            votes(:)
        

In [None]:
# Manual data processing

# Convert date recorded to days since the first recording
s = train_data['date_recorded']
sc = compete_data['date_recorded']
s = s.apply(lambda date_string: np.datetime64(date_string))
sc = sc.apply(lambda date_string: np.datetime64(date_string))
min_date = s.min()
min_datec = sc.min()
s = (s-s.min())/np.timedelta64(1,'D')
sc = (sc-sc.min())/np.timedelta64(1,'D')
train_data['date_recorded']=s
compete_data['date_recorded']=sc

train_data['region_code'] = 'r'+train_data['region_code'].astype(np.str) # Regions are categorical
compete_data['region_code'] = 'r'+compete_data['region_code'].astype(np.str)

XY = np.array([train_data.longitude,train_data.latitude]).T
XYc = np.array([compete_data.longitude,compete_data.latitude]).T
r = 5 # total number of rotations (including original coordinates)
for i,theta in enumerate(np.linspace(0,np.pi/4,r)):
    if i>0:
        coords = Rotate(XY, theta)
        train_data['longitude_r%d'%(i)] = coords[:,0]
        train_data['latitude_r%d'%(i)] = coords[:,1]
        coordsc = Rotate(XYc, theta)
        compete_data['longitude_r%d'%(i)] = coordsc[:,0]
        compete_data['latitude_r%d'%(i)] = coordsc[:,1]

# Include kNN information using the tuned k=30 results from the initial investigation using kNN classifiers
# This gives the forest access to (unnormalized) kNN information
knn = KNN(n_neighbors=30,weights='distance')
knn.fit(XY,target)
dists,neighbors = knn.kneighbors(return_distance=True)
dists[dists==0]=np.inf
dists = 1.0/dists
class_sums = np.zeros((XY.shape[0],3))
for i in range(XY.shape[0]):
    for c in range(3):
        class_sums[i,c] = np.sum(dists[i,target[neighbors[i]]==c])
for c in range(3):
    train_data['knn_%d'%(c)] = class_sums[:,c]

dists,neighbors = knn.kneighbors(XYc,return_distance=True)
dists[dists==0]=np.inf
dists = 1.0/dists
class_sums = np.zeros((XYc.shape[0],3))
for i in range(XYc.shape[0]):
    for c in range(3):
        class_sums[i,c] = np.sum(dists[i,target[neighbors[i]]==c])
for c in range(3):
    compete_data['knn_%d'%(c)] = class_sums[:,c]
    
    
#                uniform       copy of quantity      unique
for feature in ['recorded_by','quantity_group','id','num_private']:
    train_data.drop(feature, axis=1, inplace=True) # uniform
    compete_data.drop(feature,axis=1,inplace=True)
####### EMPRICAL GUESSES
## General features that will correlate with higher-detail versions of that feature, e.g. source_type generalized source
for feature in ['extraction_type_group','extraction_type_class','payment_type',
                'quality_group','source_type','source_class','waterpoint_type_group',
                'scheme_management','scheme_name','date_recorded','management_group','basin']:
    train_data.drop(feature,axis=1,inplace=True)
    compete_data.drop(feature,axis=1,inplace=True)
    
# Region proxy variables
for feature in ['region','region_code','district_code','lga','wpt_name','ward','subvillage']:
    train_data.drop(feature,axis=1,inplace=True)
    compete_data.drop(feature,axis=1,inplace=True)
    
# After removing all of the above, the following has mean permutation importances
# of <=0.0003 each (as assessed over 15 samples of 50 trees each)
#######

train_data.fillna("was_nan",inplace=True)
compete_data.fillna("was_nan",inplace=True)
# Random data to help assess variable importance
#train_data['random_1'] = np.random.uniform(0.0,1.0,N)
#train_data['random_2'] = np.random.uniform(0.0,1.0,N)
#train_data['random_3'] = np.random.binomial(1,0.5,N)
#train_data['random_4'] = np.random.binomial(1,0.1,N)

In [None]:
train_data['knn_0'][0:30]

In [None]:
# For all categorical data: create binary columns for each category that represents at least p% of the data
p = 0.01
feature_factors = {}
for f in list(train_data.columns):
    # Only modify string data
    if train_data[f].dtype == np.object:
        sizes = train_data.groupby(f).size()/N
        sizes.sort(ascending=False)
        sizes = sizes[sizes>p]
#        print sizes[sizes>p]
        print ""
        print f
        appended = np.zeros(N,dtype=np.bool)
        # The list of categories with at least p% of the training data for feature f
        salient_categories = list(sizes.keys())
        feature_factors[f] = salient_categories
        for cat in salient_categories:
            # Append the binary column
            train_data[f+'__'+str(cat)] = (train_data[f]==cat)
            print "\t", cat
        train_data.drop(f,axis=1,inplace=True)
    # Done!

In [None]:
### Perform the same one-hot encoding on the competition data
for f in list(compete_data.columns):
    # Only modify string (categorical) data
    if compete_data[f].dtype == np.object:
        for cat in feature_factors[f]:
            # Create binary column for trained factors for this feature
            compete_data[f+'__'+str(cat)] = (compete_data[f]==cat)
        # remove the original column
        compete_data.drop(f,axis=1,inplace=True)

In [None]:
## After removing all of the above, the following has mean permutation importances
## of <=0.0003 each (as assessed over 10 samples of 50 trees each)
#for feature in ['funder__Rwssp', 'funder__District Council','funder__0', 'funder__Germany Republi', 'funder__Tcrs',
#       'installer__Commu', 'installer__DANIDA', 'installer__0', 'installer__TCRS', 'installer__Central government',
#       'installer__CES', 'public_meeting__was_nan', 'permit__was_nan', 'extraction_type__swn 80', 
#       'extraction_type__afridev', 'extraction_type__ksb', 'management__parastatal',
#       'payment__unknown', 'water_quality__soft', 'water_quality__milky', 'quantity__unknown']:
for feature in ['funder__Unicef','funder__0','funder__Kkkt','permit__was_nan',
                'management__water board','water_quality__salty','quantity__unknown',
                'source__lake']:
    train_data.drop(feature,axis=1,inplace=True)
    compete_data.drop(feature,axis=1,inplace=True)

In [None]:
list(train_data.columns)

In [221]:
data_matrix = train_data.as_matrix().astype(np.float);
data_matrix_compete = compete_data.as_matrix().astype(np.float);
print data_matrix.shape
print data_matrix_compete.shape

(59400, 89)
(14850, 89)


In [31]:
data_train, data_test, target_train, target_test = sk_split(data_matrix,target,test_size=0.1)

In [32]:
# Plot accuracy vs number of trees
accuracy = []
n_est = []
forest = Forest(n_estimators=1, criterion='gini', n_jobs=4,verbose=False,max_features=13,bootstrap=True,oob_score=False,
                warm_start=True)
forest.fit(data_train,target_train)
accuracy += [forest.score(data_test,target_test)]
n_est += [forest.n_estimators]
for i in range(1,20):
    forest.set_params(n_estimators=i*20)
    forest.fit(data_train,target_train)
    accuracy += [forest.score(data_test,target_test)]
    n_est += [forest.n_estimators]
    print accuracy[-1:]
plt.plot(n_est,accuracy,'-o')

[0.80303030303030298]
[0.80808080808080807]
[0.8101010101010101]
[0.80909090909090908]


KeyboardInterrupt: 

In [59]:
plt.show()

In [147]:
plt.legend([.2,.2,.2,.1,.1,.1])

<matplotlib.legend.Legend at 0x7f9148c4f4d0>

In [933]:
forest.set_params(verbose=False)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=420, n_jobs=4,
            oob_score=True, random_state=None, verbose=False,
            warm_start=True)

In [913]:
forest.fit(data_train,target_train,n_estimators=20,warm_start=True)
print forest.score(data_test,target_test)
print forest.n_estimators

TypeError: fit() got an unexpected keyword argument 'n_estimators'

In [821]:
pi = permutation_importance(forest,data_test,target_test)
current += [pi]
zscores = np.sqrt(len(current))*np.mean(current,axis=0)/np.std(current,axis=0)
plt.figure()
plt.bar(range(len(list(train_data.columns))),np.mean(current,axis=0),yerr=np.std(current,axis=0)/np.sqrt(len(current)),error_kw=dict(ecolor='red'))
ax = plt.gca()
ax.set_xticks(np.arange(len(list(train_data.columns)))+0.4)
ax.set_xticklabels(list(train_data.columns))
plt.plot(zscores)
bad_features = np.array(train_data.columns)[pi/np.max(pi)<=-0.00]
bad_features

..........

KeyboardInterrupt: 

In [176]:
# Bar plot with standard error bars for permutation importance
pi=[]
current=[]
for i in range(5):
    data_train, data_test, target_train, target_test = sk_split(data_matrix,target,test_size=0.2)
    forest = Forest(n_estimators=50, criterion='gini', n_jobs=4,verbose=True,max_features=10,oob_score=True,
                    min_samples_split=7)
    forest.fit(data_train,target_train)
    print forest.score(data_test,target_test)
    pi = permutation_importance(forest,data_test,target_test)
    current += [pi]
zscores = np.sqrt(len(current))*np.mean(current,axis=0)/np.std(current,axis=0)
plt.figure()
plt.bar(range(len(list(train_data.columns))),np.mean(current,axis=0),yerr=np.std(current,axis=0)/np.sqrt(len(current)),error_kw=dict(ecolor='red'))
ax = plt.gca()
ax.set_xticks(np.arange(len(list(train_data.columns)))+0.4)
ax.set_xticklabels(list(train_data.columns))
#plt.plot(zscores)
bad_features = np.array(train_data.columns)[pi/np.max(pi)<=-0.00]
bad_features

[Parallel(n_jobs=4)]: Done   1 out of  50 | elapsed:    0.2s remaining:    7.5s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.8s finished
[Parallel(n_jobs=4)]: Done   1 out of  15 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


0.814646464646
...............................................................................................

[Parallel(n_jobs=4)]: Done   1 out of  50 | elapsed:    0.2s remaining:    8.7s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.8s finished
[Parallel(n_jobs=4)]: Done   1 out of   8 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


.0.811531986532
...............................................................................................

[Parallel(n_jobs=4)]: Done   1 out of  50 | elapsed:    0.2s remaining:    7.7s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.8s finished
[Parallel(n_jobs=4)]: Done   1 out of  18 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


.0.81734006734
...............................................................................................

[Parallel(n_jobs=4)]: Done   1 out of  50 | elapsed:    0.2s remaining:    7.5s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.8s finished
[Parallel(n_jobs=4)]: Done   1 out of  23 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


.0.818097643098
...............................................................................................

[Parallel(n_jobs=4)]: Done   1 out of  50 | elapsed:    0.1s remaining:    6.8s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.8s finished
[Parallel(n_jobs=4)]: Done   1 out of  50 | elapsed:    0.0s remaining:    0.4s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


.0.820538720539
................................................................................................

array(['funder__World Bank', 'funder__World Vision', 'funder__Unicef',
       'funder__District Council', 'funder__0', 'funder__Norad',
       'funder__Germany Republi', 'installer__was_nan', 'installer__Commu',
       'installer__Hesawa', 'installer__Central government',
       'installer__CES', 'permit__True', 'extraction_type__afridev',
       'management__water board', 'management__private operator',
       'management__parastatal', 'management__water authority',
       'management__company', 'payment__pay monthly',
       'payment__pay when scheme fails', 'payment__pay annually',
       'water_quality__salty', 'quantity__unknown', 'source__spring',
       'source__shallow well', 'source__machine dbh', 'source__river',
       'source__hand dtw', 'waterpoint_type__improved spring'], dtype=object)

In [15]:
plt.show()

In [33]:
mean_pi = np.mean(current,axis=0)
small_mask = np.abs(mean_pi)<=0.0003
small_features = np.array(train_data.columns)[small_mask]
small_features

array(['longitude_r6', 'funder__Rwssp', 'funder__District Council',
       'funder__0', 'funder__Germany Republi', 'funder__Tcrs',
       'installer__Commu', 'installer__DANIDA', 'installer__0',
       'installer__TCRS', 'installer__Central government',
       'installer__CES', 'public_meeting__was_nan', 'permit__was_nan',
       'extraction_type__swn 80', 'extraction_type__afridev',
       'extraction_type__ksb', 'management__parastatal',
       'payment__unknown', 'water_quality__soft', 'water_quality__milky',
       'quantity__unknown'], dtype=object)

In [None]:
plt.figure()
plt.bar(range(len(list(train_data.columns))),np.mean(current,axis=0),yerr=np.std(current,axis=0)/np.sqrt(len(current)),error_kw=dict(ecolor='red'))
ax = plt.gca()
ax.set_xticks(np.arange(len(list(train_data.columns)))+0.4)
ax.set_xticklabels(list(train_data.columns))
plt.plot(zscores)
bad_features = np.array(train_data.columns)[pi/np.max(pi)<=-0.00]
bad_features

In [731]:
len(current)

3

In [860]:
bad_features = np.array(train_data.columns)[np.mean(current,axis=0)<=0.03]
print len(bad_features)/(1.0*len(list(train_data.columns)))
bad_features

0.2


array(['random_1', 'random_2', 'random_3', 'random_4'], dtype=object)

In [122]:
dropped_after_87= ['funder__Rwssp', 'installer__0', 'region__Manyara',
       'lga__Kigoma Rural', 'public_meeting__was_nan',
       'scheme_management__Water Board', 'payment__pay monthly',
       'waterpoint_type__improved spring']
dropped_after_79 = ['funder__Danida', 'installer__TCRS',
       'basin__Lake Victoria', 'lga__Ngara']
dropped_after_75 = ['region__Kilimanjaro',
       'region__Dar es Salaam', 'scheme_management__Company',
       'extraction_type_class__submersible', 'management__other',
       'payment__other', 'quantity__unknown', 'source__lake']
dropped_after_67 = ['funder__Hesawa', 'funder__World Bank',
       'funder__World Vision', 'installer__DWE', 'basin__Pangani',
       'basin__Ruvuma / Southern Coast', 'region__Kigoma',
       'lga__Moshi Rural', 'lga__Singida Rural', 'lga__Kibondo',
       'scheme_management__Private operator', 'permit__was_nan',
       'extraction_type_class__other', 'quality_group__salty',
       'quantity__seasonal', 'source__rainwater harvesting',
       'source_type__rainwater harvesting', 'waterpoint_type__hand pump',
       'waterpoint_type_group__improved spring']
dropped_after_48 = ['region_code__r20',
       'lga__Mvomero', 'lga__Ulanga', 'extraction_type_group__afridev']
dropped_after_44 = ['lga__Kilosa',
       'lga__Kahama', 'scheme_management__WUG',
       'scheme_management__was_nan', 'extraction_type_group__mono',
       'management_group__commercial', 'payment_type__per bucket']
dropped_after_37 = ['funder__Dwsp',
       'installer__Government', 'scheme_name__K', 'permit__True',
       'extraction_type__gravity', 'extraction_type__ksb']
dropped_after_31 = ['extraction_type__other', 'extraction_type_group__other',
       'water_quality__unknown', 'waterpoint_type_group__other']
dropped_after_27 = ['public_meeting__False', 'scheme_name__was_nan',
       'extraction_type__nira/tanira', 'payment__unknown',
       'quantity__insufficient', 'source__machine dbh',
       'source_type__spring']
dropped_after_20 = ['random_1', 'random_2', 'random_3', 'random_4']
remaining = ['amount_tsh',
 'date_recorded',
 'gps_height',
 'longitude',
 'latitude',
 'district_code',
 'population',
 'construction_year',
 'funder__Government Of Tanzania',
 'extraction_type_group__gravity',
 'payment_type__never pay',
 'quantity__enough',
 'quantity__dry',
 'waterpoint_type__communal standpipe',
 'waterpoint_type__other',
 'waterpoint_type__communal standpipe multiple']
remaining += ['longitude_r%d'%(i) for i in range(1,r)]
remaining += ['latitude_r%d'%(i) for i in range(1,r)]

In [123]:
features = remaining + dropped_after_27 + dropped_after_31 + dropped_after_37 + dropped_after_44 + dropped_after_48 + dropped_after_67 + dropped_after_75 + dropped_after_79 + dropped_after_87
features

['amount_tsh',
 'date_recorded',
 'gps_height',
 'longitude',
 'latitude',
 'district_code',
 'population',
 'construction_year',
 'funder__Government Of Tanzania',
 'extraction_type_group__gravity',
 'payment_type__never pay',
 'quantity__enough',
 'quantity__dry',
 'waterpoint_type__communal standpipe',
 'waterpoint_type__other',
 'waterpoint_type__communal standpipe multiple',
 'longitude_r1',
 'latitude_r1',
 'public_meeting__False',
 'scheme_name__was_nan',
 'extraction_type__nira/tanira',
 'payment__unknown',
 'quantity__insufficient',
 'source__machine dbh',
 'source_type__spring',
 'extraction_type__other',
 'extraction_type_group__other',
 'water_quality__unknown',
 'waterpoint_type_group__other',
 'funder__Dwsp',
 'installer__Government',
 'scheme_name__K',
 'permit__True',
 'extraction_type__gravity',
 'extraction_type__ksb',
 'lga__Kilosa',
 'lga__Kahama',
 'scheme_management__WUG',
 'scheme_management__was_nan',
 'extraction_type_group__mono',
 'management_group__commercia

In [124]:
not_features = np.setdiff1d(list(train_data.columns),features)
for f in not_features:
    train_data.drop(f,axis=1,inplace=True)

In [863]:
for bf in bad_features[:]:
    train_data.drop(bf,axis=1,inplace=True)

In [865]:
remaining = train_data.columns

In [867]:
list(remaining)

['amount_tsh',
 'date_recorded',
 'gps_height',
 'longitude',
 'latitude',
 'district_code',
 'population',
 'construction_year',
 'funder__Government Of Tanzania',
 'extraction_type_group__gravity',
 'payment_type__never pay',
 'quantity__enough',
 'quantity__dry',
 'waterpoint_type__communal standpipe',
 'waterpoint_type__other',
 'waterpoint_type__communal standpipe multiple']

In [81]:
len(list(train_data.columns))

87

In [706]:
current=[]

In [759]:
a = [1,2,3,4]; a[1:-1]

[2, 3]

In [186]:
trees = forest.estimators_

In [187]:
tree = trees[0]

In [203]:
tree.tree_.weighted_n_node_samples

array([  5.34600000e+04,   7.69200000e+03,   6.95800000e+03, ...,
         2.00000000e+00,   1.00000000e+00,   3.00000000e+00])

In [188]:
type(tree)

sklearn.tree.tree.DecisionTreeClassifier

In [226]:
tree.set_params(splitter='random')

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=18, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=311854994, splitter='random')

In [617]:
tree.indices_.shape

(47520,)

In [623]:
np.sum(tree.indices_,dtype=np.int)

30062

In [717]:
current = current[3:]

In [718]:
for c in current:
    print len(c)

131
131
131


In [882]:
plt.figure(); plt.plot(train_data.population,train_data.construction_year,'ko')

[<matplotlib.lines.Line2D at 0x7f007995b610>]

In [877]:
plt.figure(); plt.plot(np.array(train_data.construction_year),'ko')

[<matplotlib.lines.Line2D at 0x7f0075bc1b90>]

In [884]:
np.cov(train_data.population,train_data.construction_year)/(np.std(train_data.population)*np.std(train_data.construction_year))

array([[ 0.49546021,  0.26091467],
       [ 0.26091467,  2.01839352]])

In [7]:
forest.n_features_

NameError: name 'forest' is not defined

In [222]:
forest = Forest(n_estimators=2000, criterion='gini', n_jobs=4,verbose=True,max_features=10,bootstrap=True,
                warm_start=False,min_samples_split=7,min_samples_leaf=1,oob_score=False)#True
forest.fit(data_matrix,target)
#forest.oob_score_

[Parallel(n_jobs=4)]: Done   1 out of 2000 | elapsed:    0.2s remaining:  7.0min
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:  1.9min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=7,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=4,
            oob_score=False, random_state=None, verbose=True,
            warm_start=False)

In [40]:
eforest = EForest(n_estimators=1000, criterion='gini', n_jobs=4,verbose=True,max_features=10,
                  min_samples_split=7,bootstrap=False)
eforest.fit(data_matrix,target)
#eforest.oob_score_

[Parallel(n_jobs=4)]: Done   1 out of 1000 | elapsed:    0.2s remaining:  3.4min
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   50.2s finished


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features=10, max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=7,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=4,
           oob_score=False, random_state=None, verbose=True,
           warm_start=False)

In [209]:
for i in range(3):
    p = np.argmax(forest.oob_decision_function_,axis=1)
    print i, " ", np.sum((target==i)&(p==i))/np.sum(target==i,dtype=np.float)*100.0

AttributeError: 'RandomForestClassifier' object has no attribute 'oob_decision_function_'

In [223]:
predictions = forest.predict(data_matrix_compete)

[Parallel(n_jobs=4)]: Done   1 out of  13 | elapsed:    0.5s remaining:    6.0s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:   47.9s finished


In [224]:
predictions_for_export = np.zeros(predictions.shape,dtype=np.object)
predictions_for_export[predictions==0] = 'non functional'
predictions_for_export[predictions==1] = 'functional needs repair'
predictions_for_export[predictions==2] = 'functional'
predictions_for_export = np.array([compete_id, predictions_for_export]).T

In [225]:
predictions_for_export.shape

(14850, 2)

In [226]:
predictions_for_export

array([[50785, 'non functional'],
       [51630, 'functional'],
       [17168, 'functional'],
       ..., 
       [28749, 'functional'],
       [33492, 'functional'],
       [68707, 'non functional']], dtype=object)

In [227]:
np.savetxt("submit9.csv",predictions_for_export,fmt='%s',delimiter=',',header='id,status_group')
##### REMEMBER: EDIT CSV FILE TO REMOVE HEADER'S LEADING # AND SPACE

In [206]:
r

3

NameError: name 'train_data' is not defined

In [3]:
train_data

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000,2011-03-14,Roman,1390,Roman,34.938093,-9.856322e+00,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466e+00,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329e+00,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0,2013-01-28,Unicef,263,UNICEF,38.486161,-1.115530e+01,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359e+00,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
5,9944,20,2011-03-13,Mkinga Distric Coun,0,DWE,39.172796,-4.765587e+00,Tajiri,0,...,per bucket,salty,salty,enough,enough,other,other,unknown,communal standpipe multiple,communal standpipe
6,19816,0,2012-10-01,Dwsp,0,DWSP,33.362410,-3.766365e+00,Kwa Ngomho,0,...,never pay,soft,good,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump
7,54551,0,2012-10-09,Rwssp,0,DWE,32.620617,-4.226198e+00,Tushirikiane,0,...,unknown,milky,milky,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
8,53934,0,2012-11-03,Wateraid,0,Water Aid,32.711100,-5.146712e+00,Kwa Ramadhan Musa,0,...,never pay,salty,salty,seasonal,seasonal,machine dbh,borehole,groundwater,hand pump,hand pump
9,46144,0,2011-08-03,Isingiro Ho,0,Artisan,30.626991,-1.257051e+00,Kwapeto,0,...,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
