In [1]:
from sklearn.tree import DecisionTreeClassifier as Tree
from sklearn.ensemble import RandomForestClassifier as Forest
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
%matplotlib qt

def ZScores(forest): # Compute the mean/std z-score for each feature in the forest
    importances = [tree.feature_importances_ for tree in forest.estimators_]
    return np.mean(importances,axis=0)/np.std(importances,axis=0)

In [None]:
def Rotate(coords,theta):
    out = np.zeros(coords.shape)
    out[:,0] = coords[:,0]*np.cos(theta) + coords[:,1]*np.sin(theta)
    out[:,1] = -coords[:,0]*np.sin(theta) + coords[:,1]*np.cos(theta)
    return out

In [None]:
sklearn.__version__

In [None]:
train_data = pd.read_csv('train.csv')
train_labels = pd.read_csv('train_label.csv')
N = train_data.shape[0]

In [None]:
X = train_data['longitude']
Y = train_data['latitude']
XY = np.array([X,Y]).T
XYr15 = Rotate(XY,np.pi/12)
XYr30 = Rotate(XY,np.pi/6)
XYr45 = Rotate(XY,np.pi/4)
#Qe = (train_data['quantity'] == 'enough')
#Qi = (train_data['quantity'] == 'insufficient')
#Qd = (train_data['quantity'] == 'dry')
#Qs = (train_data['quantity'] == 'seasonal')
#L = (train_data['source']=='lake')
#A = train_data['amount_tsh']
#Eg = (train_data['extraction_type_class']=='gravity')
#Eh = (train_data['extraction_type_class']=='handpump')
#Es = (train_data['extraction_type_class']=='submersible')
#P = train_data['population']
#H = train_data['gps_height']
#Y = train_data['construction_year']
R = np.random.uniform(0.0,1.0,size=N)
R2 = np.random.binomial(1,0.5,size=N)
target = train_labels.status_group != 'non functional'
Dgps = np.array([X,Y],dtype=np.float).T
Dall = np.array([X,Y,Qe,Qi,Qd,Qs,L,A,Eg,Eh,Es,P,H,Y,R*0,R2*0],dtype=np.float).T
Dquantity = np.array([Qe,Qi,Qd,Qs,R],dtype=np.float).T

In [None]:
s = 0.8
random_indices = np.arange(0,N)
np.random.shuffle(random_indices)
train_ind = random_indices[0:int(s*N)]
test_ind = random_indices[int(s*N):]

In [8]:
### Train on rotated trees
forestR0 = Forest(n_estimators=20,criterion='gini')
forestR0.fit(Dgps[train_ind,:],target[train_ind])
forestR15 = Forest(n_estimators=20,criterion='gini')
forestR15.fit(XYr15[train_ind,:],target[train_ind])
forestR30 = Forest(n_estimators=20,criterion='gini')
forestR30.fit(XYr30[train_ind,:],target[train_ind])
forestR45 = Forest(n_estimators=20,criterion='gini')
forestR45.fit(XYr45[train_ind,:],target[train_ind])
forestFull = Forest(n_estimators=80,criterion='gini')
forestFull.fit(Dgps[train_ind,:],target[train_ind])

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            n_estimators=80, n_jobs=1, oob_score=False, random_state=None,
            verbose=0)

In [72]:
p0 = forestR0.predict_proba(Rotate(XY[test_ind,:],0.0))
p15 = forestR15.predict_proba(Rotate(XYr15[test_ind,:],np.pi/12))
p30 = forestR30.predict_proba(Rotate(XYr30[test_ind,:],np.pi/6))
p45 = forestR45.predict_proba(Rotate(XYr45[test_ind,:],np.pi/4))
pF = forestFull.predict_proba(Rotate(XY[test_ind,:],0.0))

In [80]:
ptot = p0+p15+p30+p45
np.sum( (ptot[:,0]<=ptot[:,1])==target[test_ind] ) / (1.0*test_ind.shape[0])

0.71826599326599327

In [96]:
r = 100
forests = []
ptot = np.zeros((test_ind.shape[0],2))
for theta in np.linspace(0,np.pi/4,r):
    f = Forest(n_estimators = 100/r,criterion='gini')
    f.fit(Rotate(Dgps[train_ind,:],theta),target[train_ind])
    forests += [f]

In [97]:
p = np.zeros((test_ind.shape[0],2))
for i,theta in enumerate(np.linspace(0,np.pi/4,r)):
    p += forests[i].predict_proba(Rotate(Dgps[test_ind,:],theta))

np.sum( (p[:,0]<=p[:,1])==target[test_ind])/(1.0*test_ind.shape[0])

0.73670033670033674

In [99]:
nx = 1000; ny = 1000
xx,yy = np.meshgrid(np.linspace(35.0, 37,   nx),
                    np.linspace(-7.0, -9,   ny))
v = np.array([xx.ravel(),yy.ravel()]).T
p = np.zeros(v.shape[0])
for i,theta in enumerate(np.linspace(0,np.pi/4,r)):
    p += forests[i].predict_proba(Rotate(v,theta))[:,0]
p = np.reshape(p,(nx,ny))/r
plt.contourf(xx,yy,p,np.linspace(0.0,1.0,5))
plt.plot(X[(train_labels.status_group!='non functional')],Y[(train_labels.status_group!='non functional')],'go');
plt.plot(X[(train_labels.status_group=='non functional')],Y[(train_labels.status_group=='non functional')],'ro'); plt.gca().set_aspect("equal")

In [87]:
np.sum( (pF[:,0]<=pF[:,1])==target[test_ind] ) / (1.0*test_ind.shape[0])

0.73021885521885521

In [104]:
### FOREST CLASSIFIER
myforest = Forest(n_estimators=100,criterion='entropy')
myforest.fit(Dgps[train_ind,:],target[train_ind])

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='entropy', max_depth=None, max_features='auto',
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0)

In [105]:
predictions = myforest.predict(Dgps[test_ind])
np.sum(predictions == target[test_ind])/(1.0*test_ind.shape[0])

0.7263468013468013

In [100]:
myforest.score(Dall[test_ind,:],target[test_ind])
myforest.feature_importances_

array([ 0.39320319,  0.05531299,  0.02785572,  0.014544  ,  0.10050808,
        0.01048493,  0.00414511,  0.04985363,  0.03122495,  0.0284956 ,
        0.00797111,  0.08312579,  0.13425121,  0.05902369,  0.        ,  0.        ])

In [268]:
R

array([ 0.79620773,  0.508857  ,  0.49224358, ...,  0.30268173,
        0.770254  ,  0.87442789])

In [107]:
plt.gcf().tight_layout()

In [106]:
plt.figure(); plt.gca().set_aspect("equal")
nx = 1000; ny = 1000
xx,yy = np.meshgrid(np.linspace(35.0, 37,   nx),
                    np.linspace(-7.0, -9,   ny))
probs = myforest.predict_proba(np.array([xx.ravel(),yy.ravel()]).T)[:,0]
probs = np.reshape(probs,(nx,ny))
plt.contourf(xx,yy,probs,np.linspace(0.0,1.0,5))
plt.plot(X[(train_labels.status_group!='non functional')],Y[(train_labels.status_group!='non functional')],'go');
plt.plot(X[(train_labels.status_group=='non functional')],Y[(train_labels.status_group=='non functional')],'ro'); plt.gca().set_aspect("equal")

In [121]:
probs

array([[ 0.25,  0.75],
       [ 0.26,  0.74],
       [ 0.23,  0.77],
       ..., 
       [ 0.35,  0.65],
       [ 0.5 ,  0.5 ],
       [ 0.24,  0.76]])

In [112]:
xx.ravel()

array([ 30.4       ,  30.40606061,  30.41212121, ...,  30.98787879,
        30.99393939,  31.        ])

In [86]:
myforest.feature_importances_

array([ 0.03387232,  0.00753619,  0.09897791,  0.00718113,  0.85243245])

In [62]:
### DECISION TREE CLASSIFIER
mytree = Tree()
mytree.fit(Dall[train_ind,:],train_labels.status_group[train_ind])

DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=None, max_features=None, min_density=None,
            min_samples_leaf=1, min_samples_split=2, random_state=None,
            splitter='best')

In [63]:
predictions = mytree.predict(Dall[test_ind,:])
np.sum(predictions == train_labels.status_group[test_ind])/(1.0*N)

0.13488215488215488

In [57]:
mytree.feature_importances_

array([ 0.3226293 ,  0.32493605,  0.01043847,  0.00603807,  0.13204862,
        0.00486731,  0.19904219])

In [107]:
plt.plot(X[(train_labels.status_group!='non functional')],Y[(train_labels.status_group!='non functional')],'go');
plt.plot(X[(train_labels.status_group=='non functional')],Y[(train_labels.status_group=='non functional')],'ro'); plt.gca().set_aspect("equal")