In [19]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier as Forest

In [20]:
%matplotlib qt

In [85]:
post_mrmr = np.load('processed_train_data')
target = np.load('processed_targets')
# Column      Feature
#   0           id
#   1       amount_tsh
#   2      date recorded
#   3       gps_height
#   4       longitude
#   5        latitude
#   6        population
#   7    construction year
#   8-9    rotation 1
# 10-11    rotation 2
# 12-13    rotation 3
# 14-15    rotation 4
# 16-18    1/d by class
# 19-21    short_sigmoid by class
# 22-24    long_sigmoid by class
# 25-27    NN by class

In [77]:
post_mrmr[0:30,18]

array([ 0.26674073,  0.48061879,  0.52061743,  0.29243779,  0.91912138,
        0.35825617,  0.34887252,  0.59733005,  0.7796757 ,  0.40829032,
        0.14706701,  0.48169796,  0.66271635,  0.29327937,  1.68885862,
        0.86615059,  0.34626475,  0.43008496,  0.1532668 ,  0.89737101,
        0.38637802,  0.        ,  0.36375111,  0.16015436,  0.57380795,
        0.1339315 ,  1.24771772,  0.42865829,  0.13501701,  0.23529248])

In [50]:
plt.plot(post_mrmr[:,14],post_mrmr[:,15],'ko')

[<matplotlib.lines.Line2D at 0x7f78e7ce5610>]

In [70]:
post_mrmr[:,24]

array([ 0.02333878,  0.03005595,  0.02583791, ...,  0.00429199,
        0.00738748,  0.01280966])

In [80]:
plt.plot(post_mrmr[:,16])

[<matplotlib.lines.Line2D at 0x7f78e6fd4290>]

In [92]:
plt.figure()
for t,c in enumerate(['ro','yo','bo']):
    plt.plot(post_mrmr[target==t,27],post_mrmr[target==t,25],c,alpha=0.1)

In [93]:
post_mrmr.shape

(59400, 95)

In [128]:
forest = Forest(n_estimators=200, criterion='gini', n_jobs=4,
                verbose=True, max_features='auto',bootstrap=True,
                min_samples_split=1, min_samples_leaf=1, oob_score=True,
                warm_start=True)
forest.fit(post_mrmr,target)
forest.oob_score_

[Parallel(n_jobs=4)]: Done   1 out of 200 | elapsed:    0.3s remaining:  1.0min
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:   14.7s finished


0.83050505050505052

In [112]:
# No GPS data at all
print [4,5]+range(8,28)
no_gps = np.delete(post_mrmr,[4,5]+range(8,28),1)
forest = Forest(n_estimators=200, criterion='gini', n_jobs=4,
                verbose=True, max_features='auto',bootstrap=True,
                min_samples_split=1, min_samples_leaf=1, oob_score=True,
                warm_start=True)
forest.fit(no_gps,target)
forest.oob_score_

[Parallel(n_jobs=4)]: Done   1 out of 200 | elapsed:    0.1s remaining:   25.6s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    7.8s finished


[4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]


0.79356902356902359

In [117]:
# with raw GPS
print range(8,28)
no_gps = np.delete(post_mrmr,range(8,28),1)
forest = Forest(n_estimators=200, criterion='gini', n_jobs=4,
                verbose=True, max_features='auto',bootstrap=True,
                min_samples_split=1, min_samples_leaf=1, oob_score=True,
                warm_start=True)
forest.fit(no_gps,target)
forest.oob_score_

[Parallel(n_jobs=4)]: Done   1 out of 200 | elapsed:    0.2s remaining:   39.8s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    8.8s finished


[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]


0.80806397306397304

In [122]:
# only rotations
print range(16,28)
no_gps = np.delete(post_mrmr,range(16,28),1)
forest = Forest(n_estimators=200, criterion='gini', n_jobs=4,
                verbose=True, max_features='auto',bootstrap=True,
                min_samples_split=1, min_samples_leaf=1, oob_score=True,
                warm_start=True)
forest.fit(no_gps,target)
forest.oob_score_

[Parallel(n_jobs=4)]: Done   1 out of 200 | elapsed:    0.4s remaining:  1.4min
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:   12.8s finished


[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]


0.81208754208754208

In [125]:
# only kNN
print [4,5] + range(8,16)
no_gps = np.delete(post_mrmr,[4,5]+range(8,16),1)
forest = Forest(n_estimators=200, criterion='gini', n_jobs=4,
                verbose=True, max_features='auto',bootstrap=True,
                min_samples_split=1, min_samples_leaf=1, oob_score=True,
                warm_start=True)
forest.fit(no_gps,target)
forest.oob_score_

[Parallel(n_jobs=4)]: Done   1 out of 200 | elapsed:    0.2s remaining:   35.9s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:   11.4s finished


[4, 5, 8, 9, 10, 11, 12, 13, 14, 15]


0.82489898989898991

In [139]:
# OOB Accuracy vs. number of trees
n_trees = 0
n = []
oob_votes = np.zeros((post_mrmr.shape[0],3))
oob_score = []
for i in range(1000):
    n_trees += 50
    n += [n_trees]
    forest = Forest(n_estimators=n_trees, criterion='gini', n_jobs=4,
                verbose=False, max_features='auto',bootstrap=True,
                min_samples_split=1, min_samples_leaf=1, oob_score=True,
                warm_start=False)
    forest.fit(post_mrmr,target)
    oob_votes += forest.oob_decision_function_
    oob_score += [np.sum(np.argmax(oob_votes,axis=1)==target,dtype=np.float)
                     /post_mrmr.shape[0]]
    print n_trees,"\t",oob_score[-1]

50 	0.819730639731
100 	0.828367003367
150 	0.830538720539
200 	0.831430976431
250 	0.832121212121
300 	0.831835016835
350 	0.832037037037
400 	0.831919191919
450 	0.831868686869
500 	0.832121212121
550 	0.832306397306
600 	0.832457912458
650 	0.832188552189
700 	0.832138047138
750 	0.832356902357
800 	0.832272727273
850 	0.832171717172
900 	0.832306397306
950 	0.832356902357
1000 	0.832373737374
1050 	0.832542087542
1100 	0.832373737374
1150 	0.832474747475
1200 	0.832424242424
1250 	0.832508417508
1300 	0.832558922559
1350 	0.832491582492
1400 	0.832491582492
1450 	0.832407407407
1500 	0.832373737374
1550 	0.832390572391
1600 	0.832306397306
1650 	0.832272727273
1700 	0.83234006734
1750 	0.832373737374
1800 	0.832407407407
1850 	0.832356902357
1900 	0.832441077441
1950 	0.832424242424
2000 	0.832323232323
2050 	0.832474747475
2100 	0.832441077441
2150 	0.832457912458
2200 	0.832407407407
2250 	0.832491582492
2300 	0.832508417508
2350 	0.832474747475
2400 	0.832457912458
2450 	0.83245

KeyboardInterrupt: 

In [137]:
oob_score=[]

In [144]:
np.argmax(oob_votes,1)

array([2, 2, 2, ..., 2, 2, 2])