In [45]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from time import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [46]:
X = pd.read_csv("X_whole.csv")
y = pd.read_csv("y_whole.csv").pop('SLIDE')

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [48]:
clf = RandomForestClassifier(n_estimators=1000,min_samples_split=5)

In [49]:
t0 = time()
clf.fit(X_train, y_train)
print ("Training time:", round(time() - t0, 3), "s")


('Training time:', 3.959, 's')


In [143]:
#Predict on test set
t1 = time()
pred = clf.predict(X_test)
print ("predicting time:", round(time() - t1, 3), "s")


('predicting time:', 0.32, 's')


In [52]:
accuracy_score(pred, y_test)

0.85068493150684932

0.85104375737621119

In [63]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],axis=0)
indices = np.argsort(importances)[::-1]
features = pd.DataFrame(list(X))

In [64]:
print("Feature ranking:")
for f in range(X.shape[1]):
    print("%d. feature %d : %s : (%f)" % (f + 1, indices[f], features[0][indices[f]], importances[indices[f]]))


Feature ranking:
1. feature 0 : Slope : (0.139160)
2. feature 48 : COVERAGE_Developed, Open : (0.117253)
3. feature 2 : Precip_Mon : (0.066584)
4. feature 6 : Soil_Temp : (0.058225)
5. feature 3 : Soil_Moist : (0.057162)
6. feature 5 : Vegetation : (0.055479)
7. feature 4 : Soil_Moist_Depth : (0.051851)
8. feature 8 : maxtempi : (0.048139)
9. feature 11 : maxpressurei : (0.043433)
10. feature 9 : mintempi : (0.042171)
11. feature 7 : maxwspdi : (0.038947)
12. feature 10 : maxhumidity : (0.032646)
13. feature 49 : COVERAGE_Evergreen Forest : (0.031893)
14. feature 55 : COVERAGE_Shrub : (0.029951)
15. feature 46 : COVERAGE_Developed, Low : (0.019791)
16. feature 1 : precipi : (0.019338)
17. feature 40 : GEO_GENL_U_volcanic rocks : (0.016319)
18. feature 32 : GEO_GENL_U_marine sedimentary rocks : (0.014188)
19. feature 52 : COVERAGE_Mixed Forest : (0.010003)
20. feature 13 : AGE_NAME_Eocene : (0.009554)
21. feature 37 : GEO_GENL_U_sediments : (0.009193)
22. feature 27 : AGE_NAME_Quaternar

In [141]:
plt.figure(figsize=(12,6))
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [181]:
y_df = pd.DataFrame(y_test).reset_index()
y_df = y_df.rename(columns={0:'SLIDE'})

In [158]:
pred_df = pd.DataFrame(pred)

In [159]:
X_test = X_test.reset_index()

In [182]:
frames = [X_test,y_df,pred_df]
answers = pd.concat(frames,axis=1)

In [58]:
wrong_answers = answers[answers['SLIDE'] != answers[0]]

In [None]:
pred = clf.predict_proba(X_test)
pred_df = pd.DataFrame(pred)

In [None]:
frames = [X_test,y_df,pred_df]
answers = pd.concat(frames,axis=1)

In [206]:
answers['check'] = np.where(answers[1]>0.3, 1, 0)

In [197]:
check = answers.pop('check')

In [198]:
accuracy_score(np.asarray(check), y_test)

0.78767123287671237

In [207]:
wrong_answers = answers[answers['SLIDE'] != answers['check']]

In [208]:
wrong_answers['check'].value_counts()

1    131
0     24
Name: check, dtype: int64

In [202]:
wrong_answers['check'].describe()

count    155.000000
mean       0.845161
std        0.362923
min        0.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: check, dtype: float64