In [1]:
import pandas as pd
import numpy as np
import requests
import pickle as pkl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
plt.style.use("seaborn")



### Defining Features and Classifier

In [2]:
df_common = pd.read_csv("Trees_common.csv", index_col=False)

In [12]:
df_common.head(1)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Cover_Type,Soil_Type
0,2596,51,3,258,0,510,221,232,148,6279,0,29


In [3]:
X = df_common.drop("Cover_Type",axis=1)
y = df_common["Cover_Type"]

In [4]:
#Splitting the set (test = 0.3) in a stratified manner
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30,stratify=y, random_state=4444)

In [5]:
#scaling the features
sca = StandardScaler()
sca.fit_transform(X_train)
sca.transform(X_test);

In [16]:
RandomForestClassifier().get_params().keys()

dict_keys(['bootstrap', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [10]:
def grid_estimator(estimator, param_grid, X_train, y_train, X_test, y_test, cv=5):
    grid = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=cv, n_jobs=1)
    grid.fit(X_train, y_train)
    print("Accuracy training: " + str(grid.best_score_))
    y_pred = grid.predict(X_test)
    print("Accuracy test: " + str(accuracy_score(y_pred,y_test)))
    print("Recall test: " + str(recall_score(y_pred,y_test)))
    print("Precision test: " + str(precision_score(y_pred,y_test)))
    print("F1 score test: " + str(f1_score(y_pred,y_test)))
    print(grid.best_estimator_)


In [18]:
param_grid = {}
grid_estimator(RandomForestClassifier(), param_grid, X_train, y_train, X_test, y_test)

Accuracy training: 0.9499321380449856
Accuracy test: 0.9543785570038553
Recall test: 0.9594346374045801
Precision test: 0.9464525238263325
F1 score test: 0.9528993662263816
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [19]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
feature_viewer = {}
for col, score in zip(X.loc[:,:].columns, rf.feature_importances_):
    feature_viewer[col] = score
pd.Series(feature_viewer)

Elevation                             0.293780
Aspect                                0.046303
Slope                                 0.028672
Horizontal_Distance_To_Hydrology      0.063661
Vertical_Distance_To_Hydrology        0.059778
Horizontal_Distance_To_Roadways       0.139501
Hillshade_9am                         0.036844
Hillshade_Noon                        0.040541
Hillshade_3pm                         0.036617
Horizontal_Distance_To_Fire_Points    0.133337
Soil_Type                             0.120965
dtype: float64

In [7]:
X_train.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Soil_Type'],
      dtype='object')

In [16]:
param_grid = {}
col_out = ["Elevation","Horizontal_Distance_To_Roadways","Horizontal_Distance_To_Fire_Points", "Soil_Type"]
X_train_out = X_train.drop(col_out, axis = 1)
X_test_out = X_test.drop(col_out, axis = 1)
grid_estimator(RandomForestClassifier(), param_grid, X_train_out, y_train, X_test_out, y_test)

Accuracy training: 0.5795902711527681
Accuracy test: 0.5809562603267854
Recall test: 0.5815108394613695
Precision test: 0.501506059536416
F1 score test: 0.5385533872017285
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [19]:
#Droping feature importances < 0.5
param_grid = {}
col_in = ["Elevation","Horizontal_Distance_To_Roadways","Horizontal_Distance_To_Fire_Points", "Soil_Type",
          "Horizontal_Distance_To_Hydrology","Vertical_Distance_To_Hydrology"]
X_train_in = X_train[col_in]
X_test_in = X_test[col_in]
grid_estimator(RandomForestClassifier(), param_grid, X_train_in, y_train, X_test_in, y_test)

Accuracy training: 0.9568929059669345
Accuracy test: 0.9610967046080411
Recall test: 0.9648490900229426
Precision test: 0.9550064713495705
F1 score test: 0.9599025503657318
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [None]:
param_grid = {'n_estimators': [10,100,1000], 'max_depth': [80,100,120], 'max_leaf_nodes':[100,1000,10000,100000,1000000,10000000]}
grid_estimator(RandomForestClassifier(), param_grid, X_train, y_train, X_test, y_test)