In [5]:
import pandas as pd
import numpy as np
import requests
import pickle as pkl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
plt.style.use("seaborn")

### Defining Features and Classifier

In [6]:
df_all = pd.read_csv("Trees_clean.csv", index_col=False)

In [7]:
df_all.head(1)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Cover_Type,Soil_Type
0,2596,51,3,258,0,510,221,232,148,6279,5,29


In [8]:
X = df_all.drop("Cover_Type",axis=1)
y = df_all["Cover_Type"]

In [9]:
#Splitting the set (test = 0.3) in a stratified manner
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30,stratify=y, random_state=4444)

In [10]:
#scaling the features
sca = StandardScaler()
sca.fit_transform(X_train)
sca.transform(X_test);

In [11]:
DecisionTreeClassifier().get_params().keys()

dict_keys(['class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'presort', 'random_state', 'splitter'])

In [17]:
def alg_class(estimator, X_train, y_train, X_test, y_test):
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    print("Accuracy_train: " + str(estimator.score(X_train, y_train)))
    print("Accuracy_test: " + str(accuracy_score(y_pred,y_test)))    

In [18]:
#Trying with default parameters
alg_class(DecisionTreeClassifier(), X_train, y_train, X_test, y_test)

Accuracy_train: 1.0
Accuracy_test: 0.9286361758766294


In [21]:
def grid_estimator(estimator, param_grid, X_train, y_train, X_test, y_test, cv=5):
    grid = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=cv, n_jobs=1)
    grid.fit(X_train, y_train)
    print("Accuracy training: " + str(grid.best_score_))
    y_pred = grid.predict(X_test)
    print("Accuracy test: " + str(accuracy_score(y_pred,y_test)))
    print(grid.best_estimator_)

### Selecting features determind to be important by Random Forest

In [22]:
col_in = ["Elevation","Horizontal_Distance_To_Roadways","Horizontal_Distance_To_Fire_Points", "Soil_Type",
          "Horizontal_Distance_To_Hydrology","Vertical_Distance_To_Hydrology"]
X_in = X[col_in]
X_train_in = X_train[col_in]
X_test_in = X_test[col_in]

In [23]:
param_grid = {'max_depth': [80,100,120], 'max_leaf_nodes':[100,1000,10000,100000,1000000,10000000]}
grid_estimator(DecisionTreeClassifier(), param_grid, X_train_in, y_train, X_test_in, y_test)

Accuracy training: 0.930657867561002
Accuracy test: 0.936977923627685
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=100,
            max_features=None, max_leaf_nodes=1000000,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [26]:
param_grid = {'max_depth': [50, 80, 100, 120, 160, 200], 'max_leaf_nodes':[500000,1000000,1500000,2000000]}
grid_estimator(DecisionTreeClassifier(), param_grid, X_train_in, y_train, X_test_in, y_test)

Accuracy training: 0.9306775376928902
Accuracy test: 0.9375229484119699
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=160,
            max_features=None, max_leaf_nodes=2000000,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [27]:
param_grid = {'max_depth': [160, 200, 300, 400], 'max_leaf_nodes':[2000000, 4000000, 6000000, 8000000, 10000000]}
grid_estimator(DecisionTreeClassifier(), param_grid, X_train_in, y_train, X_test_in, y_test)

Accuracy training: 0.9307217954896387
Accuracy test: 0.9374311547640903
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=300,
            max_features=None, max_leaf_nodes=4000000,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
