In [1]:
import pandas as pd
import numpy as np
import requests
import pickle as pkl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
plt.style.use("seaborn")



### Defining Features and Classifier

In [2]:
df_common = pd.read_csv("Trees_common.csv", index_col=False)

In [3]:
df_common.head(1)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Cover_Type,Soil_Type
0,2596,51,3,258,0,510,221,232,148,6279,0,29


In [4]:
X = df_common.drop("Cover_Type",axis=1)
y = df_common["Cover_Type"]

In [5]:
#Splitting the set (test = 0.3) in a stratified manner
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30,stratify=y, random_state=4444)

In [6]:
#scaling the features
sca = StandardScaler()
sca.fit_transform(X_train)
sca.transform(X_test);

In [7]:
DecisionTreeClassifier().get_params().keys()

dict_keys(['class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'presort', 'random_state', 'splitter'])

In [8]:
def alg_class(estimator, X_train, y_train, X_test, y_test):
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    print("Accuracy_train: " + str(estimator.score(X_train, y_train)))
    acc = accuracy_score(y_pred,y_test)
    rec = recall_score(y_pred,y_test)
    prec = precision_score(y_pred,y_test)
    f1 = f1_score(y_pred,y_test)
    
    return ("Accuracy: " + str(acc)), ("Recall: " + str(rec)), ("Precision: " + str(prec)), ("F1: " + str(f1))

In [9]:
#Trying with default parameters
alg_class(DecisionTreeClassifier(), X_train, y_train, X_test, y_test)

Accuracy_train: 1.0


('Accuracy: 0.9402079126124472',
 'Recall: 0.9392450696260691',
 'Precision: 0.9380515354747617',
 'F1: 0.9386479231421305')

In [10]:
def grid_estimator(estimator, param_grid, X_train, y_train, X_test, y_test, cv=5):
    grid = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=cv, n_jobs=1)
    grid.fit(X_train, y_train)
    print("Accuracy training: " + str(grid.best_score_))
    y_pred = grid.predict(X_test)
    print("Accuracy test: " + str(accuracy_score(y_pred,y_test)))
    print("Recall test: " + str(recall_score(y_pred,y_test)))
    print("Precision test: " + str(precision_score(y_pred,y_test)))
    print("F1 score test: " + str(f1_score(y_pred,y_test)))
    print(grid.best_estimator_)

In [11]:
param_grid = {}
grid_estimator(DecisionTreeClassifier(), param_grid, X_train, y_train, X_test, y_test)

Accuracy training: 0.9331412217118916
Accuracy test: 0.9403914999082064
Recall test: 0.9392576192943615
Precision test: 0.938439816448994
F1 score test: 0.93884853978082
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [12]:
param_grid = {'max_depth': [80,100,120], 'max_leaf_nodes':[100,1000,10000,100000,1000000,10000000]}
grid_estimator(DecisionTreeClassifier(), param_grid, X_train, y_train, X_test, y_test)

Accuracy training: 0.9334141447918408
Accuracy test: 0.940362814393244
Recall test: 0.9389232127096204
Precision test: 0.9387575008824568
F1 score test: 0.9388403494837172
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=120,
            max_features=None, max_leaf_nodes=10000000,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [13]:
param_grid = {'max_depth': [80, 120, 160, 200], 'max_leaf_nodes':[500000,1000000,1500000,2000000]}
grid_estimator(DecisionTreeClassifier(), param_grid, X_train, y_train, X_test, y_test)

Accuracy training: 0.9336526451409857
Accuracy test: 0.9406037727189278
Recall test: 0.9393771707776535
Precision test: 0.938769266972585
F1 score test: 0.9390731204943358
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=160,
            max_features=None, max_leaf_nodes=2000000,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [14]:
param_grid = {'max_depth': [200, 250, 300, 350], 'max_leaf_nodes':[1000000]}
grid_estimator(DecisionTreeClassifier(), param_grid, X_train, y_train, X_test, y_test)

Accuracy training: 0.933426438624271
Accuracy test: 0.9401620157885074
Recall test: 0.939166902271228
Precision test: 0.9380397693846335
F1 score test: 0.9386029974452255
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=300,
            max_features=None, max_leaf_nodes=1000000,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
