In [None]:
import pandas as pd
import numpy as np
import requests
import pickle as pkl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
plt.style.use("seaborn")

### Defining Features and Classifier

In [None]:
df_common = pd.read_csv("Trees_common.csv", index_col=False)

In [None]:
df_common.head(1)

In [None]:
X = df_common.drop("Cover_Type",axis=1)
y = df_common["Cover_Type"]

In [None]:
#Splitting the set (test = 0.3) in a stratified manner
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30,stratify=y, random_state=4444)

In [None]:
#scaling the features
sca = StandardScaler()
sca.fit_transform(X_train)
sca.transform(X_test);

In [None]:
DecisionTreeClassifier().get_params().keys()

In [None]:
def alg_class(estimator, X_train, y_train, X_test, y_test):
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    print("Accuracy_train: " + str(estimator.score(X_train, y_train)))
    acc = accuracy_score(y_pred,y_test)
    rec = recall_score(y_pred,y_test)
    prec = precision_score(y_pred,y_test)
    f1 = f1_score(y_pred,y_test)
    
    return ("Accuracy: " + str(acc)), ("Recall: " + str(rec)), ("Precision: " + str(prec)), ("F1: " + str(f1))

In [None]:
#Trying with default parameters
alg_class(DecisionTreeClassifier(), X_train, y_train, X_test, y_test)

In [None]:
def grid_estimator(estimator, param_grid, X_train, y_train, X_test, y_test, cv=5):
    grid = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=cv, n_jobs=1)
    grid.fit(X_train, y_train)
    print("Accuracy training: " + str(grid.best_score_))
    y_pred = grid.predict(X_test)
    print("Accuracy test: " + str(accuracy_score(y_pred,y_test)))
    print("Recall test: " + str(recall_score(y_pred,y_test)))
    print("Precision test: " + str(precision_score(y_pred,y_test)))
    print("F1 score test: " + str(f1_score(y_pred,y_test)))
    print(grid.best_estimator_)

In [None]:
param_grid = {}
grid_estimator(DecisionTreeClassifier(), param_grid, X_train, y_train, X_test, y_test)

In [None]:
param_grid = {'max_depth': [80,100,120], 'max_leaf_nodes':[100,1000,10000,100000,1000000,10000000]}
grid_estimator(DecisionTreeClassifier(), param_grid, X_train, y_train, X_test, y_test)

In [None]:
param_grid = {'max_depth': [80, 120, 160, 200], 'max_leaf_nodes':[500000,1000000,1500000,2000000]}
grid_estimator(DecisionTreeClassifier(), param_grid, X_train, y_train, X_test, y_test)

In [None]:
param_grid = {'max_depth': [200, 250, 300, 350], 'max_leaf_nodes':[1000000]}
grid_estimator(DecisionTreeClassifier(), param_grid, X_train, y_train, X_test, y_test)