In [1]:
import pandas as pd
import numpy as np
import requests
import pickle as pkl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

from collections import Counter

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
plt.style.use("seaborn")



### Defining Features and Classifier

In [2]:
df_all = pd.read_csv("Trees_clean.csv", index_col=False)

In [3]:
df_all.head(1)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Cover_Type,Soil_Type
0,2596,51,3,258,0,510,221,232,148,6279,5,29


In [4]:
X = df_all.drop("Cover_Type",axis=1)
y = df_all["Cover_Type"]

In [5]:
#Splitting the set (test = 0.3) in a stratified manner
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30,stratify=y, random_state=4444)

In [6]:
#scaling the features
sca = StandardScaler()
sca.fit_transform(X_train)
sca.transform(X_test);

In [7]:
RandomForestClassifier().get_params().keys()

dict_keys(['bootstrap', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

### Now without the distance to fire points

In [21]:
col_in = ["Elevation","Horizontal_Distance_To_Roadways","Horizontal_Distance_To_Fire_Points", "Soil_Type",
          "Horizontal_Distance_To_Hydrology","Vertical_Distance_To_Hydrology"]
X_in = X[col_in]
X_train_in = X_train[col_in]
X_test_in = X_test[col_in]

In [22]:
rf = RandomForestClassifier(max_leaf_nodes=1000000, max_depth=50, n_estimators=200)
rf.fit(X_train_in, y_train)
y_pred = rf.predict(X_test_in)

In [23]:
print("Recall for cover:")
for i in range(7):
    print("Type " + str(i+1)+ ":" + str(100 * confusion_matrix(y_test, y_pred)[i,i]/len(y_test[y_test==(i+1)])))

Recall for cover:
Type 1:96.0835221550856
Type 2:97.45973103034439
Type 3:95.64609360432594
Type 4:82.76699029126213
Type 5:82.83005617977528
Type 6:88.54126679462571
Type 7:95.40061758491792


In [24]:
print("Precision for cover:")
for i in range(7):
    print("Type " + str(i+1)+ ":" + str(100 * confusion_matrix(y_test, y_pred)[i,i]/len(y_pred[y_pred==(i+1)])))

Precision for cover:
Type 1:96.73499778214308
Type 2:96.37791610913956
Type 3:94.10199963309485
Type 4:86.11111111111111
Type 5:93.68546465448769
Type 6:92.85426731078906
Type 7:96.94467382328654


### Training with the whole set

In [25]:
rf = RandomForestClassifier(max_leaf_nodes=1000000, max_depth=50, n_estimators=200)
rf.fit(X_in, y)
y_pred = rf.predict(X_in)

In [26]:
print("Recall for cover:")
for i in range(7):
    print("Type " + str(i+1)+ ":" + str(100 * confusion_matrix(y, y_pred)[i,i]/len(y[y==(i+1)])))

Recall for cover:
Type 1:100.0
Type 2:100.0
Type 3:100.0
Type 4:100.0
Type 5:100.0
Type 6:100.0
Type 7:100.0


In [27]:
print("Precision for cover:")
for i in range(7):
    print("Type " + str(i+1)+ ":" + str(100 * confusion_matrix(y, y_pred)[i,i]/len(y_pred[y_pred==(i+1)])))

Precision for cover:
Type 1:100.0
Type 2:100.0
Type 3:100.0
Type 4:100.0
Type 5:100.0
Type 6:100.0
Type 7:100.0
