 Define Problem.
    Prepare Data.
    Evaluate Algorithms.
    Improve Results.
    Present Results.
    
    http://archive.ics.uci.edu/ml/datasets/Covertype


In [1]:
import pandas
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import preprocessing

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from statistics import mean
from numpy.linalg import norm
from matplotlib.colors import ListedColormap
import random

In [2]:
file = "covtype.data"
#nome dos atributos
#estamos descartando os atributos que descrevem o tipo do solo
names = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area_Rawah', 'Wilderness_Area_Neota', 'Wilderness_Area_Comanche', 'Wilderness_Area_Cache', 'Cover_Type']
#names = ['Elevation', 'Aspect', 'Slope', 'Cover_Type']

#usecols = [0..13, 54]

usecols = list(range(0, 14)) + [54]
#usecols = list(range(0, 3)) + [54]

#especifico o tipo de alguns parametros(os que não são simplesmente numéricos)
dtype = {'Cover_Type': 'category', 'Wilderness_Area_Rawah' : bool, 'Wilderness_Area_Neota' : bool, 'Wilderness_Area_Comanche' : bool, 'Wilderness_Area_Cache' : bool}

dataset = pandas.read_csv(file, header = None, usecols = usecols, names = names, dtype = dtype)

#adicionando uma coluna adicional para sintetizar os 4 boleanos que representam a Wilderness_area. 
#para uma única instância, somente um dos 4 booleanos pode ser verdadeiro, logo eles, em realidade, funcionam como uma categorização
#
new_column = pandas.Series([1 if dataset['Wilderness_Area_Rawah'][i] else 
                            2 if dataset['Wilderness_Area_Neota'][i] else
                            3 if dataset['Wilderness_Area_Comanche'][i] else
                            4 for i in range(len(dataset.index)) ], dtype="category")
#elimina as colunas reduzidas
dataset = dataset.drop(columns=['Wilderness_Area_Rawah', 'Wilderness_Area_Neota', 'Wilderness_Area_Comanche', 'Wilderness_Area_Cache'])
#insere nova coluna na posição 10
dataset.insert(loc = 10, column = 'Wilderness_Area', value = new_column)

names = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area', 'Cover_Type']

In [None]:
print(dataset.dtypes)

In [None]:
print(dataset.shape)
#print(dataset.head(5))
dataset.describe()



In [None]:
#quantidade de exemplares por classificação
dataset.groupby('Cover_Type').size()

In [None]:
plt.rcParams["figure.figsize"] = (20,40)

dataset.plot(kind='box', subplots=True, layout=(4,4), sharex=False, sharey=False)

plt.show()

In [None]:
gp = dataset.groupby('Cover_Type')
for name in names:
    print(name)
    display(gp[name].describe())

In [None]:
dataset['Elevation'].hist()
plt.show()

In [None]:
scatter_matrix(dataset)
plt.show()

In [None]:
#faz um scatter plot maneiro, seleto
#para cada classificação, seleciona N amostras aleatórias pra serem plotadas
seed = 7
N = 250
#markerTypes = {'1':'.', '2': 'x', '3': 'o', '4': '^', '5': '1', '6': '2', '7': '3'}
markerTypes = {'1':'.', '2': '.', '3': '.', '4': '.', '5': '.', '6': '.', '7': '.'}
points = []
markers = []
i = 0
gp = dataset.groupby('Cover_Type', sort = False)
samples = gp.apply(lambda x: x.sample(n = N, random_state = seed))

#plota grafico name1 x name2
name1, name2 = 'Elevation', 'Slope'
i = 0
fig, ax = plt.subplots()
for cover_type, group in samples.groupby('Cover_Type'):
    
    plt.scatter(group[name1], group[name2], marker = markerTypes[cover_type])
    
ax.set(xlabel=name1, ylabel=name2,
       title=name1 + ' x ' + name2)
plt.show()

#plota grafico name1 x name2
name1, name2 = 'Elevation', 'Horizontal_Distance_To_Roadways'
i = 0
fig, ax = plt.subplots()
for cover_type, group in samples.groupby('Cover_Type'):
    
    plt.scatter(group[name1], group[name2], marker = markerTypes[cover_type])
    
ax.set(xlabel=name1, ylabel=name2,
       title=name1 + ' x ' + name2)
plt.show()

#plota grafico name1 x name2
name1, name2 = 'Elevation', 'Horizontal_Distance_To_Fire_Points'
i = 0
fig, ax = plt.subplots()
for cover_type, group in samples.groupby('Cover_Type'):
    
    plt.scatter(group[name1], group[name2], marker = markerTypes[cover_type])
    
ax.set(xlabel=name1, ylabel=name2,
       title=name1 + ' x ' + name2)
plt.show()


In [3]:
array = dataset.values
X = array[:, 0:11]
Y = array[:, 11]
#preprocessa dataset
X_scale_temp = preprocessing.scale(X[:, 0:10])
print(X[:, 10].reshape(-1, 1).shape)
X_scaled = np.append(X_scale_temp, X[:, 10].reshape(-1, 1), axis = 1)


validation_size = 0.2
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size = validation_size, random_state = seed)



(581012, 1)


In [None]:
print(X_scaled[0, :])

In [4]:
#test options and evaluation metric
seed = 7
scoring = 'accuracy'

In [5]:
#testando varios algoritmos

#custom metric:
#calcula a distancia entre os vetores u e v. 
#É necessário customizar essa métrica porque o 'aspecto' não funciona da mesma maneira que as outras medidas: um aspecto de 359 grau é próximo ao de 0 grau, e não distante
def custom_distance(u, v):
    #para todos os campos "normais", só subtraio valores
    custom = u - v
    #mas, para o aspect, faço mod 360 . [1] é aspect
    custom[1] = (u[1] - v[1]) % 360
    dist = norm(u-v)
    return dist



models = []
#models.append(('LR', LogisticRegression()))
#models.append(('LDA', LinearDiscriminantAnalysis()))
#usar uma função customizada aumentou estupidamente o tempo de processamento, sem gerar benefício algum
#models.append(('KNN-Custom', KNeighborsClassifier(metric = custom_distance)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
#models.append(('NB', GaussianNB()))
#models.append(('SVM', SVC()))

#avalia cada um deles
results = []
model_names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits = 10, random_state = seed)
    print("here")
    cv_results = model_selection.cross_val_score(model, X, Y, cv = kfold, scoring = scoring)
    print("there")
    results.append(cv_results)
    model_names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    


here
there
KNN: 0.631658 (0.026439)
here
there
CART: 0.665721 (0.045751)


In [None]:
#tentar descobrir o nivel de importancia dos atributos baseado na arvore de decisão
dtc = DecisionTreeClassifier(random_state = seed)
dtc.fit(X_train, Y_train)
dtc.score(X_validation, Y_validation)

In [None]:
fi = dtc.feature_importances_
for i in range(len(fi)):
    msg = "%s: %f" % (names[i], fi[i])
    print(msg)

In [None]:
#https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html

estimator = dtc

n_nodes = estimator.tree_.node_count
children_left = estimator.tree_.children_left
children_right = estimator.tree_.children_right
feature = estimator.tree_.feature
threshold = estimator.tree_.threshold





# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]  # seed is the root node id and its parent depth
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1

    # If we have a test node
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has %s nodes and has "
      "the following tree structure:"
      % n_nodes)
for i in range(n_nodes):
    if is_leaves[i]:
        print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
    else:
        print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
              "node %s."
              % (node_depth[i] * "\t",
                 i,
                 children_left[i],
                 feature[i],
                 threshold[i],
                 children_right[i],
                 ))
print()

In [None]:
print("Node count:", n_nodes)
print("Leaf node count:", sum(b for b in is_leaves))
print("Max node depth:", node_depth.max())

In [None]:
# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.

X_test = X_validation

node_indicator = estimator.decision_path(X_test)

# Similarly, we can also have the leaves ids reached by each sample.

leave_id = estimator.apply(X_test)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                    node_indicator.indptr[sample_id + 1]]

print('Rules used to predict sample %s: ' % sample_id)
for node_id in node_index:
    if leave_id[sample_id] == node_id:
        print("leaf node:",  node_id, "classification", estimator.predict(X_test[sample_id].reshape(1, -1)))
        continue

    if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
        threshold_sign = "<="
    else:
        threshold_sign = ">"

    print("decision id node %s : (X_test[%s, %s] (= %s) %s %s)"
          % (node_id,
             sample_id,
             feature[node_id],
             X_test[sample_id, feature[node_id]],
             threshold_sign,
             threshold[node_id]))