# Capítulo 06 - Machine Learning - Arvore de Decisão

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree

#### Carregando a base de dados bank-numeric

In [0]:
bank = pd.read_csv('bank-numeric.csv')
bank.head()

Unnamed: 0,age,balance,duration,campaign,previous,default_cat,housing_cat,loan_cat,recent_pdays,deposit_cat,job_blue-collar,job_entrepreneur,job_other,job_pink-collar,job_self-employed,job_technician,job_white-collar,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,poutcome_failure,poutcome_success,poutcome_unknown
0,59,2343,1042,1,0,0,1,0,0.0001,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1
1,56,45,1467,1,0,0,0,0,0.0001,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1
2,41,1270,1389,1,0,0,1,0,0.0001,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1
3,55,2476,579,1,0,0,1,0,0.0001,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1
4,54,184,673,2,0,0,0,0,0.0001,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1


#### Separando as features das classes

In [0]:
bank_data = bank.drop('deposit_cat', 1)
bank_target = bank.deposit_cat

#### Dividindo os dados em treino e teste

In [0]:
X_train, X_test, y_train, y_test = train_test_split(bank_data,bank_target,test_size=0.3)

#### Função para treinar o modelo de arvore de decisão com o parametro max_depth

In [0]:
def compara_modelos(maxdepth):
    if maxdepth == 0:
        dt = tree.DecisionTreeClassifier(random_state=1)
    else:   
        dt = tree.DecisionTreeClassifier(random_state=1, max_depth=maxdepth)
    dt.fit(X_train, y_train)
    train_score = dt.score(X_train, y_train)
    test_score = dt.score(X_test, y_test)
    return train_score,test_score

compara_modelos(2)

(0.7299372840138231, 0.7240967452971037)

In [0]:
print('{:10} {:20} {:20}'.format('depth', 'Training score','Testing score'))
print('{:10} {:20} {:20}'.format('-----', '--------------','-------------'))
print('{:1}         {} '.format(2,str(compara_modelos(2))))
print('{:1}         {} '.format(3,str(compara_modelos(3))))
print('{:1}         {} '.format(4,str(compara_modelos(4))))
print('{:1}         {} '.format(10,str(compara_modelos(10))))
print('{:1}         {} '.format(15,str(compara_modelos(15))))
print('{:1}         {} '.format('Full',str(compara_modelos(0))))

depth      Training score       Testing score       
-----      --------------       -------------       
2         (0.7299372840138231, 0.7240967452971037) 
3         (0.7657749904006144, 0.7673932517169304) 
4         (0.7862536797644951, 0.787996416840848) 
10         (0.8675284781773966, 0.7769483427888922) 
15         (0.9482913093562012, 0.7411167512690355) 
Full         (1.0, 0.7330546431770678) 


#### Verificando as features mais importantes para o modelo de arvore de decisão treinado

##### Treinando o modelo utilizando o valor de max_depth igual a 4

In [0]:
dt = tree.DecisionTreeClassifier(max_depth=4)
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

##### Listando as features e sua importância para o modelo

In [0]:
features = dt.feature_importances_

l = len(bank.columns)
for i in range(0,len(bank.columns)-1):
    print('{:.<20} {:3}'.format(features[i],fi[i]))

0.0033553729713..... 0.0033553729713
0.00212350316418.... 0.00212350316418
0.669133838704...... 0.669133838704
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.112145036782...... 0.112145036782
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.00224028581684.... 0.00224028581684
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.211001962562...... 0.211001962562
0.0................. 0.0
